1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/cmn_err.h> 30 #include <sys/vmem.h> 31 #include <sys/kmem.h> 32 #include <sys/systm.h> 33 #include <sys/machsystm.h> /* for page_freelist_coalesce() */ 34 #include <sys/errno.h> 35 #include <sys/memnode.h> 36 #include <sys/memlist.h> 37 #include <sys/memlist_impl.h> 38 #include <sys/tuneable.h> 39 #include <sys/proc.h> 40 #include <sys/disp.h> 41 #include <sys/debug.h> 42 #include <sys/vm.h> 43 #include <sys/callb.h> 44 #include <sys/memlist_plat.h> /* for installed_top_size() */ 45 #include <sys/condvar_impl.h> /* for CV_HAS_WAITERS() */ 46 #include <sys/dumphdr.h> /* for dump_resize() */ 47 #include <sys/atomic.h> /* for use in stats collection */ 48 #include <sys/rwlock.h> 49 #include <sys/cpuvar.h> 50 #include <vm/seg_kmem.h> 51 #include <vm/seg_kpm.h> 52 #include <vm/page.h> 53 #include <vm/vm_dep.h> 54 #define SUNDDI_IMPL /* so sunddi.h will not redefine splx() et al */ 55 #include <sys/sunddi.h> 56 #include <sys/mem_config.h> 57 #include <sys/mem_cage.h> 58 #include <sys/lgrp.h> 59 #include <sys/ddi.h> 60 #include <sys/modctl.h> 61 62 extern void memlist_read_lock(void); 63 extern void memlist_read_unlock(void); 64 extern void memlist_write_lock(void); 65 extern void memlist_write_unlock(void); 66 67 extern struct memlist *phys_avail; 68 69 extern void mem_node_add(pfn_t, pfn_t); 70 extern void mem_node_del(pfn_t, pfn_t); 71 72 extern uint_t page_ctrs_adjust(int); 73 static void kphysm_setup_post_add(pgcnt_t); 74 static int kphysm_setup_pre_del(pgcnt_t); 75 static void kphysm_setup_post_del(pgcnt_t, int); 76 77 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs); 78 79 static int delspan_reserve(pfn_t, pgcnt_t); 80 static void delspan_unreserve(pfn_t, pgcnt_t); 81 82 static kmutex_t memseg_lists_lock; 83 static struct memseg *memseg_va_avail; 84 static struct memseg *memseg_delete_junk; 85 static struct memseg *memseg_edit_junk; 86 void memseg_remap_init(void); 87 static void memseg_remap_to_dummy(caddr_t, pgcnt_t); 88 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t); 89 static struct memseg *memseg_reuse(pgcnt_t); 90 91 static struct kmem_cache *memseg_cache; 92 93 /* 94 * Add a chunk of memory to the system. page_t's for this memory 95 * are allocated in the first few pages of the chunk. 96 * base: starting PAGESIZE page of new memory. 97 * npgs: length in PAGESIZE pages. 98 * 99 * Adding mem this way doesn't increase the size of the hash tables; 100 * growing them would be too hard. This should be OK, but adding memory 101 * dynamically most likely means more hash misses, since the tables will 102 * be smaller than they otherwise would be. 103 */ 104 int 105 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs) 106 { 107 page_t *pp; 108 page_t *opp, *oepp; 109 struct memseg *seg; 110 uint64_t avmem; 111 pfn_t pfn; 112 pfn_t pt_base = base; 113 pgcnt_t tpgs = npgs; 114 pgcnt_t metapgs; 115 int exhausted; 116 pfn_t pnum; 117 int mnode; 118 caddr_t vaddr; 119 int reuse; 120 int mlret; 121 void *mapva; 122 pgcnt_t nkpmpgs = 0; 123 offset_t kpm_pages_off; 124 125 cmn_err(CE_CONT, 126 "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n", 127 npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT); 128 129 /* 130 * Add this span in the delete list to prevent interactions. 131 */ 132 if (!delspan_reserve(base, npgs)) { 133 return (KPHYSM_ESPAN); 134 } 135 /* 136 * Check to see if any of the memory span has been added 137 * by trying an add to the installed memory list. This 138 * forms the interlocking process for add. 139 */ 140 141 memlist_write_lock(); 142 143 mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT, 144 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 145 146 if (mlret == MEML_SPANOP_OK) 147 installed_top_size(phys_install, &physmax, &physinstalled); 148 149 memlist_write_unlock(); 150 151 if (mlret != MEML_SPANOP_OK) { 152 if (mlret == MEML_SPANOP_EALLOC) { 153 delspan_unreserve(pt_base, tpgs); 154 return (KPHYSM_ERESOURCE); 155 } else 156 if (mlret == MEML_SPANOP_ESPAN) { 157 delspan_unreserve(pt_base, tpgs); 158 return (KPHYSM_ESPAN); 159 } else { 160 delspan_unreserve(pt_base, tpgs); 161 return (KPHYSM_ERESOURCE); 162 } 163 } 164 165 /* 166 * We store the page_t's for this new memory in the first 167 * few pages of the chunk. Here, we go and get'em ... 168 */ 169 170 /* 171 * The expression after the '-' gives the number of pages 172 * that will fit in the new memory based on a requirement 173 * of (PAGESIZE + sizeof (page_t)) bytes per page. 174 */ 175 metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) / 176 (PAGESIZE + sizeof (page_t))); 177 178 npgs -= metapgs; 179 base += metapgs; 180 181 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs); 182 183 exhausted = (metapgs == 0 || npgs == 0); 184 185 if (kpm_enable && !exhausted) { 186 pgcnt_t start, end, nkpmpgs_prelim; 187 size_t ptsz; 188 189 /* 190 * A viable kpm large page mapping must not overlap two 191 * dynamic memsegs. Therefore the total size is checked 192 * to be at least kpm_pgsz and also whether start and end 193 * points are at least kpm_pgsz aligned. 194 */ 195 if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) || 196 pmodkpmp(base + npgs)) { 197 198 kphysm_addmem_error_undospan(pt_base, tpgs); 199 200 /* 201 * There is no specific error code for violating 202 * kpm granularity constraints. 203 */ 204 return (KPHYSM_ENOTVIABLE); 205 } 206 207 start = kpmptop(ptokpmp(base)); 208 end = kpmptop(ptokpmp(base + npgs)); 209 nkpmpgs_prelim = ptokpmp(end - start); 210 ptsz = npgs * sizeof (page_t); 211 metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ); 212 exhausted = (tpgs <= metapgs); 213 if (!exhausted) { 214 npgs = tpgs - metapgs; 215 base = pt_base + metapgs; 216 217 /* final nkpmpgs */ 218 start = kpmptop(ptokpmp(base)); 219 nkpmpgs = ptokpmp(end - start); 220 kpm_pages_off = ptsz + 221 (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ; 222 } 223 } 224 225 /* 226 * Is memory area supplied too small? 227 */ 228 if (exhausted) { 229 kphysm_addmem_error_undospan(pt_base, tpgs); 230 231 /* 232 * There is no specific error code for 'too small'. 233 */ 234 return (KPHYSM_ERESOURCE); 235 } 236 237 /* 238 * We may re-use a previously allocated VA space for the page_ts 239 * eventually, but we need to initialize and lock the pages first. 240 */ 241 242 /* 243 * Get an address in the kernel address map, map 244 * the page_t pages and see if we can touch them. 245 */ 246 247 mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP); 248 if (mapva == NULL) { 249 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" 250 " Can't allocate VA for page_ts"); 251 252 kphysm_addmem_error_undospan(pt_base, tpgs); 253 254 return (KPHYSM_ERESOURCE); 255 } 256 pp = mapva; 257 258 if (physmax < (pt_base + tpgs)) 259 physmax = (pt_base + tpgs); 260 261 /* 262 * In the remapping code we map one page at a time so we must do 263 * the same here to match mapping sizes. 264 */ 265 pfn = pt_base; 266 vaddr = (caddr_t)pp; 267 for (pnum = 0; pnum < metapgs; pnum++) { 268 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 269 PROT_READ | PROT_WRITE, 270 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); 271 pfn++; 272 vaddr += ptob(1); 273 } 274 275 if (ddi_peek32((dev_info_t *)NULL, 276 (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) { 277 278 cmn_err(CE_PANIC, "kphysm_add_memory_dynamic:" 279 " Can't access pp array at 0x%p [phys 0x%lx]", 280 (void *)pp, pt_base); 281 282 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 283 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 284 285 vmem_free(heap_arena, mapva, ptob(metapgs)); 286 287 kphysm_addmem_error_undospan(pt_base, tpgs); 288 289 return (KPHYSM_EFAULT); 290 } 291 292 /* 293 * Add this memory slice to its memory node translation. 294 * 295 * Note that right now, each node may have only one slice; 296 * this may change with COD or in larger SSM systems with 297 * nested latency groups, so we must not assume that the 298 * node does not yet exist. 299 */ 300 pnum = base + npgs - 1; 301 mem_node_add_slice(base, pnum); 302 303 /* 304 * Allocate or resize page counters as necessary to accomodate 305 * the increase in memory pages. 306 */ 307 mnode = PFN_2_MEM_NODE(pnum); 308 if (page_ctrs_adjust(mnode) != 0) { 309 310 mem_node_pre_del_slice(base, pnum); 311 mem_node_post_del_slice(base, pnum, 0); 312 313 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 314 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 315 316 vmem_free(heap_arena, mapva, ptob(metapgs)); 317 318 kphysm_addmem_error_undospan(pt_base, tpgs); 319 320 return (KPHYSM_ERESOURCE); 321 } 322 323 /* 324 * Update the phys_avail memory list. 325 * The phys_install list was done at the start. 326 */ 327 328 memlist_write_lock(); 329 330 mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT, 331 (uint64_t)(npgs) << PAGESHIFT, &phys_avail); 332 ASSERT(mlret == MEML_SPANOP_OK); 333 334 memlist_write_unlock(); 335 336 /* See if we can find a memseg to re-use. */ 337 seg = memseg_reuse(metapgs); 338 339 reuse = (seg != NULL); 340 341 /* 342 * Initialize the memseg structure representing this memory 343 * and add it to the existing list of memsegs. Do some basic 344 * initialization and add the memory to the system. 345 * In order to prevent lock deadlocks, the add_physmem() 346 * code is repeated here, but split into several stages. 347 */ 348 if (seg == NULL) { 349 seg = kmem_cache_alloc(memseg_cache, KM_SLEEP); 350 bzero(seg, sizeof (struct memseg)); 351 seg->msegflags = MEMSEG_DYNAMIC; 352 seg->pages = pp; 353 } else { 354 /*EMPTY*/ 355 ASSERT(seg->msegflags & MEMSEG_DYNAMIC); 356 } 357 358 seg->epages = seg->pages + npgs; 359 seg->pages_base = base; 360 seg->pages_end = base + npgs; 361 362 /* 363 * Initialize metadata. The page_ts are set to locked state 364 * ready to be freed. 365 */ 366 bzero((caddr_t)pp, ptob(metapgs)); 367 368 pfn = seg->pages_base; 369 /* Save the original pp base in case we reuse a memseg. */ 370 opp = pp; 371 oepp = opp + npgs; 372 for (pp = opp; pp < oepp; pp++) { 373 pp->p_pagenum = pfn; 374 pfn++; 375 page_iolock_init(pp); 376 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 377 continue; 378 pp->p_offset = (u_offset_t)-1; 379 } 380 381 if (reuse) { 382 /* Remap our page_ts to the re-used memseg VA space. */ 383 pfn = pt_base; 384 vaddr = (caddr_t)seg->pages; 385 for (pnum = 0; pnum < metapgs; pnum++) { 386 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 387 PROT_READ | PROT_WRITE, 388 HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST); 389 pfn++; 390 vaddr += ptob(1); 391 } 392 393 hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs), 394 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 395 396 vmem_free(heap_arena, mapva, ptob(metapgs)); 397 } 398 399 hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off); 400 401 memsegs_lock(1); 402 403 /* 404 * The new memseg is inserted at the beginning of the list. 405 * Not only does this save searching for the tail, but in the 406 * case of a re-used memseg, it solves the problem of what 407 * happens of some process has still got a pointer to the 408 * memseg and follows the next pointer to continue traversing 409 * the memsegs list. 410 */ 411 412 hat_kpm_addmem_mseg_insert(seg); 413 414 seg->next = memsegs; 415 membar_producer(); 416 417 hat_kpm_addmem_memsegs_update(seg); 418 419 memsegs = seg; 420 421 build_pfn_hash(); 422 423 total_pages += npgs; 424 425 /* 426 * Recalculate the paging parameters now total_pages has changed. 427 * This will also cause the clock hands to be reset before next use. 428 */ 429 setupclock(1); 430 431 memsegs_unlock(1); 432 433 PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs); 434 435 /* 436 * Free the pages outside the lock to avoid locking loops. 437 */ 438 for (pp = seg->pages; pp < seg->epages; pp++) { 439 page_free(pp, 1); 440 } 441 442 /* 443 * Now that we've updated the appropriate memory lists we 444 * need to reset a number of globals, since we've increased memory. 445 * Several have already been updated for us as noted above. The 446 * globals we're interested in at this point are: 447 * physmax - highest page frame number. 448 * physinstalled - number of pages currently installed (done earlier) 449 * maxmem - max free pages in the system 450 * physmem - physical memory pages available 451 * availrmem - real memory available 452 */ 453 454 mutex_enter(&freemem_lock); 455 maxmem += npgs; 456 physmem += npgs; 457 availrmem += npgs; 458 availrmem_initial += npgs; 459 460 mutex_exit(&freemem_lock); 461 462 dump_resize(); 463 464 page_freelist_coalesce_all(mnode); 465 466 kphysm_setup_post_add(npgs); 467 468 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK " 469 "(0x%" PRIx64 ")\n", 470 physinstalled << (PAGESHIFT - 10), 471 (uint64_t)physinstalled << PAGESHIFT); 472 473 avmem = (uint64_t)freemem << PAGESHIFT; 474 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: " 475 "avail mem = %" PRId64 "\n", avmem); 476 477 /* 478 * Update lgroup generation number on single lgroup systems 479 */ 480 if (nlgrps == 1) 481 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 482 483 delspan_unreserve(pt_base, tpgs); 484 return (KPHYSM_OK); /* Successfully added system memory */ 485 486 } 487 488 /* 489 * There are various error conditions in kphysm_add_memory_dynamic() 490 * which require a rollback of already changed global state. 491 */ 492 static void 493 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs) 494 { 495 int mlret; 496 497 /* Unreserve memory span. */ 498 memlist_write_lock(); 499 500 mlret = memlist_delete_span( 501 (uint64_t)(pt_base) << PAGESHIFT, 502 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 503 504 ASSERT(mlret == MEML_SPANOP_OK); 505 phys_install_has_changed(); 506 installed_top_size(phys_install, &physmax, &physinstalled); 507 508 memlist_write_unlock(); 509 delspan_unreserve(pt_base, tpgs); 510 } 511 512 /* 513 * Only return an available memseg of exactly the right size. 514 * When the meta data area has it's own virtual address space 515 * we will need to manage this more carefully and do best fit 516 * allocations, possibly splitting an availble area. 517 */ 518 static struct memseg * 519 memseg_reuse(pgcnt_t metapgs) 520 { 521 struct memseg **segpp, *seg; 522 523 mutex_enter(&memseg_lists_lock); 524 525 segpp = &memseg_va_avail; 526 for (; (seg = *segpp) != NULL; segpp = &seg->lnext) { 527 caddr_t end; 528 529 if (kpm_enable) 530 end = hat_kpm_mseg_reuse(seg); 531 else 532 end = (caddr_t)seg->epages; 533 534 if (btopr(end - (caddr_t)seg->pages) == metapgs) { 535 *segpp = seg->lnext; 536 seg->lnext = NULL; 537 break; 538 } 539 } 540 mutex_exit(&memseg_lists_lock); 541 542 return (seg); 543 } 544 545 static uint_t handle_gen; 546 547 struct memdelspan { 548 struct memdelspan *mds_next; 549 pfn_t mds_base; 550 pgcnt_t mds_npgs; 551 uint_t *mds_bitmap; 552 uint_t *mds_bitmap_retired; 553 }; 554 555 #define NBPBMW (sizeof (uint_t) * NBBY) 556 #define MDS_BITMAPBYTES(MDSP) \ 557 ((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t)) 558 559 struct transit_list { 560 struct transit_list *trl_next; 561 struct memdelspan *trl_spans; 562 int trl_collect; 563 }; 564 565 struct transit_list_head { 566 kmutex_t trh_lock; 567 struct transit_list *trh_head; 568 }; 569 570 static struct transit_list_head transit_list_head; 571 572 struct mem_handle; 573 static void transit_list_collect(struct mem_handle *, int); 574 static void transit_list_insert(struct transit_list *); 575 static void transit_list_remove(struct transit_list *); 576 577 #ifdef DEBUG 578 #define MEM_DEL_STATS 579 #endif /* DEBUG */ 580 581 #ifdef MEM_DEL_STATS 582 static int mem_del_stat_print = 0; 583 struct mem_del_stat { 584 uint_t nloop; 585 uint_t need_free; 586 uint_t free_loop; 587 uint_t free_low; 588 uint_t free_failed; 589 uint_t ncheck; 590 uint_t nopaget; 591 uint_t lockfail; 592 uint_t nfree; 593 uint_t nreloc; 594 uint_t nrelocfail; 595 uint_t already_done; 596 uint_t first_notfree; 597 uint_t npplocked; 598 uint_t nlockreloc; 599 uint_t nnorepl; 600 uint_t nmodreloc; 601 uint_t ndestroy; 602 uint_t nputpage; 603 uint_t nnoreclaim; 604 uint_t ndelay; 605 uint_t demotefail; 606 uint64_t nticks_total; 607 uint64_t nticks_pgrp; 608 uint_t retired; 609 uint_t toxic; 610 uint_t failing; 611 uint_t modtoxic; 612 uint_t npplkdtoxic; 613 uint_t gptlmodfail; 614 uint_t gptllckfail; 615 }; 616 /* 617 * The stat values are only incremented in the delete thread 618 * so no locking or atomic required. 619 */ 620 #define MDSTAT_INCR(MHP, FLD) (MHP)->mh_delstat.FLD++ 621 #define MDSTAT_TOTAL(MHP, ntck) ((MHP)->mh_delstat.nticks_total += (ntck)) 622 #define MDSTAT_PGRP(MHP, ntck) ((MHP)->mh_delstat.nticks_pgrp += (ntck)) 623 static void mem_del_stat_print_func(struct mem_handle *); 624 #define MDSTAT_PRINT(MHP) mem_del_stat_print_func((MHP)) 625 #else /* MEM_DEL_STATS */ 626 #define MDSTAT_INCR(MHP, FLD) 627 #define MDSTAT_TOTAL(MHP, ntck) 628 #define MDSTAT_PGRP(MHP, ntck) 629 #define MDSTAT_PRINT(MHP) 630 #endif /* MEM_DEL_STATS */ 631 632 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING, 633 MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t; 634 635 /* 636 * mh_mutex must be taken to examine or change mh_exthandle and mh_state. 637 * The mutex may not be required for other fields, dependent on mh_state. 638 */ 639 struct mem_handle { 640 kmutex_t mh_mutex; 641 struct mem_handle *mh_next; 642 memhandle_t mh_exthandle; 643 mhnd_state_t mh_state; 644 struct transit_list mh_transit; 645 pgcnt_t mh_phys_pages; 646 pgcnt_t mh_vm_pages; 647 pgcnt_t mh_hold_todo; 648 void (*mh_delete_complete)(void *, int error); 649 void *mh_delete_complete_arg; 650 volatile uint_t mh_cancel; 651 volatile uint_t mh_dr_aio_cleanup_cancel; 652 volatile uint_t mh_aio_cleanup_done; 653 kcondvar_t mh_cv; 654 kthread_id_t mh_thread_id; 655 page_t *mh_deleted; /* link through p_next */ 656 #ifdef MEM_DEL_STATS 657 struct mem_del_stat mh_delstat; 658 #endif /* MEM_DEL_STATS */ 659 }; 660 661 static struct mem_handle *mem_handle_head; 662 static kmutex_t mem_handle_list_mutex; 663 664 static struct mem_handle * 665 kphysm_allocate_mem_handle() 666 { 667 struct mem_handle *mhp; 668 669 mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP); 670 mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL); 671 mutex_enter(&mem_handle_list_mutex); 672 mutex_enter(&mhp->mh_mutex); 673 /* handle_gen is protected by list mutex. */ 674 mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen); 675 mhp->mh_next = mem_handle_head; 676 mem_handle_head = mhp; 677 mutex_exit(&mem_handle_list_mutex); 678 679 return (mhp); 680 } 681 682 static void 683 kphysm_free_mem_handle(struct mem_handle *mhp) 684 { 685 struct mem_handle **mhpp; 686 687 ASSERT(mutex_owned(&mhp->mh_mutex)); 688 ASSERT(mhp->mh_state == MHND_FREE); 689 /* 690 * Exit the mutex to preserve locking order. This is OK 691 * here as once in the FREE state, the handle cannot 692 * be found by a lookup. 693 */ 694 mutex_exit(&mhp->mh_mutex); 695 696 mutex_enter(&mem_handle_list_mutex); 697 mhpp = &mem_handle_head; 698 while (*mhpp != NULL && *mhpp != mhp) 699 mhpp = &(*mhpp)->mh_next; 700 ASSERT(*mhpp == mhp); 701 /* 702 * No need to lock the handle (mh_mutex) as only 703 * mh_next changing and this is the only thread that 704 * can be referncing mhp. 705 */ 706 *mhpp = mhp->mh_next; 707 mutex_exit(&mem_handle_list_mutex); 708 709 mutex_destroy(&mhp->mh_mutex); 710 kmem_free(mhp, sizeof (struct mem_handle)); 711 } 712 713 /* 714 * This function finds the internal mem_handle corresponding to an 715 * external handle and returns it with the mh_mutex held. 716 */ 717 static struct mem_handle * 718 kphysm_lookup_mem_handle(memhandle_t handle) 719 { 720 struct mem_handle *mhp; 721 722 mutex_enter(&mem_handle_list_mutex); 723 for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) { 724 if (mhp->mh_exthandle == handle) { 725 mutex_enter(&mhp->mh_mutex); 726 /* 727 * The state of the handle could have been changed 728 * by kphysm_del_release() while waiting for mh_mutex. 729 */ 730 if (mhp->mh_state == MHND_FREE) { 731 mutex_exit(&mhp->mh_mutex); 732 continue; 733 } 734 break; 735 } 736 } 737 mutex_exit(&mem_handle_list_mutex); 738 return (mhp); 739 } 740 741 int 742 kphysm_del_gethandle(memhandle_t *xmhp) 743 { 744 struct mem_handle *mhp; 745 746 mhp = kphysm_allocate_mem_handle(); 747 /* 748 * The handle is allocated using KM_SLEEP, so cannot fail. 749 * If the implementation is changed, the correct error to return 750 * here would be KPHYSM_ENOHANDLES. 751 */ 752 ASSERT(mhp->mh_state == MHND_FREE); 753 mhp->mh_state = MHND_INIT; 754 *xmhp = mhp->mh_exthandle; 755 mutex_exit(&mhp->mh_mutex); 756 return (KPHYSM_OK); 757 } 758 759 static int 760 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2) 761 { 762 pfn_t e1, e2; 763 764 e1 = b1 + l1; 765 e2 = b2 + l2; 766 767 return (!(b2 >= e1 || b1 >= e2)); 768 } 769 770 static int can_remove_pgs(pgcnt_t); 771 772 static struct memdelspan * 773 span_to_install(pfn_t base, pgcnt_t npgs) 774 { 775 struct memdelspan *mdsp; 776 struct memdelspan *mdsp_new; 777 uint64_t address, size, thislen; 778 struct memlist *mlp; 779 780 mdsp_new = NULL; 781 782 address = (uint64_t)base << PAGESHIFT; 783 size = (uint64_t)npgs << PAGESHIFT; 784 while (size != 0) { 785 memlist_read_lock(); 786 for (mlp = phys_install; mlp != NULL; mlp = mlp->next) { 787 if (address >= (mlp->address + mlp->size)) 788 continue; 789 if ((address + size) > mlp->address) 790 break; 791 } 792 if (mlp == NULL) { 793 address += size; 794 size = 0; 795 thislen = 0; 796 } else { 797 if (address < mlp->address) { 798 size -= (mlp->address - address); 799 address = mlp->address; 800 } 801 ASSERT(address >= mlp->address); 802 if ((address + size) > (mlp->address + mlp->size)) { 803 thislen = mlp->size - (address - mlp->address); 804 } else { 805 thislen = size; 806 } 807 } 808 memlist_read_unlock(); 809 /* TODO: phys_install could change now */ 810 if (thislen == 0) 811 continue; 812 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 813 mdsp->mds_base = btop(address); 814 mdsp->mds_npgs = btop(thislen); 815 mdsp->mds_next = mdsp_new; 816 mdsp_new = mdsp; 817 address += thislen; 818 size -= thislen; 819 } 820 return (mdsp_new); 821 } 822 823 static void 824 free_delspans(struct memdelspan *mdsp) 825 { 826 struct memdelspan *amdsp; 827 828 while ((amdsp = mdsp) != NULL) { 829 mdsp = amdsp->mds_next; 830 kmem_free(amdsp, sizeof (struct memdelspan)); 831 } 832 } 833 834 /* 835 * Concatenate lists. No list ordering is required. 836 */ 837 838 static void 839 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp) 840 { 841 while (*mdspp != NULL) 842 mdspp = &(*mdspp)->mds_next; 843 844 *mdspp = mdsp; 845 } 846 847 /* 848 * Given a new list of delspans, check there is no overlap with 849 * all existing span activity (add or delete) and then concatenate 850 * the new spans to the given list. 851 * Return 1 for OK, 0 if overlapping. 852 */ 853 static int 854 delspan_insert( 855 struct transit_list *my_tlp, 856 struct memdelspan *mdsp_new) 857 { 858 struct transit_list_head *trh; 859 struct transit_list *tlp; 860 int ret; 861 862 trh = &transit_list_head; 863 864 ASSERT(my_tlp != NULL); 865 ASSERT(mdsp_new != NULL); 866 867 ret = 1; 868 mutex_enter(&trh->trh_lock); 869 /* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */ 870 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 871 struct memdelspan *mdsp; 872 873 for (mdsp = tlp->trl_spans; mdsp != NULL; 874 mdsp = mdsp->mds_next) { 875 struct memdelspan *nmdsp; 876 877 for (nmdsp = mdsp_new; nmdsp != NULL; 878 nmdsp = nmdsp->mds_next) { 879 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 880 nmdsp->mds_base, nmdsp->mds_npgs)) { 881 ret = 0; 882 goto done; 883 } 884 } 885 } 886 } 887 done: 888 if (ret != 0) { 889 if (my_tlp->trl_spans == NULL) 890 transit_list_insert(my_tlp); 891 delspan_concat(&my_tlp->trl_spans, mdsp_new); 892 } 893 mutex_exit(&trh->trh_lock); 894 return (ret); 895 } 896 897 static void 898 delspan_remove( 899 struct transit_list *my_tlp, 900 pfn_t base, 901 pgcnt_t npgs) 902 { 903 struct transit_list_head *trh; 904 struct memdelspan *mdsp; 905 906 trh = &transit_list_head; 907 908 ASSERT(my_tlp != NULL); 909 910 mutex_enter(&trh->trh_lock); 911 if ((mdsp = my_tlp->trl_spans) != NULL) { 912 if (npgs == 0) { 913 my_tlp->trl_spans = NULL; 914 free_delspans(mdsp); 915 transit_list_remove(my_tlp); 916 } else { 917 struct memdelspan **prv; 918 919 prv = &my_tlp->trl_spans; 920 while (mdsp != NULL) { 921 pfn_t p_end; 922 923 p_end = mdsp->mds_base + mdsp->mds_npgs; 924 if (mdsp->mds_base >= base && 925 p_end <= (base + npgs)) { 926 *prv = mdsp->mds_next; 927 mdsp->mds_next = NULL; 928 free_delspans(mdsp); 929 } else { 930 prv = &mdsp->mds_next; 931 } 932 mdsp = *prv; 933 } 934 if (my_tlp->trl_spans == NULL) 935 transit_list_remove(my_tlp); 936 } 937 } 938 mutex_exit(&trh->trh_lock); 939 } 940 941 /* 942 * Reserve interface for add to stop delete before add finished. 943 * This list is only accessed through the delspan_insert/remove 944 * functions and so is fully protected by the mutex in struct transit_list. 945 */ 946 947 static struct transit_list reserve_transit; 948 949 static int 950 delspan_reserve(pfn_t base, pgcnt_t npgs) 951 { 952 struct memdelspan *mdsp; 953 int ret; 954 955 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 956 mdsp->mds_base = base; 957 mdsp->mds_npgs = npgs; 958 if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) { 959 free_delspans(mdsp); 960 } 961 return (ret); 962 } 963 964 static void 965 delspan_unreserve(pfn_t base, pgcnt_t npgs) 966 { 967 delspan_remove(&reserve_transit, base, npgs); 968 } 969 970 /* 971 * Return whether memseg was created by kphysm_add_memory_dynamic(). 972 * If this is the case and startp non zero, return also the start pfn 973 * of the meta data via startp. 974 */ 975 static int 976 memseg_is_dynamic(struct memseg *seg, pfn_t *startp) 977 { 978 pfn_t pt_start; 979 980 if ((seg->msegflags & MEMSEG_DYNAMIC) == 0) 981 return (0); 982 983 /* Meta data is required to be at the beginning */ 984 ASSERT(hat_getpfnum(kas.a_hat, (caddr_t)seg->epages) < seg->pages_base); 985 986 pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages); 987 if (startp != NULL) 988 *startp = pt_start; 989 990 return (1); 991 } 992 993 int 994 kphysm_del_span( 995 memhandle_t handle, 996 pfn_t base, 997 pgcnt_t npgs) 998 { 999 struct mem_handle *mhp; 1000 struct memseg *seg; 1001 struct memdelspan *mdsp; 1002 struct memdelspan *mdsp_new; 1003 pgcnt_t phys_pages, vm_pages; 1004 pfn_t p_end; 1005 page_t *pp; 1006 int ret; 1007 1008 mhp = kphysm_lookup_mem_handle(handle); 1009 if (mhp == NULL) { 1010 return (KPHYSM_EHANDLE); 1011 } 1012 if (mhp->mh_state != MHND_INIT) { 1013 mutex_exit(&mhp->mh_mutex); 1014 return (KPHYSM_ESEQUENCE); 1015 } 1016 1017 /* 1018 * Intersect the span with the installed memory list (phys_install). 1019 */ 1020 mdsp_new = span_to_install(base, npgs); 1021 if (mdsp_new == NULL) { 1022 /* 1023 * No physical memory in this range. Is this an 1024 * error? If an attempt to start the delete is made 1025 * for OK returns from del_span such as this, start will 1026 * return an error. 1027 * Could return KPHYSM_ENOWORK. 1028 */ 1029 /* 1030 * It is assumed that there are no error returns 1031 * from span_to_install() due to kmem_alloc failure. 1032 */ 1033 mutex_exit(&mhp->mh_mutex); 1034 return (KPHYSM_OK); 1035 } 1036 /* 1037 * Does this span overlap an existing span? 1038 */ 1039 if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) { 1040 /* 1041 * Differentiate between already on list for this handle 1042 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY). 1043 */ 1044 ret = KPHYSM_EBUSY; 1045 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1046 mdsp = mdsp->mds_next) { 1047 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 1048 base, npgs)) { 1049 ret = KPHYSM_EDUP; 1050 break; 1051 } 1052 } 1053 mutex_exit(&mhp->mh_mutex); 1054 free_delspans(mdsp_new); 1055 return (ret); 1056 } 1057 /* 1058 * At this point the spans in mdsp_new have been inserted into the 1059 * list of spans for this handle and thereby to the global list of 1060 * spans being processed. Each of these spans must now be checked 1061 * for relocatability. As a side-effect segments in the memseg list 1062 * may be split. 1063 * 1064 * Note that mdsp_new can no longer be used as it is now part of 1065 * a larger list. Select elements of this larger list based 1066 * on base and npgs. 1067 */ 1068 restart: 1069 phys_pages = 0; 1070 vm_pages = 0; 1071 ret = KPHYSM_OK; 1072 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1073 mdsp = mdsp->mds_next) { 1074 pgcnt_t pages_checked; 1075 1076 if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) { 1077 continue; 1078 } 1079 p_end = mdsp->mds_base + mdsp->mds_npgs; 1080 /* 1081 * The pages_checked count is a hack. All pages should be 1082 * checked for relocatability. Those not covered by memsegs 1083 * should be tested with arch_kphysm_del_span_ok(). 1084 */ 1085 pages_checked = 0; 1086 for (seg = memsegs; seg; seg = seg->next) { 1087 pfn_t mseg_start; 1088 1089 if (seg->pages_base >= p_end || 1090 seg->pages_end <= mdsp->mds_base) { 1091 /* Span and memseg don't overlap. */ 1092 continue; 1093 } 1094 /* Check that segment is suitable for delete. */ 1095 if (memseg_is_dynamic(seg, &mseg_start)) { 1096 /* 1097 * Can only delete whole added segments 1098 * for the moment. 1099 * Check that this is completely within the 1100 * span. 1101 */ 1102 if (mseg_start < mdsp->mds_base || 1103 seg->pages_end > p_end) { 1104 ret = KPHYSM_EBUSY; 1105 break; 1106 } 1107 pages_checked += seg->pages_end - mseg_start; 1108 } else { 1109 /* 1110 * Set mseg_start for accounting below. 1111 */ 1112 mseg_start = seg->pages_base; 1113 /* 1114 * If this segment is larger than the span, 1115 * try to split it. After the split, it 1116 * is necessary to restart. 1117 */ 1118 if (seg->pages_base < mdsp->mds_base || 1119 seg->pages_end > p_end) { 1120 pfn_t abase; 1121 pgcnt_t anpgs; 1122 int s_ret; 1123 1124 /* Split required. */ 1125 if (mdsp->mds_base < seg->pages_base) 1126 abase = seg->pages_base; 1127 else 1128 abase = mdsp->mds_base; 1129 if (p_end > seg->pages_end) 1130 anpgs = seg->pages_end - abase; 1131 else 1132 anpgs = p_end - abase; 1133 s_ret = kphysm_split_memseg(abase, 1134 anpgs); 1135 if (s_ret == 0) { 1136 /* Split failed. */ 1137 ret = KPHYSM_ERESOURCE; 1138 break; 1139 } 1140 goto restart; 1141 } 1142 pages_checked += 1143 seg->pages_end - seg->pages_base; 1144 } 1145 /* 1146 * The memseg is wholly within the delete span. 1147 * The individual pages can now be checked. 1148 */ 1149 /* Cage test. */ 1150 for (pp = seg->pages; pp < seg->epages; pp++) { 1151 if (PP_ISNORELOC(pp)) { 1152 ret = KPHYSM_ENONRELOC; 1153 break; 1154 } 1155 } 1156 if (ret != KPHYSM_OK) { 1157 break; 1158 } 1159 phys_pages += (seg->pages_end - mseg_start); 1160 vm_pages += MSEG_NPAGES(seg); 1161 } 1162 if (ret != KPHYSM_OK) 1163 break; 1164 if (pages_checked != mdsp->mds_npgs) { 1165 ret = KPHYSM_ENONRELOC; 1166 break; 1167 } 1168 } 1169 1170 if (ret == KPHYSM_OK) { 1171 mhp->mh_phys_pages += phys_pages; 1172 mhp->mh_vm_pages += vm_pages; 1173 } else { 1174 /* 1175 * Keep holding the mh_mutex to prevent it going away. 1176 */ 1177 delspan_remove(&mhp->mh_transit, base, npgs); 1178 } 1179 mutex_exit(&mhp->mh_mutex); 1180 return (ret); 1181 } 1182 1183 int 1184 kphysm_del_span_query( 1185 pfn_t base, 1186 pgcnt_t npgs, 1187 memquery_t *mqp) 1188 { 1189 struct memdelspan *mdsp; 1190 struct memdelspan *mdsp_new; 1191 int done_first_nonreloc; 1192 1193 mqp->phys_pages = 0; 1194 mqp->managed = 0; 1195 mqp->nonrelocatable = 0; 1196 mqp->first_nonrelocatable = 0; 1197 mqp->last_nonrelocatable = 0; 1198 1199 mdsp_new = span_to_install(base, npgs); 1200 /* 1201 * It is OK to proceed here if mdsp_new == NULL. 1202 */ 1203 done_first_nonreloc = 0; 1204 for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) { 1205 pfn_t sbase; 1206 pgcnt_t snpgs; 1207 1208 mqp->phys_pages += mdsp->mds_npgs; 1209 sbase = mdsp->mds_base; 1210 snpgs = mdsp->mds_npgs; 1211 while (snpgs != 0) { 1212 struct memseg *lseg, *seg; 1213 pfn_t p_end; 1214 page_t *pp; 1215 pfn_t mseg_start; 1216 1217 p_end = sbase + snpgs; 1218 /* 1219 * Find the lowest addressed memseg that starts 1220 * after sbase and account for it. 1221 * This is to catch dynamic memsegs whose start 1222 * is hidden. 1223 */ 1224 seg = NULL; 1225 for (lseg = memsegs; lseg != NULL; lseg = lseg->next) { 1226 if ((lseg->pages_base >= sbase) || 1227 (lseg->pages_base < p_end && 1228 lseg->pages_end > sbase)) { 1229 if (seg == NULL || 1230 seg->pages_base > lseg->pages_base) 1231 seg = lseg; 1232 } 1233 } 1234 if (seg != NULL) { 1235 if (!memseg_is_dynamic(seg, &mseg_start)) { 1236 mseg_start = seg->pages_base; 1237 } 1238 /* 1239 * Now have the full extent of the memseg so 1240 * do the range check. 1241 */ 1242 if (mseg_start >= p_end || 1243 seg->pages_end <= sbase) { 1244 /* Span does not overlap memseg. */ 1245 seg = NULL; 1246 } 1247 } 1248 /* 1249 * Account for gap either before the segment if 1250 * there is one or to the end of the span. 1251 */ 1252 if (seg == NULL || mseg_start > sbase) { 1253 pfn_t a_end; 1254 1255 a_end = (seg == NULL) ? p_end : mseg_start; 1256 /* 1257 * Check with arch layer for relocatability. 1258 */ 1259 if (arch_kphysm_del_span_ok(sbase, 1260 (a_end - sbase))) { 1261 /* 1262 * No non-relocatble pages in this 1263 * area, avoid the fine-grained 1264 * test. 1265 */ 1266 snpgs -= (a_end - sbase); 1267 sbase = a_end; 1268 } 1269 while (sbase < a_end) { 1270 if (!arch_kphysm_del_span_ok(sbase, 1271 1)) { 1272 mqp->nonrelocatable++; 1273 if (!done_first_nonreloc) { 1274 mqp-> 1275 first_nonrelocatable 1276 = sbase; 1277 done_first_nonreloc = 1; 1278 } 1279 mqp->last_nonrelocatable = 1280 sbase; 1281 } 1282 sbase++; 1283 snpgs--; 1284 } 1285 } 1286 if (seg != NULL) { 1287 ASSERT(mseg_start <= sbase); 1288 if (seg->pages_base != mseg_start && 1289 seg->pages_base > sbase) { 1290 pgcnt_t skip_pgs; 1291 1292 /* 1293 * Skip the page_t area of a 1294 * dynamic memseg. 1295 */ 1296 skip_pgs = seg->pages_base - sbase; 1297 if (snpgs <= skip_pgs) { 1298 sbase += snpgs; 1299 snpgs = 0; 1300 continue; 1301 } 1302 snpgs -= skip_pgs; 1303 sbase += skip_pgs; 1304 } 1305 ASSERT(snpgs != 0); 1306 ASSERT(seg->pages_base <= sbase); 1307 /* 1308 * The individual pages can now be checked. 1309 */ 1310 for (pp = seg->pages + 1311 (sbase - seg->pages_base); 1312 snpgs != 0 && pp < seg->epages; pp++) { 1313 mqp->managed++; 1314 if (PP_ISNORELOC(pp)) { 1315 mqp->nonrelocatable++; 1316 if (!done_first_nonreloc) { 1317 mqp-> 1318 first_nonrelocatable 1319 = sbase; 1320 done_first_nonreloc = 1; 1321 } 1322 mqp->last_nonrelocatable = 1323 sbase; 1324 } 1325 sbase++; 1326 snpgs--; 1327 } 1328 } 1329 } 1330 } 1331 1332 free_delspans(mdsp_new); 1333 1334 return (KPHYSM_OK); 1335 } 1336 1337 /* 1338 * This release function can be called at any stage as follows: 1339 * _gethandle only called 1340 * _span(s) only called 1341 * _start called but failed 1342 * delete thread exited 1343 */ 1344 int 1345 kphysm_del_release(memhandle_t handle) 1346 { 1347 struct mem_handle *mhp; 1348 1349 mhp = kphysm_lookup_mem_handle(handle); 1350 if (mhp == NULL) { 1351 return (KPHYSM_EHANDLE); 1352 } 1353 switch (mhp->mh_state) { 1354 case MHND_STARTING: 1355 case MHND_RUNNING: 1356 mutex_exit(&mhp->mh_mutex); 1357 return (KPHYSM_ENOTFINISHED); 1358 case MHND_FREE: 1359 ASSERT(mhp->mh_state != MHND_FREE); 1360 mutex_exit(&mhp->mh_mutex); 1361 return (KPHYSM_EHANDLE); 1362 case MHND_INIT: 1363 break; 1364 case MHND_DONE: 1365 break; 1366 case MHND_RELEASE: 1367 mutex_exit(&mhp->mh_mutex); 1368 return (KPHYSM_ESEQUENCE); 1369 default: 1370 #ifdef DEBUG 1371 cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d", 1372 (void *)mhp, mhp->mh_state); 1373 #endif /* DEBUG */ 1374 mutex_exit(&mhp->mh_mutex); 1375 return (KPHYSM_EHANDLE); 1376 } 1377 /* 1378 * Set state so that we can wait if necessary. 1379 * Also this means that we have read/write access to all 1380 * fields except mh_exthandle and mh_state. 1381 */ 1382 mhp->mh_state = MHND_RELEASE; 1383 /* 1384 * The mem_handle cannot be de-allocated by any other operation 1385 * now, so no need to hold mh_mutex. 1386 */ 1387 mutex_exit(&mhp->mh_mutex); 1388 1389 delspan_remove(&mhp->mh_transit, 0, 0); 1390 mhp->mh_phys_pages = 0; 1391 mhp->mh_vm_pages = 0; 1392 mhp->mh_hold_todo = 0; 1393 mhp->mh_delete_complete = NULL; 1394 mhp->mh_delete_complete_arg = NULL; 1395 mhp->mh_cancel = 0; 1396 1397 mutex_enter(&mhp->mh_mutex); 1398 ASSERT(mhp->mh_state == MHND_RELEASE); 1399 mhp->mh_state = MHND_FREE; 1400 1401 kphysm_free_mem_handle(mhp); 1402 1403 return (KPHYSM_OK); 1404 } 1405 1406 /* 1407 * This cancel function can only be called with the thread running. 1408 */ 1409 int 1410 kphysm_del_cancel(memhandle_t handle) 1411 { 1412 struct mem_handle *mhp; 1413 1414 mhp = kphysm_lookup_mem_handle(handle); 1415 if (mhp == NULL) { 1416 return (KPHYSM_EHANDLE); 1417 } 1418 if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) { 1419 mutex_exit(&mhp->mh_mutex); 1420 return (KPHYSM_ENOTRUNNING); 1421 } 1422 /* 1423 * Set the cancel flag and wake the delete thread up. 1424 * The thread may be waiting on I/O, so the effect of the cancel 1425 * may be delayed. 1426 */ 1427 if (mhp->mh_cancel == 0) { 1428 mhp->mh_cancel = KPHYSM_ECANCELLED; 1429 cv_signal(&mhp->mh_cv); 1430 } 1431 mutex_exit(&mhp->mh_mutex); 1432 return (KPHYSM_OK); 1433 } 1434 1435 int 1436 kphysm_del_status( 1437 memhandle_t handle, 1438 memdelstat_t *mdstp) 1439 { 1440 struct mem_handle *mhp; 1441 1442 mhp = kphysm_lookup_mem_handle(handle); 1443 if (mhp == NULL) { 1444 return (KPHYSM_EHANDLE); 1445 } 1446 /* 1447 * Calling kphysm_del_status() is allowed before the delete 1448 * is started to allow for status display. 1449 */ 1450 if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING && 1451 mhp->mh_state != MHND_RUNNING) { 1452 mutex_exit(&mhp->mh_mutex); 1453 return (KPHYSM_ENOTRUNNING); 1454 } 1455 mdstp->phys_pages = mhp->mh_phys_pages; 1456 mdstp->managed = mhp->mh_vm_pages; 1457 mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo; 1458 mutex_exit(&mhp->mh_mutex); 1459 return (KPHYSM_OK); 1460 } 1461 1462 static int mem_delete_additional_pages = 100; 1463 1464 static int 1465 can_remove_pgs(pgcnt_t npgs) 1466 { 1467 /* 1468 * If all pageable pages were paged out, freemem would 1469 * equal availrmem. There is a minimum requirement for 1470 * availrmem. 1471 */ 1472 if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages)) 1473 < npgs) 1474 return (0); 1475 /* TODO: check swap space, etc. */ 1476 return (1); 1477 } 1478 1479 static int 1480 get_availrmem(pgcnt_t npgs) 1481 { 1482 int ret; 1483 1484 mutex_enter(&freemem_lock); 1485 ret = can_remove_pgs(npgs); 1486 if (ret != 0) 1487 availrmem -= npgs; 1488 mutex_exit(&freemem_lock); 1489 return (ret); 1490 } 1491 1492 static void 1493 put_availrmem(pgcnt_t npgs) 1494 { 1495 mutex_enter(&freemem_lock); 1496 availrmem += npgs; 1497 mutex_exit(&freemem_lock); 1498 } 1499 1500 #define FREEMEM_INCR 100 1501 static pgcnt_t freemem_incr = FREEMEM_INCR; 1502 #define DEL_FREE_WAIT_FRAC 4 1503 #define DEL_FREE_WAIT_TICKS ((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC) 1504 1505 #define DEL_BUSY_WAIT_FRAC 20 1506 #define DEL_BUSY_WAIT_TICKS ((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC) 1507 1508 static void kphysm_del_cleanup(struct mem_handle *); 1509 1510 static void page_delete_collect(page_t *, struct mem_handle *); 1511 1512 static pgcnt_t 1513 delthr_get_freemem(struct mem_handle *mhp) 1514 { 1515 pgcnt_t free_get; 1516 int ret; 1517 1518 ASSERT(MUTEX_HELD(&mhp->mh_mutex)); 1519 1520 MDSTAT_INCR(mhp, need_free); 1521 /* 1522 * Get up to freemem_incr pages. 1523 */ 1524 free_get = freemem_incr; 1525 if (free_get > mhp->mh_hold_todo) 1526 free_get = mhp->mh_hold_todo; 1527 /* 1528 * Take free_get pages away from freemem, 1529 * waiting if necessary. 1530 */ 1531 1532 while (!mhp->mh_cancel) { 1533 mutex_exit(&mhp->mh_mutex); 1534 MDSTAT_INCR(mhp, free_loop); 1535 /* 1536 * Duplicate test from page_create_throttle() 1537 * but don't override with !PG_WAIT. 1538 */ 1539 if (freemem < (free_get + throttlefree)) { 1540 MDSTAT_INCR(mhp, free_low); 1541 ret = 0; 1542 } else { 1543 ret = page_create_wait(free_get, 0); 1544 if (ret == 0) { 1545 /* EMPTY */ 1546 MDSTAT_INCR(mhp, free_failed); 1547 } 1548 } 1549 if (ret != 0) { 1550 mutex_enter(&mhp->mh_mutex); 1551 return (free_get); 1552 } 1553 1554 /* 1555 * Put pressure on pageout. 1556 */ 1557 page_needfree(free_get); 1558 cv_signal(&proc_pageout->p_cv); 1559 1560 mutex_enter(&mhp->mh_mutex); 1561 (void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex, 1562 (lbolt + DEL_FREE_WAIT_TICKS)); 1563 mutex_exit(&mhp->mh_mutex); 1564 page_needfree(-(spgcnt_t)free_get); 1565 1566 mutex_enter(&mhp->mh_mutex); 1567 } 1568 return (0); 1569 } 1570 1571 #define DR_AIO_CLEANUP_DELAY 25000 /* 0.025secs, in usec */ 1572 #define DR_AIO_CLEANUP_MAXLOOPS_NODELAY 100 1573 /* 1574 * This function is run as a helper thread for delete_memory_thread. 1575 * It is needed in order to force kaio cleanup, so that pages used in kaio 1576 * will be unlocked and subsequently relocated by delete_memory_thread. 1577 * The address of the delete_memory_threads's mem_handle is passed in to 1578 * this thread function, and is used to set the mh_aio_cleanup_done member 1579 * prior to calling thread_exit(). 1580 */ 1581 static void 1582 dr_aio_cleanup_thread(caddr_t amhp) 1583 { 1584 proc_t *procp; 1585 int (*aio_cleanup_dr_delete_memory)(proc_t *); 1586 int cleaned; 1587 int n = 0; 1588 struct mem_handle *mhp; 1589 volatile uint_t *pcancel; 1590 1591 mhp = (struct mem_handle *)amhp; 1592 ASSERT(mhp != NULL); 1593 pcancel = &mhp->mh_dr_aio_cleanup_cancel; 1594 if (modload("sys", "kaio") == -1) { 1595 mhp->mh_aio_cleanup_done = 1; 1596 cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio"); 1597 thread_exit(); 1598 } 1599 aio_cleanup_dr_delete_memory = (int (*)(proc_t *)) 1600 modgetsymvalue("aio_cleanup_dr_delete_memory", 0); 1601 if (aio_cleanup_dr_delete_memory == NULL) { 1602 mhp->mh_aio_cleanup_done = 1; 1603 cmn_err(CE_WARN, 1604 "aio_cleanup_dr_delete_memory not found in kaio"); 1605 thread_exit(); 1606 } 1607 do { 1608 cleaned = 0; 1609 mutex_enter(&pidlock); 1610 for (procp = practive; (*pcancel == 0) && (procp != NULL); 1611 procp = procp->p_next) { 1612 mutex_enter(&procp->p_lock); 1613 if (procp->p_aio != NULL) { 1614 /* cleanup proc's outstanding kaio */ 1615 cleaned += 1616 (*aio_cleanup_dr_delete_memory)(procp); 1617 } 1618 mutex_exit(&procp->p_lock); 1619 } 1620 mutex_exit(&pidlock); 1621 if ((*pcancel == 0) && 1622 (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) { 1623 /* delay a bit before retrying all procs again */ 1624 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 1625 n = 0; 1626 } 1627 } while (*pcancel == 0); 1628 mhp->mh_aio_cleanup_done = 1; 1629 thread_exit(); 1630 } 1631 1632 static void 1633 delete_memory_thread(caddr_t amhp) 1634 { 1635 struct mem_handle *mhp; 1636 struct memdelspan *mdsp; 1637 callb_cpr_t cprinfo; 1638 page_t *pp_targ; 1639 spgcnt_t freemem_left; 1640 void (*del_complete_funcp)(void *, int error); 1641 void *del_complete_arg; 1642 int comp_code; 1643 int ret; 1644 int first_scan; 1645 uint_t szc; 1646 #ifdef MEM_DEL_STATS 1647 uint64_t start_total, ntick_total; 1648 uint64_t start_pgrp, ntick_pgrp; 1649 #endif /* MEM_DEL_STATS */ 1650 1651 mhp = (struct mem_handle *)amhp; 1652 1653 #ifdef MEM_DEL_STATS 1654 start_total = ddi_get_lbolt(); 1655 #endif /* MEM_DEL_STATS */ 1656 1657 CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex, 1658 callb_generic_cpr, "memdel"); 1659 1660 mutex_enter(&mhp->mh_mutex); 1661 ASSERT(mhp->mh_state == MHND_STARTING); 1662 1663 mhp->mh_state = MHND_RUNNING; 1664 mhp->mh_thread_id = curthread; 1665 1666 mhp->mh_hold_todo = mhp->mh_vm_pages; 1667 mutex_exit(&mhp->mh_mutex); 1668 1669 /* Allocate the remap pages now, if necessary. */ 1670 memseg_remap_init(); 1671 1672 /* 1673 * Subtract from availrmem now if possible as availrmem 1674 * may not be available by the end of the delete. 1675 */ 1676 if (!get_availrmem(mhp->mh_vm_pages)) { 1677 comp_code = KPHYSM_ENOTVIABLE; 1678 mutex_enter(&mhp->mh_mutex); 1679 goto early_exit; 1680 } 1681 1682 ret = kphysm_setup_pre_del(mhp->mh_vm_pages); 1683 1684 mutex_enter(&mhp->mh_mutex); 1685 1686 if (ret != 0) { 1687 mhp->mh_cancel = KPHYSM_EREFUSED; 1688 goto refused; 1689 } 1690 1691 transit_list_collect(mhp, 1); 1692 1693 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1694 mdsp = mdsp->mds_next) { 1695 ASSERT(mdsp->mds_bitmap == NULL); 1696 mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP); 1697 mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp), 1698 KM_SLEEP); 1699 } 1700 1701 first_scan = 1; 1702 freemem_left = 0; 1703 /* 1704 * Start dr_aio_cleanup_thread, which periodically iterates 1705 * through the process list and invokes aio cleanup. This 1706 * is needed in order to avoid a deadly embrace between the 1707 * delete_memory_thread (waiting on writer lock for page, with the 1708 * exclusive-wanted bit set), kaio read request threads (waiting for a 1709 * reader lock on the same page that is wanted by the 1710 * delete_memory_thread), and threads waiting for kaio completion 1711 * (blocked on spt_amp->lock). 1712 */ 1713 mhp->mh_dr_aio_cleanup_cancel = 0; 1714 mhp->mh_aio_cleanup_done = 0; 1715 (void) thread_create(NULL, 0, dr_aio_cleanup_thread, 1716 (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1); 1717 while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) { 1718 pgcnt_t collected; 1719 1720 MDSTAT_INCR(mhp, nloop); 1721 collected = 0; 1722 for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) && 1723 (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) { 1724 pfn_t pfn, p_end; 1725 1726 if (first_scan) { 1727 mem_node_pre_del_slice(mdsp->mds_base, 1728 mdsp->mds_base + mdsp->mds_npgs - 1); 1729 } 1730 1731 p_end = mdsp->mds_base + mdsp->mds_npgs; 1732 for (pfn = mdsp->mds_base; (pfn < p_end) && 1733 (mhp->mh_cancel == 0); pfn++) { 1734 page_t *pp, *tpp, *tpp_targ; 1735 pgcnt_t bit; 1736 struct vnode *vp; 1737 u_offset_t offset; 1738 int mod, result; 1739 spgcnt_t pgcnt; 1740 1741 bit = pfn - mdsp->mds_base; 1742 if ((mdsp->mds_bitmap[bit / NBPBMW] & 1743 (1 << (bit % NBPBMW))) != 0) { 1744 MDSTAT_INCR(mhp, already_done); 1745 continue; 1746 } 1747 if (freemem_left == 0) { 1748 freemem_left += delthr_get_freemem(mhp); 1749 if (freemem_left == 0) 1750 break; 1751 } 1752 1753 /* 1754 * Release mh_mutex - some of this 1755 * stuff takes some time (eg PUTPAGE). 1756 */ 1757 1758 mutex_exit(&mhp->mh_mutex); 1759 MDSTAT_INCR(mhp, ncheck); 1760 1761 pp = page_numtopp_nolock(pfn); 1762 if (pp == NULL) { 1763 /* 1764 * Not covered by a page_t - will 1765 * be dealt with elsewhere. 1766 */ 1767 MDSTAT_INCR(mhp, nopaget); 1768 mutex_enter(&mhp->mh_mutex); 1769 mdsp->mds_bitmap[bit / NBPBMW] |= 1770 (1 << (bit % NBPBMW)); 1771 continue; 1772 } 1773 1774 if (!page_try_reclaim_lock(pp, SE_EXCL, 1775 SE_EXCL_WANTED | SE_RETIRED)) { 1776 /* 1777 * Page in use elsewhere. Skip it. 1778 */ 1779 MDSTAT_INCR(mhp, lockfail); 1780 mutex_enter(&mhp->mh_mutex); 1781 continue; 1782 } 1783 /* 1784 * See if the cage expanded into the delete. 1785 * This can happen as we have to allow the 1786 * cage to expand. 1787 */ 1788 if (PP_ISNORELOC(pp)) { 1789 page_unlock(pp); 1790 mutex_enter(&mhp->mh_mutex); 1791 mhp->mh_cancel = KPHYSM_ENONRELOC; 1792 break; 1793 } 1794 if (PP_RETIRED(pp)) { 1795 /* 1796 * Page has been retired and is 1797 * not part of the cage so we 1798 * can now do the accounting for 1799 * it. 1800 */ 1801 MDSTAT_INCR(mhp, retired); 1802 mutex_enter(&mhp->mh_mutex); 1803 mdsp->mds_bitmap[bit / NBPBMW] 1804 |= (1 << (bit % NBPBMW)); 1805 mdsp->mds_bitmap_retired[bit / 1806 NBPBMW] |= 1807 (1 << (bit % NBPBMW)); 1808 mhp->mh_hold_todo--; 1809 continue; 1810 } 1811 ASSERT(freemem_left != 0); 1812 if (PP_ISFREE(pp)) { 1813 /* 1814 * Like page_reclaim() only 'freemem' 1815 * processing is already done. 1816 */ 1817 MDSTAT_INCR(mhp, nfree); 1818 free_page_collect: 1819 if (PP_ISAGED(pp)) { 1820 page_list_sub(pp, 1821 PG_FREE_LIST); 1822 } else { 1823 page_list_sub(pp, 1824 PG_CACHE_LIST); 1825 } 1826 PP_CLRFREE(pp); 1827 PP_CLRAGED(pp); 1828 collected++; 1829 mutex_enter(&mhp->mh_mutex); 1830 page_delete_collect(pp, mhp); 1831 mdsp->mds_bitmap[bit / NBPBMW] |= 1832 (1 << (bit % NBPBMW)); 1833 freemem_left--; 1834 continue; 1835 } 1836 ASSERT(pp->p_vnode != NULL); 1837 if (first_scan) { 1838 MDSTAT_INCR(mhp, first_notfree); 1839 page_unlock(pp); 1840 mutex_enter(&mhp->mh_mutex); 1841 continue; 1842 } 1843 /* 1844 * Keep stats on pages encountered that 1845 * are marked for retirement. 1846 */ 1847 if (PP_TOXIC(pp)) { 1848 MDSTAT_INCR(mhp, toxic); 1849 } else if (PP_PR_REQ(pp)) { 1850 MDSTAT_INCR(mhp, failing); 1851 } 1852 /* 1853 * In certain cases below, special exceptions 1854 * are made for pages that are toxic. This 1855 * is because the current meaning of toxic 1856 * is that an uncorrectable error has been 1857 * previously associated with the page. 1858 */ 1859 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1860 if (!PP_TOXIC(pp)) { 1861 /* 1862 * Must relocate locked in 1863 * memory pages. 1864 */ 1865 #ifdef MEM_DEL_STATS 1866 start_pgrp = ddi_get_lbolt(); 1867 #endif /* MEM_DEL_STATS */ 1868 /* 1869 * Lock all constituent pages 1870 * of a large page to ensure 1871 * that p_szc won't change. 1872 */ 1873 if (!group_page_trylock(pp, 1874 SE_EXCL)) { 1875 MDSTAT_INCR(mhp, 1876 gptllckfail); 1877 page_unlock(pp); 1878 mutex_enter( 1879 &mhp->mh_mutex); 1880 continue; 1881 } 1882 MDSTAT_INCR(mhp, npplocked); 1883 pp_targ = 1884 page_get_replacement_page( 1885 pp, NULL, 0); 1886 if (pp_targ != NULL) { 1887 #ifdef MEM_DEL_STATS 1888 ntick_pgrp = 1889 (uint64_t) 1890 ddi_get_lbolt() - 1891 start_pgrp; 1892 #endif /* MEM_DEL_STATS */ 1893 MDSTAT_PGRP(mhp, 1894 ntick_pgrp); 1895 MDSTAT_INCR(mhp, 1896 nlockreloc); 1897 goto reloc; 1898 } 1899 group_page_unlock(pp); 1900 page_unlock(pp); 1901 #ifdef MEM_DEL_STATS 1902 ntick_pgrp = 1903 (uint64_t)ddi_get_lbolt() - 1904 start_pgrp; 1905 #endif /* MEM_DEL_STATS */ 1906 MDSTAT_PGRP(mhp, ntick_pgrp); 1907 MDSTAT_INCR(mhp, nnorepl); 1908 mutex_enter(&mhp->mh_mutex); 1909 continue; 1910 } else { 1911 /* 1912 * Cannot do anything about 1913 * this page because it is 1914 * toxic. 1915 */ 1916 MDSTAT_INCR(mhp, npplkdtoxic); 1917 page_unlock(pp); 1918 mutex_enter(&mhp->mh_mutex); 1919 continue; 1920 } 1921 } 1922 /* 1923 * Unload the mappings and check if mod bit 1924 * is set. 1925 */ 1926 ASSERT(!PP_ISKAS(pp)); 1927 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1928 mod = hat_ismod(pp); 1929 1930 #ifdef MEM_DEL_STATS 1931 start_pgrp = ddi_get_lbolt(); 1932 #endif /* MEM_DEL_STATS */ 1933 if (mod && !PP_TOXIC(pp)) { 1934 /* 1935 * Lock all constituent pages 1936 * of a large page to ensure 1937 * that p_szc won't change. 1938 */ 1939 if (!group_page_trylock(pp, SE_EXCL)) { 1940 MDSTAT_INCR(mhp, gptlmodfail); 1941 page_unlock(pp); 1942 mutex_enter(&mhp->mh_mutex); 1943 continue; 1944 } 1945 pp_targ = page_get_replacement_page(pp, 1946 NULL, 0); 1947 if (pp_targ != NULL) { 1948 MDSTAT_INCR(mhp, nmodreloc); 1949 #ifdef MEM_DEL_STATS 1950 ntick_pgrp = 1951 (uint64_t)ddi_get_lbolt() - 1952 start_pgrp; 1953 #endif /* MEM_DEL_STATS */ 1954 MDSTAT_PGRP(mhp, ntick_pgrp); 1955 goto reloc; 1956 } 1957 group_page_unlock(pp); 1958 } 1959 1960 if (!page_try_demote_pages(pp)) { 1961 MDSTAT_INCR(mhp, demotefail); 1962 page_unlock(pp); 1963 #ifdef MEM_DEL_STATS 1964 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 1965 start_pgrp; 1966 #endif /* MEM_DEL_STATS */ 1967 MDSTAT_PGRP(mhp, ntick_pgrp); 1968 mutex_enter(&mhp->mh_mutex); 1969 continue; 1970 } 1971 1972 /* 1973 * Regular 'page-out'. 1974 */ 1975 if (!mod) { 1976 MDSTAT_INCR(mhp, ndestroy); 1977 page_destroy(pp, 1); 1978 /* 1979 * page_destroy was called with 1980 * dontfree. As long as p_lckcnt 1981 * and p_cowcnt are both zero, the 1982 * only additional action of 1983 * page_destroy with !dontfree is to 1984 * call page_free, so we can collect 1985 * the page here. 1986 */ 1987 collected++; 1988 #ifdef MEM_DEL_STATS 1989 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 1990 start_pgrp; 1991 #endif /* MEM_DEL_STATS */ 1992 MDSTAT_PGRP(mhp, ntick_pgrp); 1993 mutex_enter(&mhp->mh_mutex); 1994 page_delete_collect(pp, mhp); 1995 mdsp->mds_bitmap[bit / NBPBMW] |= 1996 (1 << (bit % NBPBMW)); 1997 continue; 1998 } 1999 /* 2000 * The page is toxic and the mod bit is 2001 * set, we cannot do anything here to deal 2002 * with it. 2003 */ 2004 if (PP_TOXIC(pp)) { 2005 page_unlock(pp); 2006 #ifdef MEM_DEL_STATS 2007 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2008 start_pgrp; 2009 #endif /* MEM_DEL_STATS */ 2010 MDSTAT_PGRP(mhp, ntick_pgrp); 2011 MDSTAT_INCR(mhp, modtoxic); 2012 mutex_enter(&mhp->mh_mutex); 2013 continue; 2014 } 2015 MDSTAT_INCR(mhp, nputpage); 2016 vp = pp->p_vnode; 2017 offset = pp->p_offset; 2018 VN_HOLD(vp); 2019 page_unlock(pp); 2020 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, 2021 B_INVAL|B_FORCE, kcred); 2022 VN_RELE(vp); 2023 #ifdef MEM_DEL_STATS 2024 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2025 start_pgrp; 2026 #endif /* MEM_DEL_STATS */ 2027 MDSTAT_PGRP(mhp, ntick_pgrp); 2028 /* 2029 * Try to get the page back immediately 2030 * so that it can be collected. 2031 */ 2032 pp = page_numtopp_nolock(pfn); 2033 if (pp == NULL) { 2034 MDSTAT_INCR(mhp, nnoreclaim); 2035 /* 2036 * This should not happen as this 2037 * thread is deleting the page. 2038 * If this code is generalized, this 2039 * becomes a reality. 2040 */ 2041 #ifdef DEBUG 2042 cmn_err(CE_WARN, 2043 "delete_memory_thread(0x%p) " 2044 "pfn 0x%lx has no page_t", 2045 (void *)mhp, pfn); 2046 #endif /* DEBUG */ 2047 mutex_enter(&mhp->mh_mutex); 2048 continue; 2049 } 2050 if (page_try_reclaim_lock(pp, SE_EXCL, 2051 SE_EXCL_WANTED | SE_RETIRED)) { 2052 if (PP_ISFREE(pp)) { 2053 goto free_page_collect; 2054 } 2055 page_unlock(pp); 2056 } 2057 MDSTAT_INCR(mhp, nnoreclaim); 2058 mutex_enter(&mhp->mh_mutex); 2059 continue; 2060 2061 reloc: 2062 /* 2063 * Got some freemem and a target 2064 * page, so move the data to avoid 2065 * I/O and lock problems. 2066 */ 2067 ASSERT(!page_iolock_assert(pp)); 2068 MDSTAT_INCR(mhp, nreloc); 2069 /* 2070 * page_relocate() will return pgcnt: the 2071 * number of consecutive pages relocated. 2072 * If it is successful, pp will be a 2073 * linked list of the page structs that 2074 * were relocated. If page_relocate() is 2075 * unsuccessful, pp will be unmodified. 2076 */ 2077 #ifdef MEM_DEL_STATS 2078 start_pgrp = ddi_get_lbolt(); 2079 #endif /* MEM_DEL_STATS */ 2080 result = page_relocate(&pp, &pp_targ, 0, 0, 2081 &pgcnt, NULL); 2082 #ifdef MEM_DEL_STATS 2083 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2084 start_pgrp; 2085 #endif /* MEM_DEL_STATS */ 2086 MDSTAT_PGRP(mhp, ntick_pgrp); 2087 if (result != 0) { 2088 MDSTAT_INCR(mhp, nrelocfail); 2089 /* 2090 * We did not succeed. We need 2091 * to give the pp_targ pages back. 2092 * page_free(pp_targ, 1) without 2093 * the freemem accounting. 2094 */ 2095 group_page_unlock(pp); 2096 page_free_replacement_page(pp_targ); 2097 page_unlock(pp); 2098 mutex_enter(&mhp->mh_mutex); 2099 continue; 2100 } 2101 2102 /* 2103 * We will then collect pgcnt pages. 2104 */ 2105 ASSERT(pgcnt > 0); 2106 mutex_enter(&mhp->mh_mutex); 2107 /* 2108 * We need to make sure freemem_left is 2109 * large enough. 2110 */ 2111 while ((freemem_left < pgcnt) && 2112 (!mhp->mh_cancel)) { 2113 freemem_left += 2114 delthr_get_freemem(mhp); 2115 } 2116 2117 /* 2118 * Do not proceed if mh_cancel is set. 2119 */ 2120 if (mhp->mh_cancel) { 2121 while (pp_targ != NULL) { 2122 /* 2123 * Unlink and unlock each page. 2124 */ 2125 tpp_targ = pp_targ; 2126 page_sub(&pp_targ, tpp_targ); 2127 page_unlock(tpp_targ); 2128 } 2129 /* 2130 * We need to give the pp pages back. 2131 * page_free(pp, 1) without the 2132 * freemem accounting. 2133 */ 2134 page_free_replacement_page(pp); 2135 break; 2136 } 2137 2138 /* Now remove pgcnt from freemem_left */ 2139 freemem_left -= pgcnt; 2140 ASSERT(freemem_left >= 0); 2141 szc = pp->p_szc; 2142 while (pp != NULL) { 2143 /* 2144 * pp and pp_targ were passed back as 2145 * a linked list of pages. 2146 * Unlink and unlock each page. 2147 */ 2148 tpp_targ = pp_targ; 2149 page_sub(&pp_targ, tpp_targ); 2150 page_unlock(tpp_targ); 2151 /* 2152 * The original page is now free 2153 * so remove it from the linked 2154 * list and collect it. 2155 */ 2156 tpp = pp; 2157 page_sub(&pp, tpp); 2158 pfn = page_pptonum(tpp); 2159 collected++; 2160 ASSERT(PAGE_EXCL(tpp)); 2161 ASSERT(tpp->p_vnode == NULL); 2162 ASSERT(!hat_page_is_mapped(tpp)); 2163 ASSERT(tpp->p_szc == szc); 2164 tpp->p_szc = 0; 2165 page_delete_collect(tpp, mhp); 2166 bit = pfn - mdsp->mds_base; 2167 mdsp->mds_bitmap[bit / NBPBMW] |= 2168 (1 << (bit % NBPBMW)); 2169 } 2170 ASSERT(pp_targ == NULL); 2171 } 2172 } 2173 first_scan = 0; 2174 if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) && 2175 (collected == 0)) { 2176 /* 2177 * This code is needed as we cannot wait 2178 * for a page to be locked OR the delete to 2179 * be cancelled. Also, we must delay so 2180 * that other threads get a chance to run 2181 * on our cpu, otherwise page locks may be 2182 * held indefinitely by those threads. 2183 */ 2184 MDSTAT_INCR(mhp, ndelay); 2185 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2186 (void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex, 2187 (lbolt + DEL_BUSY_WAIT_TICKS)); 2188 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2189 } 2190 } 2191 /* stop the dr aio cleanup thread */ 2192 mhp->mh_dr_aio_cleanup_cancel = 1; 2193 transit_list_collect(mhp, 0); 2194 if (freemem_left != 0) { 2195 /* Return any surplus. */ 2196 page_create_putback(freemem_left); 2197 freemem_left = 0; 2198 } 2199 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2200 mdsp = mdsp->mds_next) { 2201 mem_node_post_del_slice(mdsp->mds_base, 2202 mdsp->mds_base + mdsp->mds_npgs - 1, 2203 (mhp->mh_cancel != 0)); 2204 } 2205 #ifdef MEM_DEL_STATS 2206 ntick_total = (uint64_t)ddi_get_lbolt() - start_total; 2207 #endif /* MEM_DEL_STATS */ 2208 MDSTAT_TOTAL(mhp, ntick_total); 2209 MDSTAT_PRINT(mhp); 2210 2211 /* 2212 * If the memory delete was cancelled, exclusive-wanted bits must 2213 * be cleared. If there are retired pages being deleted, they need 2214 * to be unretired. 2215 */ 2216 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2217 mdsp = mdsp->mds_next) { 2218 pfn_t pfn, p_end; 2219 2220 p_end = mdsp->mds_base + mdsp->mds_npgs; 2221 for (pfn = mdsp->mds_base; pfn < p_end; pfn++) { 2222 page_t *pp; 2223 pgcnt_t bit; 2224 2225 bit = pfn - mdsp->mds_base; 2226 if (mhp->mh_cancel) { 2227 pp = page_numtopp_nolock(pfn); 2228 if (pp != NULL) { 2229 if ((mdsp->mds_bitmap[bit / NBPBMW] & 2230 (1 << (bit % NBPBMW))) == 0) { 2231 page_lock_clr_exclwanted(pp); 2232 } 2233 } 2234 } else { 2235 pp = NULL; 2236 } 2237 if ((mdsp->mds_bitmap_retired[bit / NBPBMW] & 2238 (1 << (bit % NBPBMW))) != 0) { 2239 /* do we already have pp? */ 2240 if (pp == NULL) { 2241 pp = page_numtopp_nolock(pfn); 2242 } 2243 ASSERT(pp != NULL); 2244 ASSERT(PP_RETIRED(pp)); 2245 if (mhp->mh_cancel != 0) { 2246 page_unlock(pp); 2247 /* 2248 * To satisfy ASSERT below in 2249 * cancel code. 2250 */ 2251 mhp->mh_hold_todo++; 2252 } else { 2253 (void) page_unretire_pp(pp, 2254 PR_UNR_CLEAN); 2255 } 2256 } 2257 } 2258 } 2259 /* 2260 * Free retired page bitmap and collected page bitmap 2261 */ 2262 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2263 mdsp = mdsp->mds_next) { 2264 ASSERT(mdsp->mds_bitmap_retired != NULL); 2265 kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp)); 2266 mdsp->mds_bitmap_retired = NULL; /* Paranoia. */ 2267 ASSERT(mdsp->mds_bitmap != NULL); 2268 kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp)); 2269 mdsp->mds_bitmap = NULL; /* Paranoia. */ 2270 } 2271 2272 /* wait for our dr aio cancel thread to exit */ 2273 while (!(mhp->mh_aio_cleanup_done)) { 2274 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2275 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 2276 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2277 } 2278 refused: 2279 if (mhp->mh_cancel != 0) { 2280 page_t *pp; 2281 2282 comp_code = mhp->mh_cancel; 2283 /* 2284 * Go through list of deleted pages (mh_deleted) freeing 2285 * them. 2286 */ 2287 while ((pp = mhp->mh_deleted) != NULL) { 2288 mhp->mh_deleted = pp->p_next; 2289 mhp->mh_hold_todo++; 2290 mutex_exit(&mhp->mh_mutex); 2291 /* Restore p_next. */ 2292 pp->p_next = pp->p_prev; 2293 if (PP_ISFREE(pp)) { 2294 cmn_err(CE_PANIC, 2295 "page %p is free", 2296 (void *)pp); 2297 } 2298 page_free(pp, 1); 2299 mutex_enter(&mhp->mh_mutex); 2300 } 2301 ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages); 2302 2303 mutex_exit(&mhp->mh_mutex); 2304 put_availrmem(mhp->mh_vm_pages); 2305 mutex_enter(&mhp->mh_mutex); 2306 2307 goto t_exit; 2308 } 2309 2310 /* 2311 * All the pages are no longer in use and are exclusively locked. 2312 */ 2313 2314 mhp->mh_deleted = NULL; 2315 2316 kphysm_del_cleanup(mhp); 2317 2318 comp_code = KPHYSM_OK; 2319 2320 t_exit: 2321 mutex_exit(&mhp->mh_mutex); 2322 kphysm_setup_post_del(mhp->mh_vm_pages, 2323 (comp_code == KPHYSM_OK) ? 0 : 1); 2324 mutex_enter(&mhp->mh_mutex); 2325 2326 early_exit: 2327 /* mhp->mh_mutex exited by CALLB_CPR_EXIT() */ 2328 mhp->mh_state = MHND_DONE; 2329 del_complete_funcp = mhp->mh_delete_complete; 2330 del_complete_arg = mhp->mh_delete_complete_arg; 2331 CALLB_CPR_EXIT(&cprinfo); 2332 (*del_complete_funcp)(del_complete_arg, comp_code); 2333 thread_exit(); 2334 /*NOTREACHED*/ 2335 } 2336 2337 /* 2338 * Start the delete of the memory from the system. 2339 */ 2340 int 2341 kphysm_del_start( 2342 memhandle_t handle, 2343 void (*complete)(void *, int), 2344 void *complete_arg) 2345 { 2346 struct mem_handle *mhp; 2347 2348 mhp = kphysm_lookup_mem_handle(handle); 2349 if (mhp == NULL) { 2350 return (KPHYSM_EHANDLE); 2351 } 2352 switch (mhp->mh_state) { 2353 case MHND_FREE: 2354 ASSERT(mhp->mh_state != MHND_FREE); 2355 mutex_exit(&mhp->mh_mutex); 2356 return (KPHYSM_EHANDLE); 2357 case MHND_INIT: 2358 break; 2359 case MHND_STARTING: 2360 case MHND_RUNNING: 2361 mutex_exit(&mhp->mh_mutex); 2362 return (KPHYSM_ESEQUENCE); 2363 case MHND_DONE: 2364 mutex_exit(&mhp->mh_mutex); 2365 return (KPHYSM_ESEQUENCE); 2366 case MHND_RELEASE: 2367 mutex_exit(&mhp->mh_mutex); 2368 return (KPHYSM_ESEQUENCE); 2369 default: 2370 #ifdef DEBUG 2371 cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d", 2372 (void *)mhp, mhp->mh_state); 2373 #endif /* DEBUG */ 2374 mutex_exit(&mhp->mh_mutex); 2375 return (KPHYSM_EHANDLE); 2376 } 2377 2378 if (mhp->mh_transit.trl_spans == NULL) { 2379 mutex_exit(&mhp->mh_mutex); 2380 return (KPHYSM_ENOWORK); 2381 } 2382 2383 ASSERT(complete != NULL); 2384 mhp->mh_delete_complete = complete; 2385 mhp->mh_delete_complete_arg = complete_arg; 2386 mhp->mh_state = MHND_STARTING; 2387 /* 2388 * Release the mutex in case thread_create sleeps. 2389 */ 2390 mutex_exit(&mhp->mh_mutex); 2391 2392 /* 2393 * The "obvious" process for this thread is pageout (proc_pageout) 2394 * but this gives the thread too much power over freemem 2395 * which results in freemem starvation. 2396 */ 2397 (void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0, 2398 TS_RUN, maxclsyspri - 1); 2399 2400 return (KPHYSM_OK); 2401 } 2402 2403 static kmutex_t pp_dummy_lock; /* Protects init. of pp_dummy. */ 2404 static caddr_t pp_dummy; 2405 static pgcnt_t pp_dummy_npages; 2406 static pfn_t *pp_dummy_pfn; /* Array of dummy pfns. */ 2407 2408 static void 2409 memseg_remap_init_pages(page_t *pages, page_t *epages) 2410 { 2411 page_t *pp; 2412 2413 for (pp = pages; pp < epages; pp++) { 2414 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2415 pp->p_offset = (u_offset_t)-1; 2416 page_iolock_init(pp); 2417 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 2418 continue; 2419 page_lock_delete(pp); 2420 } 2421 } 2422 2423 void 2424 memseg_remap_init() 2425 { 2426 mutex_enter(&pp_dummy_lock); 2427 if (pp_dummy == NULL) { 2428 uint_t dpages; 2429 int i; 2430 2431 /* 2432 * dpages starts off as the size of the structure and 2433 * ends up as the minimum number of pages that will 2434 * hold a whole number of page_t structures. 2435 */ 2436 dpages = sizeof (page_t); 2437 ASSERT(dpages != 0); 2438 ASSERT(dpages <= MMU_PAGESIZE); 2439 2440 while ((dpages & 1) == 0) 2441 dpages >>= 1; 2442 2443 pp_dummy_npages = dpages; 2444 /* 2445 * Allocate pp_dummy pages directly from static_arena, 2446 * since these are whole page allocations and are 2447 * referenced by physical address. This also has the 2448 * nice fringe benefit of hiding the memory from 2449 * ::findleaks since it doesn't deal well with allocated 2450 * kernel heap memory that doesn't have any mappings. 2451 */ 2452 pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages), 2453 PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP); 2454 bzero(pp_dummy, ptob(pp_dummy_npages)); 2455 ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0); 2456 pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) * 2457 pp_dummy_npages, KM_SLEEP); 2458 for (i = 0; i < pp_dummy_npages; i++) { 2459 pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat, 2460 &pp_dummy[MMU_PAGESIZE * i]); 2461 ASSERT(pp_dummy_pfn[i] != PFN_INVALID); 2462 } 2463 /* 2464 * Initialize the page_t's to a known 'deleted' state 2465 * that matches the state of deleted pages. 2466 */ 2467 memseg_remap_init_pages((page_t *)pp_dummy, 2468 (page_t *)(pp_dummy + 2469 ptob(pp_dummy_npages))); 2470 /* Remove kmem mappings for the pages for safety. */ 2471 hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages), 2472 HAT_UNLOAD_UNLOCK); 2473 /* Leave pp_dummy pointer set as flag that init is done. */ 2474 } 2475 mutex_exit(&pp_dummy_lock); 2476 } 2477 2478 static void 2479 memseg_remap_to_dummy(caddr_t pp, pgcnt_t metapgs) 2480 { 2481 ASSERT(pp_dummy != NULL); 2482 2483 while (metapgs != 0) { 2484 pgcnt_t n; 2485 int i; 2486 2487 n = pp_dummy_npages; 2488 if (n > metapgs) 2489 n = metapgs; 2490 for (i = 0; i < n; i++) { 2491 hat_devload(kas.a_hat, pp, ptob(1), pp_dummy_pfn[i], 2492 PROT_READ, 2493 HAT_LOAD | HAT_LOAD_NOCONSIST | 2494 HAT_LOAD_REMAP); 2495 pp += ptob(1); 2496 } 2497 metapgs -= n; 2498 } 2499 } 2500 2501 /* 2502 * Transition all the deleted pages to the deleted state so that 2503 * page_lock will not wait. The page_lock_delete call will 2504 * also wake up any waiters. 2505 */ 2506 static void 2507 memseg_lock_delete_all(struct memseg *seg) 2508 { 2509 page_t *pp; 2510 2511 for (pp = seg->pages; pp < seg->epages; pp++) { 2512 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2513 page_lock_delete(pp); 2514 } 2515 } 2516 2517 static void 2518 kphysm_del_cleanup(struct mem_handle *mhp) 2519 { 2520 struct memdelspan *mdsp; 2521 struct memseg *seg; 2522 struct memseg **segpp; 2523 struct memseg *seglist; 2524 pfn_t p_end; 2525 uint64_t avmem; 2526 pgcnt_t avpgs; 2527 pgcnt_t npgs; 2528 2529 avpgs = mhp->mh_vm_pages; 2530 2531 memsegs_lock(1); 2532 2533 /* 2534 * remove from main segment list. 2535 */ 2536 npgs = 0; 2537 seglist = NULL; 2538 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2539 mdsp = mdsp->mds_next) { 2540 p_end = mdsp->mds_base + mdsp->mds_npgs; 2541 for (segpp = &memsegs; (seg = *segpp) != NULL; ) { 2542 if (seg->pages_base >= p_end || 2543 seg->pages_end <= mdsp->mds_base) { 2544 /* Span and memseg don't overlap. */ 2545 segpp = &((*segpp)->next); 2546 continue; 2547 } 2548 ASSERT(seg->pages_base >= mdsp->mds_base); 2549 ASSERT(seg->pages_end <= p_end); 2550 2551 PLCNT_MODIFY_MAX(seg->pages_base, 2552 seg->pages_base - seg->pages_end); 2553 2554 /* Hide the memseg from future scans. */ 2555 hat_kpm_delmem_mseg_update(seg, segpp); 2556 *segpp = seg->next; 2557 membar_producer(); /* TODO: Needed? */ 2558 npgs += MSEG_NPAGES(seg); 2559 2560 /* 2561 * Leave the deleted segment's next pointer intact 2562 * in case a memsegs scanning loop is walking this 2563 * segment concurrently. 2564 */ 2565 seg->lnext = seglist; 2566 seglist = seg; 2567 } 2568 } 2569 2570 build_pfn_hash(); 2571 2572 ASSERT(npgs < total_pages); 2573 total_pages -= npgs; 2574 2575 /* 2576 * Recalculate the paging parameters now total_pages has changed. 2577 * This will also cause the clock hands to be reset before next use. 2578 */ 2579 setupclock(1); 2580 2581 memsegs_unlock(1); 2582 2583 mutex_exit(&mhp->mh_mutex); 2584 2585 while ((seg = seglist) != NULL) { 2586 pfn_t mseg_start; 2587 pfn_t mseg_base, mseg_end; 2588 pgcnt_t mseg_npgs; 2589 page_t *pp; 2590 pgcnt_t metapgs; 2591 int dynamic; 2592 int mlret; 2593 2594 seglist = seg->lnext; 2595 2596 /* 2597 * Put the page_t's into the deleted state to stop 2598 * cv_wait()s on the pages. When we remap, the dummy 2599 * page_t's will be in the same state. 2600 */ 2601 memseg_lock_delete_all(seg); 2602 /* 2603 * Collect up information based on pages_base and pages_end 2604 * early so that we can flag early that the memseg has been 2605 * deleted by setting pages_end == pages_base. 2606 */ 2607 mseg_base = seg->pages_base; 2608 mseg_end = seg->pages_end; 2609 mseg_npgs = MSEG_NPAGES(seg); 2610 dynamic = memseg_is_dynamic(seg, &mseg_start); 2611 2612 seg->pages_end = seg->pages_base; 2613 2614 if (dynamic) { 2615 pp = seg->pages; 2616 metapgs = mseg_base - mseg_start; 2617 ASSERT(metapgs != 0); 2618 2619 /* Remap the meta data to our special dummy area. */ 2620 memseg_remap_to_dummy((caddr_t)pp, metapgs); 2621 2622 mutex_enter(&memseg_lists_lock); 2623 seg->lnext = memseg_va_avail; 2624 memseg_va_avail = seg; 2625 mutex_exit(&memseg_lists_lock); 2626 } else { 2627 /* 2628 * Set for clean-up below. 2629 */ 2630 mseg_start = seg->pages_base; 2631 /* 2632 * For memory whose page_ts were allocated 2633 * at boot, we need to find a new use for 2634 * the page_t memory. 2635 * For the moment, just leak it. 2636 * (It is held in the memseg_delete_junk list.) 2637 */ 2638 2639 mutex_enter(&memseg_lists_lock); 2640 seg->lnext = memseg_delete_junk; 2641 memseg_delete_junk = seg; 2642 mutex_exit(&memseg_lists_lock); 2643 } 2644 2645 /* Must not use seg now as it could be re-used. */ 2646 2647 memlist_write_lock(); 2648 2649 mlret = memlist_delete_span( 2650 (uint64_t)(mseg_base) << PAGESHIFT, 2651 (uint64_t)(mseg_npgs) << PAGESHIFT, 2652 &phys_avail); 2653 ASSERT(mlret == MEML_SPANOP_OK); 2654 2655 mlret = memlist_delete_span( 2656 (uint64_t)(mseg_start) << PAGESHIFT, 2657 (uint64_t)(mseg_end - mseg_start) << 2658 PAGESHIFT, 2659 &phys_install); 2660 ASSERT(mlret == MEML_SPANOP_OK); 2661 phys_install_has_changed(); 2662 2663 memlist_write_unlock(); 2664 } 2665 2666 memlist_read_lock(); 2667 installed_top_size(phys_install, &physmax, &physinstalled); 2668 memlist_read_unlock(); 2669 2670 mutex_enter(&freemem_lock); 2671 maxmem -= avpgs; 2672 physmem -= avpgs; 2673 /* availrmem is adjusted during the delete. */ 2674 availrmem_initial -= avpgs; 2675 2676 mutex_exit(&freemem_lock); 2677 2678 dump_resize(); 2679 2680 cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK " 2681 "(0x%" PRIx64 ")\n", 2682 physinstalled << (PAGESHIFT - 10), 2683 (uint64_t)physinstalled << PAGESHIFT); 2684 2685 avmem = (uint64_t)freemem << PAGESHIFT; 2686 cmn_err(CE_CONT, "?kphysm_delete: " 2687 "avail mem = %" PRId64 "\n", avmem); 2688 2689 /* 2690 * Update lgroup generation number on single lgroup systems 2691 */ 2692 if (nlgrps == 1) 2693 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 2694 2695 /* Successfully deleted system memory */ 2696 mutex_enter(&mhp->mh_mutex); 2697 } 2698 2699 static uint_t mdel_nullvp_waiter; 2700 2701 static void 2702 page_delete_collect( 2703 page_t *pp, 2704 struct mem_handle *mhp) 2705 { 2706 if (pp->p_vnode) { 2707 page_hashout(pp, (kmutex_t *)NULL); 2708 /* do not do PP_SETAGED(pp); */ 2709 } else { 2710 kmutex_t *sep; 2711 2712 sep = page_se_mutex(pp); 2713 mutex_enter(sep); 2714 if (CV_HAS_WAITERS(&pp->p_cv)) { 2715 mdel_nullvp_waiter++; 2716 cv_broadcast(&pp->p_cv); 2717 } 2718 mutex_exit(sep); 2719 } 2720 ASSERT(pp->p_next == pp->p_prev); 2721 ASSERT(pp->p_next == NULL || pp->p_next == pp); 2722 pp->p_next = mhp->mh_deleted; 2723 mhp->mh_deleted = pp; 2724 ASSERT(mhp->mh_hold_todo != 0); 2725 mhp->mh_hold_todo--; 2726 } 2727 2728 static void 2729 transit_list_collect(struct mem_handle *mhp, int v) 2730 { 2731 struct transit_list_head *trh; 2732 2733 trh = &transit_list_head; 2734 mutex_enter(&trh->trh_lock); 2735 mhp->mh_transit.trl_collect = v; 2736 mutex_exit(&trh->trh_lock); 2737 } 2738 2739 static void 2740 transit_list_insert(struct transit_list *tlp) 2741 { 2742 struct transit_list_head *trh; 2743 2744 trh = &transit_list_head; 2745 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2746 tlp->trl_next = trh->trh_head; 2747 trh->trh_head = tlp; 2748 } 2749 2750 static void 2751 transit_list_remove(struct transit_list *tlp) 2752 { 2753 struct transit_list_head *trh; 2754 struct transit_list **tlpp; 2755 2756 trh = &transit_list_head; 2757 tlpp = &trh->trh_head; 2758 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2759 while (*tlpp != NULL && *tlpp != tlp) 2760 tlpp = &(*tlpp)->trl_next; 2761 ASSERT(*tlpp != NULL); 2762 if (*tlpp == tlp) 2763 *tlpp = tlp->trl_next; 2764 tlp->trl_next = NULL; 2765 } 2766 2767 static struct transit_list * 2768 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum) 2769 { 2770 struct transit_list *tlp; 2771 2772 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 2773 struct memdelspan *mdsp; 2774 2775 for (mdsp = tlp->trl_spans; mdsp != NULL; 2776 mdsp = mdsp->mds_next) { 2777 if (pfnum >= mdsp->mds_base && 2778 pfnum < (mdsp->mds_base + mdsp->mds_npgs)) { 2779 return (tlp); 2780 } 2781 } 2782 } 2783 return (NULL); 2784 } 2785 2786 int 2787 pfn_is_being_deleted(pfn_t pfnum) 2788 { 2789 struct transit_list_head *trh; 2790 struct transit_list *tlp; 2791 int ret; 2792 2793 trh = &transit_list_head; 2794 if (trh->trh_head == NULL) 2795 return (0); 2796 2797 mutex_enter(&trh->trh_lock); 2798 tlp = pfnum_to_transit_list(trh, pfnum); 2799 ret = (tlp != NULL && tlp->trl_collect); 2800 mutex_exit(&trh->trh_lock); 2801 2802 return (ret); 2803 } 2804 2805 #ifdef MEM_DEL_STATS 2806 extern int hz; 2807 static void 2808 mem_del_stat_print_func(struct mem_handle *mhp) 2809 { 2810 uint64_t tmp; 2811 2812 if (mem_del_stat_print) { 2813 printf("memory delete loop %x/%x, statistics%s\n", 2814 (uint_t)mhp->mh_transit.trl_spans->mds_base, 2815 (uint_t)mhp->mh_transit.trl_spans->mds_npgs, 2816 (mhp->mh_cancel ? " (cancelled)" : "")); 2817 printf("\t%8u nloop\n", mhp->mh_delstat.nloop); 2818 printf("\t%8u need_free\n", mhp->mh_delstat.need_free); 2819 printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop); 2820 printf("\t%8u free_low\n", mhp->mh_delstat.free_low); 2821 printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed); 2822 printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck); 2823 printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget); 2824 printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail); 2825 printf("\t%8u nfree\n", mhp->mh_delstat.nfree); 2826 printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc); 2827 printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail); 2828 printf("\t%8u already_done\n", mhp->mh_delstat.already_done); 2829 printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree); 2830 printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked); 2831 printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc); 2832 printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl); 2833 printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc); 2834 printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy); 2835 printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage); 2836 printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim); 2837 printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay); 2838 printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail); 2839 printf("\t%8u retired\n", mhp->mh_delstat.retired); 2840 printf("\t%8u toxic\n", mhp->mh_delstat.toxic); 2841 printf("\t%8u failing\n", mhp->mh_delstat.failing); 2842 printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic); 2843 printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic); 2844 printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail); 2845 printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail); 2846 tmp = mhp->mh_delstat.nticks_total / hz; /* seconds */ 2847 printf( 2848 "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n", 2849 mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60); 2850 2851 tmp = mhp->mh_delstat.nticks_pgrp / hz; /* seconds */ 2852 printf( 2853 "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n", 2854 mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60); 2855 } 2856 } 2857 #endif /* MEM_DEL_STATS */ 2858 2859 struct mem_callback { 2860 kphysm_setup_vector_t *vec; 2861 void *arg; 2862 }; 2863 2864 #define NMEMCALLBACKS 100 2865 2866 static struct mem_callback mem_callbacks[NMEMCALLBACKS]; 2867 static uint_t nmemcallbacks; 2868 static krwlock_t mem_callback_rwlock; 2869 2870 int 2871 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg) 2872 { 2873 uint_t i, found; 2874 2875 /* 2876 * This test will become more complicated when the version must 2877 * change. 2878 */ 2879 if (vec->version != KPHYSM_SETUP_VECTOR_VERSION) 2880 return (EINVAL); 2881 2882 if (vec->post_add == NULL || vec->pre_del == NULL || 2883 vec->post_del == NULL) 2884 return (EINVAL); 2885 2886 rw_enter(&mem_callback_rwlock, RW_WRITER); 2887 for (i = 0, found = 0; i < nmemcallbacks; i++) { 2888 if (mem_callbacks[i].vec == NULL && found == 0) 2889 found = i + 1; 2890 if (mem_callbacks[i].vec == vec && 2891 mem_callbacks[i].arg == arg) { 2892 #ifdef DEBUG 2893 /* Catch this in DEBUG kernels. */ 2894 cmn_err(CE_WARN, "kphysm_setup_func_register" 2895 "(0x%p, 0x%p) duplicate registration from 0x%p", 2896 (void *)vec, arg, (void *)caller()); 2897 #endif /* DEBUG */ 2898 rw_exit(&mem_callback_rwlock); 2899 return (EEXIST); 2900 } 2901 } 2902 if (found != 0) { 2903 i = found - 1; 2904 } else { 2905 ASSERT(nmemcallbacks < NMEMCALLBACKS); 2906 if (nmemcallbacks == NMEMCALLBACKS) { 2907 rw_exit(&mem_callback_rwlock); 2908 return (ENOMEM); 2909 } 2910 i = nmemcallbacks++; 2911 } 2912 mem_callbacks[i].vec = vec; 2913 mem_callbacks[i].arg = arg; 2914 rw_exit(&mem_callback_rwlock); 2915 return (0); 2916 } 2917 2918 void 2919 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg) 2920 { 2921 uint_t i; 2922 2923 rw_enter(&mem_callback_rwlock, RW_WRITER); 2924 for (i = 0; i < nmemcallbacks; i++) { 2925 if (mem_callbacks[i].vec == vec && 2926 mem_callbacks[i].arg == arg) { 2927 mem_callbacks[i].vec = NULL; 2928 mem_callbacks[i].arg = NULL; 2929 if (i == (nmemcallbacks - 1)) 2930 nmemcallbacks--; 2931 break; 2932 } 2933 } 2934 rw_exit(&mem_callback_rwlock); 2935 } 2936 2937 static void 2938 kphysm_setup_post_add(pgcnt_t delta_pages) 2939 { 2940 uint_t i; 2941 2942 rw_enter(&mem_callback_rwlock, RW_READER); 2943 for (i = 0; i < nmemcallbacks; i++) { 2944 if (mem_callbacks[i].vec != NULL) { 2945 (*mem_callbacks[i].vec->post_add) 2946 (mem_callbacks[i].arg, delta_pages); 2947 } 2948 } 2949 rw_exit(&mem_callback_rwlock); 2950 } 2951 2952 /* 2953 * Note the locking between pre_del and post_del: The reader lock is held 2954 * between the two calls to stop the set of functions from changing. 2955 */ 2956 2957 static int 2958 kphysm_setup_pre_del(pgcnt_t delta_pages) 2959 { 2960 uint_t i; 2961 int ret; 2962 int aret; 2963 2964 ret = 0; 2965 rw_enter(&mem_callback_rwlock, RW_READER); 2966 for (i = 0; i < nmemcallbacks; i++) { 2967 if (mem_callbacks[i].vec != NULL) { 2968 aret = (*mem_callbacks[i].vec->pre_del) 2969 (mem_callbacks[i].arg, delta_pages); 2970 ret |= aret; 2971 } 2972 } 2973 2974 return (ret); 2975 } 2976 2977 static void 2978 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled) 2979 { 2980 uint_t i; 2981 2982 for (i = 0; i < nmemcallbacks; i++) { 2983 if (mem_callbacks[i].vec != NULL) { 2984 (*mem_callbacks[i].vec->post_del) 2985 (mem_callbacks[i].arg, delta_pages, cancelled); 2986 } 2987 } 2988 rw_exit(&mem_callback_rwlock); 2989 } 2990 2991 static int 2992 kphysm_split_memseg( 2993 pfn_t base, 2994 pgcnt_t npgs) 2995 { 2996 struct memseg *seg; 2997 struct memseg **segpp; 2998 pgcnt_t size_low, size_high; 2999 struct memseg *seg_low, *seg_mid, *seg_high; 3000 3001 /* 3002 * Lock the memsegs list against other updates now 3003 */ 3004 memsegs_lock(1); 3005 3006 /* 3007 * Find boot time memseg that wholly covers this area. 3008 */ 3009 3010 /* First find the memseg with page 'base' in it. */ 3011 for (segpp = &memsegs; (seg = *segpp) != NULL; 3012 segpp = &((*segpp)->next)) { 3013 if (base >= seg->pages_base && base < seg->pages_end) 3014 break; 3015 } 3016 if (seg == NULL) { 3017 memsegs_unlock(1); 3018 return (0); 3019 } 3020 if (memseg_is_dynamic(seg, (pfn_t *)NULL)) { 3021 memsegs_unlock(1); 3022 return (0); 3023 } 3024 if ((base + npgs) > seg->pages_end) { 3025 memsegs_unlock(1); 3026 return (0); 3027 } 3028 3029 /* 3030 * Work out the size of the two segments that will 3031 * surround the new segment, one for low address 3032 * and one for high. 3033 */ 3034 ASSERT(base >= seg->pages_base); 3035 size_low = base - seg->pages_base; 3036 ASSERT(seg->pages_end >= (base + npgs)); 3037 size_high = seg->pages_end - (base + npgs); 3038 3039 /* 3040 * Sanity check. 3041 */ 3042 if ((size_low + size_high) == 0) { 3043 memsegs_unlock(1); 3044 return (0); 3045 } 3046 3047 /* 3048 * Allocate the new structures. The old memseg will not be freed 3049 * as there may be a reference to it. 3050 */ 3051 seg_low = NULL; 3052 seg_high = NULL; 3053 3054 if (size_low != 0) { 3055 seg_low = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3056 bzero(seg_low, sizeof (struct memseg)); 3057 } 3058 3059 seg_mid = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3060 bzero(seg_mid, sizeof (struct memseg)); 3061 3062 if (size_high != 0) { 3063 seg_high = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3064 bzero(seg_high, sizeof (struct memseg)); 3065 } 3066 3067 /* 3068 * All allocation done now. 3069 */ 3070 if (size_low != 0) { 3071 seg_low->pages = seg->pages; 3072 seg_low->epages = seg_low->pages + size_low; 3073 seg_low->pages_base = seg->pages_base; 3074 seg_low->pages_end = seg_low->pages_base + size_low; 3075 seg_low->next = seg_mid; 3076 } 3077 if (size_high != 0) { 3078 seg_high->pages = seg->epages - size_high; 3079 seg_high->epages = seg_high->pages + size_high; 3080 seg_high->pages_base = seg->pages_end - size_high; 3081 seg_high->pages_end = seg_high->pages_base + size_high; 3082 seg_high->next = seg->next; 3083 } 3084 3085 seg_mid->pages = seg->pages + size_low; 3086 seg_mid->pages_base = seg->pages_base + size_low; 3087 seg_mid->epages = seg->epages - size_high; 3088 seg_mid->pages_end = seg->pages_end - size_high; 3089 seg_mid->next = (seg_high != NULL) ? seg_high : seg->next; 3090 3091 /* 3092 * Update hat_kpm specific info of all involved memsegs and 3093 * allow hat_kpm specific global chain updates. 3094 */ 3095 hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high); 3096 3097 /* 3098 * At this point we have two equivalent memseg sub-chains, 3099 * seg and seg_low/seg_mid/seg_high, which both chain on to 3100 * the same place in the global chain. By re-writing the pointer 3101 * in the previous element we switch atomically from using the old 3102 * (seg) to the new. 3103 */ 3104 *segpp = (seg_low != NULL) ? seg_low : seg_mid; 3105 3106 membar_enter(); 3107 3108 build_pfn_hash(); 3109 memsegs_unlock(1); 3110 3111 /* 3112 * We leave the old segment, 'seg', intact as there may be 3113 * references to it. Also, as the value of total_pages has not 3114 * changed and the memsegs list is effectively the same when 3115 * accessed via the old or the new pointer, we do not have to 3116 * cause pageout_scanner() to re-evaluate its hand pointers. 3117 * 3118 * We currently do not re-use or reclaim the page_t memory. 3119 * If we do, then this may have to change. 3120 */ 3121 3122 mutex_enter(&memseg_lists_lock); 3123 seg->lnext = memseg_edit_junk; 3124 memseg_edit_junk = seg; 3125 mutex_exit(&memseg_lists_lock); 3126 3127 return (1); 3128 } 3129 3130 /* 3131 * The memsegs lock is only taken when modifying the memsegs list 3132 * and rebuilding the pfn hash table (after boot). 3133 * No lock is needed for read as memseg structure are never de-allocated 3134 * and the pointer linkage is never updated until the memseg is ready. 3135 */ 3136 krwlock_t memsegslock; 3137 3138 void 3139 memsegs_lock(int writer) 3140 { 3141 rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER); 3142 } 3143 3144 /*ARGSUSED*/ 3145 void 3146 memsegs_unlock(int writer) 3147 { 3148 rw_exit(&memsegslock); 3149 } 3150 3151 /* 3152 * memlist (phys_install, phys_avail) locking. 3153 */ 3154 3155 /* 3156 * A read/write lock might be better here. 3157 */ 3158 static kmutex_t memlists_mutex; 3159 3160 void 3161 memlist_read_lock() 3162 { 3163 mutex_enter(&memlists_mutex); 3164 } 3165 3166 void 3167 memlist_read_unlock() 3168 { 3169 mutex_exit(&memlists_mutex); 3170 } 3171 3172 void 3173 memlist_write_lock() 3174 { 3175 mutex_enter(&memlists_mutex); 3176 } 3177 3178 void 3179 memlist_write_unlock() 3180 { 3181 mutex_exit(&memlists_mutex); 3182 } 3183 3184 /* 3185 * The sfmmu hat layer (e.g.) accesses some parts of the memseg 3186 * structure using physical addresses. Therefore a kmem_cache is 3187 * used with KMC_NOHASH to avoid page crossings within a memseg 3188 * structure. KMC_NOHASH requires that no external (outside of 3189 * slab) information is allowed. This, in turn, implies that the 3190 * cache's slabsize must be exactly a single page, since per-slab 3191 * information (e.g. the freelist for the slab) is kept at the 3192 * end of the slab, where it is easy to locate. Should be changed 3193 * when a more obvious kmem_cache interface/flag will become 3194 * available. 3195 */ 3196 void 3197 mem_config_init() 3198 { 3199 memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg), 3200 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH); 3201 } 3202