1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/cmn_err.h> 31 #include <sys/vmem.h> 32 #include <sys/kmem.h> 33 #include <sys/systm.h> 34 #include <sys/machsystm.h> /* for page_freelist_coalesce() */ 35 #include <sys/errno.h> 36 #include <sys/memnode.h> 37 #include <sys/memlist.h> 38 #include <sys/memlist_impl.h> 39 #include <sys/tuneable.h> 40 #include <sys/proc.h> 41 #include <sys/disp.h> 42 #include <sys/debug.h> 43 #include <sys/vm.h> 44 #include <sys/callb.h> 45 #include <sys/memlist_plat.h> /* for installed_top_size() */ 46 #include <sys/condvar_impl.h> /* for CV_HAS_WAITERS() */ 47 #include <sys/dumphdr.h> /* for dump_resize() */ 48 #include <sys/atomic.h> /* for use in stats collection */ 49 #include <sys/rwlock.h> 50 #include <sys/cpuvar.h> 51 #include <vm/seg_kmem.h> 52 #include <vm/seg_kpm.h> 53 #include <vm/page.h> 54 #define SUNDDI_IMPL /* so sunddi.h will not redefine splx() et al */ 55 #include <sys/sunddi.h> 56 #include <sys/mem_config.h> 57 #include <sys/mem_cage.h> 58 #include <sys/lgrp.h> 59 #include <sys/ddi.h> 60 #include <sys/modctl.h> 61 62 extern void memlist_read_lock(void); 63 extern void memlist_read_unlock(void); 64 extern void memlist_write_lock(void); 65 extern void memlist_write_unlock(void); 66 67 extern struct memlist *phys_avail; 68 69 extern void mem_node_add(pfn_t, pfn_t); 70 extern void mem_node_del(pfn_t, pfn_t); 71 72 extern uint_t page_ctrs_adjust(int); 73 static void kphysm_setup_post_add(pgcnt_t); 74 static int kphysm_setup_pre_del(pgcnt_t); 75 static void kphysm_setup_post_del(pgcnt_t, int); 76 77 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs); 78 79 static int delspan_reserve(pfn_t, pgcnt_t); 80 static void delspan_unreserve(pfn_t, pgcnt_t); 81 82 static kmutex_t memseg_lists_lock; 83 static struct memseg *memseg_va_avail; 84 static struct memseg *memseg_delete_junk; 85 static struct memseg *memseg_edit_junk; 86 void memseg_remap_init(void); 87 static void memseg_remap_to_dummy(caddr_t, pgcnt_t); 88 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t); 89 static struct memseg *memseg_reuse(pgcnt_t); 90 91 static struct kmem_cache *memseg_cache; 92 93 /* 94 * Add a chunk of memory to the system. page_t's for this memory 95 * are allocated in the first few pages of the chunk. 96 * base: starting PAGESIZE page of new memory. 97 * npgs: length in PAGESIZE pages. 98 * 99 * Adding mem this way doesn't increase the size of the hash tables; 100 * growing them would be too hard. This should be OK, but adding memory 101 * dynamically most likely means more hash misses, since the tables will 102 * be smaller than they otherwise would be. 103 */ 104 int 105 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs) 106 { 107 page_t *pp; 108 page_t *opp, *oepp; 109 struct memseg *seg; 110 uint64_t avmem; 111 pfn_t pfn; 112 pfn_t pt_base = base; 113 pgcnt_t tpgs = npgs; 114 pgcnt_t metapgs; 115 int exhausted; 116 pfn_t pnum; 117 int mnode; 118 caddr_t vaddr; 119 int reuse; 120 int mlret; 121 void *mapva; 122 pgcnt_t nkpmpgs = 0; 123 offset_t kpm_pages_off; 124 125 cmn_err(CE_CONT, 126 "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n", 127 npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT); 128 129 /* 130 * Add this span in the delete list to prevent interactions. 131 */ 132 if (!delspan_reserve(base, npgs)) { 133 return (KPHYSM_ESPAN); 134 } 135 /* 136 * Check to see if any of the memory span has been added 137 * by trying an add to the installed memory list. This 138 * forms the interlocking process for add. 139 */ 140 141 memlist_write_lock(); 142 143 mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT, 144 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 145 146 if (mlret == MEML_SPANOP_OK) 147 installed_top_size(phys_install, &physmax, &physinstalled); 148 149 memlist_write_unlock(); 150 151 if (mlret != MEML_SPANOP_OK) { 152 if (mlret == MEML_SPANOP_EALLOC) { 153 delspan_unreserve(pt_base, tpgs); 154 return (KPHYSM_ERESOURCE); 155 } else 156 if (mlret == MEML_SPANOP_ESPAN) { 157 delspan_unreserve(pt_base, tpgs); 158 return (KPHYSM_ESPAN); 159 } else { 160 delspan_unreserve(pt_base, tpgs); 161 return (KPHYSM_ERESOURCE); 162 } 163 } 164 165 /* 166 * We store the page_t's for this new memory in the first 167 * few pages of the chunk. Here, we go and get'em ... 168 */ 169 170 /* 171 * The expression after the '-' gives the number of pages 172 * that will fit in the new memory based on a requirement 173 * of (PAGESIZE + sizeof (page_t)) bytes per page. 174 */ 175 metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) / 176 (PAGESIZE + sizeof (page_t))); 177 178 npgs -= metapgs; 179 base += metapgs; 180 181 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs); 182 183 exhausted = (metapgs == 0 || npgs == 0); 184 185 if (kpm_enable && !exhausted) { 186 pgcnt_t start, end, nkpmpgs_prelim; 187 size_t ptsz; 188 189 /* 190 * A viable kpm large page mapping must not overlap two 191 * dynamic memsegs. Therefore the total size is checked 192 * to be at least kpm_pgsz and also whether start and end 193 * points are at least kpm_pgsz aligned. 194 */ 195 if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) || 196 pmodkpmp(base + npgs)) { 197 198 kphysm_addmem_error_undospan(pt_base, tpgs); 199 200 /* 201 * There is no specific error code for violating 202 * kpm granularity constraints. 203 */ 204 return (KPHYSM_ENOTVIABLE); 205 } 206 207 start = kpmptop(ptokpmp(base)); 208 end = kpmptop(ptokpmp(base + npgs)); 209 nkpmpgs_prelim = ptokpmp(end - start); 210 ptsz = npgs * sizeof (page_t); 211 metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ); 212 exhausted = (tpgs <= metapgs); 213 if (!exhausted) { 214 npgs = tpgs - metapgs; 215 base = pt_base + metapgs; 216 217 /* final nkpmpgs */ 218 start = kpmptop(ptokpmp(base)); 219 nkpmpgs = ptokpmp(end - start); 220 kpm_pages_off = ptsz + 221 (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ; 222 } 223 } 224 225 /* 226 * Is memory area supplied too small? 227 */ 228 if (exhausted) { 229 kphysm_addmem_error_undospan(pt_base, tpgs); 230 231 /* 232 * There is no specific error code for 'too small'. 233 */ 234 return (KPHYSM_ERESOURCE); 235 } 236 237 /* 238 * We may re-use a previously allocated VA space for the page_ts 239 * eventually, but we need to initialize and lock the pages first. 240 */ 241 242 /* 243 * Get an address in the kernel address map, map 244 * the page_t pages and see if we can touch them. 245 */ 246 247 mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP); 248 if (mapva == NULL) { 249 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" 250 " Can't allocate VA for page_ts"); 251 252 kphysm_addmem_error_undospan(pt_base, tpgs); 253 254 return (KPHYSM_ERESOURCE); 255 } 256 pp = mapva; 257 258 if (physmax < (pt_base + tpgs)) 259 physmax = (pt_base + tpgs); 260 261 /* 262 * In the remapping code we map one page at a time so we must do 263 * the same here to match mapping sizes. 264 */ 265 pfn = pt_base; 266 vaddr = (caddr_t)pp; 267 for (pnum = 0; pnum < metapgs; pnum++) { 268 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 269 PROT_READ | PROT_WRITE, 270 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); 271 pfn++; 272 vaddr += ptob(1); 273 } 274 275 if (ddi_peek32((dev_info_t *)NULL, 276 (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) { 277 278 cmn_err(CE_PANIC, "kphysm_add_memory_dynamic:" 279 " Can't access pp array at 0x%p [phys 0x%lx]", 280 (void *)pp, pt_base); 281 282 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 283 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 284 285 vmem_free(heap_arena, mapva, ptob(metapgs)); 286 287 kphysm_addmem_error_undospan(pt_base, tpgs); 288 289 return (KPHYSM_EFAULT); 290 } 291 292 /* 293 * Add this memory slice to its memory node translation. 294 * 295 * Note that right now, each node may have only one slice; 296 * this may change with COD or in larger SSM systems with 297 * nested latency groups, so we must not assume that the 298 * node does not yet exist. 299 */ 300 pnum = base + npgs - 1; 301 mem_node_add_slice(base, pnum); 302 303 /* 304 * Allocate or resize page counters as necessary to accomodate 305 * the increase in memory pages. 306 */ 307 mnode = PFN_2_MEM_NODE(pnum); 308 if (page_ctrs_adjust(mnode) != 0) { 309 310 mem_node_pre_del_slice(base, pnum); 311 mem_node_post_del_slice(base, pnum, 0); 312 313 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 314 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 315 316 vmem_free(heap_arena, mapva, ptob(metapgs)); 317 318 kphysm_addmem_error_undospan(pt_base, tpgs); 319 320 return (KPHYSM_ERESOURCE); 321 } 322 323 /* 324 * Update the phys_avail memory list. 325 * The phys_install list was done at the start. 326 */ 327 328 memlist_write_lock(); 329 330 mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT, 331 (uint64_t)(npgs) << PAGESHIFT, &phys_avail); 332 ASSERT(mlret == MEML_SPANOP_OK); 333 334 memlist_write_unlock(); 335 336 /* See if we can find a memseg to re-use. */ 337 seg = memseg_reuse(metapgs); 338 339 reuse = (seg != NULL); 340 341 /* 342 * Initialize the memseg structure representing this memory 343 * and add it to the existing list of memsegs. Do some basic 344 * initialization and add the memory to the system. 345 * In order to prevent lock deadlocks, the add_physmem() 346 * code is repeated here, but split into several stages. 347 */ 348 if (seg == NULL) { 349 seg = kmem_cache_alloc(memseg_cache, KM_SLEEP); 350 bzero(seg, sizeof (struct memseg)); 351 seg->msegflags = MEMSEG_DYNAMIC; 352 seg->pages = pp; 353 } else { 354 /*EMPTY*/ 355 ASSERT(seg->msegflags & MEMSEG_DYNAMIC); 356 } 357 358 seg->epages = seg->pages + npgs; 359 seg->pages_base = base; 360 seg->pages_end = base + npgs; 361 362 /* 363 * Initialize metadata. The page_ts are set to locked state 364 * ready to be freed. 365 */ 366 bzero((caddr_t)pp, ptob(metapgs)); 367 368 pfn = seg->pages_base; 369 /* Save the original pp base in case we reuse a memseg. */ 370 opp = pp; 371 oepp = opp + npgs; 372 for (pp = opp; pp < oepp; pp++) { 373 pp->p_pagenum = pfn; 374 pfn++; 375 page_iolock_init(pp); 376 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 377 continue; 378 pp->p_offset = (u_offset_t)-1; 379 } 380 381 if (reuse) { 382 /* Remap our page_ts to the re-used memseg VA space. */ 383 pfn = pt_base; 384 vaddr = (caddr_t)seg->pages; 385 for (pnum = 0; pnum < metapgs; pnum++) { 386 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 387 PROT_READ | PROT_WRITE, 388 HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST); 389 pfn++; 390 vaddr += ptob(1); 391 } 392 393 hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs), 394 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 395 396 vmem_free(heap_arena, mapva, ptob(metapgs)); 397 } 398 399 hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off); 400 401 memsegs_lock(1); 402 403 /* 404 * The new memseg is inserted at the beginning of the list. 405 * Not only does this save searching for the tail, but in the 406 * case of a re-used memseg, it solves the problem of what 407 * happens of some process has still got a pointer to the 408 * memseg and follows the next pointer to continue traversing 409 * the memsegs list. 410 */ 411 412 hat_kpm_addmem_mseg_insert(seg); 413 414 seg->next = memsegs; 415 membar_producer(); 416 417 hat_kpm_addmem_memsegs_update(seg); 418 419 memsegs = seg; 420 421 build_pfn_hash(); 422 423 total_pages += npgs; 424 425 /* 426 * Recalculate the paging parameters now total_pages has changed. 427 * This will also cause the clock hands to be reset before next use. 428 */ 429 setupclock(1); 430 431 memsegs_unlock(1); 432 433 /* 434 * Free the pages outside the lock to avoid locking loops. 435 */ 436 for (pp = seg->pages; pp < seg->epages; pp++) { 437 page_free(pp, 1); 438 } 439 440 /* 441 * Now that we've updated the appropriate memory lists we 442 * need to reset a number of globals, since we've increased memory. 443 * Several have already been updated for us as noted above. The 444 * globals we're interested in at this point are: 445 * physmax - highest page frame number. 446 * physinstalled - number of pages currently installed (done earlier) 447 * maxmem - max free pages in the system 448 * physmem - physical memory pages available 449 * availrmem - real memory available 450 */ 451 452 mutex_enter(&freemem_lock); 453 maxmem += npgs; 454 physmem += npgs; 455 availrmem += npgs; 456 availrmem_initial += npgs; 457 458 mutex_exit(&freemem_lock); 459 460 dump_resize(); 461 462 page_freelist_coalesce_all(mnode); 463 464 kphysm_setup_post_add(npgs); 465 466 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK " 467 "(0x%" PRIx64 ")\n", 468 physinstalled << (PAGESHIFT - 10), 469 (uint64_t)physinstalled << PAGESHIFT); 470 471 avmem = (uint64_t)freemem << PAGESHIFT; 472 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: " 473 "avail mem = %" PRId64 "\n", avmem); 474 475 /* 476 * Update lgroup generation number on single lgroup systems 477 */ 478 if (nlgrps == 1) 479 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 480 481 delspan_unreserve(pt_base, tpgs); 482 return (KPHYSM_OK); /* Successfully added system memory */ 483 484 } 485 486 /* 487 * There are various error conditions in kphysm_add_memory_dynamic() 488 * which require a rollback of already changed global state. 489 */ 490 static void 491 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs) 492 { 493 int mlret; 494 495 /* Unreserve memory span. */ 496 memlist_write_lock(); 497 498 mlret = memlist_delete_span( 499 (uint64_t)(pt_base) << PAGESHIFT, 500 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 501 502 ASSERT(mlret == MEML_SPANOP_OK); 503 phys_install_has_changed(); 504 installed_top_size(phys_install, &physmax, &physinstalled); 505 506 memlist_write_unlock(); 507 delspan_unreserve(pt_base, tpgs); 508 } 509 510 /* 511 * Only return an available memseg of exactly the right size. 512 * When the meta data area has it's own virtual address space 513 * we will need to manage this more carefully and do best fit 514 * allocations, possibly splitting an availble area. 515 */ 516 static struct memseg * 517 memseg_reuse(pgcnt_t metapgs) 518 { 519 struct memseg **segpp, *seg; 520 521 mutex_enter(&memseg_lists_lock); 522 523 segpp = &memseg_va_avail; 524 for (; (seg = *segpp) != NULL; segpp = &seg->lnext) { 525 caddr_t end; 526 527 if (kpm_enable) 528 end = hat_kpm_mseg_reuse(seg); 529 else 530 end = (caddr_t)seg->epages; 531 532 if (btopr(end - (caddr_t)seg->pages) == metapgs) { 533 *segpp = seg->lnext; 534 seg->lnext = NULL; 535 break; 536 } 537 } 538 mutex_exit(&memseg_lists_lock); 539 540 return (seg); 541 } 542 543 static uint_t handle_gen; 544 545 struct memdelspan { 546 struct memdelspan *mds_next; 547 pfn_t mds_base; 548 pgcnt_t mds_npgs; 549 uint_t *mds_bitmap; 550 uint_t *mds_bitmap_retired; 551 }; 552 553 #define NBPBMW (sizeof (uint_t) * NBBY) 554 #define MDS_BITMAPBYTES(MDSP) \ 555 ((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t)) 556 557 struct transit_list { 558 struct transit_list *trl_next; 559 struct memdelspan *trl_spans; 560 int trl_collect; 561 }; 562 563 struct transit_list_head { 564 kmutex_t trh_lock; 565 struct transit_list *trh_head; 566 }; 567 568 static struct transit_list_head transit_list_head; 569 570 struct mem_handle; 571 static void transit_list_collect(struct mem_handle *, int); 572 static void transit_list_insert(struct transit_list *); 573 static void transit_list_remove(struct transit_list *); 574 575 #ifdef DEBUG 576 #define MEM_DEL_STATS 577 #endif /* DEBUG */ 578 579 #ifdef MEM_DEL_STATS 580 static int mem_del_stat_print = 0; 581 struct mem_del_stat { 582 uint_t nloop; 583 uint_t need_free; 584 uint_t free_loop; 585 uint_t free_low; 586 uint_t free_failed; 587 uint_t ncheck; 588 uint_t nopaget; 589 uint_t lockfail; 590 uint_t nfree; 591 uint_t nreloc; 592 uint_t nrelocfail; 593 uint_t already_done; 594 uint_t first_notfree; 595 uint_t npplocked; 596 uint_t nlockreloc; 597 uint_t nnorepl; 598 uint_t nmodreloc; 599 uint_t ndestroy; 600 uint_t nputpage; 601 uint_t nnoreclaim; 602 uint_t ndelay; 603 uint_t demotefail; 604 uint64_t nticks_total; 605 uint64_t nticks_pgrp; 606 uint_t retired; 607 uint_t toxic; 608 uint_t failing; 609 uint_t modtoxic; 610 uint_t npplkdtoxic; 611 uint_t gptlmodfail; 612 uint_t gptllckfail; 613 }; 614 /* 615 * The stat values are only incremented in the delete thread 616 * so no locking or atomic required. 617 */ 618 #define MDSTAT_INCR(MHP, FLD) (MHP)->mh_delstat.FLD++ 619 #define MDSTAT_TOTAL(MHP, ntck) ((MHP)->mh_delstat.nticks_total += (ntck)) 620 #define MDSTAT_PGRP(MHP, ntck) ((MHP)->mh_delstat.nticks_pgrp += (ntck)) 621 static void mem_del_stat_print_func(struct mem_handle *); 622 #define MDSTAT_PRINT(MHP) mem_del_stat_print_func((MHP)) 623 #else /* MEM_DEL_STATS */ 624 #define MDSTAT_INCR(MHP, FLD) 625 #define MDSTAT_TOTAL(MHP, ntck) 626 #define MDSTAT_PGRP(MHP, ntck) 627 #define MDSTAT_PRINT(MHP) 628 #endif /* MEM_DEL_STATS */ 629 630 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING, 631 MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t; 632 633 /* 634 * mh_mutex must be taken to examine or change mh_exthandle and mh_state. 635 * The mutex may not be required for other fields, dependent on mh_state. 636 */ 637 struct mem_handle { 638 kmutex_t mh_mutex; 639 struct mem_handle *mh_next; 640 memhandle_t mh_exthandle; 641 mhnd_state_t mh_state; 642 struct transit_list mh_transit; 643 pgcnt_t mh_phys_pages; 644 pgcnt_t mh_vm_pages; 645 pgcnt_t mh_hold_todo; 646 void (*mh_delete_complete)(void *, int error); 647 void *mh_delete_complete_arg; 648 volatile uint_t mh_cancel; 649 volatile uint_t mh_dr_aio_cleanup_cancel; 650 volatile uint_t mh_aio_cleanup_done; 651 kcondvar_t mh_cv; 652 kthread_id_t mh_thread_id; 653 page_t *mh_deleted; /* link through p_next */ 654 #ifdef MEM_DEL_STATS 655 struct mem_del_stat mh_delstat; 656 #endif /* MEM_DEL_STATS */ 657 }; 658 659 static struct mem_handle *mem_handle_head; 660 static kmutex_t mem_handle_list_mutex; 661 662 static struct mem_handle * 663 kphysm_allocate_mem_handle() 664 { 665 struct mem_handle *mhp; 666 667 mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP); 668 mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL); 669 mutex_enter(&mem_handle_list_mutex); 670 mutex_enter(&mhp->mh_mutex); 671 /* handle_gen is protected by list mutex. */ 672 mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen); 673 mhp->mh_next = mem_handle_head; 674 mem_handle_head = mhp; 675 mutex_exit(&mem_handle_list_mutex); 676 677 return (mhp); 678 } 679 680 static void 681 kphysm_free_mem_handle(struct mem_handle *mhp) 682 { 683 struct mem_handle **mhpp; 684 685 ASSERT(mutex_owned(&mhp->mh_mutex)); 686 ASSERT(mhp->mh_state == MHND_FREE); 687 /* 688 * Exit the mutex to preserve locking order. This is OK 689 * here as once in the FREE state, the handle cannot 690 * be found by a lookup. 691 */ 692 mutex_exit(&mhp->mh_mutex); 693 694 mutex_enter(&mem_handle_list_mutex); 695 mhpp = &mem_handle_head; 696 while (*mhpp != NULL && *mhpp != mhp) 697 mhpp = &(*mhpp)->mh_next; 698 ASSERT(*mhpp == mhp); 699 /* 700 * No need to lock the handle (mh_mutex) as only 701 * mh_next changing and this is the only thread that 702 * can be referncing mhp. 703 */ 704 *mhpp = mhp->mh_next; 705 mutex_exit(&mem_handle_list_mutex); 706 707 mutex_destroy(&mhp->mh_mutex); 708 kmem_free(mhp, sizeof (struct mem_handle)); 709 } 710 711 /* 712 * This function finds the internal mem_handle corresponding to an 713 * external handle and returns it with the mh_mutex held. 714 */ 715 static struct mem_handle * 716 kphysm_lookup_mem_handle(memhandle_t handle) 717 { 718 struct mem_handle *mhp; 719 720 mutex_enter(&mem_handle_list_mutex); 721 for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) { 722 if (mhp->mh_exthandle == handle) { 723 mutex_enter(&mhp->mh_mutex); 724 /* 725 * The state of the handle could have been changed 726 * by kphysm_del_release() while waiting for mh_mutex. 727 */ 728 if (mhp->mh_state == MHND_FREE) { 729 mutex_exit(&mhp->mh_mutex); 730 continue; 731 } 732 break; 733 } 734 } 735 mutex_exit(&mem_handle_list_mutex); 736 return (mhp); 737 } 738 739 int 740 kphysm_del_gethandle(memhandle_t *xmhp) 741 { 742 struct mem_handle *mhp; 743 744 mhp = kphysm_allocate_mem_handle(); 745 /* 746 * The handle is allocated using KM_SLEEP, so cannot fail. 747 * If the implementation is changed, the correct error to return 748 * here would be KPHYSM_ENOHANDLES. 749 */ 750 ASSERT(mhp->mh_state == MHND_FREE); 751 mhp->mh_state = MHND_INIT; 752 *xmhp = mhp->mh_exthandle; 753 mutex_exit(&mhp->mh_mutex); 754 return (KPHYSM_OK); 755 } 756 757 static int 758 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2) 759 { 760 pfn_t e1, e2; 761 762 e1 = b1 + l1; 763 e2 = b2 + l2; 764 765 return (!(b2 >= e1 || b1 >= e2)); 766 } 767 768 static int can_remove_pgs(pgcnt_t); 769 770 static struct memdelspan * 771 span_to_install(pfn_t base, pgcnt_t npgs) 772 { 773 struct memdelspan *mdsp; 774 struct memdelspan *mdsp_new; 775 uint64_t address, size, thislen; 776 struct memlist *mlp; 777 778 mdsp_new = NULL; 779 780 address = (uint64_t)base << PAGESHIFT; 781 size = (uint64_t)npgs << PAGESHIFT; 782 while (size != 0) { 783 memlist_read_lock(); 784 for (mlp = phys_install; mlp != NULL; mlp = mlp->next) { 785 if (address >= (mlp->address + mlp->size)) 786 continue; 787 if ((address + size) > mlp->address) 788 break; 789 } 790 if (mlp == NULL) { 791 address += size; 792 size = 0; 793 thislen = 0; 794 } else { 795 if (address < mlp->address) { 796 size -= (mlp->address - address); 797 address = mlp->address; 798 } 799 ASSERT(address >= mlp->address); 800 if ((address + size) > (mlp->address + mlp->size)) { 801 thislen = mlp->size - (address - mlp->address); 802 } else { 803 thislen = size; 804 } 805 } 806 memlist_read_unlock(); 807 /* TODO: phys_install could change now */ 808 if (thislen == 0) 809 continue; 810 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 811 mdsp->mds_base = btop(address); 812 mdsp->mds_npgs = btop(thislen); 813 mdsp->mds_next = mdsp_new; 814 mdsp_new = mdsp; 815 address += thislen; 816 size -= thislen; 817 } 818 return (mdsp_new); 819 } 820 821 static void 822 free_delspans(struct memdelspan *mdsp) 823 { 824 struct memdelspan *amdsp; 825 826 while ((amdsp = mdsp) != NULL) { 827 mdsp = amdsp->mds_next; 828 kmem_free(amdsp, sizeof (struct memdelspan)); 829 } 830 } 831 832 /* 833 * Concatenate lists. No list ordering is required. 834 */ 835 836 static void 837 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp) 838 { 839 while (*mdspp != NULL) 840 mdspp = &(*mdspp)->mds_next; 841 842 *mdspp = mdsp; 843 } 844 845 /* 846 * Given a new list of delspans, check there is no overlap with 847 * all existing span activity (add or delete) and then concatenate 848 * the new spans to the given list. 849 * Return 1 for OK, 0 if overlapping. 850 */ 851 static int 852 delspan_insert( 853 struct transit_list *my_tlp, 854 struct memdelspan *mdsp_new) 855 { 856 struct transit_list_head *trh; 857 struct transit_list *tlp; 858 int ret; 859 860 trh = &transit_list_head; 861 862 ASSERT(my_tlp != NULL); 863 ASSERT(mdsp_new != NULL); 864 865 ret = 1; 866 mutex_enter(&trh->trh_lock); 867 /* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */ 868 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 869 struct memdelspan *mdsp; 870 871 for (mdsp = tlp->trl_spans; mdsp != NULL; 872 mdsp = mdsp->mds_next) { 873 struct memdelspan *nmdsp; 874 875 for (nmdsp = mdsp_new; nmdsp != NULL; 876 nmdsp = nmdsp->mds_next) { 877 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 878 nmdsp->mds_base, nmdsp->mds_npgs)) { 879 ret = 0; 880 goto done; 881 } 882 } 883 } 884 } 885 done: 886 if (ret != 0) { 887 if (my_tlp->trl_spans == NULL) 888 transit_list_insert(my_tlp); 889 delspan_concat(&my_tlp->trl_spans, mdsp_new); 890 } 891 mutex_exit(&trh->trh_lock); 892 return (ret); 893 } 894 895 static void 896 delspan_remove( 897 struct transit_list *my_tlp, 898 pfn_t base, 899 pgcnt_t npgs) 900 { 901 struct transit_list_head *trh; 902 struct memdelspan *mdsp; 903 904 trh = &transit_list_head; 905 906 ASSERT(my_tlp != NULL); 907 908 mutex_enter(&trh->trh_lock); 909 if ((mdsp = my_tlp->trl_spans) != NULL) { 910 if (npgs == 0) { 911 my_tlp->trl_spans = NULL; 912 free_delspans(mdsp); 913 transit_list_remove(my_tlp); 914 } else { 915 struct memdelspan **prv; 916 917 prv = &my_tlp->trl_spans; 918 while (mdsp != NULL) { 919 pfn_t p_end; 920 921 p_end = mdsp->mds_base + mdsp->mds_npgs; 922 if (mdsp->mds_base >= base && 923 p_end <= (base + npgs)) { 924 *prv = mdsp->mds_next; 925 mdsp->mds_next = NULL; 926 free_delspans(mdsp); 927 } else { 928 prv = &mdsp->mds_next; 929 } 930 mdsp = *prv; 931 } 932 if (my_tlp->trl_spans == NULL) 933 transit_list_remove(my_tlp); 934 } 935 } 936 mutex_exit(&trh->trh_lock); 937 } 938 939 /* 940 * Reserve interface for add to stop delete before add finished. 941 * This list is only accessed through the delspan_insert/remove 942 * functions and so is fully protected by the mutex in struct transit_list. 943 */ 944 945 static struct transit_list reserve_transit; 946 947 static int 948 delspan_reserve(pfn_t base, pgcnt_t npgs) 949 { 950 struct memdelspan *mdsp; 951 int ret; 952 953 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 954 mdsp->mds_base = base; 955 mdsp->mds_npgs = npgs; 956 if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) { 957 free_delspans(mdsp); 958 } 959 return (ret); 960 } 961 962 static void 963 delspan_unreserve(pfn_t base, pgcnt_t npgs) 964 { 965 delspan_remove(&reserve_transit, base, npgs); 966 } 967 968 /* 969 * Return whether memseg was created by kphysm_add_memory_dynamic(). 970 * If this is the case and startp non zero, return also the start pfn 971 * of the meta data via startp. 972 */ 973 static int 974 memseg_is_dynamic(struct memseg *seg, pfn_t *startp) 975 { 976 pfn_t pt_start; 977 978 if ((seg->msegflags & MEMSEG_DYNAMIC) == 0) 979 return (0); 980 981 /* Meta data is required to be at the beginning */ 982 ASSERT(hat_getpfnum(kas.a_hat, (caddr_t)seg->epages) < seg->pages_base); 983 984 pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages); 985 if (startp != NULL) 986 *startp = pt_start; 987 988 return (1); 989 } 990 991 int 992 kphysm_del_span( 993 memhandle_t handle, 994 pfn_t base, 995 pgcnt_t npgs) 996 { 997 struct mem_handle *mhp; 998 struct memseg *seg; 999 struct memdelspan *mdsp; 1000 struct memdelspan *mdsp_new; 1001 pgcnt_t phys_pages, vm_pages; 1002 pfn_t p_end; 1003 page_t *pp; 1004 int ret; 1005 1006 mhp = kphysm_lookup_mem_handle(handle); 1007 if (mhp == NULL) { 1008 return (KPHYSM_EHANDLE); 1009 } 1010 if (mhp->mh_state != MHND_INIT) { 1011 mutex_exit(&mhp->mh_mutex); 1012 return (KPHYSM_ESEQUENCE); 1013 } 1014 1015 /* 1016 * Intersect the span with the installed memory list (phys_install). 1017 */ 1018 mdsp_new = span_to_install(base, npgs); 1019 if (mdsp_new == NULL) { 1020 /* 1021 * No physical memory in this range. Is this an 1022 * error? If an attempt to start the delete is made 1023 * for OK returns from del_span such as this, start will 1024 * return an error. 1025 * Could return KPHYSM_ENOWORK. 1026 */ 1027 /* 1028 * It is assumed that there are no error returns 1029 * from span_to_install() due to kmem_alloc failure. 1030 */ 1031 mutex_exit(&mhp->mh_mutex); 1032 return (KPHYSM_OK); 1033 } 1034 /* 1035 * Does this span overlap an existing span? 1036 */ 1037 if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) { 1038 /* 1039 * Differentiate between already on list for this handle 1040 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY). 1041 */ 1042 ret = KPHYSM_EBUSY; 1043 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1044 mdsp = mdsp->mds_next) { 1045 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 1046 base, npgs)) { 1047 ret = KPHYSM_EDUP; 1048 break; 1049 } 1050 } 1051 mutex_exit(&mhp->mh_mutex); 1052 free_delspans(mdsp_new); 1053 return (ret); 1054 } 1055 /* 1056 * At this point the spans in mdsp_new have been inserted into the 1057 * list of spans for this handle and thereby to the global list of 1058 * spans being processed. Each of these spans must now be checked 1059 * for relocatability. As a side-effect segments in the memseg list 1060 * may be split. 1061 * 1062 * Note that mdsp_new can no longer be used as it is now part of 1063 * a larger list. Select elements of this larger list based 1064 * on base and npgs. 1065 */ 1066 restart: 1067 phys_pages = 0; 1068 vm_pages = 0; 1069 ret = KPHYSM_OK; 1070 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1071 mdsp = mdsp->mds_next) { 1072 pgcnt_t pages_checked; 1073 1074 if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) { 1075 continue; 1076 } 1077 p_end = mdsp->mds_base + mdsp->mds_npgs; 1078 /* 1079 * The pages_checked count is a hack. All pages should be 1080 * checked for relocatability. Those not covered by memsegs 1081 * should be tested with arch_kphysm_del_span_ok(). 1082 */ 1083 pages_checked = 0; 1084 for (seg = memsegs; seg; seg = seg->next) { 1085 pfn_t mseg_start; 1086 1087 if (seg->pages_base >= p_end || 1088 seg->pages_end <= mdsp->mds_base) { 1089 /* Span and memseg don't overlap. */ 1090 continue; 1091 } 1092 /* Check that segment is suitable for delete. */ 1093 if (memseg_is_dynamic(seg, &mseg_start)) { 1094 /* 1095 * Can only delete whole added segments 1096 * for the moment. 1097 * Check that this is completely within the 1098 * span. 1099 */ 1100 if (mseg_start < mdsp->mds_base || 1101 seg->pages_end > p_end) { 1102 ret = KPHYSM_EBUSY; 1103 break; 1104 } 1105 pages_checked += seg->pages_end - mseg_start; 1106 } else { 1107 /* 1108 * Set mseg_start for accounting below. 1109 */ 1110 mseg_start = seg->pages_base; 1111 /* 1112 * If this segment is larger than the span, 1113 * try to split it. After the split, it 1114 * is necessary to restart. 1115 */ 1116 if (seg->pages_base < mdsp->mds_base || 1117 seg->pages_end > p_end) { 1118 pfn_t abase; 1119 pgcnt_t anpgs; 1120 int s_ret; 1121 1122 /* Split required. */ 1123 if (mdsp->mds_base < seg->pages_base) 1124 abase = seg->pages_base; 1125 else 1126 abase = mdsp->mds_base; 1127 if (p_end > seg->pages_end) 1128 anpgs = seg->pages_end - abase; 1129 else 1130 anpgs = p_end - abase; 1131 s_ret = kphysm_split_memseg(abase, 1132 anpgs); 1133 if (s_ret == 0) { 1134 /* Split failed. */ 1135 ret = KPHYSM_ERESOURCE; 1136 break; 1137 } 1138 goto restart; 1139 } 1140 pages_checked += 1141 seg->pages_end - seg->pages_base; 1142 } 1143 /* 1144 * The memseg is wholly within the delete span. 1145 * The individual pages can now be checked. 1146 */ 1147 /* Cage test. */ 1148 for (pp = seg->pages; pp < seg->epages; pp++) { 1149 if (PP_ISNORELOC(pp)) { 1150 ret = KPHYSM_ENONRELOC; 1151 break; 1152 } 1153 } 1154 if (ret != KPHYSM_OK) { 1155 break; 1156 } 1157 phys_pages += (seg->pages_end - mseg_start); 1158 vm_pages += MSEG_NPAGES(seg); 1159 } 1160 if (ret != KPHYSM_OK) 1161 break; 1162 if (pages_checked != mdsp->mds_npgs) { 1163 ret = KPHYSM_ENONRELOC; 1164 break; 1165 } 1166 } 1167 1168 if (ret == KPHYSM_OK) { 1169 mhp->mh_phys_pages += phys_pages; 1170 mhp->mh_vm_pages += vm_pages; 1171 } else { 1172 /* 1173 * Keep holding the mh_mutex to prevent it going away. 1174 */ 1175 delspan_remove(&mhp->mh_transit, base, npgs); 1176 } 1177 mutex_exit(&mhp->mh_mutex); 1178 return (ret); 1179 } 1180 1181 int 1182 kphysm_del_span_query( 1183 pfn_t base, 1184 pgcnt_t npgs, 1185 memquery_t *mqp) 1186 { 1187 struct memdelspan *mdsp; 1188 struct memdelspan *mdsp_new; 1189 int done_first_nonreloc; 1190 1191 mqp->phys_pages = 0; 1192 mqp->managed = 0; 1193 mqp->nonrelocatable = 0; 1194 mqp->first_nonrelocatable = 0; 1195 mqp->last_nonrelocatable = 0; 1196 1197 mdsp_new = span_to_install(base, npgs); 1198 /* 1199 * It is OK to proceed here if mdsp_new == NULL. 1200 */ 1201 done_first_nonreloc = 0; 1202 for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) { 1203 pfn_t sbase; 1204 pgcnt_t snpgs; 1205 1206 mqp->phys_pages += mdsp->mds_npgs; 1207 sbase = mdsp->mds_base; 1208 snpgs = mdsp->mds_npgs; 1209 while (snpgs != 0) { 1210 struct memseg *lseg, *seg; 1211 pfn_t p_end; 1212 page_t *pp; 1213 pfn_t mseg_start; 1214 1215 p_end = sbase + snpgs; 1216 /* 1217 * Find the lowest addressed memseg that starts 1218 * after sbase and account for it. 1219 * This is to catch dynamic memsegs whose start 1220 * is hidden. 1221 */ 1222 seg = NULL; 1223 for (lseg = memsegs; lseg != NULL; lseg = lseg->next) { 1224 if ((lseg->pages_base >= sbase) || 1225 (lseg->pages_base < p_end && 1226 lseg->pages_end > sbase)) { 1227 if (seg == NULL || 1228 seg->pages_base > lseg->pages_base) 1229 seg = lseg; 1230 } 1231 } 1232 if (seg != NULL) { 1233 if (!memseg_is_dynamic(seg, &mseg_start)) { 1234 mseg_start = seg->pages_base; 1235 } 1236 /* 1237 * Now have the full extent of the memseg so 1238 * do the range check. 1239 */ 1240 if (mseg_start >= p_end || 1241 seg->pages_end <= sbase) { 1242 /* Span does not overlap memseg. */ 1243 seg = NULL; 1244 } 1245 } 1246 /* 1247 * Account for gap either before the segment if 1248 * there is one or to the end of the span. 1249 */ 1250 if (seg == NULL || mseg_start > sbase) { 1251 pfn_t a_end; 1252 1253 a_end = (seg == NULL) ? p_end : mseg_start; 1254 /* 1255 * Check with arch layer for relocatability. 1256 */ 1257 if (arch_kphysm_del_span_ok(sbase, 1258 (a_end - sbase))) { 1259 /* 1260 * No non-relocatble pages in this 1261 * area, avoid the fine-grained 1262 * test. 1263 */ 1264 snpgs -= (a_end - sbase); 1265 sbase = a_end; 1266 } 1267 while (sbase < a_end) { 1268 if (!arch_kphysm_del_span_ok(sbase, 1269 1)) { 1270 mqp->nonrelocatable++; 1271 if (!done_first_nonreloc) { 1272 mqp-> 1273 first_nonrelocatable 1274 = sbase; 1275 done_first_nonreloc = 1; 1276 } 1277 mqp->last_nonrelocatable = 1278 sbase; 1279 } 1280 sbase++; 1281 snpgs--; 1282 } 1283 } 1284 if (seg != NULL) { 1285 ASSERT(mseg_start <= sbase); 1286 if (seg->pages_base != mseg_start && 1287 seg->pages_base > sbase) { 1288 pgcnt_t skip_pgs; 1289 1290 /* 1291 * Skip the page_t area of a 1292 * dynamic memseg. 1293 */ 1294 skip_pgs = seg->pages_base - sbase; 1295 if (snpgs <= skip_pgs) { 1296 sbase += snpgs; 1297 snpgs = 0; 1298 continue; 1299 } 1300 snpgs -= skip_pgs; 1301 sbase += skip_pgs; 1302 } 1303 ASSERT(snpgs != 0); 1304 ASSERT(seg->pages_base <= sbase); 1305 /* 1306 * The individual pages can now be checked. 1307 */ 1308 for (pp = seg->pages + 1309 (sbase - seg->pages_base); 1310 snpgs != 0 && pp < seg->epages; pp++) { 1311 mqp->managed++; 1312 if (PP_ISNORELOC(pp)) { 1313 mqp->nonrelocatable++; 1314 if (!done_first_nonreloc) { 1315 mqp-> 1316 first_nonrelocatable 1317 = sbase; 1318 done_first_nonreloc = 1; 1319 } 1320 mqp->last_nonrelocatable = 1321 sbase; 1322 } 1323 sbase++; 1324 snpgs--; 1325 } 1326 } 1327 } 1328 } 1329 1330 free_delspans(mdsp_new); 1331 1332 return (KPHYSM_OK); 1333 } 1334 1335 /* 1336 * This release function can be called at any stage as follows: 1337 * _gethandle only called 1338 * _span(s) only called 1339 * _start called but failed 1340 * delete thread exited 1341 */ 1342 int 1343 kphysm_del_release(memhandle_t handle) 1344 { 1345 struct mem_handle *mhp; 1346 1347 mhp = kphysm_lookup_mem_handle(handle); 1348 if (mhp == NULL) { 1349 return (KPHYSM_EHANDLE); 1350 } 1351 switch (mhp->mh_state) { 1352 case MHND_STARTING: 1353 case MHND_RUNNING: 1354 mutex_exit(&mhp->mh_mutex); 1355 return (KPHYSM_ENOTFINISHED); 1356 case MHND_FREE: 1357 ASSERT(mhp->mh_state != MHND_FREE); 1358 mutex_exit(&mhp->mh_mutex); 1359 return (KPHYSM_EHANDLE); 1360 case MHND_INIT: 1361 break; 1362 case MHND_DONE: 1363 break; 1364 case MHND_RELEASE: 1365 mutex_exit(&mhp->mh_mutex); 1366 return (KPHYSM_ESEQUENCE); 1367 default: 1368 #ifdef DEBUG 1369 cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d", 1370 (void *)mhp, mhp->mh_state); 1371 #endif /* DEBUG */ 1372 mutex_exit(&mhp->mh_mutex); 1373 return (KPHYSM_EHANDLE); 1374 } 1375 /* 1376 * Set state so that we can wait if necessary. 1377 * Also this means that we have read/write access to all 1378 * fields except mh_exthandle and mh_state. 1379 */ 1380 mhp->mh_state = MHND_RELEASE; 1381 /* 1382 * The mem_handle cannot be de-allocated by any other operation 1383 * now, so no need to hold mh_mutex. 1384 */ 1385 mutex_exit(&mhp->mh_mutex); 1386 1387 delspan_remove(&mhp->mh_transit, 0, 0); 1388 mhp->mh_phys_pages = 0; 1389 mhp->mh_vm_pages = 0; 1390 mhp->mh_hold_todo = 0; 1391 mhp->mh_delete_complete = NULL; 1392 mhp->mh_delete_complete_arg = NULL; 1393 mhp->mh_cancel = 0; 1394 1395 mutex_enter(&mhp->mh_mutex); 1396 ASSERT(mhp->mh_state == MHND_RELEASE); 1397 mhp->mh_state = MHND_FREE; 1398 1399 kphysm_free_mem_handle(mhp); 1400 1401 return (KPHYSM_OK); 1402 } 1403 1404 /* 1405 * This cancel function can only be called with the thread running. 1406 */ 1407 int 1408 kphysm_del_cancel(memhandle_t handle) 1409 { 1410 struct mem_handle *mhp; 1411 1412 mhp = kphysm_lookup_mem_handle(handle); 1413 if (mhp == NULL) { 1414 return (KPHYSM_EHANDLE); 1415 } 1416 if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) { 1417 mutex_exit(&mhp->mh_mutex); 1418 return (KPHYSM_ENOTRUNNING); 1419 } 1420 /* 1421 * Set the cancel flag and wake the delete thread up. 1422 * The thread may be waiting on I/O, so the effect of the cancel 1423 * may be delayed. 1424 */ 1425 if (mhp->mh_cancel == 0) { 1426 mhp->mh_cancel = KPHYSM_ECANCELLED; 1427 cv_signal(&mhp->mh_cv); 1428 } 1429 mutex_exit(&mhp->mh_mutex); 1430 return (KPHYSM_OK); 1431 } 1432 1433 int 1434 kphysm_del_status( 1435 memhandle_t handle, 1436 memdelstat_t *mdstp) 1437 { 1438 struct mem_handle *mhp; 1439 1440 mhp = kphysm_lookup_mem_handle(handle); 1441 if (mhp == NULL) { 1442 return (KPHYSM_EHANDLE); 1443 } 1444 /* 1445 * Calling kphysm_del_status() is allowed before the delete 1446 * is started to allow for status display. 1447 */ 1448 if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING && 1449 mhp->mh_state != MHND_RUNNING) { 1450 mutex_exit(&mhp->mh_mutex); 1451 return (KPHYSM_ENOTRUNNING); 1452 } 1453 mdstp->phys_pages = mhp->mh_phys_pages; 1454 mdstp->managed = mhp->mh_vm_pages; 1455 mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo; 1456 mutex_exit(&mhp->mh_mutex); 1457 return (KPHYSM_OK); 1458 } 1459 1460 static int mem_delete_additional_pages = 100; 1461 1462 static int 1463 can_remove_pgs(pgcnt_t npgs) 1464 { 1465 /* 1466 * If all pageable pages were paged out, freemem would 1467 * equal availrmem. There is a minimum requirement for 1468 * availrmem. 1469 */ 1470 if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages)) 1471 < npgs) 1472 return (0); 1473 /* TODO: check swap space, etc. */ 1474 return (1); 1475 } 1476 1477 static int 1478 get_availrmem(pgcnt_t npgs) 1479 { 1480 int ret; 1481 1482 mutex_enter(&freemem_lock); 1483 ret = can_remove_pgs(npgs); 1484 if (ret != 0) 1485 availrmem -= npgs; 1486 mutex_exit(&freemem_lock); 1487 return (ret); 1488 } 1489 1490 static void 1491 put_availrmem(pgcnt_t npgs) 1492 { 1493 mutex_enter(&freemem_lock); 1494 availrmem += npgs; 1495 mutex_exit(&freemem_lock); 1496 } 1497 1498 #define FREEMEM_INCR 100 1499 static pgcnt_t freemem_incr = FREEMEM_INCR; 1500 #define DEL_FREE_WAIT_FRAC 4 1501 #define DEL_FREE_WAIT_TICKS ((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC) 1502 1503 #define DEL_BUSY_WAIT_FRAC 20 1504 #define DEL_BUSY_WAIT_TICKS ((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC) 1505 1506 static void kphysm_del_cleanup(struct mem_handle *); 1507 1508 static void page_delete_collect(page_t *, struct mem_handle *); 1509 1510 static pgcnt_t 1511 delthr_get_freemem(struct mem_handle *mhp) 1512 { 1513 pgcnt_t free_get; 1514 int ret; 1515 1516 ASSERT(MUTEX_HELD(&mhp->mh_mutex)); 1517 1518 MDSTAT_INCR(mhp, need_free); 1519 /* 1520 * Get up to freemem_incr pages. 1521 */ 1522 free_get = freemem_incr; 1523 if (free_get > mhp->mh_hold_todo) 1524 free_get = mhp->mh_hold_todo; 1525 /* 1526 * Take free_get pages away from freemem, 1527 * waiting if necessary. 1528 */ 1529 1530 while (!mhp->mh_cancel) { 1531 mutex_exit(&mhp->mh_mutex); 1532 MDSTAT_INCR(mhp, free_loop); 1533 /* 1534 * Duplicate test from page_create_throttle() 1535 * but don't override with !PG_WAIT. 1536 */ 1537 if (freemem < (free_get + throttlefree)) { 1538 MDSTAT_INCR(mhp, free_low); 1539 ret = 0; 1540 } else { 1541 ret = page_create_wait(free_get, 0); 1542 if (ret == 0) { 1543 /* EMPTY */ 1544 MDSTAT_INCR(mhp, free_failed); 1545 } 1546 } 1547 if (ret != 0) { 1548 mutex_enter(&mhp->mh_mutex); 1549 return (free_get); 1550 } 1551 1552 /* 1553 * Put pressure on pageout. 1554 */ 1555 page_needfree(free_get); 1556 cv_signal(&proc_pageout->p_cv); 1557 1558 mutex_enter(&mhp->mh_mutex); 1559 (void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex, 1560 (lbolt + DEL_FREE_WAIT_TICKS)); 1561 mutex_exit(&mhp->mh_mutex); 1562 page_needfree(-(spgcnt_t)free_get); 1563 1564 mutex_enter(&mhp->mh_mutex); 1565 } 1566 return (0); 1567 } 1568 1569 #define DR_AIO_CLEANUP_DELAY 25000 /* 0.025secs, in usec */ 1570 #define DR_AIO_CLEANUP_MAXLOOPS_NODELAY 100 1571 /* 1572 * This function is run as a helper thread for delete_memory_thread. 1573 * It is needed in order to force kaio cleanup, so that pages used in kaio 1574 * will be unlocked and subsequently relocated by delete_memory_thread. 1575 * The address of the delete_memory_threads's mem_handle is passed in to 1576 * this thread function, and is used to set the mh_aio_cleanup_done member 1577 * prior to calling thread_exit(). 1578 */ 1579 static void 1580 dr_aio_cleanup_thread(caddr_t amhp) 1581 { 1582 proc_t *procp; 1583 int (*aio_cleanup_dr_delete_memory)(proc_t *); 1584 int cleaned; 1585 int n = 0; 1586 struct mem_handle *mhp; 1587 volatile uint_t *pcancel; 1588 1589 mhp = (struct mem_handle *)amhp; 1590 ASSERT(mhp != NULL); 1591 pcancel = &mhp->mh_dr_aio_cleanup_cancel; 1592 if (modload("sys", "kaio") == -1) { 1593 mhp->mh_aio_cleanup_done = 1; 1594 cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio"); 1595 thread_exit(); 1596 } 1597 aio_cleanup_dr_delete_memory = (int (*)(proc_t *)) 1598 modgetsymvalue("aio_cleanup_dr_delete_memory", 0); 1599 if (aio_cleanup_dr_delete_memory == NULL) { 1600 mhp->mh_aio_cleanup_done = 1; 1601 cmn_err(CE_WARN, 1602 "aio_cleanup_dr_delete_memory not found in kaio"); 1603 thread_exit(); 1604 } 1605 do { 1606 cleaned = 0; 1607 mutex_enter(&pidlock); 1608 for (procp = practive; (*pcancel == 0) && (procp != NULL); 1609 procp = procp->p_next) { 1610 mutex_enter(&procp->p_lock); 1611 if (procp->p_aio != NULL) { 1612 /* cleanup proc's outstanding kaio */ 1613 cleaned += 1614 (*aio_cleanup_dr_delete_memory)(procp); 1615 } 1616 mutex_exit(&procp->p_lock); 1617 } 1618 mutex_exit(&pidlock); 1619 if ((*pcancel == 0) && 1620 (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) { 1621 /* delay a bit before retrying all procs again */ 1622 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 1623 n = 0; 1624 } 1625 } while (*pcancel == 0); 1626 mhp->mh_aio_cleanup_done = 1; 1627 thread_exit(); 1628 } 1629 1630 static void 1631 delete_memory_thread(caddr_t amhp) 1632 { 1633 struct mem_handle *mhp; 1634 struct memdelspan *mdsp; 1635 callb_cpr_t cprinfo; 1636 page_t *pp_targ; 1637 spgcnt_t freemem_left; 1638 void (*del_complete_funcp)(void *, int error); 1639 void *del_complete_arg; 1640 int comp_code; 1641 int ret; 1642 int first_scan; 1643 uint_t szc; 1644 #ifdef MEM_DEL_STATS 1645 uint64_t start_total, ntick_total; 1646 uint64_t start_pgrp, ntick_pgrp; 1647 #endif /* MEM_DEL_STATS */ 1648 1649 mhp = (struct mem_handle *)amhp; 1650 1651 #ifdef MEM_DEL_STATS 1652 start_total = ddi_get_lbolt(); 1653 #endif /* MEM_DEL_STATS */ 1654 1655 CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex, 1656 callb_generic_cpr, "memdel"); 1657 1658 mutex_enter(&mhp->mh_mutex); 1659 ASSERT(mhp->mh_state == MHND_STARTING); 1660 1661 mhp->mh_state = MHND_RUNNING; 1662 mhp->mh_thread_id = curthread; 1663 1664 mhp->mh_hold_todo = mhp->mh_vm_pages; 1665 mutex_exit(&mhp->mh_mutex); 1666 1667 /* Allocate the remap pages now, if necessary. */ 1668 memseg_remap_init(); 1669 1670 /* 1671 * Subtract from availrmem now if possible as availrmem 1672 * may not be available by the end of the delete. 1673 */ 1674 if (!get_availrmem(mhp->mh_vm_pages)) { 1675 comp_code = KPHYSM_ENOTVIABLE; 1676 mutex_enter(&mhp->mh_mutex); 1677 goto early_exit; 1678 } 1679 1680 ret = kphysm_setup_pre_del(mhp->mh_vm_pages); 1681 1682 mutex_enter(&mhp->mh_mutex); 1683 1684 if (ret != 0) { 1685 mhp->mh_cancel = KPHYSM_EREFUSED; 1686 goto refused; 1687 } 1688 1689 transit_list_collect(mhp, 1); 1690 1691 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1692 mdsp = mdsp->mds_next) { 1693 ASSERT(mdsp->mds_bitmap == NULL); 1694 mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP); 1695 mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp), 1696 KM_SLEEP); 1697 } 1698 1699 first_scan = 1; 1700 freemem_left = 0; 1701 /* 1702 * Start dr_aio_cleanup_thread, which periodically iterates 1703 * through the process list and invokes aio cleanup. This 1704 * is needed in order to avoid a deadly embrace between the 1705 * delete_memory_thread (waiting on writer lock for page, with the 1706 * exclusive-wanted bit set), kaio read request threads (waiting for a 1707 * reader lock on the same page that is wanted by the 1708 * delete_memory_thread), and threads waiting for kaio completion 1709 * (blocked on spt_amp->lock). 1710 */ 1711 mhp->mh_dr_aio_cleanup_cancel = 0; 1712 mhp->mh_aio_cleanup_done = 0; 1713 (void) thread_create(NULL, 0, dr_aio_cleanup_thread, 1714 (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1); 1715 while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) { 1716 pgcnt_t collected; 1717 1718 MDSTAT_INCR(mhp, nloop); 1719 collected = 0; 1720 for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) && 1721 (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) { 1722 pfn_t pfn, p_end; 1723 1724 if (first_scan) { 1725 mem_node_pre_del_slice(mdsp->mds_base, 1726 mdsp->mds_base + mdsp->mds_npgs - 1); 1727 } 1728 1729 p_end = mdsp->mds_base + mdsp->mds_npgs; 1730 for (pfn = mdsp->mds_base; (pfn < p_end) && 1731 (mhp->mh_cancel == 0); pfn++) { 1732 page_t *pp, *tpp, *tpp_targ; 1733 pgcnt_t bit; 1734 struct vnode *vp; 1735 u_offset_t offset; 1736 int mod, result; 1737 spgcnt_t pgcnt; 1738 1739 bit = pfn - mdsp->mds_base; 1740 if ((mdsp->mds_bitmap[bit / NBPBMW] & 1741 (1 << (bit % NBPBMW))) != 0) { 1742 MDSTAT_INCR(mhp, already_done); 1743 continue; 1744 } 1745 if (freemem_left == 0) { 1746 freemem_left += delthr_get_freemem(mhp); 1747 if (freemem_left == 0) 1748 break; 1749 } 1750 1751 /* 1752 * Release mh_mutex - some of this 1753 * stuff takes some time (eg PUTPAGE). 1754 */ 1755 1756 mutex_exit(&mhp->mh_mutex); 1757 MDSTAT_INCR(mhp, ncheck); 1758 1759 pp = page_numtopp_nolock(pfn); 1760 if (pp == NULL) { 1761 /* 1762 * Not covered by a page_t - will 1763 * be dealt with elsewhere. 1764 */ 1765 MDSTAT_INCR(mhp, nopaget); 1766 mutex_enter(&mhp->mh_mutex); 1767 mdsp->mds_bitmap[bit / NBPBMW] |= 1768 (1 << (bit % NBPBMW)); 1769 continue; 1770 } 1771 1772 if (!page_try_reclaim_lock(pp, SE_EXCL, 1773 SE_EXCL_WANTED | SE_RETIRED)) { 1774 /* 1775 * Page in use elsewhere. Skip it. 1776 */ 1777 MDSTAT_INCR(mhp, lockfail); 1778 mutex_enter(&mhp->mh_mutex); 1779 continue; 1780 } 1781 /* 1782 * See if the cage expanded into the delete. 1783 * This can happen as we have to allow the 1784 * cage to expand. 1785 */ 1786 if (PP_ISNORELOC(pp)) { 1787 page_unlock(pp); 1788 mutex_enter(&mhp->mh_mutex); 1789 mhp->mh_cancel = KPHYSM_ENONRELOC; 1790 break; 1791 } 1792 if (PP_RETIRED(pp)) { 1793 /* 1794 * Page has been retired and is 1795 * not part of the cage so we 1796 * can now do the accounting for 1797 * it. 1798 */ 1799 MDSTAT_INCR(mhp, retired); 1800 mutex_enter(&mhp->mh_mutex); 1801 mdsp->mds_bitmap[bit / NBPBMW] 1802 |= (1 << (bit % NBPBMW)); 1803 mdsp->mds_bitmap_retired[bit / 1804 NBPBMW] |= 1805 (1 << (bit % NBPBMW)); 1806 mhp->mh_hold_todo--; 1807 continue; 1808 } 1809 ASSERT(freemem_left != 0); 1810 if (PP_ISFREE(pp)) { 1811 /* 1812 * Like page_reclaim() only 'freemem' 1813 * processing is already done. 1814 */ 1815 MDSTAT_INCR(mhp, nfree); 1816 free_page_collect: 1817 if (PP_ISAGED(pp)) { 1818 page_list_sub(pp, 1819 PG_FREE_LIST); 1820 } else { 1821 page_list_sub(pp, 1822 PG_CACHE_LIST); 1823 } 1824 PP_CLRFREE(pp); 1825 PP_CLRAGED(pp); 1826 collected++; 1827 mutex_enter(&mhp->mh_mutex); 1828 page_delete_collect(pp, mhp); 1829 mdsp->mds_bitmap[bit / NBPBMW] |= 1830 (1 << (bit % NBPBMW)); 1831 freemem_left--; 1832 continue; 1833 } 1834 ASSERT(pp->p_vnode != NULL); 1835 if (first_scan) { 1836 MDSTAT_INCR(mhp, first_notfree); 1837 page_unlock(pp); 1838 mutex_enter(&mhp->mh_mutex); 1839 continue; 1840 } 1841 /* 1842 * Keep stats on pages encountered that 1843 * are marked for retirement. 1844 */ 1845 if (PP_TOXIC(pp)) { 1846 MDSTAT_INCR(mhp, toxic); 1847 } else if (PP_PR_REQ(pp)) { 1848 MDSTAT_INCR(mhp, failing); 1849 } 1850 /* 1851 * In certain cases below, special exceptions 1852 * are made for pages that are toxic. This 1853 * is because the current meaning of toxic 1854 * is that an uncorrectable error has been 1855 * previously associated with the page. 1856 */ 1857 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1858 if (!PP_TOXIC(pp)) { 1859 /* 1860 * Must relocate locked in 1861 * memory pages. 1862 */ 1863 #ifdef MEM_DEL_STATS 1864 start_pgrp = ddi_get_lbolt(); 1865 #endif /* MEM_DEL_STATS */ 1866 /* 1867 * Lock all constituent pages 1868 * of a large page to ensure 1869 * that p_szc won't change. 1870 */ 1871 if (!group_page_trylock(pp, 1872 SE_EXCL)) { 1873 MDSTAT_INCR(mhp, 1874 gptllckfail); 1875 page_unlock(pp); 1876 mutex_enter( 1877 &mhp->mh_mutex); 1878 continue; 1879 } 1880 MDSTAT_INCR(mhp, npplocked); 1881 pp_targ = 1882 page_get_replacement_page( 1883 pp, NULL, 0); 1884 if (pp_targ != NULL) { 1885 #ifdef MEM_DEL_STATS 1886 ntick_pgrp = 1887 (uint64_t) 1888 ddi_get_lbolt() - 1889 start_pgrp; 1890 #endif /* MEM_DEL_STATS */ 1891 MDSTAT_PGRP(mhp, 1892 ntick_pgrp); 1893 MDSTAT_INCR(mhp, 1894 nlockreloc); 1895 goto reloc; 1896 } 1897 group_page_unlock(pp); 1898 page_unlock(pp); 1899 #ifdef MEM_DEL_STATS 1900 ntick_pgrp = 1901 (uint64_t)ddi_get_lbolt() - 1902 start_pgrp; 1903 #endif /* MEM_DEL_STATS */ 1904 MDSTAT_PGRP(mhp, ntick_pgrp); 1905 MDSTAT_INCR(mhp, nnorepl); 1906 mutex_enter(&mhp->mh_mutex); 1907 continue; 1908 } else { 1909 /* 1910 * Cannot do anything about 1911 * this page because it is 1912 * toxic. 1913 */ 1914 MDSTAT_INCR(mhp, npplkdtoxic); 1915 page_unlock(pp); 1916 mutex_enter(&mhp->mh_mutex); 1917 continue; 1918 } 1919 } 1920 /* 1921 * Unload the mappings and check if mod bit 1922 * is set. 1923 */ 1924 ASSERT(pp->p_vnode != &kvp); 1925 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1926 mod = hat_ismod(pp); 1927 1928 #ifdef MEM_DEL_STATS 1929 start_pgrp = ddi_get_lbolt(); 1930 #endif /* MEM_DEL_STATS */ 1931 if (mod && !PP_TOXIC(pp)) { 1932 /* 1933 * Lock all constituent pages 1934 * of a large page to ensure 1935 * that p_szc won't change. 1936 */ 1937 if (!group_page_trylock(pp, SE_EXCL)) { 1938 MDSTAT_INCR(mhp, gptlmodfail); 1939 page_unlock(pp); 1940 mutex_enter(&mhp->mh_mutex); 1941 continue; 1942 } 1943 pp_targ = page_get_replacement_page(pp, 1944 NULL, 0); 1945 if (pp_targ != NULL) { 1946 MDSTAT_INCR(mhp, nmodreloc); 1947 #ifdef MEM_DEL_STATS 1948 ntick_pgrp = 1949 (uint64_t)ddi_get_lbolt() - 1950 start_pgrp; 1951 #endif /* MEM_DEL_STATS */ 1952 MDSTAT_PGRP(mhp, ntick_pgrp); 1953 goto reloc; 1954 } 1955 group_page_unlock(pp); 1956 } 1957 1958 if (!page_try_demote_pages(pp)) { 1959 MDSTAT_INCR(mhp, demotefail); 1960 page_unlock(pp); 1961 #ifdef MEM_DEL_STATS 1962 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 1963 start_pgrp; 1964 #endif /* MEM_DEL_STATS */ 1965 MDSTAT_PGRP(mhp, ntick_pgrp); 1966 mutex_enter(&mhp->mh_mutex); 1967 continue; 1968 } 1969 1970 /* 1971 * Regular 'page-out'. 1972 */ 1973 if (!mod) { 1974 MDSTAT_INCR(mhp, ndestroy); 1975 page_destroy(pp, 1); 1976 /* 1977 * page_destroy was called with 1978 * dontfree. As long as p_lckcnt 1979 * and p_cowcnt are both zero, the 1980 * only additional action of 1981 * page_destroy with !dontfree is to 1982 * call page_free, so we can collect 1983 * the page here. 1984 */ 1985 collected++; 1986 #ifdef MEM_DEL_STATS 1987 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 1988 start_pgrp; 1989 #endif /* MEM_DEL_STATS */ 1990 MDSTAT_PGRP(mhp, ntick_pgrp); 1991 mutex_enter(&mhp->mh_mutex); 1992 page_delete_collect(pp, mhp); 1993 mdsp->mds_bitmap[bit / NBPBMW] |= 1994 (1 << (bit % NBPBMW)); 1995 continue; 1996 } 1997 /* 1998 * The page is toxic and the mod bit is 1999 * set, we cannot do anything here to deal 2000 * with it. 2001 */ 2002 if (PP_TOXIC(pp)) { 2003 page_unlock(pp); 2004 #ifdef MEM_DEL_STATS 2005 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2006 start_pgrp; 2007 #endif /* MEM_DEL_STATS */ 2008 MDSTAT_PGRP(mhp, ntick_pgrp); 2009 MDSTAT_INCR(mhp, modtoxic); 2010 mutex_enter(&mhp->mh_mutex); 2011 continue; 2012 } 2013 MDSTAT_INCR(mhp, nputpage); 2014 vp = pp->p_vnode; 2015 offset = pp->p_offset; 2016 VN_HOLD(vp); 2017 page_unlock(pp); 2018 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, 2019 B_INVAL|B_FORCE, kcred); 2020 VN_RELE(vp); 2021 #ifdef MEM_DEL_STATS 2022 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2023 start_pgrp; 2024 #endif /* MEM_DEL_STATS */ 2025 MDSTAT_PGRP(mhp, ntick_pgrp); 2026 /* 2027 * Try to get the page back immediately 2028 * so that it can be collected. 2029 */ 2030 pp = page_numtopp_nolock(pfn); 2031 if (pp == NULL) { 2032 MDSTAT_INCR(mhp, nnoreclaim); 2033 /* 2034 * This should not happen as this 2035 * thread is deleting the page. 2036 * If this code is generalized, this 2037 * becomes a reality. 2038 */ 2039 #ifdef DEBUG 2040 cmn_err(CE_WARN, 2041 "delete_memory_thread(0x%p) " 2042 "pfn 0x%lx has no page_t", 2043 (void *)mhp, pfn); 2044 #endif /* DEBUG */ 2045 mutex_enter(&mhp->mh_mutex); 2046 continue; 2047 } 2048 if (page_try_reclaim_lock(pp, SE_EXCL, 2049 SE_EXCL_WANTED | SE_RETIRED)) { 2050 if (PP_ISFREE(pp)) { 2051 goto free_page_collect; 2052 } 2053 page_unlock(pp); 2054 } 2055 MDSTAT_INCR(mhp, nnoreclaim); 2056 mutex_enter(&mhp->mh_mutex); 2057 continue; 2058 2059 reloc: 2060 /* 2061 * Got some freemem and a target 2062 * page, so move the data to avoid 2063 * I/O and lock problems. 2064 */ 2065 ASSERT(!page_iolock_assert(pp)); 2066 MDSTAT_INCR(mhp, nreloc); 2067 /* 2068 * page_relocate() will return pgcnt: the 2069 * number of consecutive pages relocated. 2070 * If it is successful, pp will be a 2071 * linked list of the page structs that 2072 * were relocated. If page_relocate() is 2073 * unsuccessful, pp will be unmodified. 2074 */ 2075 #ifdef MEM_DEL_STATS 2076 start_pgrp = ddi_get_lbolt(); 2077 #endif /* MEM_DEL_STATS */ 2078 result = page_relocate(&pp, &pp_targ, 0, 0, 2079 &pgcnt, NULL); 2080 #ifdef MEM_DEL_STATS 2081 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2082 start_pgrp; 2083 #endif /* MEM_DEL_STATS */ 2084 MDSTAT_PGRP(mhp, ntick_pgrp); 2085 if (result != 0) { 2086 MDSTAT_INCR(mhp, nrelocfail); 2087 /* 2088 * We did not succeed. We need 2089 * to give the pp_targ pages back. 2090 * page_free(pp_targ, 1) without 2091 * the freemem accounting. 2092 */ 2093 group_page_unlock(pp); 2094 page_free_replacement_page(pp_targ); 2095 page_unlock(pp); 2096 mutex_enter(&mhp->mh_mutex); 2097 continue; 2098 } 2099 2100 /* 2101 * We will then collect pgcnt pages. 2102 */ 2103 ASSERT(pgcnt > 0); 2104 mutex_enter(&mhp->mh_mutex); 2105 /* 2106 * We need to make sure freemem_left is 2107 * large enough. 2108 */ 2109 while ((freemem_left < pgcnt) && 2110 (!mhp->mh_cancel)) { 2111 freemem_left += 2112 delthr_get_freemem(mhp); 2113 } 2114 2115 /* 2116 * Do not proceed if mh_cancel is set. 2117 */ 2118 if (mhp->mh_cancel) { 2119 while (pp_targ != NULL) { 2120 /* 2121 * Unlink and unlock each page. 2122 */ 2123 tpp_targ = pp_targ; 2124 page_sub(&pp_targ, tpp_targ); 2125 page_unlock(tpp_targ); 2126 } 2127 /* 2128 * We need to give the pp pages back. 2129 * page_free(pp, 1) without the 2130 * freemem accounting. 2131 */ 2132 page_free_replacement_page(pp); 2133 break; 2134 } 2135 2136 /* Now remove pgcnt from freemem_left */ 2137 freemem_left -= pgcnt; 2138 ASSERT(freemem_left >= 0); 2139 szc = pp->p_szc; 2140 while (pp != NULL) { 2141 /* 2142 * pp and pp_targ were passed back as 2143 * a linked list of pages. 2144 * Unlink and unlock each page. 2145 */ 2146 tpp_targ = pp_targ; 2147 page_sub(&pp_targ, tpp_targ); 2148 page_unlock(tpp_targ); 2149 /* 2150 * The original page is now free 2151 * so remove it from the linked 2152 * list and collect it. 2153 */ 2154 tpp = pp; 2155 page_sub(&pp, tpp); 2156 pfn = page_pptonum(tpp); 2157 collected++; 2158 ASSERT(PAGE_EXCL(tpp)); 2159 ASSERT(tpp->p_vnode == NULL); 2160 ASSERT(!hat_page_is_mapped(tpp)); 2161 ASSERT(tpp->p_szc == szc); 2162 tpp->p_szc = 0; 2163 page_delete_collect(tpp, mhp); 2164 bit = pfn - mdsp->mds_base; 2165 mdsp->mds_bitmap[bit / NBPBMW] |= 2166 (1 << (bit % NBPBMW)); 2167 } 2168 ASSERT(pp_targ == NULL); 2169 } 2170 } 2171 first_scan = 0; 2172 if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) && 2173 (collected == 0)) { 2174 /* 2175 * This code is needed as we cannot wait 2176 * for a page to be locked OR the delete to 2177 * be cancelled. Also, we must delay so 2178 * that other threads get a chance to run 2179 * on our cpu, otherwise page locks may be 2180 * held indefinitely by those threads. 2181 */ 2182 MDSTAT_INCR(mhp, ndelay); 2183 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2184 (void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex, 2185 (lbolt + DEL_BUSY_WAIT_TICKS)); 2186 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2187 } 2188 } 2189 /* stop the dr aio cleanup thread */ 2190 mhp->mh_dr_aio_cleanup_cancel = 1; 2191 transit_list_collect(mhp, 0); 2192 if (freemem_left != 0) { 2193 /* Return any surplus. */ 2194 page_create_putback(freemem_left); 2195 freemem_left = 0; 2196 } 2197 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2198 mdsp = mdsp->mds_next) { 2199 mem_node_post_del_slice(mdsp->mds_base, 2200 mdsp->mds_base + mdsp->mds_npgs - 1, 2201 (mhp->mh_cancel != 0)); 2202 } 2203 #ifdef MEM_DEL_STATS 2204 ntick_total = (uint64_t)ddi_get_lbolt() - start_total; 2205 #endif /* MEM_DEL_STATS */ 2206 MDSTAT_TOTAL(mhp, ntick_total); 2207 MDSTAT_PRINT(mhp); 2208 2209 /* 2210 * If the memory delete was cancelled, exclusive-wanted bits must 2211 * be cleared. If there are retired pages being deleted, they need 2212 * to be unretired. 2213 */ 2214 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2215 mdsp = mdsp->mds_next) { 2216 pfn_t pfn, p_end; 2217 2218 p_end = mdsp->mds_base + mdsp->mds_npgs; 2219 for (pfn = mdsp->mds_base; pfn < p_end; pfn++) { 2220 page_t *pp; 2221 pgcnt_t bit; 2222 2223 bit = pfn - mdsp->mds_base; 2224 if (mhp->mh_cancel) { 2225 pp = page_numtopp_nolock(pfn); 2226 if (pp != NULL) { 2227 if ((mdsp->mds_bitmap[bit / NBPBMW] & 2228 (1 << (bit % NBPBMW))) == 0) { 2229 page_lock_clr_exclwanted(pp); 2230 } 2231 } 2232 } else { 2233 pp = NULL; 2234 } 2235 if ((mdsp->mds_bitmap_retired[bit / NBPBMW] & 2236 (1 << (bit % NBPBMW))) != 0) { 2237 /* do we already have pp? */ 2238 if (pp == NULL) { 2239 pp = page_numtopp_nolock(pfn); 2240 } 2241 ASSERT(pp != NULL); 2242 ASSERT(PP_RETIRED(pp)); 2243 if (mhp->mh_cancel != 0) { 2244 page_unlock(pp); 2245 /* 2246 * To satisfy ASSERT below in 2247 * cancel code. 2248 */ 2249 mhp->mh_hold_todo++; 2250 } else { 2251 (void) page_unretire_pp(pp, 0); 2252 } 2253 } 2254 } 2255 } 2256 /* 2257 * Free retired page bitmap and collected page bitmap 2258 */ 2259 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2260 mdsp = mdsp->mds_next) { 2261 ASSERT(mdsp->mds_bitmap_retired != NULL); 2262 kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp)); 2263 mdsp->mds_bitmap_retired = NULL; /* Paranoia. */ 2264 ASSERT(mdsp->mds_bitmap != NULL); 2265 kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp)); 2266 mdsp->mds_bitmap = NULL; /* Paranoia. */ 2267 } 2268 2269 /* wait for our dr aio cancel thread to exit */ 2270 while (!(mhp->mh_aio_cleanup_done)) { 2271 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2272 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 2273 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2274 } 2275 refused: 2276 if (mhp->mh_cancel != 0) { 2277 page_t *pp; 2278 2279 comp_code = mhp->mh_cancel; 2280 /* 2281 * Go through list of deleted pages (mh_deleted) freeing 2282 * them. 2283 */ 2284 while ((pp = mhp->mh_deleted) != NULL) { 2285 mhp->mh_deleted = pp->p_next; 2286 mhp->mh_hold_todo++; 2287 mutex_exit(&mhp->mh_mutex); 2288 /* Restore p_next. */ 2289 pp->p_next = pp->p_prev; 2290 if (PP_ISFREE(pp)) { 2291 cmn_err(CE_PANIC, 2292 "page %p is free", 2293 (void *)pp); 2294 } 2295 page_free(pp, 1); 2296 mutex_enter(&mhp->mh_mutex); 2297 } 2298 ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages); 2299 2300 mutex_exit(&mhp->mh_mutex); 2301 put_availrmem(mhp->mh_vm_pages); 2302 mutex_enter(&mhp->mh_mutex); 2303 2304 goto t_exit; 2305 } 2306 2307 /* 2308 * All the pages are no longer in use and are exclusively locked. 2309 */ 2310 2311 mhp->mh_deleted = NULL; 2312 2313 kphysm_del_cleanup(mhp); 2314 2315 comp_code = KPHYSM_OK; 2316 2317 t_exit: 2318 mutex_exit(&mhp->mh_mutex); 2319 kphysm_setup_post_del(mhp->mh_vm_pages, 2320 (comp_code == KPHYSM_OK) ? 0 : 1); 2321 mutex_enter(&mhp->mh_mutex); 2322 2323 early_exit: 2324 /* mhp->mh_mutex exited by CALLB_CPR_EXIT() */ 2325 mhp->mh_state = MHND_DONE; 2326 del_complete_funcp = mhp->mh_delete_complete; 2327 del_complete_arg = mhp->mh_delete_complete_arg; 2328 CALLB_CPR_EXIT(&cprinfo); 2329 (*del_complete_funcp)(del_complete_arg, comp_code); 2330 thread_exit(); 2331 /*NOTREACHED*/ 2332 } 2333 2334 /* 2335 * Start the delete of the memory from the system. 2336 */ 2337 int 2338 kphysm_del_start( 2339 memhandle_t handle, 2340 void (*complete)(void *, int), 2341 void *complete_arg) 2342 { 2343 struct mem_handle *mhp; 2344 2345 mhp = kphysm_lookup_mem_handle(handle); 2346 if (mhp == NULL) { 2347 return (KPHYSM_EHANDLE); 2348 } 2349 switch (mhp->mh_state) { 2350 case MHND_FREE: 2351 ASSERT(mhp->mh_state != MHND_FREE); 2352 mutex_exit(&mhp->mh_mutex); 2353 return (KPHYSM_EHANDLE); 2354 case MHND_INIT: 2355 break; 2356 case MHND_STARTING: 2357 case MHND_RUNNING: 2358 mutex_exit(&mhp->mh_mutex); 2359 return (KPHYSM_ESEQUENCE); 2360 case MHND_DONE: 2361 mutex_exit(&mhp->mh_mutex); 2362 return (KPHYSM_ESEQUENCE); 2363 case MHND_RELEASE: 2364 mutex_exit(&mhp->mh_mutex); 2365 return (KPHYSM_ESEQUENCE); 2366 default: 2367 #ifdef DEBUG 2368 cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d", 2369 (void *)mhp, mhp->mh_state); 2370 #endif /* DEBUG */ 2371 mutex_exit(&mhp->mh_mutex); 2372 return (KPHYSM_EHANDLE); 2373 } 2374 2375 if (mhp->mh_transit.trl_spans == NULL) { 2376 mutex_exit(&mhp->mh_mutex); 2377 return (KPHYSM_ENOWORK); 2378 } 2379 2380 ASSERT(complete != NULL); 2381 mhp->mh_delete_complete = complete; 2382 mhp->mh_delete_complete_arg = complete_arg; 2383 mhp->mh_state = MHND_STARTING; 2384 /* 2385 * Release the mutex in case thread_create sleeps. 2386 */ 2387 mutex_exit(&mhp->mh_mutex); 2388 2389 /* 2390 * The "obvious" process for this thread is pageout (proc_pageout) 2391 * but this gives the thread too much power over freemem 2392 * which results in freemem starvation. 2393 */ 2394 (void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0, 2395 TS_RUN, maxclsyspri - 1); 2396 2397 return (KPHYSM_OK); 2398 } 2399 2400 static kmutex_t pp_dummy_lock; /* Protects init. of pp_dummy. */ 2401 static caddr_t pp_dummy; 2402 static pgcnt_t pp_dummy_npages; 2403 static pfn_t *pp_dummy_pfn; /* Array of dummy pfns. */ 2404 2405 static void 2406 memseg_remap_init_pages(page_t *pages, page_t *epages) 2407 { 2408 page_t *pp; 2409 2410 for (pp = pages; pp < epages; pp++) { 2411 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2412 pp->p_offset = (u_offset_t)-1; 2413 page_iolock_init(pp); 2414 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 2415 continue; 2416 page_lock_delete(pp); 2417 } 2418 } 2419 2420 void 2421 memseg_remap_init() 2422 { 2423 mutex_enter(&pp_dummy_lock); 2424 if (pp_dummy == NULL) { 2425 uint_t dpages; 2426 int i; 2427 2428 /* 2429 * dpages starts off as the size of the structure and 2430 * ends up as the minimum number of pages that will 2431 * hold a whole number of page_t structures. 2432 */ 2433 dpages = sizeof (page_t); 2434 ASSERT(dpages != 0); 2435 ASSERT(dpages <= MMU_PAGESIZE); 2436 2437 while ((dpages & 1) == 0) 2438 dpages >>= 1; 2439 2440 pp_dummy_npages = dpages; 2441 /* 2442 * Allocate pp_dummy pages directly from static_arena, 2443 * since these are whole page allocations and are 2444 * referenced by physical address. This also has the 2445 * nice fringe benefit of hiding the memory from 2446 * ::findleaks since it doesn't deal well with allocated 2447 * kernel heap memory that doesn't have any mappings. 2448 */ 2449 pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages), 2450 PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP); 2451 bzero(pp_dummy, ptob(pp_dummy_npages)); 2452 ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0); 2453 pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) * 2454 pp_dummy_npages, KM_SLEEP); 2455 for (i = 0; i < pp_dummy_npages; i++) { 2456 pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat, 2457 &pp_dummy[MMU_PAGESIZE * i]); 2458 ASSERT(pp_dummy_pfn[i] != PFN_INVALID); 2459 } 2460 /* 2461 * Initialize the page_t's to a known 'deleted' state 2462 * that matches the state of deleted pages. 2463 */ 2464 memseg_remap_init_pages((page_t *)pp_dummy, 2465 (page_t *)(pp_dummy + 2466 ptob(pp_dummy_npages))); 2467 /* Remove kmem mappings for the pages for safety. */ 2468 hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages), 2469 HAT_UNLOAD_UNLOCK); 2470 /* Leave pp_dummy pointer set as flag that init is done. */ 2471 } 2472 mutex_exit(&pp_dummy_lock); 2473 } 2474 2475 static void 2476 memseg_remap_to_dummy(caddr_t pp, pgcnt_t metapgs) 2477 { 2478 ASSERT(pp_dummy != NULL); 2479 2480 while (metapgs != 0) { 2481 pgcnt_t n; 2482 int i; 2483 2484 n = pp_dummy_npages; 2485 if (n > metapgs) 2486 n = metapgs; 2487 for (i = 0; i < n; i++) { 2488 hat_devload(kas.a_hat, pp, ptob(1), pp_dummy_pfn[i], 2489 PROT_READ, 2490 HAT_LOAD | HAT_LOAD_NOCONSIST | 2491 HAT_LOAD_REMAP); 2492 pp += ptob(1); 2493 } 2494 metapgs -= n; 2495 } 2496 } 2497 2498 /* 2499 * Transition all the deleted pages to the deleted state so that 2500 * page_lock will not wait. The page_lock_delete call will 2501 * also wake up any waiters. 2502 */ 2503 static void 2504 memseg_lock_delete_all(struct memseg *seg) 2505 { 2506 page_t *pp; 2507 2508 for (pp = seg->pages; pp < seg->epages; pp++) { 2509 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2510 page_lock_delete(pp); 2511 } 2512 } 2513 2514 static void 2515 kphysm_del_cleanup(struct mem_handle *mhp) 2516 { 2517 struct memdelspan *mdsp; 2518 struct memseg *seg; 2519 struct memseg **segpp; 2520 struct memseg *seglist; 2521 pfn_t p_end; 2522 uint64_t avmem; 2523 pgcnt_t avpgs; 2524 pgcnt_t npgs; 2525 2526 avpgs = mhp->mh_vm_pages; 2527 2528 memsegs_lock(1); 2529 2530 /* 2531 * remove from main segment list. 2532 */ 2533 npgs = 0; 2534 seglist = NULL; 2535 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2536 mdsp = mdsp->mds_next) { 2537 p_end = mdsp->mds_base + mdsp->mds_npgs; 2538 for (segpp = &memsegs; (seg = *segpp) != NULL; ) { 2539 if (seg->pages_base >= p_end || 2540 seg->pages_end <= mdsp->mds_base) { 2541 /* Span and memseg don't overlap. */ 2542 segpp = &((*segpp)->next); 2543 continue; 2544 } 2545 ASSERT(seg->pages_base >= mdsp->mds_base); 2546 ASSERT(seg->pages_end <= p_end); 2547 2548 /* Hide the memseg from future scans. */ 2549 hat_kpm_delmem_mseg_update(seg, segpp); 2550 *segpp = seg->next; 2551 membar_producer(); /* TODO: Needed? */ 2552 npgs += MSEG_NPAGES(seg); 2553 2554 /* 2555 * Leave the deleted segment's next pointer intact 2556 * in case a memsegs scanning loop is walking this 2557 * segment concurrently. 2558 */ 2559 seg->lnext = seglist; 2560 seglist = seg; 2561 } 2562 } 2563 2564 build_pfn_hash(); 2565 2566 ASSERT(npgs < total_pages); 2567 total_pages -= npgs; 2568 2569 /* 2570 * Recalculate the paging parameters now total_pages has changed. 2571 * This will also cause the clock hands to be reset before next use. 2572 */ 2573 setupclock(1); 2574 2575 memsegs_unlock(1); 2576 2577 mutex_exit(&mhp->mh_mutex); 2578 2579 while ((seg = seglist) != NULL) { 2580 pfn_t mseg_start; 2581 pfn_t mseg_base, mseg_end; 2582 pgcnt_t mseg_npgs; 2583 page_t *pp; 2584 pgcnt_t metapgs; 2585 int dynamic; 2586 int mlret; 2587 2588 seglist = seg->lnext; 2589 2590 /* 2591 * Put the page_t's into the deleted state to stop 2592 * cv_wait()s on the pages. When we remap, the dummy 2593 * page_t's will be in the same state. 2594 */ 2595 memseg_lock_delete_all(seg); 2596 /* 2597 * Collect up information based on pages_base and pages_end 2598 * early so that we can flag early that the memseg has been 2599 * deleted by setting pages_end == pages_base. 2600 */ 2601 mseg_base = seg->pages_base; 2602 mseg_end = seg->pages_end; 2603 mseg_npgs = MSEG_NPAGES(seg); 2604 dynamic = memseg_is_dynamic(seg, &mseg_start); 2605 2606 seg->pages_end = seg->pages_base; 2607 2608 if (dynamic) { 2609 pp = seg->pages; 2610 metapgs = mseg_base - mseg_start; 2611 ASSERT(metapgs != 0); 2612 2613 /* Remap the meta data to our special dummy area. */ 2614 memseg_remap_to_dummy((caddr_t)pp, metapgs); 2615 2616 mutex_enter(&memseg_lists_lock); 2617 seg->lnext = memseg_va_avail; 2618 memseg_va_avail = seg; 2619 mutex_exit(&memseg_lists_lock); 2620 } else { 2621 /* 2622 * Set for clean-up below. 2623 */ 2624 mseg_start = seg->pages_base; 2625 /* 2626 * For memory whose page_ts were allocated 2627 * at boot, we need to find a new use for 2628 * the page_t memory. 2629 * For the moment, just leak it. 2630 * (It is held in the memseg_delete_junk list.) 2631 */ 2632 2633 mutex_enter(&memseg_lists_lock); 2634 seg->lnext = memseg_delete_junk; 2635 memseg_delete_junk = seg; 2636 mutex_exit(&memseg_lists_lock); 2637 } 2638 2639 /* Must not use seg now as it could be re-used. */ 2640 2641 memlist_write_lock(); 2642 2643 mlret = memlist_delete_span( 2644 (uint64_t)(mseg_base) << PAGESHIFT, 2645 (uint64_t)(mseg_npgs) << PAGESHIFT, 2646 &phys_avail); 2647 ASSERT(mlret == MEML_SPANOP_OK); 2648 2649 mlret = memlist_delete_span( 2650 (uint64_t)(mseg_start) << PAGESHIFT, 2651 (uint64_t)(mseg_end - mseg_start) << 2652 PAGESHIFT, 2653 &phys_install); 2654 ASSERT(mlret == MEML_SPANOP_OK); 2655 phys_install_has_changed(); 2656 2657 memlist_write_unlock(); 2658 } 2659 2660 memlist_read_lock(); 2661 installed_top_size(phys_install, &physmax, &physinstalled); 2662 memlist_read_unlock(); 2663 2664 mutex_enter(&freemem_lock); 2665 maxmem -= avpgs; 2666 physmem -= avpgs; 2667 /* availrmem is adjusted during the delete. */ 2668 availrmem_initial -= avpgs; 2669 2670 mutex_exit(&freemem_lock); 2671 2672 dump_resize(); 2673 2674 cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK " 2675 "(0x%" PRIx64 ")\n", 2676 physinstalled << (PAGESHIFT - 10), 2677 (uint64_t)physinstalled << PAGESHIFT); 2678 2679 avmem = (uint64_t)freemem << PAGESHIFT; 2680 cmn_err(CE_CONT, "?kphysm_delete: " 2681 "avail mem = %" PRId64 "\n", avmem); 2682 2683 /* 2684 * Update lgroup generation number on single lgroup systems 2685 */ 2686 if (nlgrps == 1) 2687 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 2688 2689 /* Successfully deleted system memory */ 2690 mutex_enter(&mhp->mh_mutex); 2691 } 2692 2693 static uint_t mdel_nullvp_waiter; 2694 2695 static void 2696 page_delete_collect( 2697 page_t *pp, 2698 struct mem_handle *mhp) 2699 { 2700 if (pp->p_vnode) { 2701 page_hashout(pp, (kmutex_t *)NULL); 2702 /* do not do PP_SETAGED(pp); */ 2703 } else { 2704 kmutex_t *sep; 2705 2706 sep = page_se_mutex(pp); 2707 mutex_enter(sep); 2708 if (CV_HAS_WAITERS(&pp->p_cv)) { 2709 mdel_nullvp_waiter++; 2710 cv_broadcast(&pp->p_cv); 2711 } 2712 mutex_exit(sep); 2713 } 2714 ASSERT(pp->p_next == pp->p_prev); 2715 ASSERT(pp->p_next == NULL || pp->p_next == pp); 2716 pp->p_next = mhp->mh_deleted; 2717 mhp->mh_deleted = pp; 2718 ASSERT(mhp->mh_hold_todo != 0); 2719 mhp->mh_hold_todo--; 2720 } 2721 2722 static void 2723 transit_list_collect(struct mem_handle *mhp, int v) 2724 { 2725 struct transit_list_head *trh; 2726 2727 trh = &transit_list_head; 2728 mutex_enter(&trh->trh_lock); 2729 mhp->mh_transit.trl_collect = v; 2730 mutex_exit(&trh->trh_lock); 2731 } 2732 2733 static void 2734 transit_list_insert(struct transit_list *tlp) 2735 { 2736 struct transit_list_head *trh; 2737 2738 trh = &transit_list_head; 2739 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2740 tlp->trl_next = trh->trh_head; 2741 trh->trh_head = tlp; 2742 } 2743 2744 static void 2745 transit_list_remove(struct transit_list *tlp) 2746 { 2747 struct transit_list_head *trh; 2748 struct transit_list **tlpp; 2749 2750 trh = &transit_list_head; 2751 tlpp = &trh->trh_head; 2752 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2753 while (*tlpp != NULL && *tlpp != tlp) 2754 tlpp = &(*tlpp)->trl_next; 2755 ASSERT(*tlpp != NULL); 2756 if (*tlpp == tlp) 2757 *tlpp = tlp->trl_next; 2758 tlp->trl_next = NULL; 2759 } 2760 2761 static struct transit_list * 2762 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum) 2763 { 2764 struct transit_list *tlp; 2765 2766 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 2767 struct memdelspan *mdsp; 2768 2769 for (mdsp = tlp->trl_spans; mdsp != NULL; 2770 mdsp = mdsp->mds_next) { 2771 if (pfnum >= mdsp->mds_base && 2772 pfnum < (mdsp->mds_base + mdsp->mds_npgs)) { 2773 return (tlp); 2774 } 2775 } 2776 } 2777 return (NULL); 2778 } 2779 2780 int 2781 pfn_is_being_deleted(pfn_t pfnum) 2782 { 2783 struct transit_list_head *trh; 2784 struct transit_list *tlp; 2785 int ret; 2786 2787 trh = &transit_list_head; 2788 if (trh->trh_head == NULL) 2789 return (0); 2790 2791 mutex_enter(&trh->trh_lock); 2792 tlp = pfnum_to_transit_list(trh, pfnum); 2793 ret = (tlp != NULL && tlp->trl_collect); 2794 mutex_exit(&trh->trh_lock); 2795 2796 return (ret); 2797 } 2798 2799 #ifdef MEM_DEL_STATS 2800 extern int hz; 2801 static void 2802 mem_del_stat_print_func(struct mem_handle *mhp) 2803 { 2804 uint64_t tmp; 2805 2806 if (mem_del_stat_print) { 2807 printf("memory delete loop %x/%x, statistics%s\n", 2808 (uint_t)mhp->mh_transit.trl_spans->mds_base, 2809 (uint_t)mhp->mh_transit.trl_spans->mds_npgs, 2810 (mhp->mh_cancel ? " (cancelled)" : "")); 2811 printf("\t%8u nloop\n", mhp->mh_delstat.nloop); 2812 printf("\t%8u need_free\n", mhp->mh_delstat.need_free); 2813 printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop); 2814 printf("\t%8u free_low\n", mhp->mh_delstat.free_low); 2815 printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed); 2816 printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck); 2817 printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget); 2818 printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail); 2819 printf("\t%8u nfree\n", mhp->mh_delstat.nfree); 2820 printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc); 2821 printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail); 2822 printf("\t%8u already_done\n", mhp->mh_delstat.already_done); 2823 printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree); 2824 printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked); 2825 printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc); 2826 printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl); 2827 printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc); 2828 printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy); 2829 printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage); 2830 printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim); 2831 printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay); 2832 printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail); 2833 printf("\t%8u retired\n", mhp->mh_delstat.retired); 2834 printf("\t%8u toxic\n", mhp->mh_delstat.toxic); 2835 printf("\t%8u failing\n", mhp->mh_delstat.failing); 2836 printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic); 2837 printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic); 2838 printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail); 2839 printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail); 2840 tmp = mhp->mh_delstat.nticks_total / hz; /* seconds */ 2841 printf( 2842 "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n", 2843 mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60); 2844 2845 tmp = mhp->mh_delstat.nticks_pgrp / hz; /* seconds */ 2846 printf( 2847 "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n", 2848 mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60); 2849 } 2850 } 2851 #endif /* MEM_DEL_STATS */ 2852 2853 struct mem_callback { 2854 kphysm_setup_vector_t *vec; 2855 void *arg; 2856 }; 2857 2858 #define NMEMCALLBACKS 100 2859 2860 static struct mem_callback mem_callbacks[NMEMCALLBACKS]; 2861 static uint_t nmemcallbacks; 2862 static krwlock_t mem_callback_rwlock; 2863 2864 int 2865 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg) 2866 { 2867 uint_t i, found; 2868 2869 /* 2870 * This test will become more complicated when the version must 2871 * change. 2872 */ 2873 if (vec->version != KPHYSM_SETUP_VECTOR_VERSION) 2874 return (EINVAL); 2875 2876 if (vec->post_add == NULL || vec->pre_del == NULL || 2877 vec->post_del == NULL) 2878 return (EINVAL); 2879 2880 rw_enter(&mem_callback_rwlock, RW_WRITER); 2881 for (i = 0, found = 0; i < nmemcallbacks; i++) { 2882 if (mem_callbacks[i].vec == NULL && found == 0) 2883 found = i + 1; 2884 if (mem_callbacks[i].vec == vec && 2885 mem_callbacks[i].arg == arg) { 2886 #ifdef DEBUG 2887 /* Catch this in DEBUG kernels. */ 2888 cmn_err(CE_WARN, "kphysm_setup_func_register" 2889 "(0x%p, 0x%p) duplicate registration from 0x%p", 2890 (void *)vec, arg, (void *)caller()); 2891 #endif /* DEBUG */ 2892 rw_exit(&mem_callback_rwlock); 2893 return (EEXIST); 2894 } 2895 } 2896 if (found != 0) { 2897 i = found - 1; 2898 } else { 2899 ASSERT(nmemcallbacks < NMEMCALLBACKS); 2900 if (nmemcallbacks == NMEMCALLBACKS) { 2901 rw_exit(&mem_callback_rwlock); 2902 return (ENOMEM); 2903 } 2904 i = nmemcallbacks++; 2905 } 2906 mem_callbacks[i].vec = vec; 2907 mem_callbacks[i].arg = arg; 2908 rw_exit(&mem_callback_rwlock); 2909 return (0); 2910 } 2911 2912 void 2913 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg) 2914 { 2915 uint_t i; 2916 2917 rw_enter(&mem_callback_rwlock, RW_WRITER); 2918 for (i = 0; i < nmemcallbacks; i++) { 2919 if (mem_callbacks[i].vec == vec && 2920 mem_callbacks[i].arg == arg) { 2921 mem_callbacks[i].vec = NULL; 2922 mem_callbacks[i].arg = NULL; 2923 if (i == (nmemcallbacks - 1)) 2924 nmemcallbacks--; 2925 break; 2926 } 2927 } 2928 rw_exit(&mem_callback_rwlock); 2929 } 2930 2931 static void 2932 kphysm_setup_post_add(pgcnt_t delta_pages) 2933 { 2934 uint_t i; 2935 2936 rw_enter(&mem_callback_rwlock, RW_READER); 2937 for (i = 0; i < nmemcallbacks; i++) { 2938 if (mem_callbacks[i].vec != NULL) { 2939 (*mem_callbacks[i].vec->post_add) 2940 (mem_callbacks[i].arg, delta_pages); 2941 } 2942 } 2943 rw_exit(&mem_callback_rwlock); 2944 } 2945 2946 /* 2947 * Note the locking between pre_del and post_del: The reader lock is held 2948 * between the two calls to stop the set of functions from changing. 2949 */ 2950 2951 static int 2952 kphysm_setup_pre_del(pgcnt_t delta_pages) 2953 { 2954 uint_t i; 2955 int ret; 2956 int aret; 2957 2958 ret = 0; 2959 rw_enter(&mem_callback_rwlock, RW_READER); 2960 for (i = 0; i < nmemcallbacks; i++) { 2961 if (mem_callbacks[i].vec != NULL) { 2962 aret = (*mem_callbacks[i].vec->pre_del) 2963 (mem_callbacks[i].arg, delta_pages); 2964 ret |= aret; 2965 } 2966 } 2967 2968 return (ret); 2969 } 2970 2971 static void 2972 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled) 2973 { 2974 uint_t i; 2975 2976 for (i = 0; i < nmemcallbacks; i++) { 2977 if (mem_callbacks[i].vec != NULL) { 2978 (*mem_callbacks[i].vec->post_del) 2979 (mem_callbacks[i].arg, delta_pages, cancelled); 2980 } 2981 } 2982 rw_exit(&mem_callback_rwlock); 2983 } 2984 2985 static int 2986 kphysm_split_memseg( 2987 pfn_t base, 2988 pgcnt_t npgs) 2989 { 2990 struct memseg *seg; 2991 struct memseg **segpp; 2992 pgcnt_t size_low, size_high; 2993 struct memseg *seg_low, *seg_mid, *seg_high; 2994 2995 /* 2996 * Lock the memsegs list against other updates now 2997 */ 2998 memsegs_lock(1); 2999 3000 /* 3001 * Find boot time memseg that wholly covers this area. 3002 */ 3003 3004 /* First find the memseg with page 'base' in it. */ 3005 for (segpp = &memsegs; (seg = *segpp) != NULL; 3006 segpp = &((*segpp)->next)) { 3007 if (base >= seg->pages_base && base < seg->pages_end) 3008 break; 3009 } 3010 if (seg == NULL) { 3011 memsegs_unlock(1); 3012 return (0); 3013 } 3014 if (memseg_is_dynamic(seg, (pfn_t *)NULL)) { 3015 memsegs_unlock(1); 3016 return (0); 3017 } 3018 if ((base + npgs) > seg->pages_end) { 3019 memsegs_unlock(1); 3020 return (0); 3021 } 3022 3023 /* 3024 * Work out the size of the two segments that will 3025 * surround the new segment, one for low address 3026 * and one for high. 3027 */ 3028 ASSERT(base >= seg->pages_base); 3029 size_low = base - seg->pages_base; 3030 ASSERT(seg->pages_end >= (base + npgs)); 3031 size_high = seg->pages_end - (base + npgs); 3032 3033 /* 3034 * Sanity check. 3035 */ 3036 if ((size_low + size_high) == 0) { 3037 memsegs_unlock(1); 3038 return (0); 3039 } 3040 3041 /* 3042 * Allocate the new structures. The old memseg will not be freed 3043 * as there may be a reference to it. 3044 */ 3045 seg_low = NULL; 3046 seg_high = NULL; 3047 3048 if (size_low != 0) { 3049 seg_low = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3050 bzero(seg_low, sizeof (struct memseg)); 3051 } 3052 3053 seg_mid = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3054 bzero(seg_mid, sizeof (struct memseg)); 3055 3056 if (size_high != 0) { 3057 seg_high = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3058 bzero(seg_high, sizeof (struct memseg)); 3059 } 3060 3061 /* 3062 * All allocation done now. 3063 */ 3064 if (size_low != 0) { 3065 seg_low->pages = seg->pages; 3066 seg_low->epages = seg_low->pages + size_low; 3067 seg_low->pages_base = seg->pages_base; 3068 seg_low->pages_end = seg_low->pages_base + size_low; 3069 seg_low->next = seg_mid; 3070 } 3071 if (size_high != 0) { 3072 seg_high->pages = seg->epages - size_high; 3073 seg_high->epages = seg_high->pages + size_high; 3074 seg_high->pages_base = seg->pages_end - size_high; 3075 seg_high->pages_end = seg_high->pages_base + size_high; 3076 seg_high->next = seg->next; 3077 } 3078 3079 seg_mid->pages = seg->pages + size_low; 3080 seg_mid->pages_base = seg->pages_base + size_low; 3081 seg_mid->epages = seg->epages - size_high; 3082 seg_mid->pages_end = seg->pages_end - size_high; 3083 seg_mid->next = (seg_high != NULL) ? seg_high : seg->next; 3084 3085 /* 3086 * Update hat_kpm specific info of all involved memsegs and 3087 * allow hat_kpm specific global chain updates. 3088 */ 3089 hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high); 3090 3091 /* 3092 * At this point we have two equivalent memseg sub-chains, 3093 * seg and seg_low/seg_mid/seg_high, which both chain on to 3094 * the same place in the global chain. By re-writing the pointer 3095 * in the previous element we switch atomically from using the old 3096 * (seg) to the new. 3097 */ 3098 *segpp = (seg_low != NULL) ? seg_low : seg_mid; 3099 3100 membar_enter(); 3101 3102 build_pfn_hash(); 3103 memsegs_unlock(1); 3104 3105 /* 3106 * We leave the old segment, 'seg', intact as there may be 3107 * references to it. Also, as the value of total_pages has not 3108 * changed and the memsegs list is effectively the same when 3109 * accessed via the old or the new pointer, we do not have to 3110 * cause pageout_scanner() to re-evaluate its hand pointers. 3111 * 3112 * We currently do not re-use or reclaim the page_t memory. 3113 * If we do, then this may have to change. 3114 */ 3115 3116 mutex_enter(&memseg_lists_lock); 3117 seg->lnext = memseg_edit_junk; 3118 memseg_edit_junk = seg; 3119 mutex_exit(&memseg_lists_lock); 3120 3121 return (1); 3122 } 3123 3124 /* 3125 * The memsegs lock is only taken when modifying the memsegs list 3126 * and rebuilding the pfn hash table (after boot). 3127 * No lock is needed for read as memseg structure are never de-allocated 3128 * and the pointer linkage is never updated until the memseg is ready. 3129 */ 3130 krwlock_t memsegslock; 3131 3132 void 3133 memsegs_lock(int writer) 3134 { 3135 rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER); 3136 } 3137 3138 /*ARGSUSED*/ 3139 void 3140 memsegs_unlock(int writer) 3141 { 3142 rw_exit(&memsegslock); 3143 } 3144 3145 /* 3146 * memlist (phys_install, phys_avail) locking. 3147 */ 3148 3149 /* 3150 * A read/write lock might be better here. 3151 */ 3152 static kmutex_t memlists_mutex; 3153 3154 void 3155 memlist_read_lock() 3156 { 3157 mutex_enter(&memlists_mutex); 3158 } 3159 3160 void 3161 memlist_read_unlock() 3162 { 3163 mutex_exit(&memlists_mutex); 3164 } 3165 3166 void 3167 memlist_write_lock() 3168 { 3169 mutex_enter(&memlists_mutex); 3170 } 3171 3172 void 3173 memlist_write_unlock() 3174 { 3175 mutex_exit(&memlists_mutex); 3176 } 3177 3178 /* 3179 * The sfmmu hat layer (e.g.) accesses some parts of the memseg 3180 * structure using physical addresses. Therefore a kmem_cache is 3181 * used with KMC_NOHASH to avoid page crossings within a memseg 3182 * structure. KMC_NOHASH requires that no external (outside of 3183 * slab) information is allowed. This, in turn, implies that the 3184 * cache's slabsize must be exactly a single page, since per-slab 3185 * information (e.g. the freelist for the slab) is kept at the 3186 * end of the slab, where it is easy to locate. Should be changed 3187 * when a more obvious kmem_cache interface/flag will become 3188 * available. 3189 */ 3190 void 3191 mem_config_init() 3192 { 3193 memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg), 3194 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH); 3195 } 3196