1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/cmn_err.h> 28 #include <sys/vmem.h> 29 #include <sys/kmem.h> 30 #include <sys/systm.h> 31 #include <sys/machsystm.h> /* for page_freelist_coalesce() */ 32 #include <sys/errno.h> 33 #include <sys/memnode.h> 34 #include <sys/memlist.h> 35 #include <sys/memlist_impl.h> 36 #include <sys/tuneable.h> 37 #include <sys/proc.h> 38 #include <sys/disp.h> 39 #include <sys/debug.h> 40 #include <sys/vm.h> 41 #include <sys/callb.h> 42 #include <sys/memlist_plat.h> /* for installed_top_size() */ 43 #include <sys/condvar_impl.h> /* for CV_HAS_WAITERS() */ 44 #include <sys/dumphdr.h> /* for dump_resize() */ 45 #include <sys/atomic.h> /* for use in stats collection */ 46 #include <sys/rwlock.h> 47 #include <sys/cpuvar.h> 48 #include <vm/seg_kmem.h> 49 #include <vm/seg_kpm.h> 50 #include <vm/page.h> 51 #include <vm/vm_dep.h> 52 #define SUNDDI_IMPL /* so sunddi.h will not redefine splx() et al */ 53 #include <sys/sunddi.h> 54 #include <sys/mem_config.h> 55 #include <sys/mem_cage.h> 56 #include <sys/lgrp.h> 57 #include <sys/ddi.h> 58 #include <sys/modctl.h> 59 60 extern struct memlist *phys_avail; 61 62 extern void mem_node_add(pfn_t, pfn_t); 63 extern void mem_node_del(pfn_t, pfn_t); 64 65 extern uint_t page_ctrs_adjust(int); 66 static void kphysm_setup_post_add(pgcnt_t); 67 static int kphysm_setup_pre_del(pgcnt_t); 68 static void kphysm_setup_post_del(pgcnt_t, int); 69 70 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs); 71 72 static int delspan_reserve(pfn_t, pgcnt_t); 73 static void delspan_unreserve(pfn_t, pgcnt_t); 74 75 static kmutex_t memseg_lists_lock; 76 static struct memseg *memseg_va_avail; 77 static struct memseg *memseg_delete_junk; 78 static struct memseg *memseg_edit_junk; 79 void memseg_remap_init(void); 80 static void memseg_remap_to_dummy(caddr_t, pgcnt_t); 81 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t); 82 static struct memseg *memseg_reuse(pgcnt_t); 83 84 static struct kmem_cache *memseg_cache; 85 86 /* 87 * Add a chunk of memory to the system. page_t's for this memory 88 * are allocated in the first few pages of the chunk. 89 * base: starting PAGESIZE page of new memory. 90 * npgs: length in PAGESIZE pages. 91 * 92 * Adding mem this way doesn't increase the size of the hash tables; 93 * growing them would be too hard. This should be OK, but adding memory 94 * dynamically most likely means more hash misses, since the tables will 95 * be smaller than they otherwise would be. 96 */ 97 int 98 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs) 99 { 100 page_t *pp; 101 page_t *opp, *oepp; 102 struct memseg *seg; 103 uint64_t avmem; 104 pfn_t pfn; 105 pfn_t pt_base = base; 106 pgcnt_t tpgs = npgs; 107 pgcnt_t metapgs; 108 int exhausted; 109 pfn_t pnum; 110 int mnode; 111 caddr_t vaddr; 112 int reuse; 113 int mlret; 114 void *mapva; 115 pgcnt_t nkpmpgs = 0; 116 offset_t kpm_pages_off; 117 118 cmn_err(CE_CONT, 119 "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n", 120 npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT); 121 122 /* 123 * Add this span in the delete list to prevent interactions. 124 */ 125 if (!delspan_reserve(base, npgs)) { 126 return (KPHYSM_ESPAN); 127 } 128 /* 129 * Check to see if any of the memory span has been added 130 * by trying an add to the installed memory list. This 131 * forms the interlocking process for add. 132 */ 133 134 memlist_write_lock(); 135 136 mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT, 137 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 138 139 if (mlret == MEML_SPANOP_OK) 140 installed_top_size(phys_install, &physmax, &physinstalled); 141 142 memlist_write_unlock(); 143 144 if (mlret != MEML_SPANOP_OK) { 145 if (mlret == MEML_SPANOP_EALLOC) { 146 delspan_unreserve(pt_base, tpgs); 147 return (KPHYSM_ERESOURCE); 148 } else 149 if (mlret == MEML_SPANOP_ESPAN) { 150 delspan_unreserve(pt_base, tpgs); 151 return (KPHYSM_ESPAN); 152 } else { 153 delspan_unreserve(pt_base, tpgs); 154 return (KPHYSM_ERESOURCE); 155 } 156 } 157 158 /* 159 * We store the page_t's for this new memory in the first 160 * few pages of the chunk. Here, we go and get'em ... 161 */ 162 163 /* 164 * The expression after the '-' gives the number of pages 165 * that will fit in the new memory based on a requirement 166 * of (PAGESIZE + sizeof (page_t)) bytes per page. 167 */ 168 metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) / 169 (PAGESIZE + sizeof (page_t))); 170 171 npgs -= metapgs; 172 base += metapgs; 173 174 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs); 175 176 exhausted = (metapgs == 0 || npgs == 0); 177 178 if (kpm_enable && !exhausted) { 179 pgcnt_t start, end, nkpmpgs_prelim; 180 size_t ptsz; 181 182 /* 183 * A viable kpm large page mapping must not overlap two 184 * dynamic memsegs. Therefore the total size is checked 185 * to be at least kpm_pgsz and also whether start and end 186 * points are at least kpm_pgsz aligned. 187 */ 188 if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) || 189 pmodkpmp(base + npgs)) { 190 191 kphysm_addmem_error_undospan(pt_base, tpgs); 192 193 /* 194 * There is no specific error code for violating 195 * kpm granularity constraints. 196 */ 197 return (KPHYSM_ENOTVIABLE); 198 } 199 200 start = kpmptop(ptokpmp(base)); 201 end = kpmptop(ptokpmp(base + npgs)); 202 nkpmpgs_prelim = ptokpmp(end - start); 203 ptsz = npgs * sizeof (page_t); 204 metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ); 205 exhausted = (tpgs <= metapgs); 206 if (!exhausted) { 207 npgs = tpgs - metapgs; 208 base = pt_base + metapgs; 209 210 /* final nkpmpgs */ 211 start = kpmptop(ptokpmp(base)); 212 nkpmpgs = ptokpmp(end - start); 213 kpm_pages_off = ptsz + 214 (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ; 215 } 216 } 217 218 /* 219 * Is memory area supplied too small? 220 */ 221 if (exhausted) { 222 kphysm_addmem_error_undospan(pt_base, tpgs); 223 224 /* 225 * There is no specific error code for 'too small'. 226 */ 227 return (KPHYSM_ERESOURCE); 228 } 229 230 /* 231 * We may re-use a previously allocated VA space for the page_ts 232 * eventually, but we need to initialize and lock the pages first. 233 */ 234 235 /* 236 * Get an address in the kernel address map, map 237 * the page_t pages and see if we can touch them. 238 */ 239 240 mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP); 241 if (mapva == NULL) { 242 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" 243 " Can't allocate VA for page_ts"); 244 245 kphysm_addmem_error_undospan(pt_base, tpgs); 246 247 return (KPHYSM_ERESOURCE); 248 } 249 pp = mapva; 250 251 if (physmax < (pt_base + tpgs)) 252 physmax = (pt_base + tpgs); 253 254 /* 255 * In the remapping code we map one page at a time so we must do 256 * the same here to match mapping sizes. 257 */ 258 pfn = pt_base; 259 vaddr = (caddr_t)pp; 260 for (pnum = 0; pnum < metapgs; pnum++) { 261 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 262 PROT_READ | PROT_WRITE, 263 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); 264 pfn++; 265 vaddr += ptob(1); 266 } 267 268 if (ddi_peek32((dev_info_t *)NULL, 269 (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) { 270 271 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" 272 " Can't access pp array at 0x%p [phys 0x%lx]", 273 (void *)pp, pt_base); 274 275 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 276 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 277 278 vmem_free(heap_arena, mapva, ptob(metapgs)); 279 280 kphysm_addmem_error_undospan(pt_base, tpgs); 281 282 return (KPHYSM_EFAULT); 283 } 284 285 /* 286 * Add this memory slice to its memory node translation. 287 * 288 * Note that right now, each node may have only one slice; 289 * this may change with COD or in larger SSM systems with 290 * nested latency groups, so we must not assume that the 291 * node does not yet exist. 292 * 293 * Also, using pt_base (page table base address) 294 * and tpgs (total number of pages) to mimic the case when a 295 * memory board is already installed in a system at boot 296 * time. This will ensure the entire address range is 297 * specified in order to have proper deletion. 298 */ 299 pnum = pt_base + tpgs - 1; 300 mem_node_add_slice(pt_base, pnum); 301 302 /* 303 * Allocate or resize page counters as necessary to accommodate 304 * the increase in memory pages. 305 */ 306 mnode = PFN_2_MEM_NODE(pnum); 307 if (page_ctrs_adjust(mnode) != 0) { 308 309 mem_node_pre_del_slice(pt_base, pnum); 310 mem_node_post_del_slice(pt_base, pnum, 0); 311 312 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 313 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 314 315 vmem_free(heap_arena, mapva, ptob(metapgs)); 316 317 kphysm_addmem_error_undospan(pt_base, tpgs); 318 319 return (KPHYSM_ERESOURCE); 320 } 321 322 /* 323 * Update the phys_avail memory list. 324 * The phys_install list was done at the start. 325 */ 326 327 memlist_write_lock(); 328 329 mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT, 330 (uint64_t)(npgs) << PAGESHIFT, &phys_avail); 331 ASSERT(mlret == MEML_SPANOP_OK); 332 333 memlist_write_unlock(); 334 335 /* See if we can find a memseg to re-use. */ 336 seg = memseg_reuse(metapgs); 337 338 reuse = (seg != NULL); 339 340 /* 341 * Initialize the memseg structure representing this memory 342 * and add it to the existing list of memsegs. Do some basic 343 * initialization and add the memory to the system. 344 * In order to prevent lock deadlocks, the add_physmem() 345 * code is repeated here, but split into several stages. 346 */ 347 if (seg == NULL) { 348 seg = kmem_cache_alloc(memseg_cache, KM_SLEEP); 349 bzero(seg, sizeof (struct memseg)); 350 seg->msegflags = MEMSEG_DYNAMIC; 351 seg->pages = pp; 352 } else { 353 /*EMPTY*/ 354 ASSERT(seg->msegflags & MEMSEG_DYNAMIC); 355 } 356 357 seg->epages = seg->pages + npgs; 358 seg->pages_base = base; 359 seg->pages_end = base + npgs; 360 361 /* 362 * Initialize metadata. The page_ts are set to locked state 363 * ready to be freed. 364 */ 365 bzero((caddr_t)pp, ptob(metapgs)); 366 367 pfn = seg->pages_base; 368 /* Save the original pp base in case we reuse a memseg. */ 369 opp = pp; 370 oepp = opp + npgs; 371 for (pp = opp; pp < oepp; pp++) { 372 pp->p_pagenum = pfn; 373 pfn++; 374 page_iolock_init(pp); 375 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 376 continue; 377 pp->p_offset = (u_offset_t)-1; 378 } 379 380 if (reuse) { 381 /* Remap our page_ts to the re-used memseg VA space. */ 382 pfn = pt_base; 383 vaddr = (caddr_t)seg->pages; 384 for (pnum = 0; pnum < metapgs; pnum++) { 385 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 386 PROT_READ | PROT_WRITE, 387 HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST); 388 pfn++; 389 vaddr += ptob(1); 390 } 391 392 hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs), 393 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 394 395 vmem_free(heap_arena, mapva, ptob(metapgs)); 396 } 397 398 hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off); 399 400 memsegs_lock(1); 401 402 /* 403 * The new memseg is inserted at the beginning of the list. 404 * Not only does this save searching for the tail, but in the 405 * case of a re-used memseg, it solves the problem of what 406 * happens if some process has still got a pointer to the 407 * memseg and follows the next pointer to continue traversing 408 * the memsegs list. 409 */ 410 411 hat_kpm_addmem_mseg_insert(seg); 412 413 seg->next = memsegs; 414 membar_producer(); 415 416 hat_kpm_addmem_memsegs_update(seg); 417 418 memsegs = seg; 419 420 build_pfn_hash(); 421 422 total_pages += npgs; 423 424 /* 425 * Recalculate the paging parameters now total_pages has changed. 426 * This will also cause the clock hands to be reset before next use. 427 */ 428 setupclock(1); 429 430 memsegs_unlock(1); 431 432 PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs); 433 434 /* 435 * Free the pages outside the lock to avoid locking loops. 436 */ 437 for (pp = seg->pages; pp < seg->epages; pp++) { 438 page_free(pp, 1); 439 } 440 441 /* 442 * Now that we've updated the appropriate memory lists we 443 * need to reset a number of globals, since we've increased memory. 444 * Several have already been updated for us as noted above. The 445 * globals we're interested in at this point are: 446 * physmax - highest page frame number. 447 * physinstalled - number of pages currently installed (done earlier) 448 * maxmem - max free pages in the system 449 * physmem - physical memory pages available 450 * availrmem - real memory available 451 */ 452 453 mutex_enter(&freemem_lock); 454 maxmem += npgs; 455 physmem += npgs; 456 availrmem += npgs; 457 availrmem_initial += npgs; 458 459 mutex_exit(&freemem_lock); 460 461 dump_resize(); 462 463 page_freelist_coalesce_all(mnode); 464 465 kphysm_setup_post_add(npgs); 466 467 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK " 468 "(0x%" PRIx64 ")\n", 469 physinstalled << (PAGESHIFT - 10), 470 (uint64_t)physinstalled << PAGESHIFT); 471 472 avmem = (uint64_t)freemem << PAGESHIFT; 473 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: " 474 "avail mem = %" PRId64 "\n", avmem); 475 476 /* 477 * Update lgroup generation number on single lgroup systems 478 */ 479 if (nlgrps == 1) 480 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 481 482 delspan_unreserve(pt_base, tpgs); 483 return (KPHYSM_OK); /* Successfully added system memory */ 484 485 } 486 487 /* 488 * There are various error conditions in kphysm_add_memory_dynamic() 489 * which require a rollback of already changed global state. 490 */ 491 static void 492 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs) 493 { 494 int mlret; 495 496 /* Unreserve memory span. */ 497 memlist_write_lock(); 498 499 mlret = memlist_delete_span( 500 (uint64_t)(pt_base) << PAGESHIFT, 501 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 502 503 ASSERT(mlret == MEML_SPANOP_OK); 504 phys_install_has_changed(); 505 installed_top_size(phys_install, &physmax, &physinstalled); 506 507 memlist_write_unlock(); 508 delspan_unreserve(pt_base, tpgs); 509 } 510 511 /* 512 * Only return an available memseg of exactly the right size. 513 * When the meta data area has it's own virtual address space 514 * we will need to manage this more carefully and do best fit 515 * allocations, possibly splitting an available area. 516 */ 517 static struct memseg * 518 memseg_reuse(pgcnt_t metapgs) 519 { 520 struct memseg **segpp, *seg; 521 522 mutex_enter(&memseg_lists_lock); 523 524 segpp = &memseg_va_avail; 525 for (; (seg = *segpp) != NULL; segpp = &seg->lnext) { 526 caddr_t end; 527 528 if (kpm_enable) 529 end = hat_kpm_mseg_reuse(seg); 530 else 531 end = (caddr_t)seg->epages; 532 533 if (btopr(end - (caddr_t)seg->pages) == metapgs) { 534 *segpp = seg->lnext; 535 seg->lnext = NULL; 536 break; 537 } 538 } 539 mutex_exit(&memseg_lists_lock); 540 541 return (seg); 542 } 543 544 static uint_t handle_gen; 545 546 struct memdelspan { 547 struct memdelspan *mds_next; 548 pfn_t mds_base; 549 pgcnt_t mds_npgs; 550 uint_t *mds_bitmap; 551 uint_t *mds_bitmap_retired; 552 }; 553 554 #define NBPBMW (sizeof (uint_t) * NBBY) 555 #define MDS_BITMAPBYTES(MDSP) \ 556 ((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t)) 557 558 struct transit_list { 559 struct transit_list *trl_next; 560 struct memdelspan *trl_spans; 561 int trl_collect; 562 }; 563 564 struct transit_list_head { 565 kmutex_t trh_lock; 566 struct transit_list *trh_head; 567 }; 568 569 static struct transit_list_head transit_list_head; 570 571 struct mem_handle; 572 static void transit_list_collect(struct mem_handle *, int); 573 static void transit_list_insert(struct transit_list *); 574 static void transit_list_remove(struct transit_list *); 575 576 #ifdef DEBUG 577 #define MEM_DEL_STATS 578 #endif /* DEBUG */ 579 580 #ifdef MEM_DEL_STATS 581 static int mem_del_stat_print = 0; 582 struct mem_del_stat { 583 uint_t nloop; 584 uint_t need_free; 585 uint_t free_loop; 586 uint_t free_low; 587 uint_t free_failed; 588 uint_t ncheck; 589 uint_t nopaget; 590 uint_t lockfail; 591 uint_t nfree; 592 uint_t nreloc; 593 uint_t nrelocfail; 594 uint_t already_done; 595 uint_t first_notfree; 596 uint_t npplocked; 597 uint_t nlockreloc; 598 uint_t nnorepl; 599 uint_t nmodreloc; 600 uint_t ndestroy; 601 uint_t nputpage; 602 uint_t nnoreclaim; 603 uint_t ndelay; 604 uint_t demotefail; 605 uint64_t nticks_total; 606 uint64_t nticks_pgrp; 607 uint_t retired; 608 uint_t toxic; 609 uint_t failing; 610 uint_t modtoxic; 611 uint_t npplkdtoxic; 612 uint_t gptlmodfail; 613 uint_t gptllckfail; 614 }; 615 /* 616 * The stat values are only incremented in the delete thread 617 * so no locking or atomic required. 618 */ 619 #define MDSTAT_INCR(MHP, FLD) (MHP)->mh_delstat.FLD++ 620 #define MDSTAT_TOTAL(MHP, ntck) ((MHP)->mh_delstat.nticks_total += (ntck)) 621 #define MDSTAT_PGRP(MHP, ntck) ((MHP)->mh_delstat.nticks_pgrp += (ntck)) 622 static void mem_del_stat_print_func(struct mem_handle *); 623 #define MDSTAT_PRINT(MHP) mem_del_stat_print_func((MHP)) 624 #else /* MEM_DEL_STATS */ 625 #define MDSTAT_INCR(MHP, FLD) 626 #define MDSTAT_TOTAL(MHP, ntck) 627 #define MDSTAT_PGRP(MHP, ntck) 628 #define MDSTAT_PRINT(MHP) 629 #endif /* MEM_DEL_STATS */ 630 631 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING, 632 MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t; 633 634 /* 635 * mh_mutex must be taken to examine or change mh_exthandle and mh_state. 636 * The mutex may not be required for other fields, dependent on mh_state. 637 */ 638 struct mem_handle { 639 kmutex_t mh_mutex; 640 struct mem_handle *mh_next; 641 memhandle_t mh_exthandle; 642 mhnd_state_t mh_state; 643 struct transit_list mh_transit; 644 pgcnt_t mh_phys_pages; 645 pgcnt_t mh_vm_pages; 646 pgcnt_t mh_hold_todo; 647 void (*mh_delete_complete)(void *, int error); 648 void *mh_delete_complete_arg; 649 volatile uint_t mh_cancel; 650 volatile uint_t mh_dr_aio_cleanup_cancel; 651 volatile uint_t mh_aio_cleanup_done; 652 kcondvar_t mh_cv; 653 kthread_id_t mh_thread_id; 654 page_t *mh_deleted; /* link through p_next */ 655 #ifdef MEM_DEL_STATS 656 struct mem_del_stat mh_delstat; 657 #endif /* MEM_DEL_STATS */ 658 }; 659 660 static struct mem_handle *mem_handle_head; 661 static kmutex_t mem_handle_list_mutex; 662 663 static struct mem_handle * 664 kphysm_allocate_mem_handle() 665 { 666 struct mem_handle *mhp; 667 668 mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP); 669 mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL); 670 mutex_enter(&mem_handle_list_mutex); 671 mutex_enter(&mhp->mh_mutex); 672 /* handle_gen is protected by list mutex. */ 673 mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen); 674 mhp->mh_next = mem_handle_head; 675 mem_handle_head = mhp; 676 mutex_exit(&mem_handle_list_mutex); 677 678 return (mhp); 679 } 680 681 static void 682 kphysm_free_mem_handle(struct mem_handle *mhp) 683 { 684 struct mem_handle **mhpp; 685 686 ASSERT(mutex_owned(&mhp->mh_mutex)); 687 ASSERT(mhp->mh_state == MHND_FREE); 688 /* 689 * Exit the mutex to preserve locking order. This is OK 690 * here as once in the FREE state, the handle cannot 691 * be found by a lookup. 692 */ 693 mutex_exit(&mhp->mh_mutex); 694 695 mutex_enter(&mem_handle_list_mutex); 696 mhpp = &mem_handle_head; 697 while (*mhpp != NULL && *mhpp != mhp) 698 mhpp = &(*mhpp)->mh_next; 699 ASSERT(*mhpp == mhp); 700 /* 701 * No need to lock the handle (mh_mutex) as only 702 * mh_next changing and this is the only thread that 703 * can be referncing mhp. 704 */ 705 *mhpp = mhp->mh_next; 706 mutex_exit(&mem_handle_list_mutex); 707 708 mutex_destroy(&mhp->mh_mutex); 709 kmem_free(mhp, sizeof (struct mem_handle)); 710 } 711 712 /* 713 * This function finds the internal mem_handle corresponding to an 714 * external handle and returns it with the mh_mutex held. 715 */ 716 static struct mem_handle * 717 kphysm_lookup_mem_handle(memhandle_t handle) 718 { 719 struct mem_handle *mhp; 720 721 mutex_enter(&mem_handle_list_mutex); 722 for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) { 723 if (mhp->mh_exthandle == handle) { 724 mutex_enter(&mhp->mh_mutex); 725 /* 726 * The state of the handle could have been changed 727 * by kphysm_del_release() while waiting for mh_mutex. 728 */ 729 if (mhp->mh_state == MHND_FREE) { 730 mutex_exit(&mhp->mh_mutex); 731 continue; 732 } 733 break; 734 } 735 } 736 mutex_exit(&mem_handle_list_mutex); 737 return (mhp); 738 } 739 740 int 741 kphysm_del_gethandle(memhandle_t *xmhp) 742 { 743 struct mem_handle *mhp; 744 745 mhp = kphysm_allocate_mem_handle(); 746 /* 747 * The handle is allocated using KM_SLEEP, so cannot fail. 748 * If the implementation is changed, the correct error to return 749 * here would be KPHYSM_ENOHANDLES. 750 */ 751 ASSERT(mhp->mh_state == MHND_FREE); 752 mhp->mh_state = MHND_INIT; 753 *xmhp = mhp->mh_exthandle; 754 mutex_exit(&mhp->mh_mutex); 755 return (KPHYSM_OK); 756 } 757 758 static int 759 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2) 760 { 761 pfn_t e1, e2; 762 763 e1 = b1 + l1; 764 e2 = b2 + l2; 765 766 return (!(b2 >= e1 || b1 >= e2)); 767 } 768 769 static int can_remove_pgs(pgcnt_t); 770 771 static struct memdelspan * 772 span_to_install(pfn_t base, pgcnt_t npgs) 773 { 774 struct memdelspan *mdsp; 775 struct memdelspan *mdsp_new; 776 uint64_t address, size, thislen; 777 struct memlist *mlp; 778 779 mdsp_new = NULL; 780 781 address = (uint64_t)base << PAGESHIFT; 782 size = (uint64_t)npgs << PAGESHIFT; 783 while (size != 0) { 784 memlist_read_lock(); 785 for (mlp = phys_install; mlp != NULL; mlp = mlp->next) { 786 if (address >= (mlp->address + mlp->size)) 787 continue; 788 if ((address + size) > mlp->address) 789 break; 790 } 791 if (mlp == NULL) { 792 address += size; 793 size = 0; 794 thislen = 0; 795 } else { 796 if (address < mlp->address) { 797 size -= (mlp->address - address); 798 address = mlp->address; 799 } 800 ASSERT(address >= mlp->address); 801 if ((address + size) > (mlp->address + mlp->size)) { 802 thislen = mlp->size - (address - mlp->address); 803 } else { 804 thislen = size; 805 } 806 } 807 memlist_read_unlock(); 808 /* TODO: phys_install could change now */ 809 if (thislen == 0) 810 continue; 811 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 812 mdsp->mds_base = btop(address); 813 mdsp->mds_npgs = btop(thislen); 814 mdsp->mds_next = mdsp_new; 815 mdsp_new = mdsp; 816 address += thislen; 817 size -= thislen; 818 } 819 return (mdsp_new); 820 } 821 822 static void 823 free_delspans(struct memdelspan *mdsp) 824 { 825 struct memdelspan *amdsp; 826 827 while ((amdsp = mdsp) != NULL) { 828 mdsp = amdsp->mds_next; 829 kmem_free(amdsp, sizeof (struct memdelspan)); 830 } 831 } 832 833 /* 834 * Concatenate lists. No list ordering is required. 835 */ 836 837 static void 838 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp) 839 { 840 while (*mdspp != NULL) 841 mdspp = &(*mdspp)->mds_next; 842 843 *mdspp = mdsp; 844 } 845 846 /* 847 * Given a new list of delspans, check there is no overlap with 848 * all existing span activity (add or delete) and then concatenate 849 * the new spans to the given list. 850 * Return 1 for OK, 0 if overlapping. 851 */ 852 static int 853 delspan_insert( 854 struct transit_list *my_tlp, 855 struct memdelspan *mdsp_new) 856 { 857 struct transit_list_head *trh; 858 struct transit_list *tlp; 859 int ret; 860 861 trh = &transit_list_head; 862 863 ASSERT(my_tlp != NULL); 864 ASSERT(mdsp_new != NULL); 865 866 ret = 1; 867 mutex_enter(&trh->trh_lock); 868 /* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */ 869 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 870 struct memdelspan *mdsp; 871 872 for (mdsp = tlp->trl_spans; mdsp != NULL; 873 mdsp = mdsp->mds_next) { 874 struct memdelspan *nmdsp; 875 876 for (nmdsp = mdsp_new; nmdsp != NULL; 877 nmdsp = nmdsp->mds_next) { 878 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 879 nmdsp->mds_base, nmdsp->mds_npgs)) { 880 ret = 0; 881 goto done; 882 } 883 } 884 } 885 } 886 done: 887 if (ret != 0) { 888 if (my_tlp->trl_spans == NULL) 889 transit_list_insert(my_tlp); 890 delspan_concat(&my_tlp->trl_spans, mdsp_new); 891 } 892 mutex_exit(&trh->trh_lock); 893 return (ret); 894 } 895 896 static void 897 delspan_remove( 898 struct transit_list *my_tlp, 899 pfn_t base, 900 pgcnt_t npgs) 901 { 902 struct transit_list_head *trh; 903 struct memdelspan *mdsp; 904 905 trh = &transit_list_head; 906 907 ASSERT(my_tlp != NULL); 908 909 mutex_enter(&trh->trh_lock); 910 if ((mdsp = my_tlp->trl_spans) != NULL) { 911 if (npgs == 0) { 912 my_tlp->trl_spans = NULL; 913 free_delspans(mdsp); 914 transit_list_remove(my_tlp); 915 } else { 916 struct memdelspan **prv; 917 918 prv = &my_tlp->trl_spans; 919 while (mdsp != NULL) { 920 pfn_t p_end; 921 922 p_end = mdsp->mds_base + mdsp->mds_npgs; 923 if (mdsp->mds_base >= base && 924 p_end <= (base + npgs)) { 925 *prv = mdsp->mds_next; 926 mdsp->mds_next = NULL; 927 free_delspans(mdsp); 928 } else { 929 prv = &mdsp->mds_next; 930 } 931 mdsp = *prv; 932 } 933 if (my_tlp->trl_spans == NULL) 934 transit_list_remove(my_tlp); 935 } 936 } 937 mutex_exit(&trh->trh_lock); 938 } 939 940 /* 941 * Reserve interface for add to stop delete before add finished. 942 * This list is only accessed through the delspan_insert/remove 943 * functions and so is fully protected by the mutex in struct transit_list. 944 */ 945 946 static struct transit_list reserve_transit; 947 948 static int 949 delspan_reserve(pfn_t base, pgcnt_t npgs) 950 { 951 struct memdelspan *mdsp; 952 int ret; 953 954 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 955 mdsp->mds_base = base; 956 mdsp->mds_npgs = npgs; 957 if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) { 958 free_delspans(mdsp); 959 } 960 return (ret); 961 } 962 963 static void 964 delspan_unreserve(pfn_t base, pgcnt_t npgs) 965 { 966 delspan_remove(&reserve_transit, base, npgs); 967 } 968 969 /* 970 * Return whether memseg was created by kphysm_add_memory_dynamic(). 971 * If this is the case and startp non zero, return also the start pfn 972 * of the meta data via startp. 973 */ 974 static int 975 memseg_is_dynamic(struct memseg *seg, pfn_t *startp) 976 { 977 pfn_t pt_start; 978 979 if ((seg->msegflags & MEMSEG_DYNAMIC) == 0) 980 return (0); 981 982 /* Meta data is required to be at the beginning */ 983 ASSERT(hat_getpfnum(kas.a_hat, (caddr_t)seg->epages) < seg->pages_base); 984 985 pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages); 986 if (startp != NULL) 987 *startp = pt_start; 988 989 return (1); 990 } 991 992 int 993 kphysm_del_span( 994 memhandle_t handle, 995 pfn_t base, 996 pgcnt_t npgs) 997 { 998 struct mem_handle *mhp; 999 struct memseg *seg; 1000 struct memdelspan *mdsp; 1001 struct memdelspan *mdsp_new; 1002 pgcnt_t phys_pages, vm_pages; 1003 pfn_t p_end; 1004 page_t *pp; 1005 int ret; 1006 1007 mhp = kphysm_lookup_mem_handle(handle); 1008 if (mhp == NULL) { 1009 return (KPHYSM_EHANDLE); 1010 } 1011 if (mhp->mh_state != MHND_INIT) { 1012 mutex_exit(&mhp->mh_mutex); 1013 return (KPHYSM_ESEQUENCE); 1014 } 1015 1016 /* 1017 * Intersect the span with the installed memory list (phys_install). 1018 */ 1019 mdsp_new = span_to_install(base, npgs); 1020 if (mdsp_new == NULL) { 1021 /* 1022 * No physical memory in this range. Is this an 1023 * error? If an attempt to start the delete is made 1024 * for OK returns from del_span such as this, start will 1025 * return an error. 1026 * Could return KPHYSM_ENOWORK. 1027 */ 1028 /* 1029 * It is assumed that there are no error returns 1030 * from span_to_install() due to kmem_alloc failure. 1031 */ 1032 mutex_exit(&mhp->mh_mutex); 1033 return (KPHYSM_OK); 1034 } 1035 /* 1036 * Does this span overlap an existing span? 1037 */ 1038 if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) { 1039 /* 1040 * Differentiate between already on list for this handle 1041 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY). 1042 */ 1043 ret = KPHYSM_EBUSY; 1044 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1045 mdsp = mdsp->mds_next) { 1046 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 1047 base, npgs)) { 1048 ret = KPHYSM_EDUP; 1049 break; 1050 } 1051 } 1052 mutex_exit(&mhp->mh_mutex); 1053 free_delspans(mdsp_new); 1054 return (ret); 1055 } 1056 /* 1057 * At this point the spans in mdsp_new have been inserted into the 1058 * list of spans for this handle and thereby to the global list of 1059 * spans being processed. Each of these spans must now be checked 1060 * for relocatability. As a side-effect segments in the memseg list 1061 * may be split. 1062 * 1063 * Note that mdsp_new can no longer be used as it is now part of 1064 * a larger list. Select elements of this larger list based 1065 * on base and npgs. 1066 */ 1067 restart: 1068 phys_pages = 0; 1069 vm_pages = 0; 1070 ret = KPHYSM_OK; 1071 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1072 mdsp = mdsp->mds_next) { 1073 pgcnt_t pages_checked; 1074 1075 if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) { 1076 continue; 1077 } 1078 p_end = mdsp->mds_base + mdsp->mds_npgs; 1079 /* 1080 * The pages_checked count is a hack. All pages should be 1081 * checked for relocatability. Those not covered by memsegs 1082 * should be tested with arch_kphysm_del_span_ok(). 1083 */ 1084 pages_checked = 0; 1085 for (seg = memsegs; seg; seg = seg->next) { 1086 pfn_t mseg_start; 1087 1088 if (seg->pages_base >= p_end || 1089 seg->pages_end <= mdsp->mds_base) { 1090 /* Span and memseg don't overlap. */ 1091 continue; 1092 } 1093 /* Check that segment is suitable for delete. */ 1094 if (memseg_is_dynamic(seg, &mseg_start)) { 1095 /* 1096 * Can only delete whole added segments 1097 * for the moment. 1098 * Check that this is completely within the 1099 * span. 1100 */ 1101 if (mseg_start < mdsp->mds_base || 1102 seg->pages_end > p_end) { 1103 ret = KPHYSM_EBUSY; 1104 break; 1105 } 1106 pages_checked += seg->pages_end - mseg_start; 1107 } else { 1108 /* 1109 * Set mseg_start for accounting below. 1110 */ 1111 mseg_start = seg->pages_base; 1112 /* 1113 * If this segment is larger than the span, 1114 * try to split it. After the split, it 1115 * is necessary to restart. 1116 */ 1117 if (seg->pages_base < mdsp->mds_base || 1118 seg->pages_end > p_end) { 1119 pfn_t abase; 1120 pgcnt_t anpgs; 1121 int s_ret; 1122 1123 /* Split required. */ 1124 if (mdsp->mds_base < seg->pages_base) 1125 abase = seg->pages_base; 1126 else 1127 abase = mdsp->mds_base; 1128 if (p_end > seg->pages_end) 1129 anpgs = seg->pages_end - abase; 1130 else 1131 anpgs = p_end - abase; 1132 s_ret = kphysm_split_memseg(abase, 1133 anpgs); 1134 if (s_ret == 0) { 1135 /* Split failed. */ 1136 ret = KPHYSM_ERESOURCE; 1137 break; 1138 } 1139 goto restart; 1140 } 1141 pages_checked += 1142 seg->pages_end - seg->pages_base; 1143 } 1144 /* 1145 * The memseg is wholly within the delete span. 1146 * The individual pages can now be checked. 1147 */ 1148 /* Cage test. */ 1149 for (pp = seg->pages; pp < seg->epages; pp++) { 1150 if (PP_ISNORELOC(pp)) { 1151 ret = KPHYSM_ENONRELOC; 1152 break; 1153 } 1154 } 1155 if (ret != KPHYSM_OK) { 1156 break; 1157 } 1158 phys_pages += (seg->pages_end - mseg_start); 1159 vm_pages += MSEG_NPAGES(seg); 1160 } 1161 if (ret != KPHYSM_OK) 1162 break; 1163 if (pages_checked != mdsp->mds_npgs) { 1164 ret = KPHYSM_ENONRELOC; 1165 break; 1166 } 1167 } 1168 1169 if (ret == KPHYSM_OK) { 1170 mhp->mh_phys_pages += phys_pages; 1171 mhp->mh_vm_pages += vm_pages; 1172 } else { 1173 /* 1174 * Keep holding the mh_mutex to prevent it going away. 1175 */ 1176 delspan_remove(&mhp->mh_transit, base, npgs); 1177 } 1178 mutex_exit(&mhp->mh_mutex); 1179 return (ret); 1180 } 1181 1182 int 1183 kphysm_del_span_query( 1184 pfn_t base, 1185 pgcnt_t npgs, 1186 memquery_t *mqp) 1187 { 1188 struct memdelspan *mdsp; 1189 struct memdelspan *mdsp_new; 1190 int done_first_nonreloc; 1191 1192 mqp->phys_pages = 0; 1193 mqp->managed = 0; 1194 mqp->nonrelocatable = 0; 1195 mqp->first_nonrelocatable = 0; 1196 mqp->last_nonrelocatable = 0; 1197 1198 mdsp_new = span_to_install(base, npgs); 1199 /* 1200 * It is OK to proceed here if mdsp_new == NULL. 1201 */ 1202 done_first_nonreloc = 0; 1203 for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) { 1204 pfn_t sbase; 1205 pgcnt_t snpgs; 1206 1207 mqp->phys_pages += mdsp->mds_npgs; 1208 sbase = mdsp->mds_base; 1209 snpgs = mdsp->mds_npgs; 1210 while (snpgs != 0) { 1211 struct memseg *lseg, *seg; 1212 pfn_t p_end; 1213 page_t *pp; 1214 pfn_t mseg_start; 1215 1216 p_end = sbase + snpgs; 1217 /* 1218 * Find the lowest addressed memseg that starts 1219 * after sbase and account for it. 1220 * This is to catch dynamic memsegs whose start 1221 * is hidden. 1222 */ 1223 seg = NULL; 1224 for (lseg = memsegs; lseg != NULL; lseg = lseg->next) { 1225 if ((lseg->pages_base >= sbase) || 1226 (lseg->pages_base < p_end && 1227 lseg->pages_end > sbase)) { 1228 if (seg == NULL || 1229 seg->pages_base > lseg->pages_base) 1230 seg = lseg; 1231 } 1232 } 1233 if (seg != NULL) { 1234 if (!memseg_is_dynamic(seg, &mseg_start)) { 1235 mseg_start = seg->pages_base; 1236 } 1237 /* 1238 * Now have the full extent of the memseg so 1239 * do the range check. 1240 */ 1241 if (mseg_start >= p_end || 1242 seg->pages_end <= sbase) { 1243 /* Span does not overlap memseg. */ 1244 seg = NULL; 1245 } 1246 } 1247 /* 1248 * Account for gap either before the segment if 1249 * there is one or to the end of the span. 1250 */ 1251 if (seg == NULL || mseg_start > sbase) { 1252 pfn_t a_end; 1253 1254 a_end = (seg == NULL) ? p_end : mseg_start; 1255 /* 1256 * Check with arch layer for relocatability. 1257 */ 1258 if (arch_kphysm_del_span_ok(sbase, 1259 (a_end - sbase))) { 1260 /* 1261 * No non-relocatble pages in this 1262 * area, avoid the fine-grained 1263 * test. 1264 */ 1265 snpgs -= (a_end - sbase); 1266 sbase = a_end; 1267 } 1268 while (sbase < a_end) { 1269 if (!arch_kphysm_del_span_ok(sbase, 1270 1)) { 1271 mqp->nonrelocatable++; 1272 if (!done_first_nonreloc) { 1273 mqp-> 1274 first_nonrelocatable 1275 = sbase; 1276 done_first_nonreloc = 1; 1277 } 1278 mqp->last_nonrelocatable = 1279 sbase; 1280 } 1281 sbase++; 1282 snpgs--; 1283 } 1284 } 1285 if (seg != NULL) { 1286 ASSERT(mseg_start <= sbase); 1287 if (seg->pages_base != mseg_start && 1288 seg->pages_base > sbase) { 1289 pgcnt_t skip_pgs; 1290 1291 /* 1292 * Skip the page_t area of a 1293 * dynamic memseg. 1294 */ 1295 skip_pgs = seg->pages_base - sbase; 1296 if (snpgs <= skip_pgs) { 1297 sbase += snpgs; 1298 snpgs = 0; 1299 continue; 1300 } 1301 snpgs -= skip_pgs; 1302 sbase += skip_pgs; 1303 } 1304 ASSERT(snpgs != 0); 1305 ASSERT(seg->pages_base <= sbase); 1306 /* 1307 * The individual pages can now be checked. 1308 */ 1309 for (pp = seg->pages + 1310 (sbase - seg->pages_base); 1311 snpgs != 0 && pp < seg->epages; pp++) { 1312 mqp->managed++; 1313 if (PP_ISNORELOC(pp)) { 1314 mqp->nonrelocatable++; 1315 if (!done_first_nonreloc) { 1316 mqp-> 1317 first_nonrelocatable 1318 = sbase; 1319 done_first_nonreloc = 1; 1320 } 1321 mqp->last_nonrelocatable = 1322 sbase; 1323 } 1324 sbase++; 1325 snpgs--; 1326 } 1327 } 1328 } 1329 } 1330 1331 free_delspans(mdsp_new); 1332 1333 return (KPHYSM_OK); 1334 } 1335 1336 /* 1337 * This release function can be called at any stage as follows: 1338 * _gethandle only called 1339 * _span(s) only called 1340 * _start called but failed 1341 * delete thread exited 1342 */ 1343 int 1344 kphysm_del_release(memhandle_t handle) 1345 { 1346 struct mem_handle *mhp; 1347 1348 mhp = kphysm_lookup_mem_handle(handle); 1349 if (mhp == NULL) { 1350 return (KPHYSM_EHANDLE); 1351 } 1352 switch (mhp->mh_state) { 1353 case MHND_STARTING: 1354 case MHND_RUNNING: 1355 mutex_exit(&mhp->mh_mutex); 1356 return (KPHYSM_ENOTFINISHED); 1357 case MHND_FREE: 1358 ASSERT(mhp->mh_state != MHND_FREE); 1359 mutex_exit(&mhp->mh_mutex); 1360 return (KPHYSM_EHANDLE); 1361 case MHND_INIT: 1362 break; 1363 case MHND_DONE: 1364 break; 1365 case MHND_RELEASE: 1366 mutex_exit(&mhp->mh_mutex); 1367 return (KPHYSM_ESEQUENCE); 1368 default: 1369 #ifdef DEBUG 1370 cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d", 1371 (void *)mhp, mhp->mh_state); 1372 #endif /* DEBUG */ 1373 mutex_exit(&mhp->mh_mutex); 1374 return (KPHYSM_EHANDLE); 1375 } 1376 /* 1377 * Set state so that we can wait if necessary. 1378 * Also this means that we have read/write access to all 1379 * fields except mh_exthandle and mh_state. 1380 */ 1381 mhp->mh_state = MHND_RELEASE; 1382 /* 1383 * The mem_handle cannot be de-allocated by any other operation 1384 * now, so no need to hold mh_mutex. 1385 */ 1386 mutex_exit(&mhp->mh_mutex); 1387 1388 delspan_remove(&mhp->mh_transit, 0, 0); 1389 mhp->mh_phys_pages = 0; 1390 mhp->mh_vm_pages = 0; 1391 mhp->mh_hold_todo = 0; 1392 mhp->mh_delete_complete = NULL; 1393 mhp->mh_delete_complete_arg = NULL; 1394 mhp->mh_cancel = 0; 1395 1396 mutex_enter(&mhp->mh_mutex); 1397 ASSERT(mhp->mh_state == MHND_RELEASE); 1398 mhp->mh_state = MHND_FREE; 1399 1400 kphysm_free_mem_handle(mhp); 1401 1402 return (KPHYSM_OK); 1403 } 1404 1405 /* 1406 * This cancel function can only be called with the thread running. 1407 */ 1408 int 1409 kphysm_del_cancel(memhandle_t handle) 1410 { 1411 struct mem_handle *mhp; 1412 1413 mhp = kphysm_lookup_mem_handle(handle); 1414 if (mhp == NULL) { 1415 return (KPHYSM_EHANDLE); 1416 } 1417 if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) { 1418 mutex_exit(&mhp->mh_mutex); 1419 return (KPHYSM_ENOTRUNNING); 1420 } 1421 /* 1422 * Set the cancel flag and wake the delete thread up. 1423 * The thread may be waiting on I/O, so the effect of the cancel 1424 * may be delayed. 1425 */ 1426 if (mhp->mh_cancel == 0) { 1427 mhp->mh_cancel = KPHYSM_ECANCELLED; 1428 cv_signal(&mhp->mh_cv); 1429 } 1430 mutex_exit(&mhp->mh_mutex); 1431 return (KPHYSM_OK); 1432 } 1433 1434 int 1435 kphysm_del_status( 1436 memhandle_t handle, 1437 memdelstat_t *mdstp) 1438 { 1439 struct mem_handle *mhp; 1440 1441 mhp = kphysm_lookup_mem_handle(handle); 1442 if (mhp == NULL) { 1443 return (KPHYSM_EHANDLE); 1444 } 1445 /* 1446 * Calling kphysm_del_status() is allowed before the delete 1447 * is started to allow for status display. 1448 */ 1449 if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING && 1450 mhp->mh_state != MHND_RUNNING) { 1451 mutex_exit(&mhp->mh_mutex); 1452 return (KPHYSM_ENOTRUNNING); 1453 } 1454 mdstp->phys_pages = mhp->mh_phys_pages; 1455 mdstp->managed = mhp->mh_vm_pages; 1456 mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo; 1457 mutex_exit(&mhp->mh_mutex); 1458 return (KPHYSM_OK); 1459 } 1460 1461 static int mem_delete_additional_pages = 100; 1462 1463 static int 1464 can_remove_pgs(pgcnt_t npgs) 1465 { 1466 /* 1467 * If all pageable pages were paged out, freemem would 1468 * equal availrmem. There is a minimum requirement for 1469 * availrmem. 1470 */ 1471 if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages)) 1472 < npgs) 1473 return (0); 1474 /* TODO: check swap space, etc. */ 1475 return (1); 1476 } 1477 1478 static int 1479 get_availrmem(pgcnt_t npgs) 1480 { 1481 int ret; 1482 1483 mutex_enter(&freemem_lock); 1484 ret = can_remove_pgs(npgs); 1485 if (ret != 0) 1486 availrmem -= npgs; 1487 mutex_exit(&freemem_lock); 1488 return (ret); 1489 } 1490 1491 static void 1492 put_availrmem(pgcnt_t npgs) 1493 { 1494 mutex_enter(&freemem_lock); 1495 availrmem += npgs; 1496 mutex_exit(&freemem_lock); 1497 } 1498 1499 #define FREEMEM_INCR 100 1500 static pgcnt_t freemem_incr = FREEMEM_INCR; 1501 #define DEL_FREE_WAIT_FRAC 4 1502 #define DEL_FREE_WAIT_TICKS ((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC) 1503 1504 #define DEL_BUSY_WAIT_FRAC 20 1505 #define DEL_BUSY_WAIT_TICKS ((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC) 1506 1507 static void kphysm_del_cleanup(struct mem_handle *); 1508 1509 static void page_delete_collect(page_t *, struct mem_handle *); 1510 1511 static pgcnt_t 1512 delthr_get_freemem(struct mem_handle *mhp) 1513 { 1514 pgcnt_t free_get; 1515 int ret; 1516 1517 ASSERT(MUTEX_HELD(&mhp->mh_mutex)); 1518 1519 MDSTAT_INCR(mhp, need_free); 1520 /* 1521 * Get up to freemem_incr pages. 1522 */ 1523 free_get = freemem_incr; 1524 if (free_get > mhp->mh_hold_todo) 1525 free_get = mhp->mh_hold_todo; 1526 /* 1527 * Take free_get pages away from freemem, 1528 * waiting if necessary. 1529 */ 1530 1531 while (!mhp->mh_cancel) { 1532 mutex_exit(&mhp->mh_mutex); 1533 MDSTAT_INCR(mhp, free_loop); 1534 /* 1535 * Duplicate test from page_create_throttle() 1536 * but don't override with !PG_WAIT. 1537 */ 1538 if (freemem < (free_get + throttlefree)) { 1539 MDSTAT_INCR(mhp, free_low); 1540 ret = 0; 1541 } else { 1542 ret = page_create_wait(free_get, 0); 1543 if (ret == 0) { 1544 /* EMPTY */ 1545 MDSTAT_INCR(mhp, free_failed); 1546 } 1547 } 1548 if (ret != 0) { 1549 mutex_enter(&mhp->mh_mutex); 1550 return (free_get); 1551 } 1552 1553 /* 1554 * Put pressure on pageout. 1555 */ 1556 page_needfree(free_get); 1557 cv_signal(&proc_pageout->p_cv); 1558 1559 mutex_enter(&mhp->mh_mutex); 1560 (void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex, 1561 (lbolt + DEL_FREE_WAIT_TICKS)); 1562 mutex_exit(&mhp->mh_mutex); 1563 page_needfree(-(spgcnt_t)free_get); 1564 1565 mutex_enter(&mhp->mh_mutex); 1566 } 1567 return (0); 1568 } 1569 1570 #define DR_AIO_CLEANUP_DELAY 25000 /* 0.025secs, in usec */ 1571 #define DR_AIO_CLEANUP_MAXLOOPS_NODELAY 100 1572 /* 1573 * This function is run as a helper thread for delete_memory_thread. 1574 * It is needed in order to force kaio cleanup, so that pages used in kaio 1575 * will be unlocked and subsequently relocated by delete_memory_thread. 1576 * The address of the delete_memory_threads's mem_handle is passed in to 1577 * this thread function, and is used to set the mh_aio_cleanup_done member 1578 * prior to calling thread_exit(). 1579 */ 1580 static void 1581 dr_aio_cleanup_thread(caddr_t amhp) 1582 { 1583 proc_t *procp; 1584 int (*aio_cleanup_dr_delete_memory)(proc_t *); 1585 int cleaned; 1586 int n = 0; 1587 struct mem_handle *mhp; 1588 volatile uint_t *pcancel; 1589 1590 mhp = (struct mem_handle *)amhp; 1591 ASSERT(mhp != NULL); 1592 pcancel = &mhp->mh_dr_aio_cleanup_cancel; 1593 if (modload("sys", "kaio") == -1) { 1594 mhp->mh_aio_cleanup_done = 1; 1595 cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio"); 1596 thread_exit(); 1597 } 1598 aio_cleanup_dr_delete_memory = (int (*)(proc_t *)) 1599 modgetsymvalue("aio_cleanup_dr_delete_memory", 0); 1600 if (aio_cleanup_dr_delete_memory == NULL) { 1601 mhp->mh_aio_cleanup_done = 1; 1602 cmn_err(CE_WARN, 1603 "aio_cleanup_dr_delete_memory not found in kaio"); 1604 thread_exit(); 1605 } 1606 do { 1607 cleaned = 0; 1608 mutex_enter(&pidlock); 1609 for (procp = practive; (*pcancel == 0) && (procp != NULL); 1610 procp = procp->p_next) { 1611 mutex_enter(&procp->p_lock); 1612 if (procp->p_aio != NULL) { 1613 /* cleanup proc's outstanding kaio */ 1614 cleaned += 1615 (*aio_cleanup_dr_delete_memory)(procp); 1616 } 1617 mutex_exit(&procp->p_lock); 1618 } 1619 mutex_exit(&pidlock); 1620 if ((*pcancel == 0) && 1621 (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) { 1622 /* delay a bit before retrying all procs again */ 1623 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 1624 n = 0; 1625 } 1626 } while (*pcancel == 0); 1627 mhp->mh_aio_cleanup_done = 1; 1628 thread_exit(); 1629 } 1630 1631 static void 1632 delete_memory_thread(caddr_t amhp) 1633 { 1634 struct mem_handle *mhp; 1635 struct memdelspan *mdsp; 1636 callb_cpr_t cprinfo; 1637 page_t *pp_targ; 1638 spgcnt_t freemem_left; 1639 void (*del_complete_funcp)(void *, int error); 1640 void *del_complete_arg; 1641 int comp_code; 1642 int ret; 1643 int first_scan; 1644 uint_t szc; 1645 #ifdef MEM_DEL_STATS 1646 uint64_t start_total, ntick_total; 1647 uint64_t start_pgrp, ntick_pgrp; 1648 #endif /* MEM_DEL_STATS */ 1649 1650 mhp = (struct mem_handle *)amhp; 1651 1652 #ifdef MEM_DEL_STATS 1653 start_total = ddi_get_lbolt(); 1654 #endif /* MEM_DEL_STATS */ 1655 1656 CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex, 1657 callb_generic_cpr, "memdel"); 1658 1659 mutex_enter(&mhp->mh_mutex); 1660 ASSERT(mhp->mh_state == MHND_STARTING); 1661 1662 mhp->mh_state = MHND_RUNNING; 1663 mhp->mh_thread_id = curthread; 1664 1665 mhp->mh_hold_todo = mhp->mh_vm_pages; 1666 mutex_exit(&mhp->mh_mutex); 1667 1668 /* Allocate the remap pages now, if necessary. */ 1669 memseg_remap_init(); 1670 1671 /* 1672 * Subtract from availrmem now if possible as availrmem 1673 * may not be available by the end of the delete. 1674 */ 1675 if (!get_availrmem(mhp->mh_vm_pages)) { 1676 comp_code = KPHYSM_ENOTVIABLE; 1677 mutex_enter(&mhp->mh_mutex); 1678 goto early_exit; 1679 } 1680 1681 ret = kphysm_setup_pre_del(mhp->mh_vm_pages); 1682 1683 mutex_enter(&mhp->mh_mutex); 1684 1685 if (ret != 0) { 1686 mhp->mh_cancel = KPHYSM_EREFUSED; 1687 goto refused; 1688 } 1689 1690 transit_list_collect(mhp, 1); 1691 1692 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1693 mdsp = mdsp->mds_next) { 1694 ASSERT(mdsp->mds_bitmap == NULL); 1695 mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP); 1696 mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp), 1697 KM_SLEEP); 1698 } 1699 1700 first_scan = 1; 1701 freemem_left = 0; 1702 /* 1703 * Start dr_aio_cleanup_thread, which periodically iterates 1704 * through the process list and invokes aio cleanup. This 1705 * is needed in order to avoid a deadly embrace between the 1706 * delete_memory_thread (waiting on writer lock for page, with the 1707 * exclusive-wanted bit set), kaio read request threads (waiting for a 1708 * reader lock on the same page that is wanted by the 1709 * delete_memory_thread), and threads waiting for kaio completion 1710 * (blocked on spt_amp->lock). 1711 */ 1712 mhp->mh_dr_aio_cleanup_cancel = 0; 1713 mhp->mh_aio_cleanup_done = 0; 1714 (void) thread_create(NULL, 0, dr_aio_cleanup_thread, 1715 (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1); 1716 while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) { 1717 pgcnt_t collected; 1718 1719 MDSTAT_INCR(mhp, nloop); 1720 collected = 0; 1721 for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) && 1722 (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) { 1723 pfn_t pfn, p_end; 1724 1725 if (first_scan) { 1726 mem_node_pre_del_slice(mdsp->mds_base, 1727 mdsp->mds_base + mdsp->mds_npgs - 1); 1728 } 1729 1730 p_end = mdsp->mds_base + mdsp->mds_npgs; 1731 for (pfn = mdsp->mds_base; (pfn < p_end) && 1732 (mhp->mh_cancel == 0); pfn++) { 1733 page_t *pp, *tpp, *tpp_targ; 1734 pgcnt_t bit; 1735 struct vnode *vp; 1736 u_offset_t offset; 1737 int mod, result; 1738 spgcnt_t pgcnt; 1739 1740 bit = pfn - mdsp->mds_base; 1741 if ((mdsp->mds_bitmap[bit / NBPBMW] & 1742 (1 << (bit % NBPBMW))) != 0) { 1743 MDSTAT_INCR(mhp, already_done); 1744 continue; 1745 } 1746 if (freemem_left == 0) { 1747 freemem_left += delthr_get_freemem(mhp); 1748 if (freemem_left == 0) 1749 break; 1750 } 1751 1752 /* 1753 * Release mh_mutex - some of this 1754 * stuff takes some time (eg PUTPAGE). 1755 */ 1756 1757 mutex_exit(&mhp->mh_mutex); 1758 MDSTAT_INCR(mhp, ncheck); 1759 1760 pp = page_numtopp_nolock(pfn); 1761 if (pp == NULL) { 1762 /* 1763 * Not covered by a page_t - will 1764 * be dealt with elsewhere. 1765 */ 1766 MDSTAT_INCR(mhp, nopaget); 1767 mutex_enter(&mhp->mh_mutex); 1768 mdsp->mds_bitmap[bit / NBPBMW] |= 1769 (1 << (bit % NBPBMW)); 1770 continue; 1771 } 1772 1773 if (!page_try_reclaim_lock(pp, SE_EXCL, 1774 SE_EXCL_WANTED | SE_RETIRED)) { 1775 /* 1776 * Page in use elsewhere. Skip it. 1777 */ 1778 MDSTAT_INCR(mhp, lockfail); 1779 mutex_enter(&mhp->mh_mutex); 1780 continue; 1781 } 1782 /* 1783 * See if the cage expanded into the delete. 1784 * This can happen as we have to allow the 1785 * cage to expand. 1786 */ 1787 if (PP_ISNORELOC(pp)) { 1788 page_unlock(pp); 1789 mutex_enter(&mhp->mh_mutex); 1790 mhp->mh_cancel = KPHYSM_ENONRELOC; 1791 break; 1792 } 1793 if (PP_RETIRED(pp)) { 1794 /* 1795 * Page has been retired and is 1796 * not part of the cage so we 1797 * can now do the accounting for 1798 * it. 1799 */ 1800 MDSTAT_INCR(mhp, retired); 1801 mutex_enter(&mhp->mh_mutex); 1802 mdsp->mds_bitmap[bit / NBPBMW] 1803 |= (1 << (bit % NBPBMW)); 1804 mdsp->mds_bitmap_retired[bit / 1805 NBPBMW] |= 1806 (1 << (bit % NBPBMW)); 1807 mhp->mh_hold_todo--; 1808 continue; 1809 } 1810 ASSERT(freemem_left != 0); 1811 if (PP_ISFREE(pp)) { 1812 /* 1813 * Like page_reclaim() only 'freemem' 1814 * processing is already done. 1815 */ 1816 MDSTAT_INCR(mhp, nfree); 1817 free_page_collect: 1818 if (PP_ISAGED(pp)) { 1819 page_list_sub(pp, 1820 PG_FREE_LIST); 1821 } else { 1822 page_list_sub(pp, 1823 PG_CACHE_LIST); 1824 } 1825 PP_CLRFREE(pp); 1826 PP_CLRAGED(pp); 1827 collected++; 1828 mutex_enter(&mhp->mh_mutex); 1829 page_delete_collect(pp, mhp); 1830 mdsp->mds_bitmap[bit / NBPBMW] |= 1831 (1 << (bit % NBPBMW)); 1832 freemem_left--; 1833 continue; 1834 } 1835 ASSERT(pp->p_vnode != NULL); 1836 if (first_scan) { 1837 MDSTAT_INCR(mhp, first_notfree); 1838 page_unlock(pp); 1839 mutex_enter(&mhp->mh_mutex); 1840 continue; 1841 } 1842 /* 1843 * Keep stats on pages encountered that 1844 * are marked for retirement. 1845 */ 1846 if (PP_TOXIC(pp)) { 1847 MDSTAT_INCR(mhp, toxic); 1848 } else if (PP_PR_REQ(pp)) { 1849 MDSTAT_INCR(mhp, failing); 1850 } 1851 /* 1852 * In certain cases below, special exceptions 1853 * are made for pages that are toxic. This 1854 * is because the current meaning of toxic 1855 * is that an uncorrectable error has been 1856 * previously associated with the page. 1857 */ 1858 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1859 if (!PP_TOXIC(pp)) { 1860 /* 1861 * Must relocate locked in 1862 * memory pages. 1863 */ 1864 #ifdef MEM_DEL_STATS 1865 start_pgrp = ddi_get_lbolt(); 1866 #endif /* MEM_DEL_STATS */ 1867 /* 1868 * Lock all constituent pages 1869 * of a large page to ensure 1870 * that p_szc won't change. 1871 */ 1872 if (!group_page_trylock(pp, 1873 SE_EXCL)) { 1874 MDSTAT_INCR(mhp, 1875 gptllckfail); 1876 page_unlock(pp); 1877 mutex_enter( 1878 &mhp->mh_mutex); 1879 continue; 1880 } 1881 MDSTAT_INCR(mhp, npplocked); 1882 pp_targ = 1883 page_get_replacement_page( 1884 pp, NULL, 0); 1885 if (pp_targ != NULL) { 1886 #ifdef MEM_DEL_STATS 1887 ntick_pgrp = 1888 (uint64_t) 1889 ddi_get_lbolt() - 1890 start_pgrp; 1891 #endif /* MEM_DEL_STATS */ 1892 MDSTAT_PGRP(mhp, 1893 ntick_pgrp); 1894 MDSTAT_INCR(mhp, 1895 nlockreloc); 1896 goto reloc; 1897 } 1898 group_page_unlock(pp); 1899 page_unlock(pp); 1900 #ifdef MEM_DEL_STATS 1901 ntick_pgrp = 1902 (uint64_t)ddi_get_lbolt() - 1903 start_pgrp; 1904 #endif /* MEM_DEL_STATS */ 1905 MDSTAT_PGRP(mhp, ntick_pgrp); 1906 MDSTAT_INCR(mhp, nnorepl); 1907 mutex_enter(&mhp->mh_mutex); 1908 continue; 1909 } else { 1910 /* 1911 * Cannot do anything about 1912 * this page because it is 1913 * toxic. 1914 */ 1915 MDSTAT_INCR(mhp, npplkdtoxic); 1916 page_unlock(pp); 1917 mutex_enter(&mhp->mh_mutex); 1918 continue; 1919 } 1920 } 1921 /* 1922 * Unload the mappings and check if mod bit 1923 * is set. 1924 */ 1925 ASSERT(!PP_ISKAS(pp)); 1926 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1927 mod = hat_ismod(pp); 1928 1929 #ifdef MEM_DEL_STATS 1930 start_pgrp = ddi_get_lbolt(); 1931 #endif /* MEM_DEL_STATS */ 1932 if (mod && !PP_TOXIC(pp)) { 1933 /* 1934 * Lock all constituent pages 1935 * of a large page to ensure 1936 * that p_szc won't change. 1937 */ 1938 if (!group_page_trylock(pp, SE_EXCL)) { 1939 MDSTAT_INCR(mhp, gptlmodfail); 1940 page_unlock(pp); 1941 mutex_enter(&mhp->mh_mutex); 1942 continue; 1943 } 1944 pp_targ = page_get_replacement_page(pp, 1945 NULL, 0); 1946 if (pp_targ != NULL) { 1947 MDSTAT_INCR(mhp, nmodreloc); 1948 #ifdef MEM_DEL_STATS 1949 ntick_pgrp = 1950 (uint64_t)ddi_get_lbolt() - 1951 start_pgrp; 1952 #endif /* MEM_DEL_STATS */ 1953 MDSTAT_PGRP(mhp, ntick_pgrp); 1954 goto reloc; 1955 } 1956 group_page_unlock(pp); 1957 } 1958 1959 if (!page_try_demote_pages(pp)) { 1960 MDSTAT_INCR(mhp, demotefail); 1961 page_unlock(pp); 1962 #ifdef MEM_DEL_STATS 1963 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 1964 start_pgrp; 1965 #endif /* MEM_DEL_STATS */ 1966 MDSTAT_PGRP(mhp, ntick_pgrp); 1967 mutex_enter(&mhp->mh_mutex); 1968 continue; 1969 } 1970 1971 /* 1972 * Regular 'page-out'. 1973 */ 1974 if (!mod) { 1975 MDSTAT_INCR(mhp, ndestroy); 1976 page_destroy(pp, 1); 1977 /* 1978 * page_destroy was called with 1979 * dontfree. As long as p_lckcnt 1980 * and p_cowcnt are both zero, the 1981 * only additional action of 1982 * page_destroy with !dontfree is to 1983 * call page_free, so we can collect 1984 * the page here. 1985 */ 1986 collected++; 1987 #ifdef MEM_DEL_STATS 1988 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 1989 start_pgrp; 1990 #endif /* MEM_DEL_STATS */ 1991 MDSTAT_PGRP(mhp, ntick_pgrp); 1992 mutex_enter(&mhp->mh_mutex); 1993 page_delete_collect(pp, mhp); 1994 mdsp->mds_bitmap[bit / NBPBMW] |= 1995 (1 << (bit % NBPBMW)); 1996 continue; 1997 } 1998 /* 1999 * The page is toxic and the mod bit is 2000 * set, we cannot do anything here to deal 2001 * with it. 2002 */ 2003 if (PP_TOXIC(pp)) { 2004 page_unlock(pp); 2005 #ifdef MEM_DEL_STATS 2006 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2007 start_pgrp; 2008 #endif /* MEM_DEL_STATS */ 2009 MDSTAT_PGRP(mhp, ntick_pgrp); 2010 MDSTAT_INCR(mhp, modtoxic); 2011 mutex_enter(&mhp->mh_mutex); 2012 continue; 2013 } 2014 MDSTAT_INCR(mhp, nputpage); 2015 vp = pp->p_vnode; 2016 offset = pp->p_offset; 2017 VN_HOLD(vp); 2018 page_unlock(pp); 2019 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, 2020 B_INVAL|B_FORCE, kcred, NULL); 2021 VN_RELE(vp); 2022 #ifdef MEM_DEL_STATS 2023 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2024 start_pgrp; 2025 #endif /* MEM_DEL_STATS */ 2026 MDSTAT_PGRP(mhp, ntick_pgrp); 2027 /* 2028 * Try to get the page back immediately 2029 * so that it can be collected. 2030 */ 2031 pp = page_numtopp_nolock(pfn); 2032 if (pp == NULL) { 2033 MDSTAT_INCR(mhp, nnoreclaim); 2034 /* 2035 * This should not happen as this 2036 * thread is deleting the page. 2037 * If this code is generalized, this 2038 * becomes a reality. 2039 */ 2040 #ifdef DEBUG 2041 cmn_err(CE_WARN, 2042 "delete_memory_thread(0x%p) " 2043 "pfn 0x%lx has no page_t", 2044 (void *)mhp, pfn); 2045 #endif /* DEBUG */ 2046 mutex_enter(&mhp->mh_mutex); 2047 continue; 2048 } 2049 if (page_try_reclaim_lock(pp, SE_EXCL, 2050 SE_EXCL_WANTED | SE_RETIRED)) { 2051 if (PP_ISFREE(pp)) { 2052 goto free_page_collect; 2053 } 2054 page_unlock(pp); 2055 } 2056 MDSTAT_INCR(mhp, nnoreclaim); 2057 mutex_enter(&mhp->mh_mutex); 2058 continue; 2059 2060 reloc: 2061 /* 2062 * Got some freemem and a target 2063 * page, so move the data to avoid 2064 * I/O and lock problems. 2065 */ 2066 ASSERT(!page_iolock_assert(pp)); 2067 MDSTAT_INCR(mhp, nreloc); 2068 /* 2069 * page_relocate() will return pgcnt: the 2070 * number of consecutive pages relocated. 2071 * If it is successful, pp will be a 2072 * linked list of the page structs that 2073 * were relocated. If page_relocate() is 2074 * unsuccessful, pp will be unmodified. 2075 */ 2076 #ifdef MEM_DEL_STATS 2077 start_pgrp = ddi_get_lbolt(); 2078 #endif /* MEM_DEL_STATS */ 2079 result = page_relocate(&pp, &pp_targ, 0, 0, 2080 &pgcnt, NULL); 2081 #ifdef MEM_DEL_STATS 2082 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2083 start_pgrp; 2084 #endif /* MEM_DEL_STATS */ 2085 MDSTAT_PGRP(mhp, ntick_pgrp); 2086 if (result != 0) { 2087 MDSTAT_INCR(mhp, nrelocfail); 2088 /* 2089 * We did not succeed. We need 2090 * to give the pp_targ pages back. 2091 * page_free(pp_targ, 1) without 2092 * the freemem accounting. 2093 */ 2094 group_page_unlock(pp); 2095 page_free_replacement_page(pp_targ); 2096 page_unlock(pp); 2097 mutex_enter(&mhp->mh_mutex); 2098 continue; 2099 } 2100 2101 /* 2102 * We will then collect pgcnt pages. 2103 */ 2104 ASSERT(pgcnt > 0); 2105 mutex_enter(&mhp->mh_mutex); 2106 /* 2107 * We need to make sure freemem_left is 2108 * large enough. 2109 */ 2110 while ((freemem_left < pgcnt) && 2111 (!mhp->mh_cancel)) { 2112 freemem_left += 2113 delthr_get_freemem(mhp); 2114 } 2115 2116 /* 2117 * Do not proceed if mh_cancel is set. 2118 */ 2119 if (mhp->mh_cancel) { 2120 while (pp_targ != NULL) { 2121 /* 2122 * Unlink and unlock each page. 2123 */ 2124 tpp_targ = pp_targ; 2125 page_sub(&pp_targ, tpp_targ); 2126 page_unlock(tpp_targ); 2127 } 2128 /* 2129 * We need to give the pp pages back. 2130 * page_free(pp, 1) without the 2131 * freemem accounting. 2132 */ 2133 page_free_replacement_page(pp); 2134 break; 2135 } 2136 2137 /* Now remove pgcnt from freemem_left */ 2138 freemem_left -= pgcnt; 2139 ASSERT(freemem_left >= 0); 2140 szc = pp->p_szc; 2141 while (pp != NULL) { 2142 /* 2143 * pp and pp_targ were passed back as 2144 * a linked list of pages. 2145 * Unlink and unlock each page. 2146 */ 2147 tpp_targ = pp_targ; 2148 page_sub(&pp_targ, tpp_targ); 2149 page_unlock(tpp_targ); 2150 /* 2151 * The original page is now free 2152 * so remove it from the linked 2153 * list and collect it. 2154 */ 2155 tpp = pp; 2156 page_sub(&pp, tpp); 2157 pfn = page_pptonum(tpp); 2158 collected++; 2159 ASSERT(PAGE_EXCL(tpp)); 2160 ASSERT(tpp->p_vnode == NULL); 2161 ASSERT(!hat_page_is_mapped(tpp)); 2162 ASSERT(tpp->p_szc == szc); 2163 tpp->p_szc = 0; 2164 page_delete_collect(tpp, mhp); 2165 bit = pfn - mdsp->mds_base; 2166 mdsp->mds_bitmap[bit / NBPBMW] |= 2167 (1 << (bit % NBPBMW)); 2168 } 2169 ASSERT(pp_targ == NULL); 2170 } 2171 } 2172 first_scan = 0; 2173 if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) && 2174 (collected == 0)) { 2175 /* 2176 * This code is needed as we cannot wait 2177 * for a page to be locked OR the delete to 2178 * be cancelled. Also, we must delay so 2179 * that other threads get a chance to run 2180 * on our cpu, otherwise page locks may be 2181 * held indefinitely by those threads. 2182 */ 2183 MDSTAT_INCR(mhp, ndelay); 2184 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2185 (void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex, 2186 (lbolt + DEL_BUSY_WAIT_TICKS)); 2187 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2188 } 2189 } 2190 /* stop the dr aio cleanup thread */ 2191 mhp->mh_dr_aio_cleanup_cancel = 1; 2192 transit_list_collect(mhp, 0); 2193 if (freemem_left != 0) { 2194 /* Return any surplus. */ 2195 page_create_putback(freemem_left); 2196 freemem_left = 0; 2197 } 2198 #ifdef MEM_DEL_STATS 2199 ntick_total = (uint64_t)ddi_get_lbolt() - start_total; 2200 #endif /* MEM_DEL_STATS */ 2201 MDSTAT_TOTAL(mhp, ntick_total); 2202 MDSTAT_PRINT(mhp); 2203 2204 /* 2205 * If the memory delete was cancelled, exclusive-wanted bits must 2206 * be cleared. If there are retired pages being deleted, they need 2207 * to be unretired. 2208 */ 2209 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2210 mdsp = mdsp->mds_next) { 2211 pfn_t pfn, p_end; 2212 2213 p_end = mdsp->mds_base + mdsp->mds_npgs; 2214 for (pfn = mdsp->mds_base; pfn < p_end; pfn++) { 2215 page_t *pp; 2216 pgcnt_t bit; 2217 2218 bit = pfn - mdsp->mds_base; 2219 if (mhp->mh_cancel) { 2220 pp = page_numtopp_nolock(pfn); 2221 if (pp != NULL) { 2222 if ((mdsp->mds_bitmap[bit / NBPBMW] & 2223 (1 << (bit % NBPBMW))) == 0) { 2224 page_lock_clr_exclwanted(pp); 2225 } 2226 } 2227 } else { 2228 pp = NULL; 2229 } 2230 if ((mdsp->mds_bitmap_retired[bit / NBPBMW] & 2231 (1 << (bit % NBPBMW))) != 0) { 2232 /* do we already have pp? */ 2233 if (pp == NULL) { 2234 pp = page_numtopp_nolock(pfn); 2235 } 2236 ASSERT(pp != NULL); 2237 ASSERT(PP_RETIRED(pp)); 2238 if (mhp->mh_cancel != 0) { 2239 page_unlock(pp); 2240 /* 2241 * To satisfy ASSERT below in 2242 * cancel code. 2243 */ 2244 mhp->mh_hold_todo++; 2245 } else { 2246 (void) page_unretire_pp(pp, 2247 PR_UNR_CLEAN); 2248 } 2249 } 2250 } 2251 } 2252 /* 2253 * Free retired page bitmap and collected page bitmap 2254 */ 2255 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2256 mdsp = mdsp->mds_next) { 2257 ASSERT(mdsp->mds_bitmap_retired != NULL); 2258 kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp)); 2259 mdsp->mds_bitmap_retired = NULL; /* Paranoia. */ 2260 ASSERT(mdsp->mds_bitmap != NULL); 2261 kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp)); 2262 mdsp->mds_bitmap = NULL; /* Paranoia. */ 2263 } 2264 2265 /* wait for our dr aio cancel thread to exit */ 2266 while (!(mhp->mh_aio_cleanup_done)) { 2267 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2268 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 2269 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2270 } 2271 refused: 2272 if (mhp->mh_cancel != 0) { 2273 page_t *pp; 2274 2275 comp_code = mhp->mh_cancel; 2276 /* 2277 * Go through list of deleted pages (mh_deleted) freeing 2278 * them. 2279 */ 2280 while ((pp = mhp->mh_deleted) != NULL) { 2281 mhp->mh_deleted = pp->p_next; 2282 mhp->mh_hold_todo++; 2283 mutex_exit(&mhp->mh_mutex); 2284 /* Restore p_next. */ 2285 pp->p_next = pp->p_prev; 2286 if (PP_ISFREE(pp)) { 2287 cmn_err(CE_PANIC, 2288 "page %p is free", 2289 (void *)pp); 2290 } 2291 page_free(pp, 1); 2292 mutex_enter(&mhp->mh_mutex); 2293 } 2294 ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages); 2295 2296 mutex_exit(&mhp->mh_mutex); 2297 put_availrmem(mhp->mh_vm_pages); 2298 mutex_enter(&mhp->mh_mutex); 2299 2300 goto t_exit; 2301 } 2302 2303 /* 2304 * All the pages are no longer in use and are exclusively locked. 2305 */ 2306 2307 mhp->mh_deleted = NULL; 2308 2309 kphysm_del_cleanup(mhp); 2310 2311 /* 2312 * mem_node_post_del_slice needs to be after kphysm_del_cleanup so 2313 * that the mem_node_config[] will remain intact for the cleanup. 2314 */ 2315 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2316 mdsp = mdsp->mds_next) { 2317 mem_node_post_del_slice(mdsp->mds_base, 2318 mdsp->mds_base + mdsp->mds_npgs - 1, 0); 2319 } 2320 2321 comp_code = KPHYSM_OK; 2322 2323 t_exit: 2324 mutex_exit(&mhp->mh_mutex); 2325 kphysm_setup_post_del(mhp->mh_vm_pages, 2326 (comp_code == KPHYSM_OK) ? 0 : 1); 2327 mutex_enter(&mhp->mh_mutex); 2328 2329 early_exit: 2330 /* mhp->mh_mutex exited by CALLB_CPR_EXIT() */ 2331 mhp->mh_state = MHND_DONE; 2332 del_complete_funcp = mhp->mh_delete_complete; 2333 del_complete_arg = mhp->mh_delete_complete_arg; 2334 CALLB_CPR_EXIT(&cprinfo); 2335 (*del_complete_funcp)(del_complete_arg, comp_code); 2336 thread_exit(); 2337 /*NOTREACHED*/ 2338 } 2339 2340 /* 2341 * Start the delete of the memory from the system. 2342 */ 2343 int 2344 kphysm_del_start( 2345 memhandle_t handle, 2346 void (*complete)(void *, int), 2347 void *complete_arg) 2348 { 2349 struct mem_handle *mhp; 2350 2351 mhp = kphysm_lookup_mem_handle(handle); 2352 if (mhp == NULL) { 2353 return (KPHYSM_EHANDLE); 2354 } 2355 switch (mhp->mh_state) { 2356 case MHND_FREE: 2357 ASSERT(mhp->mh_state != MHND_FREE); 2358 mutex_exit(&mhp->mh_mutex); 2359 return (KPHYSM_EHANDLE); 2360 case MHND_INIT: 2361 break; 2362 case MHND_STARTING: 2363 case MHND_RUNNING: 2364 mutex_exit(&mhp->mh_mutex); 2365 return (KPHYSM_ESEQUENCE); 2366 case MHND_DONE: 2367 mutex_exit(&mhp->mh_mutex); 2368 return (KPHYSM_ESEQUENCE); 2369 case MHND_RELEASE: 2370 mutex_exit(&mhp->mh_mutex); 2371 return (KPHYSM_ESEQUENCE); 2372 default: 2373 #ifdef DEBUG 2374 cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d", 2375 (void *)mhp, mhp->mh_state); 2376 #endif /* DEBUG */ 2377 mutex_exit(&mhp->mh_mutex); 2378 return (KPHYSM_EHANDLE); 2379 } 2380 2381 if (mhp->mh_transit.trl_spans == NULL) { 2382 mutex_exit(&mhp->mh_mutex); 2383 return (KPHYSM_ENOWORK); 2384 } 2385 2386 ASSERT(complete != NULL); 2387 mhp->mh_delete_complete = complete; 2388 mhp->mh_delete_complete_arg = complete_arg; 2389 mhp->mh_state = MHND_STARTING; 2390 /* 2391 * Release the mutex in case thread_create sleeps. 2392 */ 2393 mutex_exit(&mhp->mh_mutex); 2394 2395 /* 2396 * The "obvious" process for this thread is pageout (proc_pageout) 2397 * but this gives the thread too much power over freemem 2398 * which results in freemem starvation. 2399 */ 2400 (void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0, 2401 TS_RUN, maxclsyspri - 1); 2402 2403 return (KPHYSM_OK); 2404 } 2405 2406 static kmutex_t pp_dummy_lock; /* Protects init. of pp_dummy. */ 2407 static caddr_t pp_dummy; 2408 static pgcnt_t pp_dummy_npages; 2409 static pfn_t *pp_dummy_pfn; /* Array of dummy pfns. */ 2410 2411 static void 2412 memseg_remap_init_pages(page_t *pages, page_t *epages) 2413 { 2414 page_t *pp; 2415 2416 for (pp = pages; pp < epages; pp++) { 2417 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2418 pp->p_offset = (u_offset_t)-1; 2419 page_iolock_init(pp); 2420 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 2421 continue; 2422 page_lock_delete(pp); 2423 } 2424 } 2425 2426 void 2427 memseg_remap_init() 2428 { 2429 mutex_enter(&pp_dummy_lock); 2430 if (pp_dummy == NULL) { 2431 uint_t dpages; 2432 int i; 2433 2434 /* 2435 * dpages starts off as the size of the structure and 2436 * ends up as the minimum number of pages that will 2437 * hold a whole number of page_t structures. 2438 */ 2439 dpages = sizeof (page_t); 2440 ASSERT(dpages != 0); 2441 ASSERT(dpages <= MMU_PAGESIZE); 2442 2443 while ((dpages & 1) == 0) 2444 dpages >>= 1; 2445 2446 pp_dummy_npages = dpages; 2447 /* 2448 * Allocate pp_dummy pages directly from static_arena, 2449 * since these are whole page allocations and are 2450 * referenced by physical address. This also has the 2451 * nice fringe benefit of hiding the memory from 2452 * ::findleaks since it doesn't deal well with allocated 2453 * kernel heap memory that doesn't have any mappings. 2454 */ 2455 pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages), 2456 PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP); 2457 bzero(pp_dummy, ptob(pp_dummy_npages)); 2458 ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0); 2459 pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) * 2460 pp_dummy_npages, KM_SLEEP); 2461 for (i = 0; i < pp_dummy_npages; i++) { 2462 pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat, 2463 &pp_dummy[MMU_PAGESIZE * i]); 2464 ASSERT(pp_dummy_pfn[i] != PFN_INVALID); 2465 } 2466 /* 2467 * Initialize the page_t's to a known 'deleted' state 2468 * that matches the state of deleted pages. 2469 */ 2470 memseg_remap_init_pages((page_t *)pp_dummy, 2471 (page_t *)(pp_dummy + ptob(pp_dummy_npages))); 2472 /* Remove kmem mappings for the pages for safety. */ 2473 hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages), 2474 HAT_UNLOAD_UNLOCK); 2475 /* Leave pp_dummy pointer set as flag that init is done. */ 2476 } 2477 mutex_exit(&pp_dummy_lock); 2478 } 2479 2480 static void 2481 memseg_remap_to_dummy(caddr_t pp, pgcnt_t metapgs) 2482 { 2483 ASSERT(pp_dummy != NULL); 2484 2485 while (metapgs != 0) { 2486 pgcnt_t n; 2487 int i; 2488 2489 n = pp_dummy_npages; 2490 if (n > metapgs) 2491 n = metapgs; 2492 for (i = 0; i < n; i++) { 2493 hat_devload(kas.a_hat, pp, ptob(1), pp_dummy_pfn[i], 2494 PROT_READ, 2495 HAT_LOAD | HAT_LOAD_NOCONSIST | 2496 HAT_LOAD_REMAP); 2497 pp += ptob(1); 2498 } 2499 metapgs -= n; 2500 } 2501 } 2502 2503 /* 2504 * Transition all the deleted pages to the deleted state so that 2505 * page_lock will not wait. The page_lock_delete call will 2506 * also wake up any waiters. 2507 */ 2508 static void 2509 memseg_lock_delete_all(struct memseg *seg) 2510 { 2511 page_t *pp; 2512 2513 for (pp = seg->pages; pp < seg->epages; pp++) { 2514 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2515 page_lock_delete(pp); 2516 } 2517 } 2518 2519 static void 2520 kphysm_del_cleanup(struct mem_handle *mhp) 2521 { 2522 struct memdelspan *mdsp; 2523 struct memseg *seg; 2524 struct memseg **segpp; 2525 struct memseg *seglist; 2526 pfn_t p_end; 2527 uint64_t avmem; 2528 pgcnt_t avpgs; 2529 pgcnt_t npgs; 2530 2531 avpgs = mhp->mh_vm_pages; 2532 2533 memsegs_lock(1); 2534 2535 /* 2536 * remove from main segment list. 2537 */ 2538 npgs = 0; 2539 seglist = NULL; 2540 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2541 mdsp = mdsp->mds_next) { 2542 p_end = mdsp->mds_base + mdsp->mds_npgs; 2543 for (segpp = &memsegs; (seg = *segpp) != NULL; ) { 2544 if (seg->pages_base >= p_end || 2545 seg->pages_end <= mdsp->mds_base) { 2546 /* Span and memseg don't overlap. */ 2547 segpp = &((*segpp)->next); 2548 continue; 2549 } 2550 ASSERT(seg->pages_base >= mdsp->mds_base); 2551 ASSERT(seg->pages_end <= p_end); 2552 2553 PLCNT_MODIFY_MAX(seg->pages_base, 2554 seg->pages_base - seg->pages_end); 2555 2556 /* Hide the memseg from future scans. */ 2557 hat_kpm_delmem_mseg_update(seg, segpp); 2558 *segpp = seg->next; 2559 membar_producer(); /* TODO: Needed? */ 2560 npgs += MSEG_NPAGES(seg); 2561 2562 /* 2563 * Leave the deleted segment's next pointer intact 2564 * in case a memsegs scanning loop is walking this 2565 * segment concurrently. 2566 */ 2567 seg->lnext = seglist; 2568 seglist = seg; 2569 } 2570 } 2571 2572 build_pfn_hash(); 2573 2574 ASSERT(npgs < total_pages); 2575 total_pages -= npgs; 2576 2577 /* 2578 * Recalculate the paging parameters now total_pages has changed. 2579 * This will also cause the clock hands to be reset before next use. 2580 */ 2581 setupclock(1); 2582 2583 memsegs_unlock(1); 2584 2585 mutex_exit(&mhp->mh_mutex); 2586 2587 while ((seg = seglist) != NULL) { 2588 pfn_t mseg_start; 2589 pfn_t mseg_base, mseg_end; 2590 pgcnt_t mseg_npgs; 2591 page_t *pp; 2592 pgcnt_t metapgs; 2593 int dynamic; 2594 int mlret; 2595 2596 seglist = seg->lnext; 2597 2598 /* 2599 * Put the page_t's into the deleted state to stop 2600 * cv_wait()s on the pages. When we remap, the dummy 2601 * page_t's will be in the same state. 2602 */ 2603 memseg_lock_delete_all(seg); 2604 /* 2605 * Collect up information based on pages_base and pages_end 2606 * early so that we can flag early that the memseg has been 2607 * deleted by setting pages_end == pages_base. 2608 */ 2609 mseg_base = seg->pages_base; 2610 mseg_end = seg->pages_end; 2611 mseg_npgs = MSEG_NPAGES(seg); 2612 dynamic = memseg_is_dynamic(seg, &mseg_start); 2613 2614 seg->pages_end = seg->pages_base; 2615 2616 if (dynamic) { 2617 pp = seg->pages; 2618 metapgs = mseg_base - mseg_start; 2619 ASSERT(metapgs != 0); 2620 2621 /* Remap the meta data to our special dummy area. */ 2622 memseg_remap_to_dummy((caddr_t)pp, metapgs); 2623 2624 mutex_enter(&memseg_lists_lock); 2625 seg->lnext = memseg_va_avail; 2626 memseg_va_avail = seg; 2627 mutex_exit(&memseg_lists_lock); 2628 } else { 2629 /* 2630 * Set for clean-up below. 2631 */ 2632 mseg_start = seg->pages_base; 2633 /* 2634 * For memory whose page_ts were allocated 2635 * at boot, we need to find a new use for 2636 * the page_t memory. 2637 * For the moment, just leak it. 2638 * (It is held in the memseg_delete_junk list.) 2639 */ 2640 2641 mutex_enter(&memseg_lists_lock); 2642 seg->lnext = memseg_delete_junk; 2643 memseg_delete_junk = seg; 2644 mutex_exit(&memseg_lists_lock); 2645 } 2646 2647 /* Must not use seg now as it could be re-used. */ 2648 2649 memlist_write_lock(); 2650 2651 mlret = memlist_delete_span( 2652 (uint64_t)(mseg_base) << PAGESHIFT, 2653 (uint64_t)(mseg_npgs) << PAGESHIFT, 2654 &phys_avail); 2655 ASSERT(mlret == MEML_SPANOP_OK); 2656 2657 mlret = memlist_delete_span( 2658 (uint64_t)(mseg_start) << PAGESHIFT, 2659 (uint64_t)(mseg_end - mseg_start) << 2660 PAGESHIFT, 2661 &phys_install); 2662 ASSERT(mlret == MEML_SPANOP_OK); 2663 phys_install_has_changed(); 2664 2665 memlist_write_unlock(); 2666 } 2667 2668 memlist_read_lock(); 2669 installed_top_size(phys_install, &physmax, &physinstalled); 2670 memlist_read_unlock(); 2671 2672 mutex_enter(&freemem_lock); 2673 maxmem -= avpgs; 2674 physmem -= avpgs; 2675 /* availrmem is adjusted during the delete. */ 2676 availrmem_initial -= avpgs; 2677 2678 mutex_exit(&freemem_lock); 2679 2680 dump_resize(); 2681 2682 cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK " 2683 "(0x%" PRIx64 ")\n", 2684 physinstalled << (PAGESHIFT - 10), 2685 (uint64_t)physinstalled << PAGESHIFT); 2686 2687 avmem = (uint64_t)freemem << PAGESHIFT; 2688 cmn_err(CE_CONT, "?kphysm_delete: " 2689 "avail mem = %" PRId64 "\n", avmem); 2690 2691 /* 2692 * Update lgroup generation number on single lgroup systems 2693 */ 2694 if (nlgrps == 1) 2695 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 2696 2697 /* Successfully deleted system memory */ 2698 mutex_enter(&mhp->mh_mutex); 2699 } 2700 2701 static uint_t mdel_nullvp_waiter; 2702 2703 static void 2704 page_delete_collect( 2705 page_t *pp, 2706 struct mem_handle *mhp) 2707 { 2708 if (pp->p_vnode) { 2709 page_hashout(pp, (kmutex_t *)NULL); 2710 /* do not do PP_SETAGED(pp); */ 2711 } else { 2712 kmutex_t *sep; 2713 2714 sep = page_se_mutex(pp); 2715 mutex_enter(sep); 2716 if (CV_HAS_WAITERS(&pp->p_cv)) { 2717 mdel_nullvp_waiter++; 2718 cv_broadcast(&pp->p_cv); 2719 } 2720 mutex_exit(sep); 2721 } 2722 ASSERT(pp->p_next == pp->p_prev); 2723 ASSERT(pp->p_next == NULL || pp->p_next == pp); 2724 pp->p_next = mhp->mh_deleted; 2725 mhp->mh_deleted = pp; 2726 ASSERT(mhp->mh_hold_todo != 0); 2727 mhp->mh_hold_todo--; 2728 } 2729 2730 static void 2731 transit_list_collect(struct mem_handle *mhp, int v) 2732 { 2733 struct transit_list_head *trh; 2734 2735 trh = &transit_list_head; 2736 mutex_enter(&trh->trh_lock); 2737 mhp->mh_transit.trl_collect = v; 2738 mutex_exit(&trh->trh_lock); 2739 } 2740 2741 static void 2742 transit_list_insert(struct transit_list *tlp) 2743 { 2744 struct transit_list_head *trh; 2745 2746 trh = &transit_list_head; 2747 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2748 tlp->trl_next = trh->trh_head; 2749 trh->trh_head = tlp; 2750 } 2751 2752 static void 2753 transit_list_remove(struct transit_list *tlp) 2754 { 2755 struct transit_list_head *trh; 2756 struct transit_list **tlpp; 2757 2758 trh = &transit_list_head; 2759 tlpp = &trh->trh_head; 2760 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2761 while (*tlpp != NULL && *tlpp != tlp) 2762 tlpp = &(*tlpp)->trl_next; 2763 ASSERT(*tlpp != NULL); 2764 if (*tlpp == tlp) 2765 *tlpp = tlp->trl_next; 2766 tlp->trl_next = NULL; 2767 } 2768 2769 static struct transit_list * 2770 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum) 2771 { 2772 struct transit_list *tlp; 2773 2774 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 2775 struct memdelspan *mdsp; 2776 2777 for (mdsp = tlp->trl_spans; mdsp != NULL; 2778 mdsp = mdsp->mds_next) { 2779 if (pfnum >= mdsp->mds_base && 2780 pfnum < (mdsp->mds_base + mdsp->mds_npgs)) { 2781 return (tlp); 2782 } 2783 } 2784 } 2785 return (NULL); 2786 } 2787 2788 int 2789 pfn_is_being_deleted(pfn_t pfnum) 2790 { 2791 struct transit_list_head *trh; 2792 struct transit_list *tlp; 2793 int ret; 2794 2795 trh = &transit_list_head; 2796 if (trh->trh_head == NULL) 2797 return (0); 2798 2799 mutex_enter(&trh->trh_lock); 2800 tlp = pfnum_to_transit_list(trh, pfnum); 2801 ret = (tlp != NULL && tlp->trl_collect); 2802 mutex_exit(&trh->trh_lock); 2803 2804 return (ret); 2805 } 2806 2807 #ifdef MEM_DEL_STATS 2808 extern int hz; 2809 static void 2810 mem_del_stat_print_func(struct mem_handle *mhp) 2811 { 2812 uint64_t tmp; 2813 2814 if (mem_del_stat_print) { 2815 printf("memory delete loop %x/%x, statistics%s\n", 2816 (uint_t)mhp->mh_transit.trl_spans->mds_base, 2817 (uint_t)mhp->mh_transit.trl_spans->mds_npgs, 2818 (mhp->mh_cancel ? " (cancelled)" : "")); 2819 printf("\t%8u nloop\n", mhp->mh_delstat.nloop); 2820 printf("\t%8u need_free\n", mhp->mh_delstat.need_free); 2821 printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop); 2822 printf("\t%8u free_low\n", mhp->mh_delstat.free_low); 2823 printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed); 2824 printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck); 2825 printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget); 2826 printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail); 2827 printf("\t%8u nfree\n", mhp->mh_delstat.nfree); 2828 printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc); 2829 printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail); 2830 printf("\t%8u already_done\n", mhp->mh_delstat.already_done); 2831 printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree); 2832 printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked); 2833 printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc); 2834 printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl); 2835 printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc); 2836 printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy); 2837 printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage); 2838 printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim); 2839 printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay); 2840 printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail); 2841 printf("\t%8u retired\n", mhp->mh_delstat.retired); 2842 printf("\t%8u toxic\n", mhp->mh_delstat.toxic); 2843 printf("\t%8u failing\n", mhp->mh_delstat.failing); 2844 printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic); 2845 printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic); 2846 printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail); 2847 printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail); 2848 tmp = mhp->mh_delstat.nticks_total / hz; /* seconds */ 2849 printf( 2850 "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n", 2851 mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60); 2852 2853 tmp = mhp->mh_delstat.nticks_pgrp / hz; /* seconds */ 2854 printf( 2855 "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n", 2856 mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60); 2857 } 2858 } 2859 #endif /* MEM_DEL_STATS */ 2860 2861 struct mem_callback { 2862 kphysm_setup_vector_t *vec; 2863 void *arg; 2864 }; 2865 2866 #define NMEMCALLBACKS 100 2867 2868 static struct mem_callback mem_callbacks[NMEMCALLBACKS]; 2869 static uint_t nmemcallbacks; 2870 static krwlock_t mem_callback_rwlock; 2871 2872 int 2873 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg) 2874 { 2875 uint_t i, found; 2876 2877 /* 2878 * This test will become more complicated when the version must 2879 * change. 2880 */ 2881 if (vec->version != KPHYSM_SETUP_VECTOR_VERSION) 2882 return (EINVAL); 2883 2884 if (vec->post_add == NULL || vec->pre_del == NULL || 2885 vec->post_del == NULL) 2886 return (EINVAL); 2887 2888 rw_enter(&mem_callback_rwlock, RW_WRITER); 2889 for (i = 0, found = 0; i < nmemcallbacks; i++) { 2890 if (mem_callbacks[i].vec == NULL && found == 0) 2891 found = i + 1; 2892 if (mem_callbacks[i].vec == vec && 2893 mem_callbacks[i].arg == arg) { 2894 #ifdef DEBUG 2895 /* Catch this in DEBUG kernels. */ 2896 cmn_err(CE_WARN, "kphysm_setup_func_register" 2897 "(0x%p, 0x%p) duplicate registration from 0x%p", 2898 (void *)vec, arg, (void *)caller()); 2899 #endif /* DEBUG */ 2900 rw_exit(&mem_callback_rwlock); 2901 return (EEXIST); 2902 } 2903 } 2904 if (found != 0) { 2905 i = found - 1; 2906 } else { 2907 ASSERT(nmemcallbacks < NMEMCALLBACKS); 2908 if (nmemcallbacks == NMEMCALLBACKS) { 2909 rw_exit(&mem_callback_rwlock); 2910 return (ENOMEM); 2911 } 2912 i = nmemcallbacks++; 2913 } 2914 mem_callbacks[i].vec = vec; 2915 mem_callbacks[i].arg = arg; 2916 rw_exit(&mem_callback_rwlock); 2917 return (0); 2918 } 2919 2920 void 2921 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg) 2922 { 2923 uint_t i; 2924 2925 rw_enter(&mem_callback_rwlock, RW_WRITER); 2926 for (i = 0; i < nmemcallbacks; i++) { 2927 if (mem_callbacks[i].vec == vec && 2928 mem_callbacks[i].arg == arg) { 2929 mem_callbacks[i].vec = NULL; 2930 mem_callbacks[i].arg = NULL; 2931 if (i == (nmemcallbacks - 1)) 2932 nmemcallbacks--; 2933 break; 2934 } 2935 } 2936 rw_exit(&mem_callback_rwlock); 2937 } 2938 2939 static void 2940 kphysm_setup_post_add(pgcnt_t delta_pages) 2941 { 2942 uint_t i; 2943 2944 rw_enter(&mem_callback_rwlock, RW_READER); 2945 for (i = 0; i < nmemcallbacks; i++) { 2946 if (mem_callbacks[i].vec != NULL) { 2947 (*mem_callbacks[i].vec->post_add) 2948 (mem_callbacks[i].arg, delta_pages); 2949 } 2950 } 2951 rw_exit(&mem_callback_rwlock); 2952 } 2953 2954 /* 2955 * Note the locking between pre_del and post_del: The reader lock is held 2956 * between the two calls to stop the set of functions from changing. 2957 */ 2958 2959 static int 2960 kphysm_setup_pre_del(pgcnt_t delta_pages) 2961 { 2962 uint_t i; 2963 int ret; 2964 int aret; 2965 2966 ret = 0; 2967 rw_enter(&mem_callback_rwlock, RW_READER); 2968 for (i = 0; i < nmemcallbacks; i++) { 2969 if (mem_callbacks[i].vec != NULL) { 2970 aret = (*mem_callbacks[i].vec->pre_del) 2971 (mem_callbacks[i].arg, delta_pages); 2972 ret |= aret; 2973 } 2974 } 2975 2976 return (ret); 2977 } 2978 2979 static void 2980 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled) 2981 { 2982 uint_t i; 2983 2984 for (i = 0; i < nmemcallbacks; i++) { 2985 if (mem_callbacks[i].vec != NULL) { 2986 (*mem_callbacks[i].vec->post_del) 2987 (mem_callbacks[i].arg, delta_pages, cancelled); 2988 } 2989 } 2990 rw_exit(&mem_callback_rwlock); 2991 } 2992 2993 static int 2994 kphysm_split_memseg( 2995 pfn_t base, 2996 pgcnt_t npgs) 2997 { 2998 struct memseg *seg; 2999 struct memseg **segpp; 3000 pgcnt_t size_low, size_high; 3001 struct memseg *seg_low, *seg_mid, *seg_high; 3002 3003 /* 3004 * Lock the memsegs list against other updates now 3005 */ 3006 memsegs_lock(1); 3007 3008 /* 3009 * Find boot time memseg that wholly covers this area. 3010 */ 3011 3012 /* First find the memseg with page 'base' in it. */ 3013 for (segpp = &memsegs; (seg = *segpp) != NULL; 3014 segpp = &((*segpp)->next)) { 3015 if (base >= seg->pages_base && base < seg->pages_end) 3016 break; 3017 } 3018 if (seg == NULL) { 3019 memsegs_unlock(1); 3020 return (0); 3021 } 3022 if (memseg_is_dynamic(seg, (pfn_t *)NULL)) { 3023 memsegs_unlock(1); 3024 return (0); 3025 } 3026 if ((base + npgs) > seg->pages_end) { 3027 memsegs_unlock(1); 3028 return (0); 3029 } 3030 3031 /* 3032 * Work out the size of the two segments that will 3033 * surround the new segment, one for low address 3034 * and one for high. 3035 */ 3036 ASSERT(base >= seg->pages_base); 3037 size_low = base - seg->pages_base; 3038 ASSERT(seg->pages_end >= (base + npgs)); 3039 size_high = seg->pages_end - (base + npgs); 3040 3041 /* 3042 * Sanity check. 3043 */ 3044 if ((size_low + size_high) == 0) { 3045 memsegs_unlock(1); 3046 return (0); 3047 } 3048 3049 /* 3050 * Allocate the new structures. The old memseg will not be freed 3051 * as there may be a reference to it. 3052 */ 3053 seg_low = NULL; 3054 seg_high = NULL; 3055 3056 if (size_low != 0) { 3057 seg_low = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3058 bzero(seg_low, sizeof (struct memseg)); 3059 } 3060 3061 seg_mid = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3062 bzero(seg_mid, sizeof (struct memseg)); 3063 3064 if (size_high != 0) { 3065 seg_high = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3066 bzero(seg_high, sizeof (struct memseg)); 3067 } 3068 3069 /* 3070 * All allocation done now. 3071 */ 3072 if (size_low != 0) { 3073 seg_low->pages = seg->pages; 3074 seg_low->epages = seg_low->pages + size_low; 3075 seg_low->pages_base = seg->pages_base; 3076 seg_low->pages_end = seg_low->pages_base + size_low; 3077 seg_low->next = seg_mid; 3078 } 3079 if (size_high != 0) { 3080 seg_high->pages = seg->epages - size_high; 3081 seg_high->epages = seg_high->pages + size_high; 3082 seg_high->pages_base = seg->pages_end - size_high; 3083 seg_high->pages_end = seg_high->pages_base + size_high; 3084 seg_high->next = seg->next; 3085 } 3086 3087 seg_mid->pages = seg->pages + size_low; 3088 seg_mid->pages_base = seg->pages_base + size_low; 3089 seg_mid->epages = seg->epages - size_high; 3090 seg_mid->pages_end = seg->pages_end - size_high; 3091 seg_mid->next = (seg_high != NULL) ? seg_high : seg->next; 3092 3093 /* 3094 * Update hat_kpm specific info of all involved memsegs and 3095 * allow hat_kpm specific global chain updates. 3096 */ 3097 hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high); 3098 3099 /* 3100 * At this point we have two equivalent memseg sub-chains, 3101 * seg and seg_low/seg_mid/seg_high, which both chain on to 3102 * the same place in the global chain. By re-writing the pointer 3103 * in the previous element we switch atomically from using the old 3104 * (seg) to the new. 3105 */ 3106 *segpp = (seg_low != NULL) ? seg_low : seg_mid; 3107 3108 membar_enter(); 3109 3110 build_pfn_hash(); 3111 memsegs_unlock(1); 3112 3113 /* 3114 * We leave the old segment, 'seg', intact as there may be 3115 * references to it. Also, as the value of total_pages has not 3116 * changed and the memsegs list is effectively the same when 3117 * accessed via the old or the new pointer, we do not have to 3118 * cause pageout_scanner() to re-evaluate its hand pointers. 3119 * 3120 * We currently do not re-use or reclaim the page_t memory. 3121 * If we do, then this may have to change. 3122 */ 3123 3124 mutex_enter(&memseg_lists_lock); 3125 seg->lnext = memseg_edit_junk; 3126 memseg_edit_junk = seg; 3127 mutex_exit(&memseg_lists_lock); 3128 3129 return (1); 3130 } 3131 3132 /* 3133 * The sfmmu hat layer (e.g.) accesses some parts of the memseg 3134 * structure using physical addresses. Therefore a kmem_cache is 3135 * used with KMC_NOHASH to avoid page crossings within a memseg 3136 * structure. KMC_NOHASH requires that no external (outside of 3137 * slab) information is allowed. This, in turn, implies that the 3138 * cache's slabsize must be exactly a single page, since per-slab 3139 * information (e.g. the freelist for the slab) is kept at the 3140 * end of the slab, where it is easy to locate. Should be changed 3141 * when a more obvious kmem_cache interface/flag will become 3142 * available. 3143 */ 3144 void 3145 mem_config_init() 3146 { 3147 memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg), 3148 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH); 3149 } 3150