1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/cmn_err.h> 31 #include <sys/vmem.h> 32 #include <sys/kmem.h> 33 #include <sys/systm.h> 34 #include <sys/machsystm.h> /* for page_freelist_coalesce() */ 35 #include <sys/errno.h> 36 #include <sys/memnode.h> 37 #include <sys/memlist.h> 38 #include <sys/memlist_impl.h> 39 #include <sys/tuneable.h> 40 #include <sys/proc.h> 41 #include <sys/disp.h> 42 #include <sys/debug.h> 43 #include <sys/vm.h> 44 #include <sys/callb.h> 45 #include <sys/memlist_plat.h> /* for installed_top_size() */ 46 #include <sys/condvar_impl.h> /* for CV_HAS_WAITERS() */ 47 #include <sys/dumphdr.h> /* for dump_resize() */ 48 #include <sys/atomic.h> /* for use in stats collection */ 49 #include <sys/rwlock.h> 50 #include <sys/cpuvar.h> 51 #include <vm/seg_kmem.h> 52 #include <vm/seg_kpm.h> 53 #include <vm/page.h> 54 #include <vm/vm_dep.h> 55 #define SUNDDI_IMPL /* so sunddi.h will not redefine splx() et al */ 56 #include <sys/sunddi.h> 57 #include <sys/mem_config.h> 58 #include <sys/mem_cage.h> 59 #include <sys/lgrp.h> 60 #include <sys/ddi.h> 61 #include <sys/modctl.h> 62 63 extern void memlist_read_lock(void); 64 extern void memlist_read_unlock(void); 65 extern void memlist_write_lock(void); 66 extern void memlist_write_unlock(void); 67 68 extern struct memlist *phys_avail; 69 70 extern void mem_node_add(pfn_t, pfn_t); 71 extern void mem_node_del(pfn_t, pfn_t); 72 73 extern uint_t page_ctrs_adjust(int); 74 static void kphysm_setup_post_add(pgcnt_t); 75 static int kphysm_setup_pre_del(pgcnt_t); 76 static void kphysm_setup_post_del(pgcnt_t, int); 77 78 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs); 79 80 static int delspan_reserve(pfn_t, pgcnt_t); 81 static void delspan_unreserve(pfn_t, pgcnt_t); 82 83 static kmutex_t memseg_lists_lock; 84 static struct memseg *memseg_va_avail; 85 static struct memseg *memseg_delete_junk; 86 static struct memseg *memseg_edit_junk; 87 void memseg_remap_init(void); 88 static void memseg_remap_to_dummy(caddr_t, pgcnt_t); 89 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t); 90 static struct memseg *memseg_reuse(pgcnt_t); 91 92 static struct kmem_cache *memseg_cache; 93 94 /* 95 * Add a chunk of memory to the system. page_t's for this memory 96 * are allocated in the first few pages of the chunk. 97 * base: starting PAGESIZE page of new memory. 98 * npgs: length in PAGESIZE pages. 99 * 100 * Adding mem this way doesn't increase the size of the hash tables; 101 * growing them would be too hard. This should be OK, but adding memory 102 * dynamically most likely means more hash misses, since the tables will 103 * be smaller than they otherwise would be. 104 */ 105 int 106 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs) 107 { 108 page_t *pp; 109 page_t *opp, *oepp; 110 struct memseg *seg; 111 uint64_t avmem; 112 pfn_t pfn; 113 pfn_t pt_base = base; 114 pgcnt_t tpgs = npgs; 115 pgcnt_t metapgs; 116 int exhausted; 117 pfn_t pnum; 118 int mnode; 119 caddr_t vaddr; 120 int reuse; 121 int mlret; 122 void *mapva; 123 pgcnt_t nkpmpgs = 0; 124 offset_t kpm_pages_off; 125 126 cmn_err(CE_CONT, 127 "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n", 128 npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT); 129 130 /* 131 * Add this span in the delete list to prevent interactions. 132 */ 133 if (!delspan_reserve(base, npgs)) { 134 return (KPHYSM_ESPAN); 135 } 136 /* 137 * Check to see if any of the memory span has been added 138 * by trying an add to the installed memory list. This 139 * forms the interlocking process for add. 140 */ 141 142 memlist_write_lock(); 143 144 mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT, 145 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 146 147 if (mlret == MEML_SPANOP_OK) 148 installed_top_size(phys_install, &physmax, &physinstalled); 149 150 memlist_write_unlock(); 151 152 if (mlret != MEML_SPANOP_OK) { 153 if (mlret == MEML_SPANOP_EALLOC) { 154 delspan_unreserve(pt_base, tpgs); 155 return (KPHYSM_ERESOURCE); 156 } else 157 if (mlret == MEML_SPANOP_ESPAN) { 158 delspan_unreserve(pt_base, tpgs); 159 return (KPHYSM_ESPAN); 160 } else { 161 delspan_unreserve(pt_base, tpgs); 162 return (KPHYSM_ERESOURCE); 163 } 164 } 165 166 /* 167 * We store the page_t's for this new memory in the first 168 * few pages of the chunk. Here, we go and get'em ... 169 */ 170 171 /* 172 * The expression after the '-' gives the number of pages 173 * that will fit in the new memory based on a requirement 174 * of (PAGESIZE + sizeof (page_t)) bytes per page. 175 */ 176 metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) / 177 (PAGESIZE + sizeof (page_t))); 178 179 npgs -= metapgs; 180 base += metapgs; 181 182 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs); 183 184 exhausted = (metapgs == 0 || npgs == 0); 185 186 if (kpm_enable && !exhausted) { 187 pgcnt_t start, end, nkpmpgs_prelim; 188 size_t ptsz; 189 190 /* 191 * A viable kpm large page mapping must not overlap two 192 * dynamic memsegs. Therefore the total size is checked 193 * to be at least kpm_pgsz and also whether start and end 194 * points are at least kpm_pgsz aligned. 195 */ 196 if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) || 197 pmodkpmp(base + npgs)) { 198 199 kphysm_addmem_error_undospan(pt_base, tpgs); 200 201 /* 202 * There is no specific error code for violating 203 * kpm granularity constraints. 204 */ 205 return (KPHYSM_ENOTVIABLE); 206 } 207 208 start = kpmptop(ptokpmp(base)); 209 end = kpmptop(ptokpmp(base + npgs)); 210 nkpmpgs_prelim = ptokpmp(end - start); 211 ptsz = npgs * sizeof (page_t); 212 metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ); 213 exhausted = (tpgs <= metapgs); 214 if (!exhausted) { 215 npgs = tpgs - metapgs; 216 base = pt_base + metapgs; 217 218 /* final nkpmpgs */ 219 start = kpmptop(ptokpmp(base)); 220 nkpmpgs = ptokpmp(end - start); 221 kpm_pages_off = ptsz + 222 (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ; 223 } 224 } 225 226 /* 227 * Is memory area supplied too small? 228 */ 229 if (exhausted) { 230 kphysm_addmem_error_undospan(pt_base, tpgs); 231 232 /* 233 * There is no specific error code for 'too small'. 234 */ 235 return (KPHYSM_ERESOURCE); 236 } 237 238 /* 239 * We may re-use a previously allocated VA space for the page_ts 240 * eventually, but we need to initialize and lock the pages first. 241 */ 242 243 /* 244 * Get an address in the kernel address map, map 245 * the page_t pages and see if we can touch them. 246 */ 247 248 mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP); 249 if (mapva == NULL) { 250 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" 251 " Can't allocate VA for page_ts"); 252 253 kphysm_addmem_error_undospan(pt_base, tpgs); 254 255 return (KPHYSM_ERESOURCE); 256 } 257 pp = mapva; 258 259 if (physmax < (pt_base + tpgs)) 260 physmax = (pt_base + tpgs); 261 262 /* 263 * In the remapping code we map one page at a time so we must do 264 * the same here to match mapping sizes. 265 */ 266 pfn = pt_base; 267 vaddr = (caddr_t)pp; 268 for (pnum = 0; pnum < metapgs; pnum++) { 269 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 270 PROT_READ | PROT_WRITE, 271 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); 272 pfn++; 273 vaddr += ptob(1); 274 } 275 276 if (ddi_peek32((dev_info_t *)NULL, 277 (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) { 278 279 cmn_err(CE_PANIC, "kphysm_add_memory_dynamic:" 280 " Can't access pp array at 0x%p [phys 0x%lx]", 281 (void *)pp, pt_base); 282 283 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 284 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 285 286 vmem_free(heap_arena, mapva, ptob(metapgs)); 287 288 kphysm_addmem_error_undospan(pt_base, tpgs); 289 290 return (KPHYSM_EFAULT); 291 } 292 293 /* 294 * Add this memory slice to its memory node translation. 295 * 296 * Note that right now, each node may have only one slice; 297 * this may change with COD or in larger SSM systems with 298 * nested latency groups, so we must not assume that the 299 * node does not yet exist. 300 */ 301 pnum = base + npgs - 1; 302 mem_node_add_slice(base, pnum); 303 304 /* 305 * Allocate or resize page counters as necessary to accomodate 306 * the increase in memory pages. 307 */ 308 mnode = PFN_2_MEM_NODE(pnum); 309 if (page_ctrs_adjust(mnode) != 0) { 310 311 mem_node_pre_del_slice(base, pnum); 312 mem_node_post_del_slice(base, pnum, 0); 313 314 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 315 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 316 317 vmem_free(heap_arena, mapva, ptob(metapgs)); 318 319 kphysm_addmem_error_undospan(pt_base, tpgs); 320 321 return (KPHYSM_ERESOURCE); 322 } 323 324 /* 325 * Update the phys_avail memory list. 326 * The phys_install list was done at the start. 327 */ 328 329 memlist_write_lock(); 330 331 mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT, 332 (uint64_t)(npgs) << PAGESHIFT, &phys_avail); 333 ASSERT(mlret == MEML_SPANOP_OK); 334 335 memlist_write_unlock(); 336 337 /* See if we can find a memseg to re-use. */ 338 seg = memseg_reuse(metapgs); 339 340 reuse = (seg != NULL); 341 342 /* 343 * Initialize the memseg structure representing this memory 344 * and add it to the existing list of memsegs. Do some basic 345 * initialization and add the memory to the system. 346 * In order to prevent lock deadlocks, the add_physmem() 347 * code is repeated here, but split into several stages. 348 */ 349 if (seg == NULL) { 350 seg = kmem_cache_alloc(memseg_cache, KM_SLEEP); 351 bzero(seg, sizeof (struct memseg)); 352 seg->msegflags = MEMSEG_DYNAMIC; 353 seg->pages = pp; 354 } else { 355 /*EMPTY*/ 356 ASSERT(seg->msegflags & MEMSEG_DYNAMIC); 357 } 358 359 seg->epages = seg->pages + npgs; 360 seg->pages_base = base; 361 seg->pages_end = base + npgs; 362 363 /* 364 * Initialize metadata. The page_ts are set to locked state 365 * ready to be freed. 366 */ 367 bzero((caddr_t)pp, ptob(metapgs)); 368 369 pfn = seg->pages_base; 370 /* Save the original pp base in case we reuse a memseg. */ 371 opp = pp; 372 oepp = opp + npgs; 373 for (pp = opp; pp < oepp; pp++) { 374 pp->p_pagenum = pfn; 375 pfn++; 376 page_iolock_init(pp); 377 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 378 continue; 379 pp->p_offset = (u_offset_t)-1; 380 } 381 382 if (reuse) { 383 /* Remap our page_ts to the re-used memseg VA space. */ 384 pfn = pt_base; 385 vaddr = (caddr_t)seg->pages; 386 for (pnum = 0; pnum < metapgs; pnum++) { 387 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 388 PROT_READ | PROT_WRITE, 389 HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST); 390 pfn++; 391 vaddr += ptob(1); 392 } 393 394 hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs), 395 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 396 397 vmem_free(heap_arena, mapva, ptob(metapgs)); 398 } 399 400 hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off); 401 402 memsegs_lock(1); 403 404 /* 405 * The new memseg is inserted at the beginning of the list. 406 * Not only does this save searching for the tail, but in the 407 * case of a re-used memseg, it solves the problem of what 408 * happens of some process has still got a pointer to the 409 * memseg and follows the next pointer to continue traversing 410 * the memsegs list. 411 */ 412 413 hat_kpm_addmem_mseg_insert(seg); 414 415 seg->next = memsegs; 416 membar_producer(); 417 418 hat_kpm_addmem_memsegs_update(seg); 419 420 memsegs = seg; 421 422 build_pfn_hash(); 423 424 total_pages += npgs; 425 426 /* 427 * Recalculate the paging parameters now total_pages has changed. 428 * This will also cause the clock hands to be reset before next use. 429 */ 430 setupclock(1); 431 432 memsegs_unlock(1); 433 434 /* 435 * Free the pages outside the lock to avoid locking loops. 436 */ 437 for (pp = seg->pages; pp < seg->epages; pp++) { 438 page_free(pp, 1); 439 } 440 441 PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs); 442 443 /* 444 * Now that we've updated the appropriate memory lists we 445 * need to reset a number of globals, since we've increased memory. 446 * Several have already been updated for us as noted above. The 447 * globals we're interested in at this point are: 448 * physmax - highest page frame number. 449 * physinstalled - number of pages currently installed (done earlier) 450 * maxmem - max free pages in the system 451 * physmem - physical memory pages available 452 * availrmem - real memory available 453 */ 454 455 mutex_enter(&freemem_lock); 456 maxmem += npgs; 457 physmem += npgs; 458 availrmem += npgs; 459 availrmem_initial += npgs; 460 461 mutex_exit(&freemem_lock); 462 463 dump_resize(); 464 465 page_freelist_coalesce_all(mnode); 466 467 kphysm_setup_post_add(npgs); 468 469 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK " 470 "(0x%" PRIx64 ")\n", 471 physinstalled << (PAGESHIFT - 10), 472 (uint64_t)physinstalled << PAGESHIFT); 473 474 avmem = (uint64_t)freemem << PAGESHIFT; 475 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: " 476 "avail mem = %" PRId64 "\n", avmem); 477 478 /* 479 * Update lgroup generation number on single lgroup systems 480 */ 481 if (nlgrps == 1) 482 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 483 484 delspan_unreserve(pt_base, tpgs); 485 return (KPHYSM_OK); /* Successfully added system memory */ 486 487 } 488 489 /* 490 * There are various error conditions in kphysm_add_memory_dynamic() 491 * which require a rollback of already changed global state. 492 */ 493 static void 494 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs) 495 { 496 int mlret; 497 498 /* Unreserve memory span. */ 499 memlist_write_lock(); 500 501 mlret = memlist_delete_span( 502 (uint64_t)(pt_base) << PAGESHIFT, 503 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 504 505 ASSERT(mlret == MEML_SPANOP_OK); 506 phys_install_has_changed(); 507 installed_top_size(phys_install, &physmax, &physinstalled); 508 509 memlist_write_unlock(); 510 delspan_unreserve(pt_base, tpgs); 511 } 512 513 /* 514 * Only return an available memseg of exactly the right size. 515 * When the meta data area has it's own virtual address space 516 * we will need to manage this more carefully and do best fit 517 * allocations, possibly splitting an availble area. 518 */ 519 static struct memseg * 520 memseg_reuse(pgcnt_t metapgs) 521 { 522 struct memseg **segpp, *seg; 523 524 mutex_enter(&memseg_lists_lock); 525 526 segpp = &memseg_va_avail; 527 for (; (seg = *segpp) != NULL; segpp = &seg->lnext) { 528 caddr_t end; 529 530 if (kpm_enable) 531 end = hat_kpm_mseg_reuse(seg); 532 else 533 end = (caddr_t)seg->epages; 534 535 if (btopr(end - (caddr_t)seg->pages) == metapgs) { 536 *segpp = seg->lnext; 537 seg->lnext = NULL; 538 break; 539 } 540 } 541 mutex_exit(&memseg_lists_lock); 542 543 return (seg); 544 } 545 546 static uint_t handle_gen; 547 548 struct memdelspan { 549 struct memdelspan *mds_next; 550 pfn_t mds_base; 551 pgcnt_t mds_npgs; 552 uint_t *mds_bitmap; 553 uint_t *mds_bitmap_retired; 554 }; 555 556 #define NBPBMW (sizeof (uint_t) * NBBY) 557 #define MDS_BITMAPBYTES(MDSP) \ 558 ((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t)) 559 560 struct transit_list { 561 struct transit_list *trl_next; 562 struct memdelspan *trl_spans; 563 int trl_collect; 564 }; 565 566 struct transit_list_head { 567 kmutex_t trh_lock; 568 struct transit_list *trh_head; 569 }; 570 571 static struct transit_list_head transit_list_head; 572 573 struct mem_handle; 574 static void transit_list_collect(struct mem_handle *, int); 575 static void transit_list_insert(struct transit_list *); 576 static void transit_list_remove(struct transit_list *); 577 578 #ifdef DEBUG 579 #define MEM_DEL_STATS 580 #endif /* DEBUG */ 581 582 #ifdef MEM_DEL_STATS 583 static int mem_del_stat_print = 0; 584 struct mem_del_stat { 585 uint_t nloop; 586 uint_t need_free; 587 uint_t free_loop; 588 uint_t free_low; 589 uint_t free_failed; 590 uint_t ncheck; 591 uint_t nopaget; 592 uint_t lockfail; 593 uint_t nfree; 594 uint_t nreloc; 595 uint_t nrelocfail; 596 uint_t already_done; 597 uint_t first_notfree; 598 uint_t npplocked; 599 uint_t nlockreloc; 600 uint_t nnorepl; 601 uint_t nmodreloc; 602 uint_t ndestroy; 603 uint_t nputpage; 604 uint_t nnoreclaim; 605 uint_t ndelay; 606 uint_t demotefail; 607 uint64_t nticks_total; 608 uint64_t nticks_pgrp; 609 uint_t retired; 610 uint_t toxic; 611 uint_t failing; 612 uint_t modtoxic; 613 uint_t npplkdtoxic; 614 uint_t gptlmodfail; 615 uint_t gptllckfail; 616 }; 617 /* 618 * The stat values are only incremented in the delete thread 619 * so no locking or atomic required. 620 */ 621 #define MDSTAT_INCR(MHP, FLD) (MHP)->mh_delstat.FLD++ 622 #define MDSTAT_TOTAL(MHP, ntck) ((MHP)->mh_delstat.nticks_total += (ntck)) 623 #define MDSTAT_PGRP(MHP, ntck) ((MHP)->mh_delstat.nticks_pgrp += (ntck)) 624 static void mem_del_stat_print_func(struct mem_handle *); 625 #define MDSTAT_PRINT(MHP) mem_del_stat_print_func((MHP)) 626 #else /* MEM_DEL_STATS */ 627 #define MDSTAT_INCR(MHP, FLD) 628 #define MDSTAT_TOTAL(MHP, ntck) 629 #define MDSTAT_PGRP(MHP, ntck) 630 #define MDSTAT_PRINT(MHP) 631 #endif /* MEM_DEL_STATS */ 632 633 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING, 634 MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t; 635 636 /* 637 * mh_mutex must be taken to examine or change mh_exthandle and mh_state. 638 * The mutex may not be required for other fields, dependent on mh_state. 639 */ 640 struct mem_handle { 641 kmutex_t mh_mutex; 642 struct mem_handle *mh_next; 643 memhandle_t mh_exthandle; 644 mhnd_state_t mh_state; 645 struct transit_list mh_transit; 646 pgcnt_t mh_phys_pages; 647 pgcnt_t mh_vm_pages; 648 pgcnt_t mh_hold_todo; 649 void (*mh_delete_complete)(void *, int error); 650 void *mh_delete_complete_arg; 651 volatile uint_t mh_cancel; 652 volatile uint_t mh_dr_aio_cleanup_cancel; 653 volatile uint_t mh_aio_cleanup_done; 654 kcondvar_t mh_cv; 655 kthread_id_t mh_thread_id; 656 page_t *mh_deleted; /* link through p_next */ 657 #ifdef MEM_DEL_STATS 658 struct mem_del_stat mh_delstat; 659 #endif /* MEM_DEL_STATS */ 660 }; 661 662 static struct mem_handle *mem_handle_head; 663 static kmutex_t mem_handle_list_mutex; 664 665 static struct mem_handle * 666 kphysm_allocate_mem_handle() 667 { 668 struct mem_handle *mhp; 669 670 mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP); 671 mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL); 672 mutex_enter(&mem_handle_list_mutex); 673 mutex_enter(&mhp->mh_mutex); 674 /* handle_gen is protected by list mutex. */ 675 mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen); 676 mhp->mh_next = mem_handle_head; 677 mem_handle_head = mhp; 678 mutex_exit(&mem_handle_list_mutex); 679 680 return (mhp); 681 } 682 683 static void 684 kphysm_free_mem_handle(struct mem_handle *mhp) 685 { 686 struct mem_handle **mhpp; 687 688 ASSERT(mutex_owned(&mhp->mh_mutex)); 689 ASSERT(mhp->mh_state == MHND_FREE); 690 /* 691 * Exit the mutex to preserve locking order. This is OK 692 * here as once in the FREE state, the handle cannot 693 * be found by a lookup. 694 */ 695 mutex_exit(&mhp->mh_mutex); 696 697 mutex_enter(&mem_handle_list_mutex); 698 mhpp = &mem_handle_head; 699 while (*mhpp != NULL && *mhpp != mhp) 700 mhpp = &(*mhpp)->mh_next; 701 ASSERT(*mhpp == mhp); 702 /* 703 * No need to lock the handle (mh_mutex) as only 704 * mh_next changing and this is the only thread that 705 * can be referncing mhp. 706 */ 707 *mhpp = mhp->mh_next; 708 mutex_exit(&mem_handle_list_mutex); 709 710 mutex_destroy(&mhp->mh_mutex); 711 kmem_free(mhp, sizeof (struct mem_handle)); 712 } 713 714 /* 715 * This function finds the internal mem_handle corresponding to an 716 * external handle and returns it with the mh_mutex held. 717 */ 718 static struct mem_handle * 719 kphysm_lookup_mem_handle(memhandle_t handle) 720 { 721 struct mem_handle *mhp; 722 723 mutex_enter(&mem_handle_list_mutex); 724 for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) { 725 if (mhp->mh_exthandle == handle) { 726 mutex_enter(&mhp->mh_mutex); 727 /* 728 * The state of the handle could have been changed 729 * by kphysm_del_release() while waiting for mh_mutex. 730 */ 731 if (mhp->mh_state == MHND_FREE) { 732 mutex_exit(&mhp->mh_mutex); 733 continue; 734 } 735 break; 736 } 737 } 738 mutex_exit(&mem_handle_list_mutex); 739 return (mhp); 740 } 741 742 int 743 kphysm_del_gethandle(memhandle_t *xmhp) 744 { 745 struct mem_handle *mhp; 746 747 mhp = kphysm_allocate_mem_handle(); 748 /* 749 * The handle is allocated using KM_SLEEP, so cannot fail. 750 * If the implementation is changed, the correct error to return 751 * here would be KPHYSM_ENOHANDLES. 752 */ 753 ASSERT(mhp->mh_state == MHND_FREE); 754 mhp->mh_state = MHND_INIT; 755 *xmhp = mhp->mh_exthandle; 756 mutex_exit(&mhp->mh_mutex); 757 return (KPHYSM_OK); 758 } 759 760 static int 761 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2) 762 { 763 pfn_t e1, e2; 764 765 e1 = b1 + l1; 766 e2 = b2 + l2; 767 768 return (!(b2 >= e1 || b1 >= e2)); 769 } 770 771 static int can_remove_pgs(pgcnt_t); 772 773 static struct memdelspan * 774 span_to_install(pfn_t base, pgcnt_t npgs) 775 { 776 struct memdelspan *mdsp; 777 struct memdelspan *mdsp_new; 778 uint64_t address, size, thislen; 779 struct memlist *mlp; 780 781 mdsp_new = NULL; 782 783 address = (uint64_t)base << PAGESHIFT; 784 size = (uint64_t)npgs << PAGESHIFT; 785 while (size != 0) { 786 memlist_read_lock(); 787 for (mlp = phys_install; mlp != NULL; mlp = mlp->next) { 788 if (address >= (mlp->address + mlp->size)) 789 continue; 790 if ((address + size) > mlp->address) 791 break; 792 } 793 if (mlp == NULL) { 794 address += size; 795 size = 0; 796 thislen = 0; 797 } else { 798 if (address < mlp->address) { 799 size -= (mlp->address - address); 800 address = mlp->address; 801 } 802 ASSERT(address >= mlp->address); 803 if ((address + size) > (mlp->address + mlp->size)) { 804 thislen = mlp->size - (address - mlp->address); 805 } else { 806 thislen = size; 807 } 808 } 809 memlist_read_unlock(); 810 /* TODO: phys_install could change now */ 811 if (thislen == 0) 812 continue; 813 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 814 mdsp->mds_base = btop(address); 815 mdsp->mds_npgs = btop(thislen); 816 mdsp->mds_next = mdsp_new; 817 mdsp_new = mdsp; 818 address += thislen; 819 size -= thislen; 820 } 821 return (mdsp_new); 822 } 823 824 static void 825 free_delspans(struct memdelspan *mdsp) 826 { 827 struct memdelspan *amdsp; 828 829 while ((amdsp = mdsp) != NULL) { 830 mdsp = amdsp->mds_next; 831 kmem_free(amdsp, sizeof (struct memdelspan)); 832 } 833 } 834 835 /* 836 * Concatenate lists. No list ordering is required. 837 */ 838 839 static void 840 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp) 841 { 842 while (*mdspp != NULL) 843 mdspp = &(*mdspp)->mds_next; 844 845 *mdspp = mdsp; 846 } 847 848 /* 849 * Given a new list of delspans, check there is no overlap with 850 * all existing span activity (add or delete) and then concatenate 851 * the new spans to the given list. 852 * Return 1 for OK, 0 if overlapping. 853 */ 854 static int 855 delspan_insert( 856 struct transit_list *my_tlp, 857 struct memdelspan *mdsp_new) 858 { 859 struct transit_list_head *trh; 860 struct transit_list *tlp; 861 int ret; 862 863 trh = &transit_list_head; 864 865 ASSERT(my_tlp != NULL); 866 ASSERT(mdsp_new != NULL); 867 868 ret = 1; 869 mutex_enter(&trh->trh_lock); 870 /* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */ 871 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 872 struct memdelspan *mdsp; 873 874 for (mdsp = tlp->trl_spans; mdsp != NULL; 875 mdsp = mdsp->mds_next) { 876 struct memdelspan *nmdsp; 877 878 for (nmdsp = mdsp_new; nmdsp != NULL; 879 nmdsp = nmdsp->mds_next) { 880 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 881 nmdsp->mds_base, nmdsp->mds_npgs)) { 882 ret = 0; 883 goto done; 884 } 885 } 886 } 887 } 888 done: 889 if (ret != 0) { 890 if (my_tlp->trl_spans == NULL) 891 transit_list_insert(my_tlp); 892 delspan_concat(&my_tlp->trl_spans, mdsp_new); 893 } 894 mutex_exit(&trh->trh_lock); 895 return (ret); 896 } 897 898 static void 899 delspan_remove( 900 struct transit_list *my_tlp, 901 pfn_t base, 902 pgcnt_t npgs) 903 { 904 struct transit_list_head *trh; 905 struct memdelspan *mdsp; 906 907 trh = &transit_list_head; 908 909 ASSERT(my_tlp != NULL); 910 911 mutex_enter(&trh->trh_lock); 912 if ((mdsp = my_tlp->trl_spans) != NULL) { 913 if (npgs == 0) { 914 my_tlp->trl_spans = NULL; 915 free_delspans(mdsp); 916 transit_list_remove(my_tlp); 917 } else { 918 struct memdelspan **prv; 919 920 prv = &my_tlp->trl_spans; 921 while (mdsp != NULL) { 922 pfn_t p_end; 923 924 p_end = mdsp->mds_base + mdsp->mds_npgs; 925 if (mdsp->mds_base >= base && 926 p_end <= (base + npgs)) { 927 *prv = mdsp->mds_next; 928 mdsp->mds_next = NULL; 929 free_delspans(mdsp); 930 } else { 931 prv = &mdsp->mds_next; 932 } 933 mdsp = *prv; 934 } 935 if (my_tlp->trl_spans == NULL) 936 transit_list_remove(my_tlp); 937 } 938 } 939 mutex_exit(&trh->trh_lock); 940 } 941 942 /* 943 * Reserve interface for add to stop delete before add finished. 944 * This list is only accessed through the delspan_insert/remove 945 * functions and so is fully protected by the mutex in struct transit_list. 946 */ 947 948 static struct transit_list reserve_transit; 949 950 static int 951 delspan_reserve(pfn_t base, pgcnt_t npgs) 952 { 953 struct memdelspan *mdsp; 954 int ret; 955 956 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 957 mdsp->mds_base = base; 958 mdsp->mds_npgs = npgs; 959 if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) { 960 free_delspans(mdsp); 961 } 962 return (ret); 963 } 964 965 static void 966 delspan_unreserve(pfn_t base, pgcnt_t npgs) 967 { 968 delspan_remove(&reserve_transit, base, npgs); 969 } 970 971 /* 972 * Return whether memseg was created by kphysm_add_memory_dynamic(). 973 * If this is the case and startp non zero, return also the start pfn 974 * of the meta data via startp. 975 */ 976 static int 977 memseg_is_dynamic(struct memseg *seg, pfn_t *startp) 978 { 979 pfn_t pt_start; 980 981 if ((seg->msegflags & MEMSEG_DYNAMIC) == 0) 982 return (0); 983 984 /* Meta data is required to be at the beginning */ 985 ASSERT(hat_getpfnum(kas.a_hat, (caddr_t)seg->epages) < seg->pages_base); 986 987 pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages); 988 if (startp != NULL) 989 *startp = pt_start; 990 991 return (1); 992 } 993 994 int 995 kphysm_del_span( 996 memhandle_t handle, 997 pfn_t base, 998 pgcnt_t npgs) 999 { 1000 struct mem_handle *mhp; 1001 struct memseg *seg; 1002 struct memdelspan *mdsp; 1003 struct memdelspan *mdsp_new; 1004 pgcnt_t phys_pages, vm_pages; 1005 pfn_t p_end; 1006 page_t *pp; 1007 int ret; 1008 1009 mhp = kphysm_lookup_mem_handle(handle); 1010 if (mhp == NULL) { 1011 return (KPHYSM_EHANDLE); 1012 } 1013 if (mhp->mh_state != MHND_INIT) { 1014 mutex_exit(&mhp->mh_mutex); 1015 return (KPHYSM_ESEQUENCE); 1016 } 1017 1018 /* 1019 * Intersect the span with the installed memory list (phys_install). 1020 */ 1021 mdsp_new = span_to_install(base, npgs); 1022 if (mdsp_new == NULL) { 1023 /* 1024 * No physical memory in this range. Is this an 1025 * error? If an attempt to start the delete is made 1026 * for OK returns from del_span such as this, start will 1027 * return an error. 1028 * Could return KPHYSM_ENOWORK. 1029 */ 1030 /* 1031 * It is assumed that there are no error returns 1032 * from span_to_install() due to kmem_alloc failure. 1033 */ 1034 mutex_exit(&mhp->mh_mutex); 1035 return (KPHYSM_OK); 1036 } 1037 /* 1038 * Does this span overlap an existing span? 1039 */ 1040 if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) { 1041 /* 1042 * Differentiate between already on list for this handle 1043 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY). 1044 */ 1045 ret = KPHYSM_EBUSY; 1046 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1047 mdsp = mdsp->mds_next) { 1048 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 1049 base, npgs)) { 1050 ret = KPHYSM_EDUP; 1051 break; 1052 } 1053 } 1054 mutex_exit(&mhp->mh_mutex); 1055 free_delspans(mdsp_new); 1056 return (ret); 1057 } 1058 /* 1059 * At this point the spans in mdsp_new have been inserted into the 1060 * list of spans for this handle and thereby to the global list of 1061 * spans being processed. Each of these spans must now be checked 1062 * for relocatability. As a side-effect segments in the memseg list 1063 * may be split. 1064 * 1065 * Note that mdsp_new can no longer be used as it is now part of 1066 * a larger list. Select elements of this larger list based 1067 * on base and npgs. 1068 */ 1069 restart: 1070 phys_pages = 0; 1071 vm_pages = 0; 1072 ret = KPHYSM_OK; 1073 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1074 mdsp = mdsp->mds_next) { 1075 pgcnt_t pages_checked; 1076 1077 if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) { 1078 continue; 1079 } 1080 p_end = mdsp->mds_base + mdsp->mds_npgs; 1081 /* 1082 * The pages_checked count is a hack. All pages should be 1083 * checked for relocatability. Those not covered by memsegs 1084 * should be tested with arch_kphysm_del_span_ok(). 1085 */ 1086 pages_checked = 0; 1087 for (seg = memsegs; seg; seg = seg->next) { 1088 pfn_t mseg_start; 1089 1090 if (seg->pages_base >= p_end || 1091 seg->pages_end <= mdsp->mds_base) { 1092 /* Span and memseg don't overlap. */ 1093 continue; 1094 } 1095 /* Check that segment is suitable for delete. */ 1096 if (memseg_is_dynamic(seg, &mseg_start)) { 1097 /* 1098 * Can only delete whole added segments 1099 * for the moment. 1100 * Check that this is completely within the 1101 * span. 1102 */ 1103 if (mseg_start < mdsp->mds_base || 1104 seg->pages_end > p_end) { 1105 ret = KPHYSM_EBUSY; 1106 break; 1107 } 1108 pages_checked += seg->pages_end - mseg_start; 1109 } else { 1110 /* 1111 * Set mseg_start for accounting below. 1112 */ 1113 mseg_start = seg->pages_base; 1114 /* 1115 * If this segment is larger than the span, 1116 * try to split it. After the split, it 1117 * is necessary to restart. 1118 */ 1119 if (seg->pages_base < mdsp->mds_base || 1120 seg->pages_end > p_end) { 1121 pfn_t abase; 1122 pgcnt_t anpgs; 1123 int s_ret; 1124 1125 /* Split required. */ 1126 if (mdsp->mds_base < seg->pages_base) 1127 abase = seg->pages_base; 1128 else 1129 abase = mdsp->mds_base; 1130 if (p_end > seg->pages_end) 1131 anpgs = seg->pages_end - abase; 1132 else 1133 anpgs = p_end - abase; 1134 s_ret = kphysm_split_memseg(abase, 1135 anpgs); 1136 if (s_ret == 0) { 1137 /* Split failed. */ 1138 ret = KPHYSM_ERESOURCE; 1139 break; 1140 } 1141 goto restart; 1142 } 1143 pages_checked += 1144 seg->pages_end - seg->pages_base; 1145 } 1146 /* 1147 * The memseg is wholly within the delete span. 1148 * The individual pages can now be checked. 1149 */ 1150 /* Cage test. */ 1151 for (pp = seg->pages; pp < seg->epages; pp++) { 1152 if (PP_ISNORELOC(pp)) { 1153 ret = KPHYSM_ENONRELOC; 1154 break; 1155 } 1156 } 1157 if (ret != KPHYSM_OK) { 1158 break; 1159 } 1160 phys_pages += (seg->pages_end - mseg_start); 1161 vm_pages += MSEG_NPAGES(seg); 1162 } 1163 if (ret != KPHYSM_OK) 1164 break; 1165 if (pages_checked != mdsp->mds_npgs) { 1166 ret = KPHYSM_ENONRELOC; 1167 break; 1168 } 1169 } 1170 1171 if (ret == KPHYSM_OK) { 1172 mhp->mh_phys_pages += phys_pages; 1173 mhp->mh_vm_pages += vm_pages; 1174 } else { 1175 /* 1176 * Keep holding the mh_mutex to prevent it going away. 1177 */ 1178 delspan_remove(&mhp->mh_transit, base, npgs); 1179 } 1180 mutex_exit(&mhp->mh_mutex); 1181 return (ret); 1182 } 1183 1184 int 1185 kphysm_del_span_query( 1186 pfn_t base, 1187 pgcnt_t npgs, 1188 memquery_t *mqp) 1189 { 1190 struct memdelspan *mdsp; 1191 struct memdelspan *mdsp_new; 1192 int done_first_nonreloc; 1193 1194 mqp->phys_pages = 0; 1195 mqp->managed = 0; 1196 mqp->nonrelocatable = 0; 1197 mqp->first_nonrelocatable = 0; 1198 mqp->last_nonrelocatable = 0; 1199 1200 mdsp_new = span_to_install(base, npgs); 1201 /* 1202 * It is OK to proceed here if mdsp_new == NULL. 1203 */ 1204 done_first_nonreloc = 0; 1205 for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) { 1206 pfn_t sbase; 1207 pgcnt_t snpgs; 1208 1209 mqp->phys_pages += mdsp->mds_npgs; 1210 sbase = mdsp->mds_base; 1211 snpgs = mdsp->mds_npgs; 1212 while (snpgs != 0) { 1213 struct memseg *lseg, *seg; 1214 pfn_t p_end; 1215 page_t *pp; 1216 pfn_t mseg_start; 1217 1218 p_end = sbase + snpgs; 1219 /* 1220 * Find the lowest addressed memseg that starts 1221 * after sbase and account for it. 1222 * This is to catch dynamic memsegs whose start 1223 * is hidden. 1224 */ 1225 seg = NULL; 1226 for (lseg = memsegs; lseg != NULL; lseg = lseg->next) { 1227 if ((lseg->pages_base >= sbase) || 1228 (lseg->pages_base < p_end && 1229 lseg->pages_end > sbase)) { 1230 if (seg == NULL || 1231 seg->pages_base > lseg->pages_base) 1232 seg = lseg; 1233 } 1234 } 1235 if (seg != NULL) { 1236 if (!memseg_is_dynamic(seg, &mseg_start)) { 1237 mseg_start = seg->pages_base; 1238 } 1239 /* 1240 * Now have the full extent of the memseg so 1241 * do the range check. 1242 */ 1243 if (mseg_start >= p_end || 1244 seg->pages_end <= sbase) { 1245 /* Span does not overlap memseg. */ 1246 seg = NULL; 1247 } 1248 } 1249 /* 1250 * Account for gap either before the segment if 1251 * there is one or to the end of the span. 1252 */ 1253 if (seg == NULL || mseg_start > sbase) { 1254 pfn_t a_end; 1255 1256 a_end = (seg == NULL) ? p_end : mseg_start; 1257 /* 1258 * Check with arch layer for relocatability. 1259 */ 1260 if (arch_kphysm_del_span_ok(sbase, 1261 (a_end - sbase))) { 1262 /* 1263 * No non-relocatble pages in this 1264 * area, avoid the fine-grained 1265 * test. 1266 */ 1267 snpgs -= (a_end - sbase); 1268 sbase = a_end; 1269 } 1270 while (sbase < a_end) { 1271 if (!arch_kphysm_del_span_ok(sbase, 1272 1)) { 1273 mqp->nonrelocatable++; 1274 if (!done_first_nonreloc) { 1275 mqp-> 1276 first_nonrelocatable 1277 = sbase; 1278 done_first_nonreloc = 1; 1279 } 1280 mqp->last_nonrelocatable = 1281 sbase; 1282 } 1283 sbase++; 1284 snpgs--; 1285 } 1286 } 1287 if (seg != NULL) { 1288 ASSERT(mseg_start <= sbase); 1289 if (seg->pages_base != mseg_start && 1290 seg->pages_base > sbase) { 1291 pgcnt_t skip_pgs; 1292 1293 /* 1294 * Skip the page_t area of a 1295 * dynamic memseg. 1296 */ 1297 skip_pgs = seg->pages_base - sbase; 1298 if (snpgs <= skip_pgs) { 1299 sbase += snpgs; 1300 snpgs = 0; 1301 continue; 1302 } 1303 snpgs -= skip_pgs; 1304 sbase += skip_pgs; 1305 } 1306 ASSERT(snpgs != 0); 1307 ASSERT(seg->pages_base <= sbase); 1308 /* 1309 * The individual pages can now be checked. 1310 */ 1311 for (pp = seg->pages + 1312 (sbase - seg->pages_base); 1313 snpgs != 0 && pp < seg->epages; pp++) { 1314 mqp->managed++; 1315 if (PP_ISNORELOC(pp)) { 1316 mqp->nonrelocatable++; 1317 if (!done_first_nonreloc) { 1318 mqp-> 1319 first_nonrelocatable 1320 = sbase; 1321 done_first_nonreloc = 1; 1322 } 1323 mqp->last_nonrelocatable = 1324 sbase; 1325 } 1326 sbase++; 1327 snpgs--; 1328 } 1329 } 1330 } 1331 } 1332 1333 free_delspans(mdsp_new); 1334 1335 return (KPHYSM_OK); 1336 } 1337 1338 /* 1339 * This release function can be called at any stage as follows: 1340 * _gethandle only called 1341 * _span(s) only called 1342 * _start called but failed 1343 * delete thread exited 1344 */ 1345 int 1346 kphysm_del_release(memhandle_t handle) 1347 { 1348 struct mem_handle *mhp; 1349 1350 mhp = kphysm_lookup_mem_handle(handle); 1351 if (mhp == NULL) { 1352 return (KPHYSM_EHANDLE); 1353 } 1354 switch (mhp->mh_state) { 1355 case MHND_STARTING: 1356 case MHND_RUNNING: 1357 mutex_exit(&mhp->mh_mutex); 1358 return (KPHYSM_ENOTFINISHED); 1359 case MHND_FREE: 1360 ASSERT(mhp->mh_state != MHND_FREE); 1361 mutex_exit(&mhp->mh_mutex); 1362 return (KPHYSM_EHANDLE); 1363 case MHND_INIT: 1364 break; 1365 case MHND_DONE: 1366 break; 1367 case MHND_RELEASE: 1368 mutex_exit(&mhp->mh_mutex); 1369 return (KPHYSM_ESEQUENCE); 1370 default: 1371 #ifdef DEBUG 1372 cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d", 1373 (void *)mhp, mhp->mh_state); 1374 #endif /* DEBUG */ 1375 mutex_exit(&mhp->mh_mutex); 1376 return (KPHYSM_EHANDLE); 1377 } 1378 /* 1379 * Set state so that we can wait if necessary. 1380 * Also this means that we have read/write access to all 1381 * fields except mh_exthandle and mh_state. 1382 */ 1383 mhp->mh_state = MHND_RELEASE; 1384 /* 1385 * The mem_handle cannot be de-allocated by any other operation 1386 * now, so no need to hold mh_mutex. 1387 */ 1388 mutex_exit(&mhp->mh_mutex); 1389 1390 delspan_remove(&mhp->mh_transit, 0, 0); 1391 mhp->mh_phys_pages = 0; 1392 mhp->mh_vm_pages = 0; 1393 mhp->mh_hold_todo = 0; 1394 mhp->mh_delete_complete = NULL; 1395 mhp->mh_delete_complete_arg = NULL; 1396 mhp->mh_cancel = 0; 1397 1398 mutex_enter(&mhp->mh_mutex); 1399 ASSERT(mhp->mh_state == MHND_RELEASE); 1400 mhp->mh_state = MHND_FREE; 1401 1402 kphysm_free_mem_handle(mhp); 1403 1404 return (KPHYSM_OK); 1405 } 1406 1407 /* 1408 * This cancel function can only be called with the thread running. 1409 */ 1410 int 1411 kphysm_del_cancel(memhandle_t handle) 1412 { 1413 struct mem_handle *mhp; 1414 1415 mhp = kphysm_lookup_mem_handle(handle); 1416 if (mhp == NULL) { 1417 return (KPHYSM_EHANDLE); 1418 } 1419 if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) { 1420 mutex_exit(&mhp->mh_mutex); 1421 return (KPHYSM_ENOTRUNNING); 1422 } 1423 /* 1424 * Set the cancel flag and wake the delete thread up. 1425 * The thread may be waiting on I/O, so the effect of the cancel 1426 * may be delayed. 1427 */ 1428 if (mhp->mh_cancel == 0) { 1429 mhp->mh_cancel = KPHYSM_ECANCELLED; 1430 cv_signal(&mhp->mh_cv); 1431 } 1432 mutex_exit(&mhp->mh_mutex); 1433 return (KPHYSM_OK); 1434 } 1435 1436 int 1437 kphysm_del_status( 1438 memhandle_t handle, 1439 memdelstat_t *mdstp) 1440 { 1441 struct mem_handle *mhp; 1442 1443 mhp = kphysm_lookup_mem_handle(handle); 1444 if (mhp == NULL) { 1445 return (KPHYSM_EHANDLE); 1446 } 1447 /* 1448 * Calling kphysm_del_status() is allowed before the delete 1449 * is started to allow for status display. 1450 */ 1451 if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING && 1452 mhp->mh_state != MHND_RUNNING) { 1453 mutex_exit(&mhp->mh_mutex); 1454 return (KPHYSM_ENOTRUNNING); 1455 } 1456 mdstp->phys_pages = mhp->mh_phys_pages; 1457 mdstp->managed = mhp->mh_vm_pages; 1458 mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo; 1459 mutex_exit(&mhp->mh_mutex); 1460 return (KPHYSM_OK); 1461 } 1462 1463 static int mem_delete_additional_pages = 100; 1464 1465 static int 1466 can_remove_pgs(pgcnt_t npgs) 1467 { 1468 /* 1469 * If all pageable pages were paged out, freemem would 1470 * equal availrmem. There is a minimum requirement for 1471 * availrmem. 1472 */ 1473 if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages)) 1474 < npgs) 1475 return (0); 1476 /* TODO: check swap space, etc. */ 1477 return (1); 1478 } 1479 1480 static int 1481 get_availrmem(pgcnt_t npgs) 1482 { 1483 int ret; 1484 1485 mutex_enter(&freemem_lock); 1486 ret = can_remove_pgs(npgs); 1487 if (ret != 0) 1488 availrmem -= npgs; 1489 mutex_exit(&freemem_lock); 1490 return (ret); 1491 } 1492 1493 static void 1494 put_availrmem(pgcnt_t npgs) 1495 { 1496 mutex_enter(&freemem_lock); 1497 availrmem += npgs; 1498 mutex_exit(&freemem_lock); 1499 } 1500 1501 #define FREEMEM_INCR 100 1502 static pgcnt_t freemem_incr = FREEMEM_INCR; 1503 #define DEL_FREE_WAIT_FRAC 4 1504 #define DEL_FREE_WAIT_TICKS ((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC) 1505 1506 #define DEL_BUSY_WAIT_FRAC 20 1507 #define DEL_BUSY_WAIT_TICKS ((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC) 1508 1509 static void kphysm_del_cleanup(struct mem_handle *); 1510 1511 static void page_delete_collect(page_t *, struct mem_handle *); 1512 1513 static pgcnt_t 1514 delthr_get_freemem(struct mem_handle *mhp) 1515 { 1516 pgcnt_t free_get; 1517 int ret; 1518 1519 ASSERT(MUTEX_HELD(&mhp->mh_mutex)); 1520 1521 MDSTAT_INCR(mhp, need_free); 1522 /* 1523 * Get up to freemem_incr pages. 1524 */ 1525 free_get = freemem_incr; 1526 if (free_get > mhp->mh_hold_todo) 1527 free_get = mhp->mh_hold_todo; 1528 /* 1529 * Take free_get pages away from freemem, 1530 * waiting if necessary. 1531 */ 1532 1533 while (!mhp->mh_cancel) { 1534 mutex_exit(&mhp->mh_mutex); 1535 MDSTAT_INCR(mhp, free_loop); 1536 /* 1537 * Duplicate test from page_create_throttle() 1538 * but don't override with !PG_WAIT. 1539 */ 1540 if (freemem < (free_get + throttlefree)) { 1541 MDSTAT_INCR(mhp, free_low); 1542 ret = 0; 1543 } else { 1544 ret = page_create_wait(free_get, 0); 1545 if (ret == 0) { 1546 /* EMPTY */ 1547 MDSTAT_INCR(mhp, free_failed); 1548 } 1549 } 1550 if (ret != 0) { 1551 mutex_enter(&mhp->mh_mutex); 1552 return (free_get); 1553 } 1554 1555 /* 1556 * Put pressure on pageout. 1557 */ 1558 page_needfree(free_get); 1559 cv_signal(&proc_pageout->p_cv); 1560 1561 mutex_enter(&mhp->mh_mutex); 1562 (void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex, 1563 (lbolt + DEL_FREE_WAIT_TICKS)); 1564 mutex_exit(&mhp->mh_mutex); 1565 page_needfree(-(spgcnt_t)free_get); 1566 1567 mutex_enter(&mhp->mh_mutex); 1568 } 1569 return (0); 1570 } 1571 1572 #define DR_AIO_CLEANUP_DELAY 25000 /* 0.025secs, in usec */ 1573 #define DR_AIO_CLEANUP_MAXLOOPS_NODELAY 100 1574 /* 1575 * This function is run as a helper thread for delete_memory_thread. 1576 * It is needed in order to force kaio cleanup, so that pages used in kaio 1577 * will be unlocked and subsequently relocated by delete_memory_thread. 1578 * The address of the delete_memory_threads's mem_handle is passed in to 1579 * this thread function, and is used to set the mh_aio_cleanup_done member 1580 * prior to calling thread_exit(). 1581 */ 1582 static void 1583 dr_aio_cleanup_thread(caddr_t amhp) 1584 { 1585 proc_t *procp; 1586 int (*aio_cleanup_dr_delete_memory)(proc_t *); 1587 int cleaned; 1588 int n = 0; 1589 struct mem_handle *mhp; 1590 volatile uint_t *pcancel; 1591 1592 mhp = (struct mem_handle *)amhp; 1593 ASSERT(mhp != NULL); 1594 pcancel = &mhp->mh_dr_aio_cleanup_cancel; 1595 if (modload("sys", "kaio") == -1) { 1596 mhp->mh_aio_cleanup_done = 1; 1597 cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio"); 1598 thread_exit(); 1599 } 1600 aio_cleanup_dr_delete_memory = (int (*)(proc_t *)) 1601 modgetsymvalue("aio_cleanup_dr_delete_memory", 0); 1602 if (aio_cleanup_dr_delete_memory == NULL) { 1603 mhp->mh_aio_cleanup_done = 1; 1604 cmn_err(CE_WARN, 1605 "aio_cleanup_dr_delete_memory not found in kaio"); 1606 thread_exit(); 1607 } 1608 do { 1609 cleaned = 0; 1610 mutex_enter(&pidlock); 1611 for (procp = practive; (*pcancel == 0) && (procp != NULL); 1612 procp = procp->p_next) { 1613 mutex_enter(&procp->p_lock); 1614 if (procp->p_aio != NULL) { 1615 /* cleanup proc's outstanding kaio */ 1616 cleaned += 1617 (*aio_cleanup_dr_delete_memory)(procp); 1618 } 1619 mutex_exit(&procp->p_lock); 1620 } 1621 mutex_exit(&pidlock); 1622 if ((*pcancel == 0) && 1623 (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) { 1624 /* delay a bit before retrying all procs again */ 1625 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 1626 n = 0; 1627 } 1628 } while (*pcancel == 0); 1629 mhp->mh_aio_cleanup_done = 1; 1630 thread_exit(); 1631 } 1632 1633 static void 1634 delete_memory_thread(caddr_t amhp) 1635 { 1636 struct mem_handle *mhp; 1637 struct memdelspan *mdsp; 1638 callb_cpr_t cprinfo; 1639 page_t *pp_targ; 1640 spgcnt_t freemem_left; 1641 void (*del_complete_funcp)(void *, int error); 1642 void *del_complete_arg; 1643 int comp_code; 1644 int ret; 1645 int first_scan; 1646 uint_t szc; 1647 #ifdef MEM_DEL_STATS 1648 uint64_t start_total, ntick_total; 1649 uint64_t start_pgrp, ntick_pgrp; 1650 #endif /* MEM_DEL_STATS */ 1651 1652 mhp = (struct mem_handle *)amhp; 1653 1654 #ifdef MEM_DEL_STATS 1655 start_total = ddi_get_lbolt(); 1656 #endif /* MEM_DEL_STATS */ 1657 1658 CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex, 1659 callb_generic_cpr, "memdel"); 1660 1661 mutex_enter(&mhp->mh_mutex); 1662 ASSERT(mhp->mh_state == MHND_STARTING); 1663 1664 mhp->mh_state = MHND_RUNNING; 1665 mhp->mh_thread_id = curthread; 1666 1667 mhp->mh_hold_todo = mhp->mh_vm_pages; 1668 mutex_exit(&mhp->mh_mutex); 1669 1670 /* Allocate the remap pages now, if necessary. */ 1671 memseg_remap_init(); 1672 1673 /* 1674 * Subtract from availrmem now if possible as availrmem 1675 * may not be available by the end of the delete. 1676 */ 1677 if (!get_availrmem(mhp->mh_vm_pages)) { 1678 comp_code = KPHYSM_ENOTVIABLE; 1679 mutex_enter(&mhp->mh_mutex); 1680 goto early_exit; 1681 } 1682 1683 ret = kphysm_setup_pre_del(mhp->mh_vm_pages); 1684 1685 mutex_enter(&mhp->mh_mutex); 1686 1687 if (ret != 0) { 1688 mhp->mh_cancel = KPHYSM_EREFUSED; 1689 goto refused; 1690 } 1691 1692 transit_list_collect(mhp, 1); 1693 1694 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1695 mdsp = mdsp->mds_next) { 1696 ASSERT(mdsp->mds_bitmap == NULL); 1697 mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP); 1698 mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp), 1699 KM_SLEEP); 1700 } 1701 1702 first_scan = 1; 1703 freemem_left = 0; 1704 /* 1705 * Start dr_aio_cleanup_thread, which periodically iterates 1706 * through the process list and invokes aio cleanup. This 1707 * is needed in order to avoid a deadly embrace between the 1708 * delete_memory_thread (waiting on writer lock for page, with the 1709 * exclusive-wanted bit set), kaio read request threads (waiting for a 1710 * reader lock on the same page that is wanted by the 1711 * delete_memory_thread), and threads waiting for kaio completion 1712 * (blocked on spt_amp->lock). 1713 */ 1714 mhp->mh_dr_aio_cleanup_cancel = 0; 1715 mhp->mh_aio_cleanup_done = 0; 1716 (void) thread_create(NULL, 0, dr_aio_cleanup_thread, 1717 (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1); 1718 while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) { 1719 pgcnt_t collected; 1720 1721 MDSTAT_INCR(mhp, nloop); 1722 collected = 0; 1723 for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) && 1724 (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) { 1725 pfn_t pfn, p_end; 1726 1727 if (first_scan) { 1728 mem_node_pre_del_slice(mdsp->mds_base, 1729 mdsp->mds_base + mdsp->mds_npgs - 1); 1730 } 1731 1732 p_end = mdsp->mds_base + mdsp->mds_npgs; 1733 for (pfn = mdsp->mds_base; (pfn < p_end) && 1734 (mhp->mh_cancel == 0); pfn++) { 1735 page_t *pp, *tpp, *tpp_targ; 1736 pgcnt_t bit; 1737 struct vnode *vp; 1738 u_offset_t offset; 1739 int mod, result; 1740 spgcnt_t pgcnt; 1741 1742 bit = pfn - mdsp->mds_base; 1743 if ((mdsp->mds_bitmap[bit / NBPBMW] & 1744 (1 << (bit % NBPBMW))) != 0) { 1745 MDSTAT_INCR(mhp, already_done); 1746 continue; 1747 } 1748 if (freemem_left == 0) { 1749 freemem_left += delthr_get_freemem(mhp); 1750 if (freemem_left == 0) 1751 break; 1752 } 1753 1754 /* 1755 * Release mh_mutex - some of this 1756 * stuff takes some time (eg PUTPAGE). 1757 */ 1758 1759 mutex_exit(&mhp->mh_mutex); 1760 MDSTAT_INCR(mhp, ncheck); 1761 1762 pp = page_numtopp_nolock(pfn); 1763 if (pp == NULL) { 1764 /* 1765 * Not covered by a page_t - will 1766 * be dealt with elsewhere. 1767 */ 1768 MDSTAT_INCR(mhp, nopaget); 1769 mutex_enter(&mhp->mh_mutex); 1770 mdsp->mds_bitmap[bit / NBPBMW] |= 1771 (1 << (bit % NBPBMW)); 1772 continue; 1773 } 1774 1775 if (!page_try_reclaim_lock(pp, SE_EXCL, 1776 SE_EXCL_WANTED | SE_RETIRED)) { 1777 /* 1778 * Page in use elsewhere. Skip it. 1779 */ 1780 MDSTAT_INCR(mhp, lockfail); 1781 mutex_enter(&mhp->mh_mutex); 1782 continue; 1783 } 1784 /* 1785 * See if the cage expanded into the delete. 1786 * This can happen as we have to allow the 1787 * cage to expand. 1788 */ 1789 if (PP_ISNORELOC(pp)) { 1790 page_unlock(pp); 1791 mutex_enter(&mhp->mh_mutex); 1792 mhp->mh_cancel = KPHYSM_ENONRELOC; 1793 break; 1794 } 1795 if (PP_RETIRED(pp)) { 1796 /* 1797 * Page has been retired and is 1798 * not part of the cage so we 1799 * can now do the accounting for 1800 * it. 1801 */ 1802 MDSTAT_INCR(mhp, retired); 1803 mutex_enter(&mhp->mh_mutex); 1804 mdsp->mds_bitmap[bit / NBPBMW] 1805 |= (1 << (bit % NBPBMW)); 1806 mdsp->mds_bitmap_retired[bit / 1807 NBPBMW] |= 1808 (1 << (bit % NBPBMW)); 1809 mhp->mh_hold_todo--; 1810 continue; 1811 } 1812 ASSERT(freemem_left != 0); 1813 if (PP_ISFREE(pp)) { 1814 /* 1815 * Like page_reclaim() only 'freemem' 1816 * processing is already done. 1817 */ 1818 MDSTAT_INCR(mhp, nfree); 1819 free_page_collect: 1820 if (PP_ISAGED(pp)) { 1821 page_list_sub(pp, 1822 PG_FREE_LIST); 1823 } else { 1824 page_list_sub(pp, 1825 PG_CACHE_LIST); 1826 } 1827 PP_CLRFREE(pp); 1828 PP_CLRAGED(pp); 1829 collected++; 1830 mutex_enter(&mhp->mh_mutex); 1831 page_delete_collect(pp, mhp); 1832 mdsp->mds_bitmap[bit / NBPBMW] |= 1833 (1 << (bit % NBPBMW)); 1834 freemem_left--; 1835 continue; 1836 } 1837 ASSERT(pp->p_vnode != NULL); 1838 if (first_scan) { 1839 MDSTAT_INCR(mhp, first_notfree); 1840 page_unlock(pp); 1841 mutex_enter(&mhp->mh_mutex); 1842 continue; 1843 } 1844 /* 1845 * Keep stats on pages encountered that 1846 * are marked for retirement. 1847 */ 1848 if (PP_TOXIC(pp)) { 1849 MDSTAT_INCR(mhp, toxic); 1850 } else if (PP_PR_REQ(pp)) { 1851 MDSTAT_INCR(mhp, failing); 1852 } 1853 /* 1854 * In certain cases below, special exceptions 1855 * are made for pages that are toxic. This 1856 * is because the current meaning of toxic 1857 * is that an uncorrectable error has been 1858 * previously associated with the page. 1859 */ 1860 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1861 if (!PP_TOXIC(pp)) { 1862 /* 1863 * Must relocate locked in 1864 * memory pages. 1865 */ 1866 #ifdef MEM_DEL_STATS 1867 start_pgrp = ddi_get_lbolt(); 1868 #endif /* MEM_DEL_STATS */ 1869 /* 1870 * Lock all constituent pages 1871 * of a large page to ensure 1872 * that p_szc won't change. 1873 */ 1874 if (!group_page_trylock(pp, 1875 SE_EXCL)) { 1876 MDSTAT_INCR(mhp, 1877 gptllckfail); 1878 page_unlock(pp); 1879 mutex_enter( 1880 &mhp->mh_mutex); 1881 continue; 1882 } 1883 MDSTAT_INCR(mhp, npplocked); 1884 pp_targ = 1885 page_get_replacement_page( 1886 pp, NULL, 0); 1887 if (pp_targ != NULL) { 1888 #ifdef MEM_DEL_STATS 1889 ntick_pgrp = 1890 (uint64_t) 1891 ddi_get_lbolt() - 1892 start_pgrp; 1893 #endif /* MEM_DEL_STATS */ 1894 MDSTAT_PGRP(mhp, 1895 ntick_pgrp); 1896 MDSTAT_INCR(mhp, 1897 nlockreloc); 1898 goto reloc; 1899 } 1900 group_page_unlock(pp); 1901 page_unlock(pp); 1902 #ifdef MEM_DEL_STATS 1903 ntick_pgrp = 1904 (uint64_t)ddi_get_lbolt() - 1905 start_pgrp; 1906 #endif /* MEM_DEL_STATS */ 1907 MDSTAT_PGRP(mhp, ntick_pgrp); 1908 MDSTAT_INCR(mhp, nnorepl); 1909 mutex_enter(&mhp->mh_mutex); 1910 continue; 1911 } else { 1912 /* 1913 * Cannot do anything about 1914 * this page because it is 1915 * toxic. 1916 */ 1917 MDSTAT_INCR(mhp, npplkdtoxic); 1918 page_unlock(pp); 1919 mutex_enter(&mhp->mh_mutex); 1920 continue; 1921 } 1922 } 1923 /* 1924 * Unload the mappings and check if mod bit 1925 * is set. 1926 */ 1927 ASSERT(pp->p_vnode != &kvp); 1928 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1929 mod = hat_ismod(pp); 1930 1931 #ifdef MEM_DEL_STATS 1932 start_pgrp = ddi_get_lbolt(); 1933 #endif /* MEM_DEL_STATS */ 1934 if (mod && !PP_TOXIC(pp)) { 1935 /* 1936 * Lock all constituent pages 1937 * of a large page to ensure 1938 * that p_szc won't change. 1939 */ 1940 if (!group_page_trylock(pp, SE_EXCL)) { 1941 MDSTAT_INCR(mhp, gptlmodfail); 1942 page_unlock(pp); 1943 mutex_enter(&mhp->mh_mutex); 1944 continue; 1945 } 1946 pp_targ = page_get_replacement_page(pp, 1947 NULL, 0); 1948 if (pp_targ != NULL) { 1949 MDSTAT_INCR(mhp, nmodreloc); 1950 #ifdef MEM_DEL_STATS 1951 ntick_pgrp = 1952 (uint64_t)ddi_get_lbolt() - 1953 start_pgrp; 1954 #endif /* MEM_DEL_STATS */ 1955 MDSTAT_PGRP(mhp, ntick_pgrp); 1956 goto reloc; 1957 } 1958 group_page_unlock(pp); 1959 } 1960 1961 if (!page_try_demote_pages(pp)) { 1962 MDSTAT_INCR(mhp, demotefail); 1963 page_unlock(pp); 1964 #ifdef MEM_DEL_STATS 1965 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 1966 start_pgrp; 1967 #endif /* MEM_DEL_STATS */ 1968 MDSTAT_PGRP(mhp, ntick_pgrp); 1969 mutex_enter(&mhp->mh_mutex); 1970 continue; 1971 } 1972 1973 /* 1974 * Regular 'page-out'. 1975 */ 1976 if (!mod) { 1977 MDSTAT_INCR(mhp, ndestroy); 1978 page_destroy(pp, 1); 1979 /* 1980 * page_destroy was called with 1981 * dontfree. As long as p_lckcnt 1982 * and p_cowcnt are both zero, the 1983 * only additional action of 1984 * page_destroy with !dontfree is to 1985 * call page_free, so we can collect 1986 * the page here. 1987 */ 1988 collected++; 1989 #ifdef MEM_DEL_STATS 1990 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 1991 start_pgrp; 1992 #endif /* MEM_DEL_STATS */ 1993 MDSTAT_PGRP(mhp, ntick_pgrp); 1994 mutex_enter(&mhp->mh_mutex); 1995 page_delete_collect(pp, mhp); 1996 mdsp->mds_bitmap[bit / NBPBMW] |= 1997 (1 << (bit % NBPBMW)); 1998 continue; 1999 } 2000 /* 2001 * The page is toxic and the mod bit is 2002 * set, we cannot do anything here to deal 2003 * with it. 2004 */ 2005 if (PP_TOXIC(pp)) { 2006 page_unlock(pp); 2007 #ifdef MEM_DEL_STATS 2008 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2009 start_pgrp; 2010 #endif /* MEM_DEL_STATS */ 2011 MDSTAT_PGRP(mhp, ntick_pgrp); 2012 MDSTAT_INCR(mhp, modtoxic); 2013 mutex_enter(&mhp->mh_mutex); 2014 continue; 2015 } 2016 MDSTAT_INCR(mhp, nputpage); 2017 vp = pp->p_vnode; 2018 offset = pp->p_offset; 2019 VN_HOLD(vp); 2020 page_unlock(pp); 2021 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, 2022 B_INVAL|B_FORCE, kcred); 2023 VN_RELE(vp); 2024 #ifdef MEM_DEL_STATS 2025 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2026 start_pgrp; 2027 #endif /* MEM_DEL_STATS */ 2028 MDSTAT_PGRP(mhp, ntick_pgrp); 2029 /* 2030 * Try to get the page back immediately 2031 * so that it can be collected. 2032 */ 2033 pp = page_numtopp_nolock(pfn); 2034 if (pp == NULL) { 2035 MDSTAT_INCR(mhp, nnoreclaim); 2036 /* 2037 * This should not happen as this 2038 * thread is deleting the page. 2039 * If this code is generalized, this 2040 * becomes a reality. 2041 */ 2042 #ifdef DEBUG 2043 cmn_err(CE_WARN, 2044 "delete_memory_thread(0x%p) " 2045 "pfn 0x%lx has no page_t", 2046 (void *)mhp, pfn); 2047 #endif /* DEBUG */ 2048 mutex_enter(&mhp->mh_mutex); 2049 continue; 2050 } 2051 if (page_try_reclaim_lock(pp, SE_EXCL, 2052 SE_EXCL_WANTED | SE_RETIRED)) { 2053 if (PP_ISFREE(pp)) { 2054 goto free_page_collect; 2055 } 2056 page_unlock(pp); 2057 } 2058 MDSTAT_INCR(mhp, nnoreclaim); 2059 mutex_enter(&mhp->mh_mutex); 2060 continue; 2061 2062 reloc: 2063 /* 2064 * Got some freemem and a target 2065 * page, so move the data to avoid 2066 * I/O and lock problems. 2067 */ 2068 ASSERT(!page_iolock_assert(pp)); 2069 MDSTAT_INCR(mhp, nreloc); 2070 /* 2071 * page_relocate() will return pgcnt: the 2072 * number of consecutive pages relocated. 2073 * If it is successful, pp will be a 2074 * linked list of the page structs that 2075 * were relocated. If page_relocate() is 2076 * unsuccessful, pp will be unmodified. 2077 */ 2078 #ifdef MEM_DEL_STATS 2079 start_pgrp = ddi_get_lbolt(); 2080 #endif /* MEM_DEL_STATS */ 2081 result = page_relocate(&pp, &pp_targ, 0, 0, 2082 &pgcnt, NULL); 2083 #ifdef MEM_DEL_STATS 2084 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2085 start_pgrp; 2086 #endif /* MEM_DEL_STATS */ 2087 MDSTAT_PGRP(mhp, ntick_pgrp); 2088 if (result != 0) { 2089 MDSTAT_INCR(mhp, nrelocfail); 2090 /* 2091 * We did not succeed. We need 2092 * to give the pp_targ pages back. 2093 * page_free(pp_targ, 1) without 2094 * the freemem accounting. 2095 */ 2096 group_page_unlock(pp); 2097 page_free_replacement_page(pp_targ); 2098 page_unlock(pp); 2099 mutex_enter(&mhp->mh_mutex); 2100 continue; 2101 } 2102 2103 /* 2104 * We will then collect pgcnt pages. 2105 */ 2106 ASSERT(pgcnt > 0); 2107 mutex_enter(&mhp->mh_mutex); 2108 /* 2109 * We need to make sure freemem_left is 2110 * large enough. 2111 */ 2112 while ((freemem_left < pgcnt) && 2113 (!mhp->mh_cancel)) { 2114 freemem_left += 2115 delthr_get_freemem(mhp); 2116 } 2117 2118 /* 2119 * Do not proceed if mh_cancel is set. 2120 */ 2121 if (mhp->mh_cancel) { 2122 while (pp_targ != NULL) { 2123 /* 2124 * Unlink and unlock each page. 2125 */ 2126 tpp_targ = pp_targ; 2127 page_sub(&pp_targ, tpp_targ); 2128 page_unlock(tpp_targ); 2129 } 2130 /* 2131 * We need to give the pp pages back. 2132 * page_free(pp, 1) without the 2133 * freemem accounting. 2134 */ 2135 page_free_replacement_page(pp); 2136 break; 2137 } 2138 2139 /* Now remove pgcnt from freemem_left */ 2140 freemem_left -= pgcnt; 2141 ASSERT(freemem_left >= 0); 2142 szc = pp->p_szc; 2143 while (pp != NULL) { 2144 /* 2145 * pp and pp_targ were passed back as 2146 * a linked list of pages. 2147 * Unlink and unlock each page. 2148 */ 2149 tpp_targ = pp_targ; 2150 page_sub(&pp_targ, tpp_targ); 2151 page_unlock(tpp_targ); 2152 /* 2153 * The original page is now free 2154 * so remove it from the linked 2155 * list and collect it. 2156 */ 2157 tpp = pp; 2158 page_sub(&pp, tpp); 2159 pfn = page_pptonum(tpp); 2160 collected++; 2161 ASSERT(PAGE_EXCL(tpp)); 2162 ASSERT(tpp->p_vnode == NULL); 2163 ASSERT(!hat_page_is_mapped(tpp)); 2164 ASSERT(tpp->p_szc == szc); 2165 tpp->p_szc = 0; 2166 page_delete_collect(tpp, mhp); 2167 bit = pfn - mdsp->mds_base; 2168 mdsp->mds_bitmap[bit / NBPBMW] |= 2169 (1 << (bit % NBPBMW)); 2170 } 2171 ASSERT(pp_targ == NULL); 2172 } 2173 } 2174 first_scan = 0; 2175 if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) && 2176 (collected == 0)) { 2177 /* 2178 * This code is needed as we cannot wait 2179 * for a page to be locked OR the delete to 2180 * be cancelled. Also, we must delay so 2181 * that other threads get a chance to run 2182 * on our cpu, otherwise page locks may be 2183 * held indefinitely by those threads. 2184 */ 2185 MDSTAT_INCR(mhp, ndelay); 2186 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2187 (void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex, 2188 (lbolt + DEL_BUSY_WAIT_TICKS)); 2189 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2190 } 2191 } 2192 /* stop the dr aio cleanup thread */ 2193 mhp->mh_dr_aio_cleanup_cancel = 1; 2194 transit_list_collect(mhp, 0); 2195 if (freemem_left != 0) { 2196 /* Return any surplus. */ 2197 page_create_putback(freemem_left); 2198 freemem_left = 0; 2199 } 2200 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2201 mdsp = mdsp->mds_next) { 2202 mem_node_post_del_slice(mdsp->mds_base, 2203 mdsp->mds_base + mdsp->mds_npgs - 1, 2204 (mhp->mh_cancel != 0)); 2205 } 2206 #ifdef MEM_DEL_STATS 2207 ntick_total = (uint64_t)ddi_get_lbolt() - start_total; 2208 #endif /* MEM_DEL_STATS */ 2209 MDSTAT_TOTAL(mhp, ntick_total); 2210 MDSTAT_PRINT(mhp); 2211 2212 /* 2213 * If the memory delete was cancelled, exclusive-wanted bits must 2214 * be cleared. If there are retired pages being deleted, they need 2215 * to be unretired. 2216 */ 2217 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2218 mdsp = mdsp->mds_next) { 2219 pfn_t pfn, p_end; 2220 2221 p_end = mdsp->mds_base + mdsp->mds_npgs; 2222 for (pfn = mdsp->mds_base; pfn < p_end; pfn++) { 2223 page_t *pp; 2224 pgcnt_t bit; 2225 2226 bit = pfn - mdsp->mds_base; 2227 if (mhp->mh_cancel) { 2228 pp = page_numtopp_nolock(pfn); 2229 if (pp != NULL) { 2230 if ((mdsp->mds_bitmap[bit / NBPBMW] & 2231 (1 << (bit % NBPBMW))) == 0) { 2232 page_lock_clr_exclwanted(pp); 2233 } 2234 } 2235 } else { 2236 pp = NULL; 2237 } 2238 if ((mdsp->mds_bitmap_retired[bit / NBPBMW] & 2239 (1 << (bit % NBPBMW))) != 0) { 2240 /* do we already have pp? */ 2241 if (pp == NULL) { 2242 pp = page_numtopp_nolock(pfn); 2243 } 2244 ASSERT(pp != NULL); 2245 ASSERT(PP_RETIRED(pp)); 2246 if (mhp->mh_cancel != 0) { 2247 page_unlock(pp); 2248 /* 2249 * To satisfy ASSERT below in 2250 * cancel code. 2251 */ 2252 mhp->mh_hold_todo++; 2253 } else { 2254 (void) page_unretire_pp(pp, 0); 2255 } 2256 } 2257 } 2258 } 2259 /* 2260 * Free retired page bitmap and collected page bitmap 2261 */ 2262 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2263 mdsp = mdsp->mds_next) { 2264 ASSERT(mdsp->mds_bitmap_retired != NULL); 2265 kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp)); 2266 mdsp->mds_bitmap_retired = NULL; /* Paranoia. */ 2267 ASSERT(mdsp->mds_bitmap != NULL); 2268 kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp)); 2269 mdsp->mds_bitmap = NULL; /* Paranoia. */ 2270 } 2271 2272 /* wait for our dr aio cancel thread to exit */ 2273 while (!(mhp->mh_aio_cleanup_done)) { 2274 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2275 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 2276 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2277 } 2278 refused: 2279 if (mhp->mh_cancel != 0) { 2280 page_t *pp; 2281 2282 comp_code = mhp->mh_cancel; 2283 /* 2284 * Go through list of deleted pages (mh_deleted) freeing 2285 * them. 2286 */ 2287 while ((pp = mhp->mh_deleted) != NULL) { 2288 mhp->mh_deleted = pp->p_next; 2289 mhp->mh_hold_todo++; 2290 mutex_exit(&mhp->mh_mutex); 2291 /* Restore p_next. */ 2292 pp->p_next = pp->p_prev; 2293 if (PP_ISFREE(pp)) { 2294 cmn_err(CE_PANIC, 2295 "page %p is free", 2296 (void *)pp); 2297 } 2298 page_free(pp, 1); 2299 mutex_enter(&mhp->mh_mutex); 2300 } 2301 ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages); 2302 2303 mutex_exit(&mhp->mh_mutex); 2304 put_availrmem(mhp->mh_vm_pages); 2305 mutex_enter(&mhp->mh_mutex); 2306 2307 goto t_exit; 2308 } 2309 2310 /* 2311 * All the pages are no longer in use and are exclusively locked. 2312 */ 2313 2314 mhp->mh_deleted = NULL; 2315 2316 kphysm_del_cleanup(mhp); 2317 2318 comp_code = KPHYSM_OK; 2319 2320 t_exit: 2321 mutex_exit(&mhp->mh_mutex); 2322 kphysm_setup_post_del(mhp->mh_vm_pages, 2323 (comp_code == KPHYSM_OK) ? 0 : 1); 2324 mutex_enter(&mhp->mh_mutex); 2325 2326 early_exit: 2327 /* mhp->mh_mutex exited by CALLB_CPR_EXIT() */ 2328 mhp->mh_state = MHND_DONE; 2329 del_complete_funcp = mhp->mh_delete_complete; 2330 del_complete_arg = mhp->mh_delete_complete_arg; 2331 CALLB_CPR_EXIT(&cprinfo); 2332 (*del_complete_funcp)(del_complete_arg, comp_code); 2333 thread_exit(); 2334 /*NOTREACHED*/ 2335 } 2336 2337 /* 2338 * Start the delete of the memory from the system. 2339 */ 2340 int 2341 kphysm_del_start( 2342 memhandle_t handle, 2343 void (*complete)(void *, int), 2344 void *complete_arg) 2345 { 2346 struct mem_handle *mhp; 2347 2348 mhp = kphysm_lookup_mem_handle(handle); 2349 if (mhp == NULL) { 2350 return (KPHYSM_EHANDLE); 2351 } 2352 switch (mhp->mh_state) { 2353 case MHND_FREE: 2354 ASSERT(mhp->mh_state != MHND_FREE); 2355 mutex_exit(&mhp->mh_mutex); 2356 return (KPHYSM_EHANDLE); 2357 case MHND_INIT: 2358 break; 2359 case MHND_STARTING: 2360 case MHND_RUNNING: 2361 mutex_exit(&mhp->mh_mutex); 2362 return (KPHYSM_ESEQUENCE); 2363 case MHND_DONE: 2364 mutex_exit(&mhp->mh_mutex); 2365 return (KPHYSM_ESEQUENCE); 2366 case MHND_RELEASE: 2367 mutex_exit(&mhp->mh_mutex); 2368 return (KPHYSM_ESEQUENCE); 2369 default: 2370 #ifdef DEBUG 2371 cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d", 2372 (void *)mhp, mhp->mh_state); 2373 #endif /* DEBUG */ 2374 mutex_exit(&mhp->mh_mutex); 2375 return (KPHYSM_EHANDLE); 2376 } 2377 2378 if (mhp->mh_transit.trl_spans == NULL) { 2379 mutex_exit(&mhp->mh_mutex); 2380 return (KPHYSM_ENOWORK); 2381 } 2382 2383 ASSERT(complete != NULL); 2384 mhp->mh_delete_complete = complete; 2385 mhp->mh_delete_complete_arg = complete_arg; 2386 mhp->mh_state = MHND_STARTING; 2387 /* 2388 * Release the mutex in case thread_create sleeps. 2389 */ 2390 mutex_exit(&mhp->mh_mutex); 2391 2392 /* 2393 * The "obvious" process for this thread is pageout (proc_pageout) 2394 * but this gives the thread too much power over freemem 2395 * which results in freemem starvation. 2396 */ 2397 (void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0, 2398 TS_RUN, maxclsyspri - 1); 2399 2400 return (KPHYSM_OK); 2401 } 2402 2403 static kmutex_t pp_dummy_lock; /* Protects init. of pp_dummy. */ 2404 static caddr_t pp_dummy; 2405 static pgcnt_t pp_dummy_npages; 2406 static pfn_t *pp_dummy_pfn; /* Array of dummy pfns. */ 2407 2408 static void 2409 memseg_remap_init_pages(page_t *pages, page_t *epages) 2410 { 2411 page_t *pp; 2412 2413 for (pp = pages; pp < epages; pp++) { 2414 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2415 pp->p_offset = (u_offset_t)-1; 2416 page_iolock_init(pp); 2417 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 2418 continue; 2419 page_lock_delete(pp); 2420 } 2421 } 2422 2423 void 2424 memseg_remap_init() 2425 { 2426 mutex_enter(&pp_dummy_lock); 2427 if (pp_dummy == NULL) { 2428 uint_t dpages; 2429 int i; 2430 2431 /* 2432 * dpages starts off as the size of the structure and 2433 * ends up as the minimum number of pages that will 2434 * hold a whole number of page_t structures. 2435 */ 2436 dpages = sizeof (page_t); 2437 ASSERT(dpages != 0); 2438 ASSERT(dpages <= MMU_PAGESIZE); 2439 2440 while ((dpages & 1) == 0) 2441 dpages >>= 1; 2442 2443 pp_dummy_npages = dpages; 2444 /* 2445 * Allocate pp_dummy pages directly from static_arena, 2446 * since these are whole page allocations and are 2447 * referenced by physical address. This also has the 2448 * nice fringe benefit of hiding the memory from 2449 * ::findleaks since it doesn't deal well with allocated 2450 * kernel heap memory that doesn't have any mappings. 2451 */ 2452 pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages), 2453 PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP); 2454 bzero(pp_dummy, ptob(pp_dummy_npages)); 2455 ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0); 2456 pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) * 2457 pp_dummy_npages, KM_SLEEP); 2458 for (i = 0; i < pp_dummy_npages; i++) { 2459 pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat, 2460 &pp_dummy[MMU_PAGESIZE * i]); 2461 ASSERT(pp_dummy_pfn[i] != PFN_INVALID); 2462 } 2463 /* 2464 * Initialize the page_t's to a known 'deleted' state 2465 * that matches the state of deleted pages. 2466 */ 2467 memseg_remap_init_pages((page_t *)pp_dummy, 2468 (page_t *)(pp_dummy + 2469 ptob(pp_dummy_npages))); 2470 /* Remove kmem mappings for the pages for safety. */ 2471 hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages), 2472 HAT_UNLOAD_UNLOCK); 2473 /* Leave pp_dummy pointer set as flag that init is done. */ 2474 } 2475 mutex_exit(&pp_dummy_lock); 2476 } 2477 2478 static void 2479 memseg_remap_to_dummy(caddr_t pp, pgcnt_t metapgs) 2480 { 2481 ASSERT(pp_dummy != NULL); 2482 2483 while (metapgs != 0) { 2484 pgcnt_t n; 2485 int i; 2486 2487 n = pp_dummy_npages; 2488 if (n > metapgs) 2489 n = metapgs; 2490 for (i = 0; i < n; i++) { 2491 hat_devload(kas.a_hat, pp, ptob(1), pp_dummy_pfn[i], 2492 PROT_READ, 2493 HAT_LOAD | HAT_LOAD_NOCONSIST | 2494 HAT_LOAD_REMAP); 2495 pp += ptob(1); 2496 } 2497 metapgs -= n; 2498 } 2499 } 2500 2501 /* 2502 * Transition all the deleted pages to the deleted state so that 2503 * page_lock will not wait. The page_lock_delete call will 2504 * also wake up any waiters. 2505 */ 2506 static void 2507 memseg_lock_delete_all(struct memseg *seg) 2508 { 2509 page_t *pp; 2510 2511 for (pp = seg->pages; pp < seg->epages; pp++) { 2512 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2513 page_lock_delete(pp); 2514 } 2515 } 2516 2517 static void 2518 kphysm_del_cleanup(struct mem_handle *mhp) 2519 { 2520 struct memdelspan *mdsp; 2521 struct memseg *seg; 2522 struct memseg **segpp; 2523 struct memseg *seglist; 2524 pfn_t p_end; 2525 uint64_t avmem; 2526 pgcnt_t avpgs; 2527 pgcnt_t npgs; 2528 2529 avpgs = mhp->mh_vm_pages; 2530 2531 memsegs_lock(1); 2532 2533 /* 2534 * remove from main segment list. 2535 */ 2536 npgs = 0; 2537 seglist = NULL; 2538 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2539 mdsp = mdsp->mds_next) { 2540 p_end = mdsp->mds_base + mdsp->mds_npgs; 2541 for (segpp = &memsegs; (seg = *segpp) != NULL; ) { 2542 if (seg->pages_base >= p_end || 2543 seg->pages_end <= mdsp->mds_base) { 2544 /* Span and memseg don't overlap. */ 2545 segpp = &((*segpp)->next); 2546 continue; 2547 } 2548 ASSERT(seg->pages_base >= mdsp->mds_base); 2549 ASSERT(seg->pages_end <= p_end); 2550 2551 PLCNT_MODIFY_MAX(seg->pages_base, 2552 seg->pages_base - seg->pages_end); 2553 2554 /* Hide the memseg from future scans. */ 2555 hat_kpm_delmem_mseg_update(seg, segpp); 2556 *segpp = seg->next; 2557 membar_producer(); /* TODO: Needed? */ 2558 npgs += MSEG_NPAGES(seg); 2559 2560 /* 2561 * Leave the deleted segment's next pointer intact 2562 * in case a memsegs scanning loop is walking this 2563 * segment concurrently. 2564 */ 2565 seg->lnext = seglist; 2566 seglist = seg; 2567 } 2568 } 2569 2570 build_pfn_hash(); 2571 2572 ASSERT(npgs < total_pages); 2573 total_pages -= npgs; 2574 2575 /* 2576 * Recalculate the paging parameters now total_pages has changed. 2577 * This will also cause the clock hands to be reset before next use. 2578 */ 2579 setupclock(1); 2580 2581 memsegs_unlock(1); 2582 2583 mutex_exit(&mhp->mh_mutex); 2584 2585 while ((seg = seglist) != NULL) { 2586 pfn_t mseg_start; 2587 pfn_t mseg_base, mseg_end; 2588 pgcnt_t mseg_npgs; 2589 page_t *pp; 2590 pgcnt_t metapgs; 2591 int dynamic; 2592 int mlret; 2593 2594 seglist = seg->lnext; 2595 2596 /* 2597 * Put the page_t's into the deleted state to stop 2598 * cv_wait()s on the pages. When we remap, the dummy 2599 * page_t's will be in the same state. 2600 */ 2601 memseg_lock_delete_all(seg); 2602 /* 2603 * Collect up information based on pages_base and pages_end 2604 * early so that we can flag early that the memseg has been 2605 * deleted by setting pages_end == pages_base. 2606 */ 2607 mseg_base = seg->pages_base; 2608 mseg_end = seg->pages_end; 2609 mseg_npgs = MSEG_NPAGES(seg); 2610 dynamic = memseg_is_dynamic(seg, &mseg_start); 2611 2612 seg->pages_end = seg->pages_base; 2613 2614 if (dynamic) { 2615 pp = seg->pages; 2616 metapgs = mseg_base - mseg_start; 2617 ASSERT(metapgs != 0); 2618 2619 /* Remap the meta data to our special dummy area. */ 2620 memseg_remap_to_dummy((caddr_t)pp, metapgs); 2621 2622 mutex_enter(&memseg_lists_lock); 2623 seg->lnext = memseg_va_avail; 2624 memseg_va_avail = seg; 2625 mutex_exit(&memseg_lists_lock); 2626 } else { 2627 /* 2628 * Set for clean-up below. 2629 */ 2630 mseg_start = seg->pages_base; 2631 /* 2632 * For memory whose page_ts were allocated 2633 * at boot, we need to find a new use for 2634 * the page_t memory. 2635 * For the moment, just leak it. 2636 * (It is held in the memseg_delete_junk list.) 2637 */ 2638 2639 mutex_enter(&memseg_lists_lock); 2640 seg->lnext = memseg_delete_junk; 2641 memseg_delete_junk = seg; 2642 mutex_exit(&memseg_lists_lock); 2643 } 2644 2645 /* Must not use seg now as it could be re-used. */ 2646 2647 memlist_write_lock(); 2648 2649 mlret = memlist_delete_span( 2650 (uint64_t)(mseg_base) << PAGESHIFT, 2651 (uint64_t)(mseg_npgs) << PAGESHIFT, 2652 &phys_avail); 2653 ASSERT(mlret == MEML_SPANOP_OK); 2654 2655 mlret = memlist_delete_span( 2656 (uint64_t)(mseg_start) << PAGESHIFT, 2657 (uint64_t)(mseg_end - mseg_start) << 2658 PAGESHIFT, 2659 &phys_install); 2660 ASSERT(mlret == MEML_SPANOP_OK); 2661 phys_install_has_changed(); 2662 2663 memlist_write_unlock(); 2664 } 2665 2666 memlist_read_lock(); 2667 installed_top_size(phys_install, &physmax, &physinstalled); 2668 memlist_read_unlock(); 2669 2670 mutex_enter(&freemem_lock); 2671 maxmem -= avpgs; 2672 physmem -= avpgs; 2673 /* availrmem is adjusted during the delete. */ 2674 availrmem_initial -= avpgs; 2675 2676 mutex_exit(&freemem_lock); 2677 2678 dump_resize(); 2679 2680 cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK " 2681 "(0x%" PRIx64 ")\n", 2682 physinstalled << (PAGESHIFT - 10), 2683 (uint64_t)physinstalled << PAGESHIFT); 2684 2685 avmem = (uint64_t)freemem << PAGESHIFT; 2686 cmn_err(CE_CONT, "?kphysm_delete: " 2687 "avail mem = %" PRId64 "\n", avmem); 2688 2689 /* 2690 * Update lgroup generation number on single lgroup systems 2691 */ 2692 if (nlgrps == 1) 2693 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 2694 2695 /* Successfully deleted system memory */ 2696 mutex_enter(&mhp->mh_mutex); 2697 } 2698 2699 static uint_t mdel_nullvp_waiter; 2700 2701 static void 2702 page_delete_collect( 2703 page_t *pp, 2704 struct mem_handle *mhp) 2705 { 2706 if (pp->p_vnode) { 2707 page_hashout(pp, (kmutex_t *)NULL); 2708 /* do not do PP_SETAGED(pp); */ 2709 } else { 2710 kmutex_t *sep; 2711 2712 sep = page_se_mutex(pp); 2713 mutex_enter(sep); 2714 if (CV_HAS_WAITERS(&pp->p_cv)) { 2715 mdel_nullvp_waiter++; 2716 cv_broadcast(&pp->p_cv); 2717 } 2718 mutex_exit(sep); 2719 } 2720 ASSERT(pp->p_next == pp->p_prev); 2721 ASSERT(pp->p_next == NULL || pp->p_next == pp); 2722 pp->p_next = mhp->mh_deleted; 2723 mhp->mh_deleted = pp; 2724 ASSERT(mhp->mh_hold_todo != 0); 2725 mhp->mh_hold_todo--; 2726 } 2727 2728 static void 2729 transit_list_collect(struct mem_handle *mhp, int v) 2730 { 2731 struct transit_list_head *trh; 2732 2733 trh = &transit_list_head; 2734 mutex_enter(&trh->trh_lock); 2735 mhp->mh_transit.trl_collect = v; 2736 mutex_exit(&trh->trh_lock); 2737 } 2738 2739 static void 2740 transit_list_insert(struct transit_list *tlp) 2741 { 2742 struct transit_list_head *trh; 2743 2744 trh = &transit_list_head; 2745 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2746 tlp->trl_next = trh->trh_head; 2747 trh->trh_head = tlp; 2748 } 2749 2750 static void 2751 transit_list_remove(struct transit_list *tlp) 2752 { 2753 struct transit_list_head *trh; 2754 struct transit_list **tlpp; 2755 2756 trh = &transit_list_head; 2757 tlpp = &trh->trh_head; 2758 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2759 while (*tlpp != NULL && *tlpp != tlp) 2760 tlpp = &(*tlpp)->trl_next; 2761 ASSERT(*tlpp != NULL); 2762 if (*tlpp == tlp) 2763 *tlpp = tlp->trl_next; 2764 tlp->trl_next = NULL; 2765 } 2766 2767 static struct transit_list * 2768 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum) 2769 { 2770 struct transit_list *tlp; 2771 2772 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 2773 struct memdelspan *mdsp; 2774 2775 for (mdsp = tlp->trl_spans; mdsp != NULL; 2776 mdsp = mdsp->mds_next) { 2777 if (pfnum >= mdsp->mds_base && 2778 pfnum < (mdsp->mds_base + mdsp->mds_npgs)) { 2779 return (tlp); 2780 } 2781 } 2782 } 2783 return (NULL); 2784 } 2785 2786 int 2787 pfn_is_being_deleted(pfn_t pfnum) 2788 { 2789 struct transit_list_head *trh; 2790 struct transit_list *tlp; 2791 int ret; 2792 2793 trh = &transit_list_head; 2794 if (trh->trh_head == NULL) 2795 return (0); 2796 2797 mutex_enter(&trh->trh_lock); 2798 tlp = pfnum_to_transit_list(trh, pfnum); 2799 ret = (tlp != NULL && tlp->trl_collect); 2800 mutex_exit(&trh->trh_lock); 2801 2802 return (ret); 2803 } 2804 2805 #ifdef MEM_DEL_STATS 2806 extern int hz; 2807 static void 2808 mem_del_stat_print_func(struct mem_handle *mhp) 2809 { 2810 uint64_t tmp; 2811 2812 if (mem_del_stat_print) { 2813 printf("memory delete loop %x/%x, statistics%s\n", 2814 (uint_t)mhp->mh_transit.trl_spans->mds_base, 2815 (uint_t)mhp->mh_transit.trl_spans->mds_npgs, 2816 (mhp->mh_cancel ? " (cancelled)" : "")); 2817 printf("\t%8u nloop\n", mhp->mh_delstat.nloop); 2818 printf("\t%8u need_free\n", mhp->mh_delstat.need_free); 2819 printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop); 2820 printf("\t%8u free_low\n", mhp->mh_delstat.free_low); 2821 printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed); 2822 printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck); 2823 printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget); 2824 printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail); 2825 printf("\t%8u nfree\n", mhp->mh_delstat.nfree); 2826 printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc); 2827 printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail); 2828 printf("\t%8u already_done\n", mhp->mh_delstat.already_done); 2829 printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree); 2830 printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked); 2831 printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc); 2832 printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl); 2833 printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc); 2834 printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy); 2835 printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage); 2836 printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim); 2837 printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay); 2838 printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail); 2839 printf("\t%8u retired\n", mhp->mh_delstat.retired); 2840 printf("\t%8u toxic\n", mhp->mh_delstat.toxic); 2841 printf("\t%8u failing\n", mhp->mh_delstat.failing); 2842 printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic); 2843 printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic); 2844 printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail); 2845 printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail); 2846 tmp = mhp->mh_delstat.nticks_total / hz; /* seconds */ 2847 printf( 2848 "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n", 2849 mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60); 2850 2851 tmp = mhp->mh_delstat.nticks_pgrp / hz; /* seconds */ 2852 printf( 2853 "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n", 2854 mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60); 2855 } 2856 } 2857 #endif /* MEM_DEL_STATS */ 2858 2859 struct mem_callback { 2860 kphysm_setup_vector_t *vec; 2861 void *arg; 2862 }; 2863 2864 #define NMEMCALLBACKS 100 2865 2866 static struct mem_callback mem_callbacks[NMEMCALLBACKS]; 2867 static uint_t nmemcallbacks; 2868 static krwlock_t mem_callback_rwlock; 2869 2870 int 2871 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg) 2872 { 2873 uint_t i, found; 2874 2875 /* 2876 * This test will become more complicated when the version must 2877 * change. 2878 */ 2879 if (vec->version != KPHYSM_SETUP_VECTOR_VERSION) 2880 return (EINVAL); 2881 2882 if (vec->post_add == NULL || vec->pre_del == NULL || 2883 vec->post_del == NULL) 2884 return (EINVAL); 2885 2886 rw_enter(&mem_callback_rwlock, RW_WRITER); 2887 for (i = 0, found = 0; i < nmemcallbacks; i++) { 2888 if (mem_callbacks[i].vec == NULL && found == 0) 2889 found = i + 1; 2890 if (mem_callbacks[i].vec == vec && 2891 mem_callbacks[i].arg == arg) { 2892 #ifdef DEBUG 2893 /* Catch this in DEBUG kernels. */ 2894 cmn_err(CE_WARN, "kphysm_setup_func_register" 2895 "(0x%p, 0x%p) duplicate registration from 0x%p", 2896 (void *)vec, arg, (void *)caller()); 2897 #endif /* DEBUG */ 2898 rw_exit(&mem_callback_rwlock); 2899 return (EEXIST); 2900 } 2901 } 2902 if (found != 0) { 2903 i = found - 1; 2904 } else { 2905 ASSERT(nmemcallbacks < NMEMCALLBACKS); 2906 if (nmemcallbacks == NMEMCALLBACKS) { 2907 rw_exit(&mem_callback_rwlock); 2908 return (ENOMEM); 2909 } 2910 i = nmemcallbacks++; 2911 } 2912 mem_callbacks[i].vec = vec; 2913 mem_callbacks[i].arg = arg; 2914 rw_exit(&mem_callback_rwlock); 2915 return (0); 2916 } 2917 2918 void 2919 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg) 2920 { 2921 uint_t i; 2922 2923 rw_enter(&mem_callback_rwlock, RW_WRITER); 2924 for (i = 0; i < nmemcallbacks; i++) { 2925 if (mem_callbacks[i].vec == vec && 2926 mem_callbacks[i].arg == arg) { 2927 mem_callbacks[i].vec = NULL; 2928 mem_callbacks[i].arg = NULL; 2929 if (i == (nmemcallbacks - 1)) 2930 nmemcallbacks--; 2931 break; 2932 } 2933 } 2934 rw_exit(&mem_callback_rwlock); 2935 } 2936 2937 static void 2938 kphysm_setup_post_add(pgcnt_t delta_pages) 2939 { 2940 uint_t i; 2941 2942 rw_enter(&mem_callback_rwlock, RW_READER); 2943 for (i = 0; i < nmemcallbacks; i++) { 2944 if (mem_callbacks[i].vec != NULL) { 2945 (*mem_callbacks[i].vec->post_add) 2946 (mem_callbacks[i].arg, delta_pages); 2947 } 2948 } 2949 rw_exit(&mem_callback_rwlock); 2950 } 2951 2952 /* 2953 * Note the locking between pre_del and post_del: The reader lock is held 2954 * between the two calls to stop the set of functions from changing. 2955 */ 2956 2957 static int 2958 kphysm_setup_pre_del(pgcnt_t delta_pages) 2959 { 2960 uint_t i; 2961 int ret; 2962 int aret; 2963 2964 ret = 0; 2965 rw_enter(&mem_callback_rwlock, RW_READER); 2966 for (i = 0; i < nmemcallbacks; i++) { 2967 if (mem_callbacks[i].vec != NULL) { 2968 aret = (*mem_callbacks[i].vec->pre_del) 2969 (mem_callbacks[i].arg, delta_pages); 2970 ret |= aret; 2971 } 2972 } 2973 2974 return (ret); 2975 } 2976 2977 static void 2978 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled) 2979 { 2980 uint_t i; 2981 2982 for (i = 0; i < nmemcallbacks; i++) { 2983 if (mem_callbacks[i].vec != NULL) { 2984 (*mem_callbacks[i].vec->post_del) 2985 (mem_callbacks[i].arg, delta_pages, cancelled); 2986 } 2987 } 2988 rw_exit(&mem_callback_rwlock); 2989 } 2990 2991 static int 2992 kphysm_split_memseg( 2993 pfn_t base, 2994 pgcnt_t npgs) 2995 { 2996 struct memseg *seg; 2997 struct memseg **segpp; 2998 pgcnt_t size_low, size_high; 2999 struct memseg *seg_low, *seg_mid, *seg_high; 3000 3001 /* 3002 * Lock the memsegs list against other updates now 3003 */ 3004 memsegs_lock(1); 3005 3006 /* 3007 * Find boot time memseg that wholly covers this area. 3008 */ 3009 3010 /* First find the memseg with page 'base' in it. */ 3011 for (segpp = &memsegs; (seg = *segpp) != NULL; 3012 segpp = &((*segpp)->next)) { 3013 if (base >= seg->pages_base && base < seg->pages_end) 3014 break; 3015 } 3016 if (seg == NULL) { 3017 memsegs_unlock(1); 3018 return (0); 3019 } 3020 if (memseg_is_dynamic(seg, (pfn_t *)NULL)) { 3021 memsegs_unlock(1); 3022 return (0); 3023 } 3024 if ((base + npgs) > seg->pages_end) { 3025 memsegs_unlock(1); 3026 return (0); 3027 } 3028 3029 /* 3030 * Work out the size of the two segments that will 3031 * surround the new segment, one for low address 3032 * and one for high. 3033 */ 3034 ASSERT(base >= seg->pages_base); 3035 size_low = base - seg->pages_base; 3036 ASSERT(seg->pages_end >= (base + npgs)); 3037 size_high = seg->pages_end - (base + npgs); 3038 3039 /* 3040 * Sanity check. 3041 */ 3042 if ((size_low + size_high) == 0) { 3043 memsegs_unlock(1); 3044 return (0); 3045 } 3046 3047 /* 3048 * Allocate the new structures. The old memseg will not be freed 3049 * as there may be a reference to it. 3050 */ 3051 seg_low = NULL; 3052 seg_high = NULL; 3053 3054 if (size_low != 0) { 3055 seg_low = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3056 bzero(seg_low, sizeof (struct memseg)); 3057 } 3058 3059 seg_mid = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3060 bzero(seg_mid, sizeof (struct memseg)); 3061 3062 if (size_high != 0) { 3063 seg_high = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3064 bzero(seg_high, sizeof (struct memseg)); 3065 } 3066 3067 /* 3068 * All allocation done now. 3069 */ 3070 if (size_low != 0) { 3071 seg_low->pages = seg->pages; 3072 seg_low->epages = seg_low->pages + size_low; 3073 seg_low->pages_base = seg->pages_base; 3074 seg_low->pages_end = seg_low->pages_base + size_low; 3075 seg_low->next = seg_mid; 3076 } 3077 if (size_high != 0) { 3078 seg_high->pages = seg->epages - size_high; 3079 seg_high->epages = seg_high->pages + size_high; 3080 seg_high->pages_base = seg->pages_end - size_high; 3081 seg_high->pages_end = seg_high->pages_base + size_high; 3082 seg_high->next = seg->next; 3083 } 3084 3085 seg_mid->pages = seg->pages + size_low; 3086 seg_mid->pages_base = seg->pages_base + size_low; 3087 seg_mid->epages = seg->epages - size_high; 3088 seg_mid->pages_end = seg->pages_end - size_high; 3089 seg_mid->next = (seg_high != NULL) ? seg_high : seg->next; 3090 3091 /* 3092 * Update hat_kpm specific info of all involved memsegs and 3093 * allow hat_kpm specific global chain updates. 3094 */ 3095 hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high); 3096 3097 /* 3098 * At this point we have two equivalent memseg sub-chains, 3099 * seg and seg_low/seg_mid/seg_high, which both chain on to 3100 * the same place in the global chain. By re-writing the pointer 3101 * in the previous element we switch atomically from using the old 3102 * (seg) to the new. 3103 */ 3104 *segpp = (seg_low != NULL) ? seg_low : seg_mid; 3105 3106 membar_enter(); 3107 3108 build_pfn_hash(); 3109 memsegs_unlock(1); 3110 3111 /* 3112 * We leave the old segment, 'seg', intact as there may be 3113 * references to it. Also, as the value of total_pages has not 3114 * changed and the memsegs list is effectively the same when 3115 * accessed via the old or the new pointer, we do not have to 3116 * cause pageout_scanner() to re-evaluate its hand pointers. 3117 * 3118 * We currently do not re-use or reclaim the page_t memory. 3119 * If we do, then this may have to change. 3120 */ 3121 3122 mutex_enter(&memseg_lists_lock); 3123 seg->lnext = memseg_edit_junk; 3124 memseg_edit_junk = seg; 3125 mutex_exit(&memseg_lists_lock); 3126 3127 return (1); 3128 } 3129 3130 /* 3131 * The memsegs lock is only taken when modifying the memsegs list 3132 * and rebuilding the pfn hash table (after boot). 3133 * No lock is needed for read as memseg structure are never de-allocated 3134 * and the pointer linkage is never updated until the memseg is ready. 3135 */ 3136 krwlock_t memsegslock; 3137 3138 void 3139 memsegs_lock(int writer) 3140 { 3141 rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER); 3142 } 3143 3144 /*ARGSUSED*/ 3145 void 3146 memsegs_unlock(int writer) 3147 { 3148 rw_exit(&memsegslock); 3149 } 3150 3151 /* 3152 * memlist (phys_install, phys_avail) locking. 3153 */ 3154 3155 /* 3156 * A read/write lock might be better here. 3157 */ 3158 static kmutex_t memlists_mutex; 3159 3160 void 3161 memlist_read_lock() 3162 { 3163 mutex_enter(&memlists_mutex); 3164 } 3165 3166 void 3167 memlist_read_unlock() 3168 { 3169 mutex_exit(&memlists_mutex); 3170 } 3171 3172 void 3173 memlist_write_lock() 3174 { 3175 mutex_enter(&memlists_mutex); 3176 } 3177 3178 void 3179 memlist_write_unlock() 3180 { 3181 mutex_exit(&memlists_mutex); 3182 } 3183 3184 /* 3185 * The sfmmu hat layer (e.g.) accesses some parts of the memseg 3186 * structure using physical addresses. Therefore a kmem_cache is 3187 * used with KMC_NOHASH to avoid page crossings within a memseg 3188 * structure. KMC_NOHASH requires that no external (outside of 3189 * slab) information is allowed. This, in turn, implies that the 3190 * cache's slabsize must be exactly a single page, since per-slab 3191 * information (e.g. the freelist for the slab) is kept at the 3192 * end of the slab, where it is easy to locate. Should be changed 3193 * when a more obvious kmem_cache interface/flag will become 3194 * available. 3195 */ 3196 void 3197 mem_config_init() 3198 { 3199 memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg), 3200 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH); 3201 } 3202