1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/cmn_err.h> 28 #include <sys/vmem.h> 29 #include <sys/kmem.h> 30 #include <sys/systm.h> 31 #include <sys/machsystm.h> /* for page_freelist_coalesce() */ 32 #include <sys/errno.h> 33 #include <sys/memnode.h> 34 #include <sys/memlist.h> 35 #include <sys/memlist_impl.h> 36 #include <sys/tuneable.h> 37 #include <sys/proc.h> 38 #include <sys/disp.h> 39 #include <sys/debug.h> 40 #include <sys/vm.h> 41 #include <sys/callb.h> 42 #include <sys/memlist_plat.h> /* for installed_top_size() */ 43 #include <sys/condvar_impl.h> /* for CV_HAS_WAITERS() */ 44 #include <sys/dumphdr.h> /* for dump_resize() */ 45 #include <sys/atomic.h> /* for use in stats collection */ 46 #include <sys/rwlock.h> 47 #include <sys/cpuvar.h> 48 #include <vm/seg_kmem.h> 49 #include <vm/seg_kpm.h> 50 #include <vm/page.h> 51 #include <vm/vm_dep.h> 52 #define SUNDDI_IMPL /* so sunddi.h will not redefine splx() et al */ 53 #include <sys/sunddi.h> 54 #include <sys/mem_config.h> 55 #include <sys/mem_cage.h> 56 #include <sys/lgrp.h> 57 #include <sys/ddi.h> 58 #include <sys/modctl.h> 59 60 extern struct memlist *phys_avail; 61 62 extern void mem_node_add(pfn_t, pfn_t); 63 extern void mem_node_del(pfn_t, pfn_t); 64 65 extern uint_t page_ctrs_adjust(int); 66 void page_ctrs_cleanup(void); 67 static void kphysm_setup_post_add(pgcnt_t); 68 static int kphysm_setup_pre_del(pgcnt_t); 69 static void kphysm_setup_post_del(pgcnt_t, int); 70 71 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs); 72 73 static int delspan_reserve(pfn_t, pgcnt_t); 74 static void delspan_unreserve(pfn_t, pgcnt_t); 75 76 kmutex_t memseg_lists_lock; 77 struct memseg *memseg_va_avail; 78 struct memseg *memseg_alloc(void); 79 static struct memseg *memseg_delete_junk; 80 static struct memseg *memseg_edit_junk; 81 void memseg_remap_init(void); 82 static void memseg_remap_to_dummy(struct memseg *); 83 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t); 84 static struct memseg *memseg_reuse(pgcnt_t); 85 86 static struct kmem_cache *memseg_cache; 87 88 /* 89 * Interfaces to manage externally allocated 90 * page_t memory (metadata) for a memseg. 91 */ 92 #pragma weak memseg_alloc_meta 93 #pragma weak memseg_free_meta 94 #pragma weak memseg_get_metapfn 95 #pragma weak memseg_remap_meta 96 97 extern int ppvm_enable; 98 extern page_t *ppvm_base; 99 extern int memseg_alloc_meta(pfn_t, pgcnt_t, void **, pgcnt_t *); 100 extern void memseg_free_meta(void *, pgcnt_t); 101 extern pfn_t memseg_get_metapfn(void *, pgcnt_t); 102 extern void memseg_remap_meta(struct memseg *); 103 static int memseg_is_dynamic(struct memseg *); 104 static int memseg_includes_meta(struct memseg *); 105 pfn_t memseg_get_start(struct memseg *); 106 static void memseg_cpu_vm_flush(void); 107 108 int meta_alloc_enable; 109 110 /* 111 * Add a chunk of memory to the system. 112 * base: starting PAGESIZE page of new memory. 113 * npgs: length in PAGESIZE pages. 114 * 115 * Adding mem this way doesn't increase the size of the hash tables; 116 * growing them would be too hard. This should be OK, but adding memory 117 * dynamically most likely means more hash misses, since the tables will 118 * be smaller than they otherwise would be. 119 */ 120 #ifdef DEBUG 121 static int memseg_debug; 122 #define MEMSEG_DEBUG(args...) if (memseg_debug) printf(args) 123 #else 124 #define MEMSEG_DEBUG(...) 125 #endif 126 127 int 128 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs) 129 { 130 page_t *pp; 131 page_t *opp, *oepp, *segpp; 132 struct memseg *seg; 133 uint64_t avmem; 134 pfn_t pfn; 135 pfn_t pt_base = base; 136 pgcnt_t tpgs = npgs; 137 pgcnt_t metapgs = 0; 138 int exhausted; 139 pfn_t pnum; 140 int mnode; 141 caddr_t vaddr; 142 int reuse; 143 int mlret; 144 int rv; 145 int flags; 146 int meta_alloc = 0; 147 void *mapva; 148 void *metabase = (void *)base; 149 pgcnt_t nkpmpgs = 0; 150 offset_t kpm_pages_off; 151 152 cmn_err(CE_CONT, 153 "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n", 154 npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT); 155 156 /* 157 * Add this span in the delete list to prevent interactions. 158 */ 159 if (!delspan_reserve(base, npgs)) { 160 return (KPHYSM_ESPAN); 161 } 162 /* 163 * Check to see if any of the memory span has been added 164 * by trying an add to the installed memory list. This 165 * forms the interlocking process for add. 166 */ 167 168 memlist_write_lock(); 169 170 mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT, 171 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 172 173 if (mlret == MEML_SPANOP_OK) 174 installed_top_size(phys_install, &physmax, &physinstalled); 175 176 memlist_write_unlock(); 177 178 if (mlret != MEML_SPANOP_OK) { 179 if (mlret == MEML_SPANOP_EALLOC) { 180 delspan_unreserve(pt_base, tpgs); 181 return (KPHYSM_ERESOURCE); 182 } else if (mlret == MEML_SPANOP_ESPAN) { 183 delspan_unreserve(pt_base, tpgs); 184 return (KPHYSM_ESPAN); 185 } else { 186 delspan_unreserve(pt_base, tpgs); 187 return (KPHYSM_ERESOURCE); 188 } 189 } 190 191 if (meta_alloc_enable) { 192 /* 193 * Allocate the page_t's from existing memory; 194 * if that fails, allocate from the incoming memory. 195 */ 196 rv = memseg_alloc_meta(base, npgs, &metabase, &metapgs); 197 if (rv == KPHYSM_OK) { 198 ASSERT(metapgs); 199 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs); 200 meta_alloc = 1; 201 goto mapalloc; 202 } 203 } 204 205 /* 206 * We store the page_t's for this new memory in the first 207 * few pages of the chunk. Here, we go and get'em ... 208 */ 209 210 /* 211 * The expression after the '-' gives the number of pages 212 * that will fit in the new memory based on a requirement 213 * of (PAGESIZE + sizeof (page_t)) bytes per page. 214 */ 215 metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) / 216 (PAGESIZE + sizeof (page_t))); 217 218 npgs -= metapgs; 219 base += metapgs; 220 221 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs); 222 223 exhausted = (metapgs == 0 || npgs == 0); 224 225 if (kpm_enable && !exhausted) { 226 pgcnt_t start, end, nkpmpgs_prelim; 227 size_t ptsz; 228 229 /* 230 * A viable kpm large page mapping must not overlap two 231 * dynamic memsegs. Therefore the total size is checked 232 * to be at least kpm_pgsz and also whether start and end 233 * points are at least kpm_pgsz aligned. 234 */ 235 if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) || 236 pmodkpmp(base + npgs)) { 237 238 kphysm_addmem_error_undospan(pt_base, tpgs); 239 240 /* 241 * There is no specific error code for violating 242 * kpm granularity constraints. 243 */ 244 return (KPHYSM_ENOTVIABLE); 245 } 246 247 start = kpmptop(ptokpmp(base)); 248 end = kpmptop(ptokpmp(base + npgs)); 249 nkpmpgs_prelim = ptokpmp(end - start); 250 ptsz = npgs * sizeof (page_t); 251 metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ); 252 exhausted = (tpgs <= metapgs); 253 if (!exhausted) { 254 npgs = tpgs - metapgs; 255 base = pt_base + metapgs; 256 257 /* final nkpmpgs */ 258 start = kpmptop(ptokpmp(base)); 259 nkpmpgs = ptokpmp(end - start); 260 kpm_pages_off = ptsz + 261 (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ; 262 } 263 } 264 265 /* 266 * Is memory area supplied too small? 267 */ 268 if (exhausted) { 269 kphysm_addmem_error_undospan(pt_base, tpgs); 270 /* 271 * There is no specific error code for 'too small'. 272 */ 273 return (KPHYSM_ERESOURCE); 274 } 275 276 mapalloc: 277 /* 278 * We may re-use a previously allocated VA space for the page_ts 279 * eventually, but we need to initialize and lock the pages first. 280 */ 281 282 /* 283 * Get an address in the kernel address map, map 284 * the page_t pages and see if we can touch them. 285 */ 286 287 mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP); 288 if (mapva == NULL) { 289 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" 290 " Can't allocate VA for page_ts"); 291 292 if (meta_alloc) 293 memseg_free_meta(metabase, metapgs); 294 kphysm_addmem_error_undospan(pt_base, tpgs); 295 296 return (KPHYSM_ERESOURCE); 297 } 298 pp = mapva; 299 300 if (physmax < (pt_base + tpgs)) 301 physmax = (pt_base + tpgs); 302 303 /* 304 * In the remapping code we map one page at a time so we must do 305 * the same here to match mapping sizes. 306 */ 307 pfn = pt_base; 308 vaddr = (caddr_t)pp; 309 for (pnum = 0; pnum < metapgs; pnum++) { 310 if (meta_alloc) 311 pfn = memseg_get_metapfn(metabase, (pgcnt_t)pnum); 312 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 313 PROT_READ | PROT_WRITE, 314 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); 315 pfn++; 316 vaddr += ptob(1); 317 } 318 319 if (ddi_peek32((dev_info_t *)NULL, 320 (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) { 321 322 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" 323 " Can't access pp array at 0x%p [phys 0x%lx]", 324 (void *)pp, pt_base); 325 326 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 327 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 328 329 vmem_free(heap_arena, mapva, ptob(metapgs)); 330 if (meta_alloc) 331 memseg_free_meta(metabase, metapgs); 332 kphysm_addmem_error_undospan(pt_base, tpgs); 333 334 return (KPHYSM_EFAULT); 335 } 336 337 /* 338 * Add this memory slice to its memory node translation. 339 * 340 * Note that right now, each node may have only one slice; 341 * this may change with COD or in larger SSM systems with 342 * nested latency groups, so we must not assume that the 343 * node does not yet exist. 344 */ 345 pnum = pt_base + tpgs - 1; 346 mem_node_add_range(pt_base, pnum); 347 348 /* 349 * Allocate or resize page counters as necessary to accommodate 350 * the increase in memory pages. 351 */ 352 mnode = PFN_2_MEM_NODE(pnum); 353 PAGE_CTRS_ADJUST(base, npgs, rv); 354 if (rv) { 355 356 mem_node_del_range(pt_base, pnum); 357 358 /* cleanup the page counters */ 359 page_ctrs_cleanup(); 360 361 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 362 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 363 364 vmem_free(heap_arena, mapva, ptob(metapgs)); 365 if (meta_alloc) 366 memseg_free_meta(metabase, metapgs); 367 kphysm_addmem_error_undospan(pt_base, tpgs); 368 369 return (KPHYSM_ERESOURCE); 370 } 371 372 /* 373 * Update the phys_avail memory list. 374 * The phys_install list was done at the start. 375 */ 376 377 memlist_write_lock(); 378 379 mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT, 380 (uint64_t)(npgs) << PAGESHIFT, &phys_avail); 381 ASSERT(mlret == MEML_SPANOP_OK); 382 383 memlist_write_unlock(); 384 385 /* See if we can find a memseg to re-use. */ 386 if (meta_alloc) { 387 seg = memseg_reuse(0); 388 reuse = 1; /* force unmapping of temp mapva */ 389 flags = MEMSEG_DYNAMIC | MEMSEG_META_ALLOC; 390 /* 391 * There is a 1:1 fixed relationship between a pfn 392 * and a page_t VA. The pfn is used as an index into 393 * the ppvm_base page_t table in order to calculate 394 * the page_t base address for a given pfn range. 395 */ 396 segpp = ppvm_base + base; 397 } else { 398 seg = memseg_reuse(metapgs); 399 reuse = (seg != NULL); 400 flags = MEMSEG_DYNAMIC | MEMSEG_META_INCL; 401 segpp = pp; 402 } 403 404 /* 405 * Initialize the memseg structure representing this memory 406 * and add it to the existing list of memsegs. Do some basic 407 * initialization and add the memory to the system. 408 * In order to prevent lock deadlocks, the add_physmem() 409 * code is repeated here, but split into several stages. 410 * 411 * If a memseg is reused, invalidate memseg pointers in 412 * all cpu vm caches. We need to do this this since the check 413 * pp >= seg->pages && pp < seg->epages 414 * used in various places is not atomic and so the first compare 415 * can happen before reuse and the second compare after reuse. 416 * The invalidation ensures that a memseg is not deferenced while 417 * it's page/pfn pointers are changing. 418 */ 419 if (seg == NULL) { 420 seg = memseg_alloc(); 421 ASSERT(seg != NULL); 422 seg->msegflags = flags; 423 MEMSEG_DEBUG("memseg_get: alloc seg=0x%p, pages=0x%p", 424 (void *)seg, (void *)(seg->pages)); 425 seg->pages = segpp; 426 } else { 427 ASSERT(seg->msegflags == flags); 428 ASSERT(seg->pages_base == seg->pages_end); 429 MEMSEG_DEBUG("memseg_get: reuse seg=0x%p, pages=0x%p", 430 (void *)seg, (void *)(seg->pages)); 431 if (meta_alloc) { 432 memseg_cpu_vm_flush(); 433 seg->pages = segpp; 434 } 435 } 436 437 seg->epages = seg->pages + npgs; 438 seg->pages_base = base; 439 seg->pages_end = base + npgs; 440 441 /* 442 * Initialize metadata. The page_ts are set to locked state 443 * ready to be freed. 444 */ 445 bzero((caddr_t)pp, ptob(metapgs)); 446 447 pfn = seg->pages_base; 448 /* Save the original pp base in case we reuse a memseg. */ 449 opp = pp; 450 oepp = opp + npgs; 451 for (pp = opp; pp < oepp; pp++) { 452 pp->p_pagenum = pfn; 453 pfn++; 454 page_iolock_init(pp); 455 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 456 continue; 457 pp->p_offset = (u_offset_t)-1; 458 } 459 460 if (reuse) { 461 /* Remap our page_ts to the re-used memseg VA space. */ 462 pfn = pt_base; 463 vaddr = (caddr_t)seg->pages; 464 for (pnum = 0; pnum < metapgs; pnum++) { 465 if (meta_alloc) 466 pfn = memseg_get_metapfn(metabase, 467 (pgcnt_t)pnum); 468 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 469 PROT_READ | PROT_WRITE, 470 HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST); 471 pfn++; 472 vaddr += ptob(1); 473 } 474 475 hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs), 476 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 477 478 vmem_free(heap_arena, mapva, ptob(metapgs)); 479 } 480 481 hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off); 482 483 memsegs_lock(1); 484 485 /* 486 * The new memseg is inserted at the beginning of the list. 487 * Not only does this save searching for the tail, but in the 488 * case of a re-used memseg, it solves the problem of what 489 * happens if some process has still got a pointer to the 490 * memseg and follows the next pointer to continue traversing 491 * the memsegs list. 492 */ 493 494 hat_kpm_addmem_mseg_insert(seg); 495 496 seg->next = memsegs; 497 membar_producer(); 498 499 hat_kpm_addmem_memsegs_update(seg); 500 501 memsegs = seg; 502 503 build_pfn_hash(); 504 505 total_pages += npgs; 506 507 /* 508 * Recalculate the paging parameters now total_pages has changed. 509 * This will also cause the clock hands to be reset before next use. 510 */ 511 setupclock(1); 512 513 memsegs_unlock(1); 514 515 PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs); 516 517 /* 518 * Free the pages outside the lock to avoid locking loops. 519 */ 520 for (pp = seg->pages; pp < seg->epages; pp++) { 521 page_free(pp, 1); 522 } 523 524 /* 525 * Now that we've updated the appropriate memory lists we 526 * need to reset a number of globals, since we've increased memory. 527 * Several have already been updated for us as noted above. The 528 * globals we're interested in at this point are: 529 * physmax - highest page frame number. 530 * physinstalled - number of pages currently installed (done earlier) 531 * maxmem - max free pages in the system 532 * physmem - physical memory pages available 533 * availrmem - real memory available 534 */ 535 536 mutex_enter(&freemem_lock); 537 maxmem += npgs; 538 physmem += npgs; 539 availrmem += npgs; 540 availrmem_initial += npgs; 541 542 mutex_exit(&freemem_lock); 543 544 dump_resize(); 545 546 page_freelist_coalesce_all(mnode); 547 548 kphysm_setup_post_add(npgs); 549 550 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK " 551 "(0x%" PRIx64 ")\n", 552 physinstalled << (PAGESHIFT - 10), 553 (uint64_t)physinstalled << PAGESHIFT); 554 555 avmem = (uint64_t)freemem << PAGESHIFT; 556 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: " 557 "avail mem = %" PRId64 "\n", avmem); 558 559 /* 560 * Update lgroup generation number on single lgroup systems 561 */ 562 if (nlgrps == 1) 563 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 564 565 delspan_unreserve(pt_base, tpgs); 566 return (KPHYSM_OK); /* Successfully added system memory */ 567 568 } 569 570 /* 571 * There are various error conditions in kphysm_add_memory_dynamic() 572 * which require a rollback of already changed global state. 573 */ 574 static void 575 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs) 576 { 577 int mlret; 578 579 /* Unreserve memory span. */ 580 memlist_write_lock(); 581 582 mlret = memlist_delete_span( 583 (uint64_t)(pt_base) << PAGESHIFT, 584 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 585 586 ASSERT(mlret == MEML_SPANOP_OK); 587 phys_install_has_changed(); 588 installed_top_size(phys_install, &physmax, &physinstalled); 589 590 memlist_write_unlock(); 591 delspan_unreserve(pt_base, tpgs); 592 } 593 594 /* 595 * Only return an available memseg of exactly the right size 596 * if size is required. 597 * When the meta data area has it's own virtual address space 598 * we will need to manage this more carefully and do best fit 599 * allocations, possibly splitting an available area. 600 */ 601 struct memseg * 602 memseg_reuse(pgcnt_t metapgs) 603 { 604 int type; 605 struct memseg **segpp, *seg; 606 607 mutex_enter(&memseg_lists_lock); 608 609 segpp = &memseg_va_avail; 610 for (; (seg = *segpp) != NULL; segpp = &seg->lnext) { 611 caddr_t end; 612 613 /* 614 * Make sure we are reusing the right segment type. 615 */ 616 type = metapgs ? MEMSEG_META_INCL : MEMSEG_META_ALLOC; 617 618 if ((seg->msegflags & (MEMSEG_META_INCL | MEMSEG_META_ALLOC)) 619 != type) 620 continue; 621 622 if (kpm_enable) 623 end = hat_kpm_mseg_reuse(seg); 624 else 625 end = (caddr_t)seg->epages; 626 627 /* 628 * Check for the right size if it is provided. 629 */ 630 if (!metapgs || btopr(end - (caddr_t)seg->pages) == metapgs) { 631 *segpp = seg->lnext; 632 seg->lnext = NULL; 633 break; 634 } 635 } 636 mutex_exit(&memseg_lists_lock); 637 638 return (seg); 639 } 640 641 static uint_t handle_gen; 642 643 struct memdelspan { 644 struct memdelspan *mds_next; 645 pfn_t mds_base; 646 pgcnt_t mds_npgs; 647 uint_t *mds_bitmap; 648 uint_t *mds_bitmap_retired; 649 }; 650 651 #define NBPBMW (sizeof (uint_t) * NBBY) 652 #define MDS_BITMAPBYTES(MDSP) \ 653 ((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t)) 654 655 struct transit_list { 656 struct transit_list *trl_next; 657 struct memdelspan *trl_spans; 658 int trl_collect; 659 }; 660 661 struct transit_list_head { 662 kmutex_t trh_lock; 663 struct transit_list *trh_head; 664 }; 665 666 static struct transit_list_head transit_list_head; 667 668 struct mem_handle; 669 static void transit_list_collect(struct mem_handle *, int); 670 static void transit_list_insert(struct transit_list *); 671 static void transit_list_remove(struct transit_list *); 672 673 #ifdef DEBUG 674 #define MEM_DEL_STATS 675 #endif /* DEBUG */ 676 677 #ifdef MEM_DEL_STATS 678 static int mem_del_stat_print = 0; 679 struct mem_del_stat { 680 uint_t nloop; 681 uint_t need_free; 682 uint_t free_loop; 683 uint_t free_low; 684 uint_t free_failed; 685 uint_t ncheck; 686 uint_t nopaget; 687 uint_t lockfail; 688 uint_t nfree; 689 uint_t nreloc; 690 uint_t nrelocfail; 691 uint_t already_done; 692 uint_t first_notfree; 693 uint_t npplocked; 694 uint_t nlockreloc; 695 uint_t nnorepl; 696 uint_t nmodreloc; 697 uint_t ndestroy; 698 uint_t nputpage; 699 uint_t nnoreclaim; 700 uint_t ndelay; 701 uint_t demotefail; 702 uint64_t nticks_total; 703 uint64_t nticks_pgrp; 704 uint_t retired; 705 uint_t toxic; 706 uint_t failing; 707 uint_t modtoxic; 708 uint_t npplkdtoxic; 709 uint_t gptlmodfail; 710 uint_t gptllckfail; 711 }; 712 /* 713 * The stat values are only incremented in the delete thread 714 * so no locking or atomic required. 715 */ 716 #define MDSTAT_INCR(MHP, FLD) (MHP)->mh_delstat.FLD++ 717 #define MDSTAT_TOTAL(MHP, ntck) ((MHP)->mh_delstat.nticks_total += (ntck)) 718 #define MDSTAT_PGRP(MHP, ntck) ((MHP)->mh_delstat.nticks_pgrp += (ntck)) 719 static void mem_del_stat_print_func(struct mem_handle *); 720 #define MDSTAT_PRINT(MHP) mem_del_stat_print_func((MHP)) 721 #else /* MEM_DEL_STATS */ 722 #define MDSTAT_INCR(MHP, FLD) 723 #define MDSTAT_TOTAL(MHP, ntck) 724 #define MDSTAT_PGRP(MHP, ntck) 725 #define MDSTAT_PRINT(MHP) 726 #endif /* MEM_DEL_STATS */ 727 728 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING, 729 MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t; 730 731 /* 732 * mh_mutex must be taken to examine or change mh_exthandle and mh_state. 733 * The mutex may not be required for other fields, dependent on mh_state. 734 */ 735 struct mem_handle { 736 kmutex_t mh_mutex; 737 struct mem_handle *mh_next; 738 memhandle_t mh_exthandle; 739 mhnd_state_t mh_state; 740 struct transit_list mh_transit; 741 pgcnt_t mh_phys_pages; 742 pgcnt_t mh_vm_pages; 743 pgcnt_t mh_hold_todo; 744 void (*mh_delete_complete)(void *, int error); 745 void *mh_delete_complete_arg; 746 volatile uint_t mh_cancel; 747 volatile uint_t mh_dr_aio_cleanup_cancel; 748 volatile uint_t mh_aio_cleanup_done; 749 kcondvar_t mh_cv; 750 kthread_id_t mh_thread_id; 751 page_t *mh_deleted; /* link through p_next */ 752 #ifdef MEM_DEL_STATS 753 struct mem_del_stat mh_delstat; 754 #endif /* MEM_DEL_STATS */ 755 }; 756 757 static struct mem_handle *mem_handle_head; 758 static kmutex_t mem_handle_list_mutex; 759 760 static struct mem_handle * 761 kphysm_allocate_mem_handle() 762 { 763 struct mem_handle *mhp; 764 765 mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP); 766 mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL); 767 mutex_enter(&mem_handle_list_mutex); 768 mutex_enter(&mhp->mh_mutex); 769 /* handle_gen is protected by list mutex. */ 770 mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen); 771 mhp->mh_next = mem_handle_head; 772 mem_handle_head = mhp; 773 mutex_exit(&mem_handle_list_mutex); 774 775 return (mhp); 776 } 777 778 static void 779 kphysm_free_mem_handle(struct mem_handle *mhp) 780 { 781 struct mem_handle **mhpp; 782 783 ASSERT(mutex_owned(&mhp->mh_mutex)); 784 ASSERT(mhp->mh_state == MHND_FREE); 785 /* 786 * Exit the mutex to preserve locking order. This is OK 787 * here as once in the FREE state, the handle cannot 788 * be found by a lookup. 789 */ 790 mutex_exit(&mhp->mh_mutex); 791 792 mutex_enter(&mem_handle_list_mutex); 793 mhpp = &mem_handle_head; 794 while (*mhpp != NULL && *mhpp != mhp) 795 mhpp = &(*mhpp)->mh_next; 796 ASSERT(*mhpp == mhp); 797 /* 798 * No need to lock the handle (mh_mutex) as only 799 * mh_next changing and this is the only thread that 800 * can be referncing mhp. 801 */ 802 *mhpp = mhp->mh_next; 803 mutex_exit(&mem_handle_list_mutex); 804 805 mutex_destroy(&mhp->mh_mutex); 806 kmem_free(mhp, sizeof (struct mem_handle)); 807 } 808 809 /* 810 * This function finds the internal mem_handle corresponding to an 811 * external handle and returns it with the mh_mutex held. 812 */ 813 static struct mem_handle * 814 kphysm_lookup_mem_handle(memhandle_t handle) 815 { 816 struct mem_handle *mhp; 817 818 mutex_enter(&mem_handle_list_mutex); 819 for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) { 820 if (mhp->mh_exthandle == handle) { 821 mutex_enter(&mhp->mh_mutex); 822 /* 823 * The state of the handle could have been changed 824 * by kphysm_del_release() while waiting for mh_mutex. 825 */ 826 if (mhp->mh_state == MHND_FREE) { 827 mutex_exit(&mhp->mh_mutex); 828 continue; 829 } 830 break; 831 } 832 } 833 mutex_exit(&mem_handle_list_mutex); 834 return (mhp); 835 } 836 837 int 838 kphysm_del_gethandle(memhandle_t *xmhp) 839 { 840 struct mem_handle *mhp; 841 842 mhp = kphysm_allocate_mem_handle(); 843 /* 844 * The handle is allocated using KM_SLEEP, so cannot fail. 845 * If the implementation is changed, the correct error to return 846 * here would be KPHYSM_ENOHANDLES. 847 */ 848 ASSERT(mhp->mh_state == MHND_FREE); 849 mhp->mh_state = MHND_INIT; 850 *xmhp = mhp->mh_exthandle; 851 mutex_exit(&mhp->mh_mutex); 852 return (KPHYSM_OK); 853 } 854 855 static int 856 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2) 857 { 858 pfn_t e1, e2; 859 860 e1 = b1 + l1; 861 e2 = b2 + l2; 862 863 return (!(b2 >= e1 || b1 >= e2)); 864 } 865 866 static int can_remove_pgs(pgcnt_t); 867 868 static struct memdelspan * 869 span_to_install(pfn_t base, pgcnt_t npgs) 870 { 871 struct memdelspan *mdsp; 872 struct memdelspan *mdsp_new; 873 uint64_t address, size, thislen; 874 struct memlist *mlp; 875 876 mdsp_new = NULL; 877 878 address = (uint64_t)base << PAGESHIFT; 879 size = (uint64_t)npgs << PAGESHIFT; 880 while (size != 0) { 881 memlist_read_lock(); 882 for (mlp = phys_install; mlp != NULL; mlp = mlp->ml_next) { 883 if (address >= (mlp->ml_address + mlp->ml_size)) 884 continue; 885 if ((address + size) > mlp->ml_address) 886 break; 887 } 888 if (mlp == NULL) { 889 address += size; 890 size = 0; 891 thislen = 0; 892 } else { 893 if (address < mlp->ml_address) { 894 size -= (mlp->ml_address - address); 895 address = mlp->ml_address; 896 } 897 ASSERT(address >= mlp->ml_address); 898 if ((address + size) > 899 (mlp->ml_address + mlp->ml_size)) { 900 thislen = 901 mlp->ml_size - (address - mlp->ml_address); 902 } else { 903 thislen = size; 904 } 905 } 906 memlist_read_unlock(); 907 /* TODO: phys_install could change now */ 908 if (thislen == 0) 909 continue; 910 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 911 mdsp->mds_base = btop(address); 912 mdsp->mds_npgs = btop(thislen); 913 mdsp->mds_next = mdsp_new; 914 mdsp_new = mdsp; 915 address += thislen; 916 size -= thislen; 917 } 918 return (mdsp_new); 919 } 920 921 static void 922 free_delspans(struct memdelspan *mdsp) 923 { 924 struct memdelspan *amdsp; 925 926 while ((amdsp = mdsp) != NULL) { 927 mdsp = amdsp->mds_next; 928 kmem_free(amdsp, sizeof (struct memdelspan)); 929 } 930 } 931 932 /* 933 * Concatenate lists. No list ordering is required. 934 */ 935 936 static void 937 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp) 938 { 939 while (*mdspp != NULL) 940 mdspp = &(*mdspp)->mds_next; 941 942 *mdspp = mdsp; 943 } 944 945 /* 946 * Given a new list of delspans, check there is no overlap with 947 * all existing span activity (add or delete) and then concatenate 948 * the new spans to the given list. 949 * Return 1 for OK, 0 if overlapping. 950 */ 951 static int 952 delspan_insert( 953 struct transit_list *my_tlp, 954 struct memdelspan *mdsp_new) 955 { 956 struct transit_list_head *trh; 957 struct transit_list *tlp; 958 int ret; 959 960 trh = &transit_list_head; 961 962 ASSERT(my_tlp != NULL); 963 ASSERT(mdsp_new != NULL); 964 965 ret = 1; 966 mutex_enter(&trh->trh_lock); 967 /* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */ 968 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 969 struct memdelspan *mdsp; 970 971 for (mdsp = tlp->trl_spans; mdsp != NULL; 972 mdsp = mdsp->mds_next) { 973 struct memdelspan *nmdsp; 974 975 for (nmdsp = mdsp_new; nmdsp != NULL; 976 nmdsp = nmdsp->mds_next) { 977 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 978 nmdsp->mds_base, nmdsp->mds_npgs)) { 979 ret = 0; 980 goto done; 981 } 982 } 983 } 984 } 985 done: 986 if (ret != 0) { 987 if (my_tlp->trl_spans == NULL) 988 transit_list_insert(my_tlp); 989 delspan_concat(&my_tlp->trl_spans, mdsp_new); 990 } 991 mutex_exit(&trh->trh_lock); 992 return (ret); 993 } 994 995 static void 996 delspan_remove( 997 struct transit_list *my_tlp, 998 pfn_t base, 999 pgcnt_t npgs) 1000 { 1001 struct transit_list_head *trh; 1002 struct memdelspan *mdsp; 1003 1004 trh = &transit_list_head; 1005 1006 ASSERT(my_tlp != NULL); 1007 1008 mutex_enter(&trh->trh_lock); 1009 if ((mdsp = my_tlp->trl_spans) != NULL) { 1010 if (npgs == 0) { 1011 my_tlp->trl_spans = NULL; 1012 free_delspans(mdsp); 1013 transit_list_remove(my_tlp); 1014 } else { 1015 struct memdelspan **prv; 1016 1017 prv = &my_tlp->trl_spans; 1018 while (mdsp != NULL) { 1019 pfn_t p_end; 1020 1021 p_end = mdsp->mds_base + mdsp->mds_npgs; 1022 if (mdsp->mds_base >= base && 1023 p_end <= (base + npgs)) { 1024 *prv = mdsp->mds_next; 1025 mdsp->mds_next = NULL; 1026 free_delspans(mdsp); 1027 } else { 1028 prv = &mdsp->mds_next; 1029 } 1030 mdsp = *prv; 1031 } 1032 if (my_tlp->trl_spans == NULL) 1033 transit_list_remove(my_tlp); 1034 } 1035 } 1036 mutex_exit(&trh->trh_lock); 1037 } 1038 1039 /* 1040 * Reserve interface for add to stop delete before add finished. 1041 * This list is only accessed through the delspan_insert/remove 1042 * functions and so is fully protected by the mutex in struct transit_list. 1043 */ 1044 1045 static struct transit_list reserve_transit; 1046 1047 static int 1048 delspan_reserve(pfn_t base, pgcnt_t npgs) 1049 { 1050 struct memdelspan *mdsp; 1051 int ret; 1052 1053 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 1054 mdsp->mds_base = base; 1055 mdsp->mds_npgs = npgs; 1056 if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) { 1057 free_delspans(mdsp); 1058 } 1059 return (ret); 1060 } 1061 1062 static void 1063 delspan_unreserve(pfn_t base, pgcnt_t npgs) 1064 { 1065 delspan_remove(&reserve_transit, base, npgs); 1066 } 1067 1068 /* 1069 * Return whether memseg was created by kphysm_add_memory_dynamic(). 1070 */ 1071 static int 1072 memseg_is_dynamic(struct memseg *seg) 1073 { 1074 return (seg->msegflags & MEMSEG_DYNAMIC); 1075 } 1076 1077 int 1078 kphysm_del_span( 1079 memhandle_t handle, 1080 pfn_t base, 1081 pgcnt_t npgs) 1082 { 1083 struct mem_handle *mhp; 1084 struct memseg *seg; 1085 struct memdelspan *mdsp; 1086 struct memdelspan *mdsp_new; 1087 pgcnt_t phys_pages, vm_pages; 1088 pfn_t p_end; 1089 page_t *pp; 1090 int ret; 1091 1092 mhp = kphysm_lookup_mem_handle(handle); 1093 if (mhp == NULL) { 1094 return (KPHYSM_EHANDLE); 1095 } 1096 if (mhp->mh_state != MHND_INIT) { 1097 mutex_exit(&mhp->mh_mutex); 1098 return (KPHYSM_ESEQUENCE); 1099 } 1100 1101 /* 1102 * Intersect the span with the installed memory list (phys_install). 1103 */ 1104 mdsp_new = span_to_install(base, npgs); 1105 if (mdsp_new == NULL) { 1106 /* 1107 * No physical memory in this range. Is this an 1108 * error? If an attempt to start the delete is made 1109 * for OK returns from del_span such as this, start will 1110 * return an error. 1111 * Could return KPHYSM_ENOWORK. 1112 */ 1113 /* 1114 * It is assumed that there are no error returns 1115 * from span_to_install() due to kmem_alloc failure. 1116 */ 1117 mutex_exit(&mhp->mh_mutex); 1118 return (KPHYSM_OK); 1119 } 1120 /* 1121 * Does this span overlap an existing span? 1122 */ 1123 if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) { 1124 /* 1125 * Differentiate between already on list for this handle 1126 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY). 1127 */ 1128 ret = KPHYSM_EBUSY; 1129 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1130 mdsp = mdsp->mds_next) { 1131 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 1132 base, npgs)) { 1133 ret = KPHYSM_EDUP; 1134 break; 1135 } 1136 } 1137 mutex_exit(&mhp->mh_mutex); 1138 free_delspans(mdsp_new); 1139 return (ret); 1140 } 1141 /* 1142 * At this point the spans in mdsp_new have been inserted into the 1143 * list of spans for this handle and thereby to the global list of 1144 * spans being processed. Each of these spans must now be checked 1145 * for relocatability. As a side-effect segments in the memseg list 1146 * may be split. 1147 * 1148 * Note that mdsp_new can no longer be used as it is now part of 1149 * a larger list. Select elements of this larger list based 1150 * on base and npgs. 1151 */ 1152 restart: 1153 phys_pages = 0; 1154 vm_pages = 0; 1155 ret = KPHYSM_OK; 1156 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1157 mdsp = mdsp->mds_next) { 1158 pgcnt_t pages_checked; 1159 1160 if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) { 1161 continue; 1162 } 1163 p_end = mdsp->mds_base + mdsp->mds_npgs; 1164 /* 1165 * The pages_checked count is a hack. All pages should be 1166 * checked for relocatability. Those not covered by memsegs 1167 * should be tested with arch_kphysm_del_span_ok(). 1168 */ 1169 pages_checked = 0; 1170 for (seg = memsegs; seg; seg = seg->next) { 1171 pfn_t mseg_start; 1172 1173 if (seg->pages_base >= p_end || 1174 seg->pages_end <= mdsp->mds_base) { 1175 /* Span and memseg don't overlap. */ 1176 continue; 1177 } 1178 mseg_start = memseg_get_start(seg); 1179 /* Check that segment is suitable for delete. */ 1180 if (memseg_includes_meta(seg)) { 1181 /* 1182 * Check that this segment is completely 1183 * within the span. 1184 */ 1185 if (mseg_start < mdsp->mds_base || 1186 seg->pages_end > p_end) { 1187 ret = KPHYSM_EBUSY; 1188 break; 1189 } 1190 pages_checked += seg->pages_end - mseg_start; 1191 } else { 1192 /* 1193 * If this segment is larger than the span, 1194 * try to split it. After the split, it 1195 * is necessary to restart. 1196 */ 1197 if (seg->pages_base < mdsp->mds_base || 1198 seg->pages_end > p_end) { 1199 pfn_t abase; 1200 pgcnt_t anpgs; 1201 int s_ret; 1202 1203 /* Split required. */ 1204 if (mdsp->mds_base < seg->pages_base) 1205 abase = seg->pages_base; 1206 else 1207 abase = mdsp->mds_base; 1208 if (p_end > seg->pages_end) 1209 anpgs = seg->pages_end - abase; 1210 else 1211 anpgs = p_end - abase; 1212 s_ret = kphysm_split_memseg(abase, 1213 anpgs); 1214 if (s_ret == 0) { 1215 /* Split failed. */ 1216 ret = KPHYSM_ERESOURCE; 1217 break; 1218 } 1219 goto restart; 1220 } 1221 pages_checked += 1222 seg->pages_end - seg->pages_base; 1223 } 1224 /* 1225 * The memseg is wholly within the delete span. 1226 * The individual pages can now be checked. 1227 */ 1228 /* Cage test. */ 1229 for (pp = seg->pages; pp < seg->epages; pp++) { 1230 if (PP_ISNORELOC(pp)) { 1231 ret = KPHYSM_ENONRELOC; 1232 break; 1233 } 1234 } 1235 if (ret != KPHYSM_OK) { 1236 break; 1237 } 1238 phys_pages += (seg->pages_end - mseg_start); 1239 vm_pages += MSEG_NPAGES(seg); 1240 } 1241 if (ret != KPHYSM_OK) 1242 break; 1243 if (pages_checked != mdsp->mds_npgs) { 1244 ret = KPHYSM_ENONRELOC; 1245 break; 1246 } 1247 } 1248 1249 if (ret == KPHYSM_OK) { 1250 mhp->mh_phys_pages += phys_pages; 1251 mhp->mh_vm_pages += vm_pages; 1252 } else { 1253 /* 1254 * Keep holding the mh_mutex to prevent it going away. 1255 */ 1256 delspan_remove(&mhp->mh_transit, base, npgs); 1257 } 1258 mutex_exit(&mhp->mh_mutex); 1259 return (ret); 1260 } 1261 1262 int 1263 kphysm_del_span_query( 1264 pfn_t base, 1265 pgcnt_t npgs, 1266 memquery_t *mqp) 1267 { 1268 struct memdelspan *mdsp; 1269 struct memdelspan *mdsp_new; 1270 int done_first_nonreloc; 1271 1272 mqp->phys_pages = 0; 1273 mqp->managed = 0; 1274 mqp->nonrelocatable = 0; 1275 mqp->first_nonrelocatable = 0; 1276 mqp->last_nonrelocatable = 0; 1277 1278 mdsp_new = span_to_install(base, npgs); 1279 /* 1280 * It is OK to proceed here if mdsp_new == NULL. 1281 */ 1282 done_first_nonreloc = 0; 1283 for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) { 1284 pfn_t sbase; 1285 pgcnt_t snpgs; 1286 1287 mqp->phys_pages += mdsp->mds_npgs; 1288 sbase = mdsp->mds_base; 1289 snpgs = mdsp->mds_npgs; 1290 while (snpgs != 0) { 1291 struct memseg *lseg, *seg; 1292 pfn_t p_end; 1293 page_t *pp; 1294 pfn_t mseg_start; 1295 1296 p_end = sbase + snpgs; 1297 /* 1298 * Find the lowest addressed memseg that starts 1299 * after sbase and account for it. 1300 * This is to catch dynamic memsegs whose start 1301 * is hidden. 1302 */ 1303 seg = NULL; 1304 for (lseg = memsegs; lseg != NULL; lseg = lseg->next) { 1305 if ((lseg->pages_base >= sbase) || 1306 (lseg->pages_base < p_end && 1307 lseg->pages_end > sbase)) { 1308 if (seg == NULL || 1309 seg->pages_base > lseg->pages_base) 1310 seg = lseg; 1311 } 1312 } 1313 if (seg != NULL) { 1314 mseg_start = memseg_get_start(seg); 1315 /* 1316 * Now have the full extent of the memseg so 1317 * do the range check. 1318 */ 1319 if (mseg_start >= p_end || 1320 seg->pages_end <= sbase) { 1321 /* Span does not overlap memseg. */ 1322 seg = NULL; 1323 } 1324 } 1325 /* 1326 * Account for gap either before the segment if 1327 * there is one or to the end of the span. 1328 */ 1329 if (seg == NULL || mseg_start > sbase) { 1330 pfn_t a_end; 1331 1332 a_end = (seg == NULL) ? p_end : mseg_start; 1333 /* 1334 * Check with arch layer for relocatability. 1335 */ 1336 if (arch_kphysm_del_span_ok(sbase, 1337 (a_end - sbase))) { 1338 /* 1339 * No non-relocatble pages in this 1340 * area, avoid the fine-grained 1341 * test. 1342 */ 1343 snpgs -= (a_end - sbase); 1344 sbase = a_end; 1345 } 1346 while (sbase < a_end) { 1347 if (!arch_kphysm_del_span_ok(sbase, 1348 1)) { 1349 mqp->nonrelocatable++; 1350 if (!done_first_nonreloc) { 1351 mqp-> 1352 first_nonrelocatable 1353 = sbase; 1354 done_first_nonreloc = 1; 1355 } 1356 mqp->last_nonrelocatable = 1357 sbase; 1358 } 1359 sbase++; 1360 snpgs--; 1361 } 1362 } 1363 if (seg != NULL) { 1364 ASSERT(mseg_start <= sbase); 1365 if (seg->pages_base != mseg_start && 1366 seg->pages_base > sbase) { 1367 pgcnt_t skip_pgs; 1368 1369 /* 1370 * Skip the page_t area of a 1371 * dynamic memseg. 1372 */ 1373 skip_pgs = seg->pages_base - sbase; 1374 if (snpgs <= skip_pgs) { 1375 sbase += snpgs; 1376 snpgs = 0; 1377 continue; 1378 } 1379 snpgs -= skip_pgs; 1380 sbase += skip_pgs; 1381 } 1382 ASSERT(snpgs != 0); 1383 ASSERT(seg->pages_base <= sbase); 1384 /* 1385 * The individual pages can now be checked. 1386 */ 1387 for (pp = seg->pages + 1388 (sbase - seg->pages_base); 1389 snpgs != 0 && pp < seg->epages; pp++) { 1390 mqp->managed++; 1391 if (PP_ISNORELOC(pp)) { 1392 mqp->nonrelocatable++; 1393 if (!done_first_nonreloc) { 1394 mqp-> 1395 first_nonrelocatable 1396 = sbase; 1397 done_first_nonreloc = 1; 1398 } 1399 mqp->last_nonrelocatable = 1400 sbase; 1401 } 1402 sbase++; 1403 snpgs--; 1404 } 1405 } 1406 } 1407 } 1408 1409 free_delspans(mdsp_new); 1410 1411 return (KPHYSM_OK); 1412 } 1413 1414 /* 1415 * This release function can be called at any stage as follows: 1416 * _gethandle only called 1417 * _span(s) only called 1418 * _start called but failed 1419 * delete thread exited 1420 */ 1421 int 1422 kphysm_del_release(memhandle_t handle) 1423 { 1424 struct mem_handle *mhp; 1425 1426 mhp = kphysm_lookup_mem_handle(handle); 1427 if (mhp == NULL) { 1428 return (KPHYSM_EHANDLE); 1429 } 1430 switch (mhp->mh_state) { 1431 case MHND_STARTING: 1432 case MHND_RUNNING: 1433 mutex_exit(&mhp->mh_mutex); 1434 return (KPHYSM_ENOTFINISHED); 1435 case MHND_FREE: 1436 ASSERT(mhp->mh_state != MHND_FREE); 1437 mutex_exit(&mhp->mh_mutex); 1438 return (KPHYSM_EHANDLE); 1439 case MHND_INIT: 1440 break; 1441 case MHND_DONE: 1442 break; 1443 case MHND_RELEASE: 1444 mutex_exit(&mhp->mh_mutex); 1445 return (KPHYSM_ESEQUENCE); 1446 default: 1447 #ifdef DEBUG 1448 cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d", 1449 (void *)mhp, mhp->mh_state); 1450 #endif /* DEBUG */ 1451 mutex_exit(&mhp->mh_mutex); 1452 return (KPHYSM_EHANDLE); 1453 } 1454 /* 1455 * Set state so that we can wait if necessary. 1456 * Also this means that we have read/write access to all 1457 * fields except mh_exthandle and mh_state. 1458 */ 1459 mhp->mh_state = MHND_RELEASE; 1460 /* 1461 * The mem_handle cannot be de-allocated by any other operation 1462 * now, so no need to hold mh_mutex. 1463 */ 1464 mutex_exit(&mhp->mh_mutex); 1465 1466 delspan_remove(&mhp->mh_transit, 0, 0); 1467 mhp->mh_phys_pages = 0; 1468 mhp->mh_vm_pages = 0; 1469 mhp->mh_hold_todo = 0; 1470 mhp->mh_delete_complete = NULL; 1471 mhp->mh_delete_complete_arg = NULL; 1472 mhp->mh_cancel = 0; 1473 1474 mutex_enter(&mhp->mh_mutex); 1475 ASSERT(mhp->mh_state == MHND_RELEASE); 1476 mhp->mh_state = MHND_FREE; 1477 1478 kphysm_free_mem_handle(mhp); 1479 1480 return (KPHYSM_OK); 1481 } 1482 1483 /* 1484 * This cancel function can only be called with the thread running. 1485 */ 1486 int 1487 kphysm_del_cancel(memhandle_t handle) 1488 { 1489 struct mem_handle *mhp; 1490 1491 mhp = kphysm_lookup_mem_handle(handle); 1492 if (mhp == NULL) { 1493 return (KPHYSM_EHANDLE); 1494 } 1495 if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) { 1496 mutex_exit(&mhp->mh_mutex); 1497 return (KPHYSM_ENOTRUNNING); 1498 } 1499 /* 1500 * Set the cancel flag and wake the delete thread up. 1501 * The thread may be waiting on I/O, so the effect of the cancel 1502 * may be delayed. 1503 */ 1504 if (mhp->mh_cancel == 0) { 1505 mhp->mh_cancel = KPHYSM_ECANCELLED; 1506 cv_signal(&mhp->mh_cv); 1507 } 1508 mutex_exit(&mhp->mh_mutex); 1509 return (KPHYSM_OK); 1510 } 1511 1512 int 1513 kphysm_del_status( 1514 memhandle_t handle, 1515 memdelstat_t *mdstp) 1516 { 1517 struct mem_handle *mhp; 1518 1519 mhp = kphysm_lookup_mem_handle(handle); 1520 if (mhp == NULL) { 1521 return (KPHYSM_EHANDLE); 1522 } 1523 /* 1524 * Calling kphysm_del_status() is allowed before the delete 1525 * is started to allow for status display. 1526 */ 1527 if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING && 1528 mhp->mh_state != MHND_RUNNING) { 1529 mutex_exit(&mhp->mh_mutex); 1530 return (KPHYSM_ENOTRUNNING); 1531 } 1532 mdstp->phys_pages = mhp->mh_phys_pages; 1533 mdstp->managed = mhp->mh_vm_pages; 1534 mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo; 1535 mutex_exit(&mhp->mh_mutex); 1536 return (KPHYSM_OK); 1537 } 1538 1539 static int mem_delete_additional_pages = 100; 1540 1541 static int 1542 can_remove_pgs(pgcnt_t npgs) 1543 { 1544 /* 1545 * If all pageable pages were paged out, freemem would 1546 * equal availrmem. There is a minimum requirement for 1547 * availrmem. 1548 */ 1549 if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages)) 1550 < npgs) 1551 return (0); 1552 /* TODO: check swap space, etc. */ 1553 return (1); 1554 } 1555 1556 static int 1557 get_availrmem(pgcnt_t npgs) 1558 { 1559 int ret; 1560 1561 mutex_enter(&freemem_lock); 1562 ret = can_remove_pgs(npgs); 1563 if (ret != 0) 1564 availrmem -= npgs; 1565 mutex_exit(&freemem_lock); 1566 return (ret); 1567 } 1568 1569 static void 1570 put_availrmem(pgcnt_t npgs) 1571 { 1572 mutex_enter(&freemem_lock); 1573 availrmem += npgs; 1574 mutex_exit(&freemem_lock); 1575 } 1576 1577 #define FREEMEM_INCR 100 1578 static pgcnt_t freemem_incr = FREEMEM_INCR; 1579 #define DEL_FREE_WAIT_FRAC 4 1580 #define DEL_FREE_WAIT_TICKS ((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC) 1581 1582 #define DEL_BUSY_WAIT_FRAC 20 1583 #define DEL_BUSY_WAIT_TICKS ((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC) 1584 1585 static void kphysm_del_cleanup(struct mem_handle *); 1586 1587 static void page_delete_collect(page_t *, struct mem_handle *); 1588 1589 static pgcnt_t 1590 delthr_get_freemem(struct mem_handle *mhp) 1591 { 1592 pgcnt_t free_get; 1593 int ret; 1594 1595 ASSERT(MUTEX_HELD(&mhp->mh_mutex)); 1596 1597 MDSTAT_INCR(mhp, need_free); 1598 /* 1599 * Get up to freemem_incr pages. 1600 */ 1601 free_get = freemem_incr; 1602 if (free_get > mhp->mh_hold_todo) 1603 free_get = mhp->mh_hold_todo; 1604 /* 1605 * Take free_get pages away from freemem, 1606 * waiting if necessary. 1607 */ 1608 1609 while (!mhp->mh_cancel) { 1610 mutex_exit(&mhp->mh_mutex); 1611 MDSTAT_INCR(mhp, free_loop); 1612 /* 1613 * Duplicate test from page_create_throttle() 1614 * but don't override with !PG_WAIT. 1615 */ 1616 if (freemem < (free_get + throttlefree)) { 1617 MDSTAT_INCR(mhp, free_low); 1618 ret = 0; 1619 } else { 1620 ret = page_create_wait(free_get, 0); 1621 if (ret == 0) { 1622 /* EMPTY */ 1623 MDSTAT_INCR(mhp, free_failed); 1624 } 1625 } 1626 if (ret != 0) { 1627 mutex_enter(&mhp->mh_mutex); 1628 return (free_get); 1629 } 1630 1631 /* 1632 * Put pressure on pageout. 1633 */ 1634 page_needfree(free_get); 1635 cv_signal(&proc_pageout->p_cv); 1636 1637 mutex_enter(&mhp->mh_mutex); 1638 (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex, 1639 DEL_FREE_WAIT_TICKS, TR_CLOCK_TICK); 1640 mutex_exit(&mhp->mh_mutex); 1641 page_needfree(-(spgcnt_t)free_get); 1642 1643 mutex_enter(&mhp->mh_mutex); 1644 } 1645 return (0); 1646 } 1647 1648 #define DR_AIO_CLEANUP_DELAY 25000 /* 0.025secs, in usec */ 1649 #define DR_AIO_CLEANUP_MAXLOOPS_NODELAY 100 1650 /* 1651 * This function is run as a helper thread for delete_memory_thread. 1652 * It is needed in order to force kaio cleanup, so that pages used in kaio 1653 * will be unlocked and subsequently relocated by delete_memory_thread. 1654 * The address of the delete_memory_threads's mem_handle is passed in to 1655 * this thread function, and is used to set the mh_aio_cleanup_done member 1656 * prior to calling thread_exit(). 1657 */ 1658 static void 1659 dr_aio_cleanup_thread(caddr_t amhp) 1660 { 1661 proc_t *procp; 1662 int (*aio_cleanup_dr_delete_memory)(proc_t *); 1663 int cleaned; 1664 int n = 0; 1665 struct mem_handle *mhp; 1666 volatile uint_t *pcancel; 1667 1668 mhp = (struct mem_handle *)amhp; 1669 ASSERT(mhp != NULL); 1670 pcancel = &mhp->mh_dr_aio_cleanup_cancel; 1671 if (modload("sys", "kaio") == -1) { 1672 mhp->mh_aio_cleanup_done = 1; 1673 cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio"); 1674 thread_exit(); 1675 } 1676 aio_cleanup_dr_delete_memory = (int (*)(proc_t *)) 1677 modgetsymvalue("aio_cleanup_dr_delete_memory", 0); 1678 if (aio_cleanup_dr_delete_memory == NULL) { 1679 mhp->mh_aio_cleanup_done = 1; 1680 cmn_err(CE_WARN, 1681 "aio_cleanup_dr_delete_memory not found in kaio"); 1682 thread_exit(); 1683 } 1684 do { 1685 cleaned = 0; 1686 mutex_enter(&pidlock); 1687 for (procp = practive; (*pcancel == 0) && (procp != NULL); 1688 procp = procp->p_next) { 1689 mutex_enter(&procp->p_lock); 1690 if (procp->p_aio != NULL) { 1691 /* cleanup proc's outstanding kaio */ 1692 cleaned += 1693 (*aio_cleanup_dr_delete_memory)(procp); 1694 } 1695 mutex_exit(&procp->p_lock); 1696 } 1697 mutex_exit(&pidlock); 1698 if ((*pcancel == 0) && 1699 (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) { 1700 /* delay a bit before retrying all procs again */ 1701 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 1702 n = 0; 1703 } 1704 } while (*pcancel == 0); 1705 mhp->mh_aio_cleanup_done = 1; 1706 thread_exit(); 1707 } 1708 1709 static void 1710 delete_memory_thread(caddr_t amhp) 1711 { 1712 struct mem_handle *mhp; 1713 struct memdelspan *mdsp; 1714 callb_cpr_t cprinfo; 1715 page_t *pp_targ; 1716 spgcnt_t freemem_left; 1717 void (*del_complete_funcp)(void *, int error); 1718 void *del_complete_arg; 1719 int comp_code; 1720 int ret; 1721 int first_scan; 1722 uint_t szc; 1723 #ifdef MEM_DEL_STATS 1724 uint64_t start_total, ntick_total; 1725 uint64_t start_pgrp, ntick_pgrp; 1726 #endif /* MEM_DEL_STATS */ 1727 1728 mhp = (struct mem_handle *)amhp; 1729 1730 #ifdef MEM_DEL_STATS 1731 start_total = ddi_get_lbolt(); 1732 #endif /* MEM_DEL_STATS */ 1733 1734 CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex, 1735 callb_generic_cpr, "memdel"); 1736 1737 mutex_enter(&mhp->mh_mutex); 1738 ASSERT(mhp->mh_state == MHND_STARTING); 1739 1740 mhp->mh_state = MHND_RUNNING; 1741 mhp->mh_thread_id = curthread; 1742 1743 mhp->mh_hold_todo = mhp->mh_vm_pages; 1744 mutex_exit(&mhp->mh_mutex); 1745 1746 /* Allocate the remap pages now, if necessary. */ 1747 memseg_remap_init(); 1748 1749 /* 1750 * Subtract from availrmem now if possible as availrmem 1751 * may not be available by the end of the delete. 1752 */ 1753 if (!get_availrmem(mhp->mh_vm_pages)) { 1754 comp_code = KPHYSM_ENOTVIABLE; 1755 mutex_enter(&mhp->mh_mutex); 1756 goto early_exit; 1757 } 1758 1759 ret = kphysm_setup_pre_del(mhp->mh_vm_pages); 1760 1761 mutex_enter(&mhp->mh_mutex); 1762 1763 if (ret != 0) { 1764 mhp->mh_cancel = KPHYSM_EREFUSED; 1765 goto refused; 1766 } 1767 1768 transit_list_collect(mhp, 1); 1769 1770 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1771 mdsp = mdsp->mds_next) { 1772 ASSERT(mdsp->mds_bitmap == NULL); 1773 mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP); 1774 mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp), 1775 KM_SLEEP); 1776 } 1777 1778 first_scan = 1; 1779 freemem_left = 0; 1780 /* 1781 * Start dr_aio_cleanup_thread, which periodically iterates 1782 * through the process list and invokes aio cleanup. This 1783 * is needed in order to avoid a deadly embrace between the 1784 * delete_memory_thread (waiting on writer lock for page, with the 1785 * exclusive-wanted bit set), kaio read request threads (waiting for a 1786 * reader lock on the same page that is wanted by the 1787 * delete_memory_thread), and threads waiting for kaio completion 1788 * (blocked on spt_amp->lock). 1789 */ 1790 mhp->mh_dr_aio_cleanup_cancel = 0; 1791 mhp->mh_aio_cleanup_done = 0; 1792 (void) thread_create(NULL, 0, dr_aio_cleanup_thread, 1793 (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1); 1794 while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) { 1795 pgcnt_t collected; 1796 1797 MDSTAT_INCR(mhp, nloop); 1798 collected = 0; 1799 for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) && 1800 (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) { 1801 pfn_t pfn, p_end; 1802 1803 p_end = mdsp->mds_base + mdsp->mds_npgs; 1804 for (pfn = mdsp->mds_base; (pfn < p_end) && 1805 (mhp->mh_cancel == 0); pfn++) { 1806 page_t *pp, *tpp, *tpp_targ; 1807 pgcnt_t bit; 1808 struct vnode *vp; 1809 u_offset_t offset; 1810 int mod, result; 1811 spgcnt_t pgcnt; 1812 1813 bit = pfn - mdsp->mds_base; 1814 if ((mdsp->mds_bitmap[bit / NBPBMW] & 1815 (1 << (bit % NBPBMW))) != 0) { 1816 MDSTAT_INCR(mhp, already_done); 1817 continue; 1818 } 1819 if (freemem_left == 0) { 1820 freemem_left += delthr_get_freemem(mhp); 1821 if (freemem_left == 0) 1822 break; 1823 } 1824 1825 /* 1826 * Release mh_mutex - some of this 1827 * stuff takes some time (eg PUTPAGE). 1828 */ 1829 1830 mutex_exit(&mhp->mh_mutex); 1831 MDSTAT_INCR(mhp, ncheck); 1832 1833 pp = page_numtopp_nolock(pfn); 1834 if (pp == NULL) { 1835 /* 1836 * Not covered by a page_t - will 1837 * be dealt with elsewhere. 1838 */ 1839 MDSTAT_INCR(mhp, nopaget); 1840 mutex_enter(&mhp->mh_mutex); 1841 mdsp->mds_bitmap[bit / NBPBMW] |= 1842 (1 << (bit % NBPBMW)); 1843 continue; 1844 } 1845 1846 if (!page_try_reclaim_lock(pp, SE_EXCL, 1847 SE_EXCL_WANTED | SE_RETIRED)) { 1848 /* 1849 * Page in use elsewhere. Skip it. 1850 */ 1851 MDSTAT_INCR(mhp, lockfail); 1852 mutex_enter(&mhp->mh_mutex); 1853 continue; 1854 } 1855 /* 1856 * See if the cage expanded into the delete. 1857 * This can happen as we have to allow the 1858 * cage to expand. 1859 */ 1860 if (PP_ISNORELOC(pp)) { 1861 page_unlock(pp); 1862 mutex_enter(&mhp->mh_mutex); 1863 mhp->mh_cancel = KPHYSM_ENONRELOC; 1864 break; 1865 } 1866 if (PP_RETIRED(pp)) { 1867 /* 1868 * Page has been retired and is 1869 * not part of the cage so we 1870 * can now do the accounting for 1871 * it. 1872 */ 1873 MDSTAT_INCR(mhp, retired); 1874 mutex_enter(&mhp->mh_mutex); 1875 mdsp->mds_bitmap[bit / NBPBMW] 1876 |= (1 << (bit % NBPBMW)); 1877 mdsp->mds_bitmap_retired[bit / 1878 NBPBMW] |= 1879 (1 << (bit % NBPBMW)); 1880 mhp->mh_hold_todo--; 1881 continue; 1882 } 1883 ASSERT(freemem_left != 0); 1884 if (PP_ISFREE(pp)) { 1885 /* 1886 * Like page_reclaim() only 'freemem' 1887 * processing is already done. 1888 */ 1889 MDSTAT_INCR(mhp, nfree); 1890 free_page_collect: 1891 if (PP_ISAGED(pp)) { 1892 page_list_sub(pp, 1893 PG_FREE_LIST); 1894 } else { 1895 page_list_sub(pp, 1896 PG_CACHE_LIST); 1897 } 1898 PP_CLRFREE(pp); 1899 PP_CLRAGED(pp); 1900 collected++; 1901 mutex_enter(&mhp->mh_mutex); 1902 page_delete_collect(pp, mhp); 1903 mdsp->mds_bitmap[bit / NBPBMW] |= 1904 (1 << (bit % NBPBMW)); 1905 freemem_left--; 1906 continue; 1907 } 1908 ASSERT(pp->p_vnode != NULL); 1909 if (first_scan) { 1910 MDSTAT_INCR(mhp, first_notfree); 1911 page_unlock(pp); 1912 mutex_enter(&mhp->mh_mutex); 1913 continue; 1914 } 1915 /* 1916 * Keep stats on pages encountered that 1917 * are marked for retirement. 1918 */ 1919 if (PP_TOXIC(pp)) { 1920 MDSTAT_INCR(mhp, toxic); 1921 } else if (PP_PR_REQ(pp)) { 1922 MDSTAT_INCR(mhp, failing); 1923 } 1924 /* 1925 * In certain cases below, special exceptions 1926 * are made for pages that are toxic. This 1927 * is because the current meaning of toxic 1928 * is that an uncorrectable error has been 1929 * previously associated with the page. 1930 */ 1931 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1932 if (!PP_TOXIC(pp)) { 1933 /* 1934 * Must relocate locked in 1935 * memory pages. 1936 */ 1937 #ifdef MEM_DEL_STATS 1938 start_pgrp = ddi_get_lbolt(); 1939 #endif /* MEM_DEL_STATS */ 1940 /* 1941 * Lock all constituent pages 1942 * of a large page to ensure 1943 * that p_szc won't change. 1944 */ 1945 if (!group_page_trylock(pp, 1946 SE_EXCL)) { 1947 MDSTAT_INCR(mhp, 1948 gptllckfail); 1949 page_unlock(pp); 1950 mutex_enter( 1951 &mhp->mh_mutex); 1952 continue; 1953 } 1954 MDSTAT_INCR(mhp, npplocked); 1955 pp_targ = 1956 page_get_replacement_page( 1957 pp, NULL, 0); 1958 if (pp_targ != NULL) { 1959 #ifdef MEM_DEL_STATS 1960 ntick_pgrp = 1961 (uint64_t) 1962 ddi_get_lbolt() - 1963 start_pgrp; 1964 #endif /* MEM_DEL_STATS */ 1965 MDSTAT_PGRP(mhp, 1966 ntick_pgrp); 1967 MDSTAT_INCR(mhp, 1968 nlockreloc); 1969 goto reloc; 1970 } 1971 group_page_unlock(pp); 1972 page_unlock(pp); 1973 #ifdef MEM_DEL_STATS 1974 ntick_pgrp = 1975 (uint64_t)ddi_get_lbolt() - 1976 start_pgrp; 1977 #endif /* MEM_DEL_STATS */ 1978 MDSTAT_PGRP(mhp, ntick_pgrp); 1979 MDSTAT_INCR(mhp, nnorepl); 1980 mutex_enter(&mhp->mh_mutex); 1981 continue; 1982 } else { 1983 /* 1984 * Cannot do anything about 1985 * this page because it is 1986 * toxic. 1987 */ 1988 MDSTAT_INCR(mhp, npplkdtoxic); 1989 page_unlock(pp); 1990 mutex_enter(&mhp->mh_mutex); 1991 continue; 1992 } 1993 } 1994 /* 1995 * Unload the mappings and check if mod bit 1996 * is set. 1997 */ 1998 ASSERT(!PP_ISKAS(pp)); 1999 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 2000 mod = hat_ismod(pp); 2001 2002 #ifdef MEM_DEL_STATS 2003 start_pgrp = ddi_get_lbolt(); 2004 #endif /* MEM_DEL_STATS */ 2005 if (mod && !PP_TOXIC(pp)) { 2006 /* 2007 * Lock all constituent pages 2008 * of a large page to ensure 2009 * that p_szc won't change. 2010 */ 2011 if (!group_page_trylock(pp, SE_EXCL)) { 2012 MDSTAT_INCR(mhp, gptlmodfail); 2013 page_unlock(pp); 2014 mutex_enter(&mhp->mh_mutex); 2015 continue; 2016 } 2017 pp_targ = page_get_replacement_page(pp, 2018 NULL, 0); 2019 if (pp_targ != NULL) { 2020 MDSTAT_INCR(mhp, nmodreloc); 2021 #ifdef MEM_DEL_STATS 2022 ntick_pgrp = 2023 (uint64_t)ddi_get_lbolt() - 2024 start_pgrp; 2025 #endif /* MEM_DEL_STATS */ 2026 MDSTAT_PGRP(mhp, ntick_pgrp); 2027 goto reloc; 2028 } 2029 group_page_unlock(pp); 2030 } 2031 2032 if (!page_try_demote_pages(pp)) { 2033 MDSTAT_INCR(mhp, demotefail); 2034 page_unlock(pp); 2035 #ifdef MEM_DEL_STATS 2036 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2037 start_pgrp; 2038 #endif /* MEM_DEL_STATS */ 2039 MDSTAT_PGRP(mhp, ntick_pgrp); 2040 mutex_enter(&mhp->mh_mutex); 2041 continue; 2042 } 2043 2044 /* 2045 * Regular 'page-out'. 2046 */ 2047 if (!mod) { 2048 MDSTAT_INCR(mhp, ndestroy); 2049 page_destroy(pp, 1); 2050 /* 2051 * page_destroy was called with 2052 * dontfree. As long as p_lckcnt 2053 * and p_cowcnt are both zero, the 2054 * only additional action of 2055 * page_destroy with !dontfree is to 2056 * call page_free, so we can collect 2057 * the page here. 2058 */ 2059 collected++; 2060 #ifdef MEM_DEL_STATS 2061 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2062 start_pgrp; 2063 #endif /* MEM_DEL_STATS */ 2064 MDSTAT_PGRP(mhp, ntick_pgrp); 2065 mutex_enter(&mhp->mh_mutex); 2066 page_delete_collect(pp, mhp); 2067 mdsp->mds_bitmap[bit / NBPBMW] |= 2068 (1 << (bit % NBPBMW)); 2069 continue; 2070 } 2071 /* 2072 * The page is toxic and the mod bit is 2073 * set, we cannot do anything here to deal 2074 * with it. 2075 */ 2076 if (PP_TOXIC(pp)) { 2077 page_unlock(pp); 2078 #ifdef MEM_DEL_STATS 2079 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2080 start_pgrp; 2081 #endif /* MEM_DEL_STATS */ 2082 MDSTAT_PGRP(mhp, ntick_pgrp); 2083 MDSTAT_INCR(mhp, modtoxic); 2084 mutex_enter(&mhp->mh_mutex); 2085 continue; 2086 } 2087 MDSTAT_INCR(mhp, nputpage); 2088 vp = pp->p_vnode; 2089 offset = pp->p_offset; 2090 VN_HOLD(vp); 2091 page_unlock(pp); 2092 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, 2093 B_INVAL|B_FORCE, kcred, NULL); 2094 VN_RELE(vp); 2095 #ifdef MEM_DEL_STATS 2096 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2097 start_pgrp; 2098 #endif /* MEM_DEL_STATS */ 2099 MDSTAT_PGRP(mhp, ntick_pgrp); 2100 /* 2101 * Try to get the page back immediately 2102 * so that it can be collected. 2103 */ 2104 pp = page_numtopp_nolock(pfn); 2105 if (pp == NULL) { 2106 MDSTAT_INCR(mhp, nnoreclaim); 2107 /* 2108 * This should not happen as this 2109 * thread is deleting the page. 2110 * If this code is generalized, this 2111 * becomes a reality. 2112 */ 2113 #ifdef DEBUG 2114 cmn_err(CE_WARN, 2115 "delete_memory_thread(0x%p) " 2116 "pfn 0x%lx has no page_t", 2117 (void *)mhp, pfn); 2118 #endif /* DEBUG */ 2119 mutex_enter(&mhp->mh_mutex); 2120 continue; 2121 } 2122 if (page_try_reclaim_lock(pp, SE_EXCL, 2123 SE_EXCL_WANTED | SE_RETIRED)) { 2124 if (PP_ISFREE(pp)) { 2125 goto free_page_collect; 2126 } 2127 page_unlock(pp); 2128 } 2129 MDSTAT_INCR(mhp, nnoreclaim); 2130 mutex_enter(&mhp->mh_mutex); 2131 continue; 2132 2133 reloc: 2134 /* 2135 * Got some freemem and a target 2136 * page, so move the data to avoid 2137 * I/O and lock problems. 2138 */ 2139 ASSERT(!page_iolock_assert(pp)); 2140 MDSTAT_INCR(mhp, nreloc); 2141 /* 2142 * page_relocate() will return pgcnt: the 2143 * number of consecutive pages relocated. 2144 * If it is successful, pp will be a 2145 * linked list of the page structs that 2146 * were relocated. If page_relocate() is 2147 * unsuccessful, pp will be unmodified. 2148 */ 2149 #ifdef MEM_DEL_STATS 2150 start_pgrp = ddi_get_lbolt(); 2151 #endif /* MEM_DEL_STATS */ 2152 result = page_relocate(&pp, &pp_targ, 0, 0, 2153 &pgcnt, NULL); 2154 #ifdef MEM_DEL_STATS 2155 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2156 start_pgrp; 2157 #endif /* MEM_DEL_STATS */ 2158 MDSTAT_PGRP(mhp, ntick_pgrp); 2159 if (result != 0) { 2160 MDSTAT_INCR(mhp, nrelocfail); 2161 /* 2162 * We did not succeed. We need 2163 * to give the pp_targ pages back. 2164 * page_free(pp_targ, 1) without 2165 * the freemem accounting. 2166 */ 2167 group_page_unlock(pp); 2168 page_free_replacement_page(pp_targ); 2169 page_unlock(pp); 2170 mutex_enter(&mhp->mh_mutex); 2171 continue; 2172 } 2173 2174 /* 2175 * We will then collect pgcnt pages. 2176 */ 2177 ASSERT(pgcnt > 0); 2178 mutex_enter(&mhp->mh_mutex); 2179 /* 2180 * We need to make sure freemem_left is 2181 * large enough. 2182 */ 2183 while ((freemem_left < pgcnt) && 2184 (!mhp->mh_cancel)) { 2185 freemem_left += 2186 delthr_get_freemem(mhp); 2187 } 2188 2189 /* 2190 * Do not proceed if mh_cancel is set. 2191 */ 2192 if (mhp->mh_cancel) { 2193 while (pp_targ != NULL) { 2194 /* 2195 * Unlink and unlock each page. 2196 */ 2197 tpp_targ = pp_targ; 2198 page_sub(&pp_targ, tpp_targ); 2199 page_unlock(tpp_targ); 2200 } 2201 /* 2202 * We need to give the pp pages back. 2203 * page_free(pp, 1) without the 2204 * freemem accounting. 2205 */ 2206 page_free_replacement_page(pp); 2207 break; 2208 } 2209 2210 /* Now remove pgcnt from freemem_left */ 2211 freemem_left -= pgcnt; 2212 ASSERT(freemem_left >= 0); 2213 szc = pp->p_szc; 2214 while (pp != NULL) { 2215 /* 2216 * pp and pp_targ were passed back as 2217 * a linked list of pages. 2218 * Unlink and unlock each page. 2219 */ 2220 tpp_targ = pp_targ; 2221 page_sub(&pp_targ, tpp_targ); 2222 page_unlock(tpp_targ); 2223 /* 2224 * The original page is now free 2225 * so remove it from the linked 2226 * list and collect it. 2227 */ 2228 tpp = pp; 2229 page_sub(&pp, tpp); 2230 pfn = page_pptonum(tpp); 2231 collected++; 2232 ASSERT(PAGE_EXCL(tpp)); 2233 ASSERT(tpp->p_vnode == NULL); 2234 ASSERT(!hat_page_is_mapped(tpp)); 2235 ASSERT(tpp->p_szc == szc); 2236 tpp->p_szc = 0; 2237 page_delete_collect(tpp, mhp); 2238 bit = pfn - mdsp->mds_base; 2239 mdsp->mds_bitmap[bit / NBPBMW] |= 2240 (1 << (bit % NBPBMW)); 2241 } 2242 ASSERT(pp_targ == NULL); 2243 } 2244 } 2245 first_scan = 0; 2246 if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) && 2247 (collected == 0)) { 2248 /* 2249 * This code is needed as we cannot wait 2250 * for a page to be locked OR the delete to 2251 * be cancelled. Also, we must delay so 2252 * that other threads get a chance to run 2253 * on our cpu, otherwise page locks may be 2254 * held indefinitely by those threads. 2255 */ 2256 MDSTAT_INCR(mhp, ndelay); 2257 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2258 (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex, 2259 DEL_BUSY_WAIT_TICKS, TR_CLOCK_TICK); 2260 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2261 } 2262 } 2263 /* stop the dr aio cleanup thread */ 2264 mhp->mh_dr_aio_cleanup_cancel = 1; 2265 transit_list_collect(mhp, 0); 2266 if (freemem_left != 0) { 2267 /* Return any surplus. */ 2268 page_create_putback(freemem_left); 2269 freemem_left = 0; 2270 } 2271 #ifdef MEM_DEL_STATS 2272 ntick_total = (uint64_t)ddi_get_lbolt() - start_total; 2273 #endif /* MEM_DEL_STATS */ 2274 MDSTAT_TOTAL(mhp, ntick_total); 2275 MDSTAT_PRINT(mhp); 2276 2277 /* 2278 * If the memory delete was cancelled, exclusive-wanted bits must 2279 * be cleared. If there are retired pages being deleted, they need 2280 * to be unretired. 2281 */ 2282 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2283 mdsp = mdsp->mds_next) { 2284 pfn_t pfn, p_end; 2285 2286 p_end = mdsp->mds_base + mdsp->mds_npgs; 2287 for (pfn = mdsp->mds_base; pfn < p_end; pfn++) { 2288 page_t *pp; 2289 pgcnt_t bit; 2290 2291 bit = pfn - mdsp->mds_base; 2292 if (mhp->mh_cancel) { 2293 pp = page_numtopp_nolock(pfn); 2294 if (pp != NULL) { 2295 if ((mdsp->mds_bitmap[bit / NBPBMW] & 2296 (1 << (bit % NBPBMW))) == 0) { 2297 page_lock_clr_exclwanted(pp); 2298 } 2299 } 2300 } else { 2301 pp = NULL; 2302 } 2303 if ((mdsp->mds_bitmap_retired[bit / NBPBMW] & 2304 (1 << (bit % NBPBMW))) != 0) { 2305 /* do we already have pp? */ 2306 if (pp == NULL) { 2307 pp = page_numtopp_nolock(pfn); 2308 } 2309 ASSERT(pp != NULL); 2310 ASSERT(PP_RETIRED(pp)); 2311 if (mhp->mh_cancel != 0) { 2312 page_unlock(pp); 2313 /* 2314 * To satisfy ASSERT below in 2315 * cancel code. 2316 */ 2317 mhp->mh_hold_todo++; 2318 } else { 2319 (void) page_unretire_pp(pp, 2320 PR_UNR_CLEAN); 2321 } 2322 } 2323 } 2324 } 2325 /* 2326 * Free retired page bitmap and collected page bitmap 2327 */ 2328 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2329 mdsp = mdsp->mds_next) { 2330 ASSERT(mdsp->mds_bitmap_retired != NULL); 2331 kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp)); 2332 mdsp->mds_bitmap_retired = NULL; /* Paranoia. */ 2333 ASSERT(mdsp->mds_bitmap != NULL); 2334 kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp)); 2335 mdsp->mds_bitmap = NULL; /* Paranoia. */ 2336 } 2337 2338 /* wait for our dr aio cancel thread to exit */ 2339 while (!(mhp->mh_aio_cleanup_done)) { 2340 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2341 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 2342 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2343 } 2344 refused: 2345 if (mhp->mh_cancel != 0) { 2346 page_t *pp; 2347 2348 comp_code = mhp->mh_cancel; 2349 /* 2350 * Go through list of deleted pages (mh_deleted) freeing 2351 * them. 2352 */ 2353 while ((pp = mhp->mh_deleted) != NULL) { 2354 mhp->mh_deleted = pp->p_next; 2355 mhp->mh_hold_todo++; 2356 mutex_exit(&mhp->mh_mutex); 2357 /* Restore p_next. */ 2358 pp->p_next = pp->p_prev; 2359 if (PP_ISFREE(pp)) { 2360 cmn_err(CE_PANIC, 2361 "page %p is free", 2362 (void *)pp); 2363 } 2364 page_free(pp, 1); 2365 mutex_enter(&mhp->mh_mutex); 2366 } 2367 ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages); 2368 2369 mutex_exit(&mhp->mh_mutex); 2370 put_availrmem(mhp->mh_vm_pages); 2371 mutex_enter(&mhp->mh_mutex); 2372 2373 goto t_exit; 2374 } 2375 2376 /* 2377 * All the pages are no longer in use and are exclusively locked. 2378 */ 2379 2380 mhp->mh_deleted = NULL; 2381 2382 kphysm_del_cleanup(mhp); 2383 2384 /* 2385 * mem_node_del_range needs to be after kphysm_del_cleanup so 2386 * that the mem_node_config[] will remain intact for the cleanup. 2387 */ 2388 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2389 mdsp = mdsp->mds_next) { 2390 mem_node_del_range(mdsp->mds_base, 2391 mdsp->mds_base + mdsp->mds_npgs - 1); 2392 } 2393 /* cleanup the page counters */ 2394 page_ctrs_cleanup(); 2395 2396 comp_code = KPHYSM_OK; 2397 2398 t_exit: 2399 mutex_exit(&mhp->mh_mutex); 2400 kphysm_setup_post_del(mhp->mh_vm_pages, 2401 (comp_code == KPHYSM_OK) ? 0 : 1); 2402 mutex_enter(&mhp->mh_mutex); 2403 2404 early_exit: 2405 /* mhp->mh_mutex exited by CALLB_CPR_EXIT() */ 2406 mhp->mh_state = MHND_DONE; 2407 del_complete_funcp = mhp->mh_delete_complete; 2408 del_complete_arg = mhp->mh_delete_complete_arg; 2409 CALLB_CPR_EXIT(&cprinfo); 2410 (*del_complete_funcp)(del_complete_arg, comp_code); 2411 thread_exit(); 2412 /*NOTREACHED*/ 2413 } 2414 2415 /* 2416 * Start the delete of the memory from the system. 2417 */ 2418 int 2419 kphysm_del_start( 2420 memhandle_t handle, 2421 void (*complete)(void *, int), 2422 void *complete_arg) 2423 { 2424 struct mem_handle *mhp; 2425 2426 mhp = kphysm_lookup_mem_handle(handle); 2427 if (mhp == NULL) { 2428 return (KPHYSM_EHANDLE); 2429 } 2430 switch (mhp->mh_state) { 2431 case MHND_FREE: 2432 ASSERT(mhp->mh_state != MHND_FREE); 2433 mutex_exit(&mhp->mh_mutex); 2434 return (KPHYSM_EHANDLE); 2435 case MHND_INIT: 2436 break; 2437 case MHND_STARTING: 2438 case MHND_RUNNING: 2439 mutex_exit(&mhp->mh_mutex); 2440 return (KPHYSM_ESEQUENCE); 2441 case MHND_DONE: 2442 mutex_exit(&mhp->mh_mutex); 2443 return (KPHYSM_ESEQUENCE); 2444 case MHND_RELEASE: 2445 mutex_exit(&mhp->mh_mutex); 2446 return (KPHYSM_ESEQUENCE); 2447 default: 2448 #ifdef DEBUG 2449 cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d", 2450 (void *)mhp, mhp->mh_state); 2451 #endif /* DEBUG */ 2452 mutex_exit(&mhp->mh_mutex); 2453 return (KPHYSM_EHANDLE); 2454 } 2455 2456 if (mhp->mh_transit.trl_spans == NULL) { 2457 mutex_exit(&mhp->mh_mutex); 2458 return (KPHYSM_ENOWORK); 2459 } 2460 2461 ASSERT(complete != NULL); 2462 mhp->mh_delete_complete = complete; 2463 mhp->mh_delete_complete_arg = complete_arg; 2464 mhp->mh_state = MHND_STARTING; 2465 /* 2466 * Release the mutex in case thread_create sleeps. 2467 */ 2468 mutex_exit(&mhp->mh_mutex); 2469 2470 /* 2471 * The "obvious" process for this thread is pageout (proc_pageout) 2472 * but this gives the thread too much power over freemem 2473 * which results in freemem starvation. 2474 */ 2475 (void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0, 2476 TS_RUN, maxclsyspri - 1); 2477 2478 return (KPHYSM_OK); 2479 } 2480 2481 static kmutex_t pp_dummy_lock; /* Protects init. of pp_dummy. */ 2482 static caddr_t pp_dummy; 2483 static pgcnt_t pp_dummy_npages; 2484 static pfn_t *pp_dummy_pfn; /* Array of dummy pfns. */ 2485 2486 static void 2487 memseg_remap_init_pages(page_t *pages, page_t *epages) 2488 { 2489 page_t *pp; 2490 2491 for (pp = pages; pp < epages; pp++) { 2492 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2493 pp->p_offset = (u_offset_t)-1; 2494 page_iolock_init(pp); 2495 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 2496 continue; 2497 page_lock_delete(pp); 2498 } 2499 } 2500 2501 void 2502 memseg_remap_init() 2503 { 2504 mutex_enter(&pp_dummy_lock); 2505 if (pp_dummy == NULL) { 2506 uint_t dpages; 2507 int i; 2508 2509 /* 2510 * dpages starts off as the size of the structure and 2511 * ends up as the minimum number of pages that will 2512 * hold a whole number of page_t structures. 2513 */ 2514 dpages = sizeof (page_t); 2515 ASSERT(dpages != 0); 2516 ASSERT(dpages <= MMU_PAGESIZE); 2517 2518 while ((dpages & 1) == 0) 2519 dpages >>= 1; 2520 2521 pp_dummy_npages = dpages; 2522 /* 2523 * Allocate pp_dummy pages directly from static_arena, 2524 * since these are whole page allocations and are 2525 * referenced by physical address. This also has the 2526 * nice fringe benefit of hiding the memory from 2527 * ::findleaks since it doesn't deal well with allocated 2528 * kernel heap memory that doesn't have any mappings. 2529 */ 2530 pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages), 2531 PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP); 2532 bzero(pp_dummy, ptob(pp_dummy_npages)); 2533 ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0); 2534 pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) * 2535 pp_dummy_npages, KM_SLEEP); 2536 for (i = 0; i < pp_dummy_npages; i++) { 2537 pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat, 2538 &pp_dummy[MMU_PAGESIZE * i]); 2539 ASSERT(pp_dummy_pfn[i] != PFN_INVALID); 2540 } 2541 /* 2542 * Initialize the page_t's to a known 'deleted' state 2543 * that matches the state of deleted pages. 2544 */ 2545 memseg_remap_init_pages((page_t *)pp_dummy, 2546 (page_t *)(pp_dummy + ptob(pp_dummy_npages))); 2547 /* Remove kmem mappings for the pages for safety. */ 2548 hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages), 2549 HAT_UNLOAD_UNLOCK); 2550 /* Leave pp_dummy pointer set as flag that init is done. */ 2551 } 2552 mutex_exit(&pp_dummy_lock); 2553 } 2554 2555 /* 2556 * Remap a page-aglined range of page_t's to dummy pages. 2557 */ 2558 void 2559 remap_to_dummy(caddr_t va, pgcnt_t metapgs) 2560 { 2561 int phase; 2562 2563 ASSERT(IS_P2ALIGNED((uint64_t)va, PAGESIZE)); 2564 2565 /* 2566 * We may start remapping at a non-zero page offset 2567 * within the dummy pages since the low/high ends 2568 * of the outgoing pp's could be shared by other 2569 * memsegs (see memseg_remap_meta). 2570 */ 2571 phase = btop((uint64_t)va) % pp_dummy_npages; 2572 ASSERT(PAGESIZE % sizeof (page_t) || phase == 0); 2573 2574 while (metapgs != 0) { 2575 pgcnt_t n; 2576 int i, j; 2577 2578 n = pp_dummy_npages; 2579 if (n > metapgs) 2580 n = metapgs; 2581 for (i = 0; i < n; i++) { 2582 j = (i + phase) % pp_dummy_npages; 2583 hat_devload(kas.a_hat, va, ptob(1), pp_dummy_pfn[j], 2584 PROT_READ, 2585 HAT_LOAD | HAT_LOAD_NOCONSIST | 2586 HAT_LOAD_REMAP); 2587 va += ptob(1); 2588 } 2589 metapgs -= n; 2590 } 2591 } 2592 2593 static void 2594 memseg_remap_to_dummy(struct memseg *seg) 2595 { 2596 caddr_t pp; 2597 pgcnt_t metapgs; 2598 2599 ASSERT(memseg_is_dynamic(seg)); 2600 ASSERT(pp_dummy != NULL); 2601 2602 2603 if (!memseg_includes_meta(seg)) { 2604 memseg_remap_meta(seg); 2605 return; 2606 } 2607 2608 pp = (caddr_t)seg->pages; 2609 metapgs = seg->pages_base - memseg_get_start(seg); 2610 ASSERT(metapgs != 0); 2611 2612 seg->pages_end = seg->pages_base; 2613 2614 remap_to_dummy(pp, metapgs); 2615 } 2616 2617 /* 2618 * Transition all the deleted pages to the deleted state so that 2619 * page_lock will not wait. The page_lock_delete call will 2620 * also wake up any waiters. 2621 */ 2622 static void 2623 memseg_lock_delete_all(struct memseg *seg) 2624 { 2625 page_t *pp; 2626 2627 for (pp = seg->pages; pp < seg->epages; pp++) { 2628 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2629 page_lock_delete(pp); 2630 } 2631 } 2632 2633 static void 2634 kphysm_del_cleanup(struct mem_handle *mhp) 2635 { 2636 struct memdelspan *mdsp; 2637 struct memseg *seg; 2638 struct memseg **segpp; 2639 struct memseg *seglist; 2640 pfn_t p_end; 2641 uint64_t avmem; 2642 pgcnt_t avpgs; 2643 pgcnt_t npgs; 2644 2645 avpgs = mhp->mh_vm_pages; 2646 2647 memsegs_lock(1); 2648 2649 /* 2650 * remove from main segment list. 2651 */ 2652 npgs = 0; 2653 seglist = NULL; 2654 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2655 mdsp = mdsp->mds_next) { 2656 p_end = mdsp->mds_base + mdsp->mds_npgs; 2657 for (segpp = &memsegs; (seg = *segpp) != NULL; ) { 2658 if (seg->pages_base >= p_end || 2659 seg->pages_end <= mdsp->mds_base) { 2660 /* Span and memseg don't overlap. */ 2661 segpp = &((*segpp)->next); 2662 continue; 2663 } 2664 ASSERT(seg->pages_base >= mdsp->mds_base); 2665 ASSERT(seg->pages_end <= p_end); 2666 2667 PLCNT_MODIFY_MAX(seg->pages_base, 2668 seg->pages_base - seg->pages_end); 2669 2670 /* Hide the memseg from future scans. */ 2671 hat_kpm_delmem_mseg_update(seg, segpp); 2672 *segpp = seg->next; 2673 membar_producer(); /* TODO: Needed? */ 2674 npgs += MSEG_NPAGES(seg); 2675 2676 /* 2677 * Leave the deleted segment's next pointer intact 2678 * in case a memsegs scanning loop is walking this 2679 * segment concurrently. 2680 */ 2681 seg->lnext = seglist; 2682 seglist = seg; 2683 } 2684 } 2685 2686 build_pfn_hash(); 2687 2688 ASSERT(npgs < total_pages); 2689 total_pages -= npgs; 2690 2691 /* 2692 * Recalculate the paging parameters now total_pages has changed. 2693 * This will also cause the clock hands to be reset before next use. 2694 */ 2695 setupclock(1); 2696 2697 memsegs_unlock(1); 2698 2699 mutex_exit(&mhp->mh_mutex); 2700 2701 while ((seg = seglist) != NULL) { 2702 pfn_t mseg_start; 2703 pfn_t mseg_base, mseg_end; 2704 pgcnt_t mseg_npgs; 2705 int mlret; 2706 2707 seglist = seg->lnext; 2708 2709 /* 2710 * Put the page_t's into the deleted state to stop 2711 * cv_wait()s on the pages. When we remap, the dummy 2712 * page_t's will be in the same state. 2713 */ 2714 memseg_lock_delete_all(seg); 2715 /* 2716 * Collect up information based on pages_base and pages_end 2717 * early so that we can flag early that the memseg has been 2718 * deleted by setting pages_end == pages_base. 2719 */ 2720 mseg_base = seg->pages_base; 2721 mseg_end = seg->pages_end; 2722 mseg_npgs = MSEG_NPAGES(seg); 2723 mseg_start = memseg_get_start(seg); 2724 2725 if (memseg_is_dynamic(seg)) { 2726 /* Remap the meta data to our special dummy area. */ 2727 memseg_remap_to_dummy(seg); 2728 2729 mutex_enter(&memseg_lists_lock); 2730 seg->lnext = memseg_va_avail; 2731 memseg_va_avail = seg; 2732 mutex_exit(&memseg_lists_lock); 2733 } else { 2734 /* 2735 * For memory whose page_ts were allocated 2736 * at boot, we need to find a new use for 2737 * the page_t memory. 2738 * For the moment, just leak it. 2739 * (It is held in the memseg_delete_junk list.) 2740 */ 2741 seg->pages_end = seg->pages_base; 2742 2743 mutex_enter(&memseg_lists_lock); 2744 seg->lnext = memseg_delete_junk; 2745 memseg_delete_junk = seg; 2746 mutex_exit(&memseg_lists_lock); 2747 } 2748 2749 /* Must not use seg now as it could be re-used. */ 2750 2751 memlist_write_lock(); 2752 2753 mlret = memlist_delete_span( 2754 (uint64_t)(mseg_base) << PAGESHIFT, 2755 (uint64_t)(mseg_npgs) << PAGESHIFT, 2756 &phys_avail); 2757 ASSERT(mlret == MEML_SPANOP_OK); 2758 2759 mlret = memlist_delete_span( 2760 (uint64_t)(mseg_start) << PAGESHIFT, 2761 (uint64_t)(mseg_end - mseg_start) << 2762 PAGESHIFT, 2763 &phys_install); 2764 ASSERT(mlret == MEML_SPANOP_OK); 2765 phys_install_has_changed(); 2766 2767 memlist_write_unlock(); 2768 } 2769 2770 memlist_read_lock(); 2771 installed_top_size(phys_install, &physmax, &physinstalled); 2772 memlist_read_unlock(); 2773 2774 mutex_enter(&freemem_lock); 2775 maxmem -= avpgs; 2776 physmem -= avpgs; 2777 /* availrmem is adjusted during the delete. */ 2778 availrmem_initial -= avpgs; 2779 2780 mutex_exit(&freemem_lock); 2781 2782 dump_resize(); 2783 2784 cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK " 2785 "(0x%" PRIx64 ")\n", 2786 physinstalled << (PAGESHIFT - 10), 2787 (uint64_t)physinstalled << PAGESHIFT); 2788 2789 avmem = (uint64_t)freemem << PAGESHIFT; 2790 cmn_err(CE_CONT, "?kphysm_delete: " 2791 "avail mem = %" PRId64 "\n", avmem); 2792 2793 /* 2794 * Update lgroup generation number on single lgroup systems 2795 */ 2796 if (nlgrps == 1) 2797 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 2798 2799 /* Successfully deleted system memory */ 2800 mutex_enter(&mhp->mh_mutex); 2801 } 2802 2803 static uint_t mdel_nullvp_waiter; 2804 2805 static void 2806 page_delete_collect( 2807 page_t *pp, 2808 struct mem_handle *mhp) 2809 { 2810 if (pp->p_vnode) { 2811 page_hashout(pp, (kmutex_t *)NULL); 2812 /* do not do PP_SETAGED(pp); */ 2813 } else { 2814 kmutex_t *sep; 2815 2816 sep = page_se_mutex(pp); 2817 mutex_enter(sep); 2818 if (CV_HAS_WAITERS(&pp->p_cv)) { 2819 mdel_nullvp_waiter++; 2820 cv_broadcast(&pp->p_cv); 2821 } 2822 mutex_exit(sep); 2823 } 2824 ASSERT(pp->p_next == pp->p_prev); 2825 ASSERT(pp->p_next == NULL || pp->p_next == pp); 2826 pp->p_next = mhp->mh_deleted; 2827 mhp->mh_deleted = pp; 2828 ASSERT(mhp->mh_hold_todo != 0); 2829 mhp->mh_hold_todo--; 2830 } 2831 2832 static void 2833 transit_list_collect(struct mem_handle *mhp, int v) 2834 { 2835 struct transit_list_head *trh; 2836 2837 trh = &transit_list_head; 2838 mutex_enter(&trh->trh_lock); 2839 mhp->mh_transit.trl_collect = v; 2840 mutex_exit(&trh->trh_lock); 2841 } 2842 2843 static void 2844 transit_list_insert(struct transit_list *tlp) 2845 { 2846 struct transit_list_head *trh; 2847 2848 trh = &transit_list_head; 2849 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2850 tlp->trl_next = trh->trh_head; 2851 trh->trh_head = tlp; 2852 } 2853 2854 static void 2855 transit_list_remove(struct transit_list *tlp) 2856 { 2857 struct transit_list_head *trh; 2858 struct transit_list **tlpp; 2859 2860 trh = &transit_list_head; 2861 tlpp = &trh->trh_head; 2862 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2863 while (*tlpp != NULL && *tlpp != tlp) 2864 tlpp = &(*tlpp)->trl_next; 2865 ASSERT(*tlpp != NULL); 2866 if (*tlpp == tlp) 2867 *tlpp = tlp->trl_next; 2868 tlp->trl_next = NULL; 2869 } 2870 2871 static struct transit_list * 2872 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum) 2873 { 2874 struct transit_list *tlp; 2875 2876 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 2877 struct memdelspan *mdsp; 2878 2879 for (mdsp = tlp->trl_spans; mdsp != NULL; 2880 mdsp = mdsp->mds_next) { 2881 if (pfnum >= mdsp->mds_base && 2882 pfnum < (mdsp->mds_base + mdsp->mds_npgs)) { 2883 return (tlp); 2884 } 2885 } 2886 } 2887 return (NULL); 2888 } 2889 2890 int 2891 pfn_is_being_deleted(pfn_t pfnum) 2892 { 2893 struct transit_list_head *trh; 2894 struct transit_list *tlp; 2895 int ret; 2896 2897 trh = &transit_list_head; 2898 if (trh->trh_head == NULL) 2899 return (0); 2900 2901 mutex_enter(&trh->trh_lock); 2902 tlp = pfnum_to_transit_list(trh, pfnum); 2903 ret = (tlp != NULL && tlp->trl_collect); 2904 mutex_exit(&trh->trh_lock); 2905 2906 return (ret); 2907 } 2908 2909 #ifdef MEM_DEL_STATS 2910 extern int hz; 2911 static void 2912 mem_del_stat_print_func(struct mem_handle *mhp) 2913 { 2914 uint64_t tmp; 2915 2916 if (mem_del_stat_print) { 2917 printf("memory delete loop %x/%x, statistics%s\n", 2918 (uint_t)mhp->mh_transit.trl_spans->mds_base, 2919 (uint_t)mhp->mh_transit.trl_spans->mds_npgs, 2920 (mhp->mh_cancel ? " (cancelled)" : "")); 2921 printf("\t%8u nloop\n", mhp->mh_delstat.nloop); 2922 printf("\t%8u need_free\n", mhp->mh_delstat.need_free); 2923 printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop); 2924 printf("\t%8u free_low\n", mhp->mh_delstat.free_low); 2925 printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed); 2926 printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck); 2927 printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget); 2928 printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail); 2929 printf("\t%8u nfree\n", mhp->mh_delstat.nfree); 2930 printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc); 2931 printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail); 2932 printf("\t%8u already_done\n", mhp->mh_delstat.already_done); 2933 printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree); 2934 printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked); 2935 printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc); 2936 printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl); 2937 printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc); 2938 printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy); 2939 printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage); 2940 printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim); 2941 printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay); 2942 printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail); 2943 printf("\t%8u retired\n", mhp->mh_delstat.retired); 2944 printf("\t%8u toxic\n", mhp->mh_delstat.toxic); 2945 printf("\t%8u failing\n", mhp->mh_delstat.failing); 2946 printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic); 2947 printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic); 2948 printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail); 2949 printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail); 2950 tmp = mhp->mh_delstat.nticks_total / hz; /* seconds */ 2951 printf( 2952 "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n", 2953 mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60); 2954 2955 tmp = mhp->mh_delstat.nticks_pgrp / hz; /* seconds */ 2956 printf( 2957 "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n", 2958 mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60); 2959 } 2960 } 2961 #endif /* MEM_DEL_STATS */ 2962 2963 struct mem_callback { 2964 kphysm_setup_vector_t *vec; 2965 void *arg; 2966 }; 2967 2968 #define NMEMCALLBACKS 100 2969 2970 static struct mem_callback mem_callbacks[NMEMCALLBACKS]; 2971 static uint_t nmemcallbacks; 2972 static krwlock_t mem_callback_rwlock; 2973 2974 int 2975 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg) 2976 { 2977 uint_t i, found; 2978 2979 /* 2980 * This test will become more complicated when the version must 2981 * change. 2982 */ 2983 if (vec->version != KPHYSM_SETUP_VECTOR_VERSION) 2984 return (EINVAL); 2985 2986 if (vec->post_add == NULL || vec->pre_del == NULL || 2987 vec->post_del == NULL) 2988 return (EINVAL); 2989 2990 rw_enter(&mem_callback_rwlock, RW_WRITER); 2991 for (i = 0, found = 0; i < nmemcallbacks; i++) { 2992 if (mem_callbacks[i].vec == NULL && found == 0) 2993 found = i + 1; 2994 if (mem_callbacks[i].vec == vec && 2995 mem_callbacks[i].arg == arg) { 2996 #ifdef DEBUG 2997 /* Catch this in DEBUG kernels. */ 2998 cmn_err(CE_WARN, "kphysm_setup_func_register" 2999 "(0x%p, 0x%p) duplicate registration from 0x%p", 3000 (void *)vec, arg, (void *)caller()); 3001 #endif /* DEBUG */ 3002 rw_exit(&mem_callback_rwlock); 3003 return (EEXIST); 3004 } 3005 } 3006 if (found != 0) { 3007 i = found - 1; 3008 } else { 3009 ASSERT(nmemcallbacks < NMEMCALLBACKS); 3010 if (nmemcallbacks == NMEMCALLBACKS) { 3011 rw_exit(&mem_callback_rwlock); 3012 return (ENOMEM); 3013 } 3014 i = nmemcallbacks++; 3015 } 3016 mem_callbacks[i].vec = vec; 3017 mem_callbacks[i].arg = arg; 3018 rw_exit(&mem_callback_rwlock); 3019 return (0); 3020 } 3021 3022 void 3023 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg) 3024 { 3025 uint_t i; 3026 3027 rw_enter(&mem_callback_rwlock, RW_WRITER); 3028 for (i = 0; i < nmemcallbacks; i++) { 3029 if (mem_callbacks[i].vec == vec && 3030 mem_callbacks[i].arg == arg) { 3031 mem_callbacks[i].vec = NULL; 3032 mem_callbacks[i].arg = NULL; 3033 if (i == (nmemcallbacks - 1)) 3034 nmemcallbacks--; 3035 break; 3036 } 3037 } 3038 rw_exit(&mem_callback_rwlock); 3039 } 3040 3041 static void 3042 kphysm_setup_post_add(pgcnt_t delta_pages) 3043 { 3044 uint_t i; 3045 3046 rw_enter(&mem_callback_rwlock, RW_READER); 3047 for (i = 0; i < nmemcallbacks; i++) { 3048 if (mem_callbacks[i].vec != NULL) { 3049 (*mem_callbacks[i].vec->post_add) 3050 (mem_callbacks[i].arg, delta_pages); 3051 } 3052 } 3053 rw_exit(&mem_callback_rwlock); 3054 } 3055 3056 /* 3057 * Note the locking between pre_del and post_del: The reader lock is held 3058 * between the two calls to stop the set of functions from changing. 3059 */ 3060 3061 static int 3062 kphysm_setup_pre_del(pgcnt_t delta_pages) 3063 { 3064 uint_t i; 3065 int ret; 3066 int aret; 3067 3068 ret = 0; 3069 rw_enter(&mem_callback_rwlock, RW_READER); 3070 for (i = 0; i < nmemcallbacks; i++) { 3071 if (mem_callbacks[i].vec != NULL) { 3072 aret = (*mem_callbacks[i].vec->pre_del) 3073 (mem_callbacks[i].arg, delta_pages); 3074 ret |= aret; 3075 } 3076 } 3077 3078 return (ret); 3079 } 3080 3081 static void 3082 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled) 3083 { 3084 uint_t i; 3085 3086 for (i = 0; i < nmemcallbacks; i++) { 3087 if (mem_callbacks[i].vec != NULL) { 3088 (*mem_callbacks[i].vec->post_del) 3089 (mem_callbacks[i].arg, delta_pages, cancelled); 3090 } 3091 } 3092 rw_exit(&mem_callback_rwlock); 3093 } 3094 3095 static int 3096 kphysm_split_memseg( 3097 pfn_t base, 3098 pgcnt_t npgs) 3099 { 3100 struct memseg *seg; 3101 struct memseg **segpp; 3102 pgcnt_t size_low, size_high; 3103 struct memseg *seg_low, *seg_mid, *seg_high; 3104 3105 /* 3106 * Lock the memsegs list against other updates now 3107 */ 3108 memsegs_lock(1); 3109 3110 /* 3111 * Find boot time memseg that wholly covers this area. 3112 */ 3113 3114 /* First find the memseg with page 'base' in it. */ 3115 for (segpp = &memsegs; (seg = *segpp) != NULL; 3116 segpp = &((*segpp)->next)) { 3117 if (base >= seg->pages_base && base < seg->pages_end) 3118 break; 3119 } 3120 if (seg == NULL) { 3121 memsegs_unlock(1); 3122 return (0); 3123 } 3124 if (memseg_includes_meta(seg)) { 3125 memsegs_unlock(1); 3126 return (0); 3127 } 3128 if ((base + npgs) > seg->pages_end) { 3129 memsegs_unlock(1); 3130 return (0); 3131 } 3132 3133 /* 3134 * Work out the size of the two segments that will 3135 * surround the new segment, one for low address 3136 * and one for high. 3137 */ 3138 ASSERT(base >= seg->pages_base); 3139 size_low = base - seg->pages_base; 3140 ASSERT(seg->pages_end >= (base + npgs)); 3141 size_high = seg->pages_end - (base + npgs); 3142 3143 /* 3144 * Sanity check. 3145 */ 3146 if ((size_low + size_high) == 0) { 3147 memsegs_unlock(1); 3148 return (0); 3149 } 3150 3151 /* 3152 * Allocate the new structures. The old memseg will not be freed 3153 * as there may be a reference to it. 3154 */ 3155 seg_low = NULL; 3156 seg_high = NULL; 3157 3158 if (size_low != 0) 3159 seg_low = memseg_alloc(); 3160 3161 seg_mid = memseg_alloc(); 3162 3163 if (size_high != 0) 3164 seg_high = memseg_alloc(); 3165 3166 /* 3167 * All allocation done now. 3168 */ 3169 if (size_low != 0) { 3170 seg_low->pages = seg->pages; 3171 seg_low->epages = seg_low->pages + size_low; 3172 seg_low->pages_base = seg->pages_base; 3173 seg_low->pages_end = seg_low->pages_base + size_low; 3174 seg_low->next = seg_mid; 3175 seg_low->msegflags = seg->msegflags; 3176 } 3177 if (size_high != 0) { 3178 seg_high->pages = seg->epages - size_high; 3179 seg_high->epages = seg_high->pages + size_high; 3180 seg_high->pages_base = seg->pages_end - size_high; 3181 seg_high->pages_end = seg_high->pages_base + size_high; 3182 seg_high->next = seg->next; 3183 seg_high->msegflags = seg->msegflags; 3184 } 3185 3186 seg_mid->pages = seg->pages + size_low; 3187 seg_mid->pages_base = seg->pages_base + size_low; 3188 seg_mid->epages = seg->epages - size_high; 3189 seg_mid->pages_end = seg->pages_end - size_high; 3190 seg_mid->next = (seg_high != NULL) ? seg_high : seg->next; 3191 seg_mid->msegflags = seg->msegflags; 3192 3193 /* 3194 * Update hat_kpm specific info of all involved memsegs and 3195 * allow hat_kpm specific global chain updates. 3196 */ 3197 hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high); 3198 3199 /* 3200 * At this point we have two equivalent memseg sub-chains, 3201 * seg and seg_low/seg_mid/seg_high, which both chain on to 3202 * the same place in the global chain. By re-writing the pointer 3203 * in the previous element we switch atomically from using the old 3204 * (seg) to the new. 3205 */ 3206 *segpp = (seg_low != NULL) ? seg_low : seg_mid; 3207 3208 membar_enter(); 3209 3210 build_pfn_hash(); 3211 memsegs_unlock(1); 3212 3213 /* 3214 * We leave the old segment, 'seg', intact as there may be 3215 * references to it. Also, as the value of total_pages has not 3216 * changed and the memsegs list is effectively the same when 3217 * accessed via the old or the new pointer, we do not have to 3218 * cause pageout_scanner() to re-evaluate its hand pointers. 3219 * 3220 * We currently do not re-use or reclaim the page_t memory. 3221 * If we do, then this may have to change. 3222 */ 3223 3224 mutex_enter(&memseg_lists_lock); 3225 seg->lnext = memseg_edit_junk; 3226 memseg_edit_junk = seg; 3227 mutex_exit(&memseg_lists_lock); 3228 3229 return (1); 3230 } 3231 3232 /* 3233 * The sfmmu hat layer (e.g.) accesses some parts of the memseg 3234 * structure using physical addresses. Therefore a kmem_cache is 3235 * used with KMC_NOHASH to avoid page crossings within a memseg 3236 * structure. KMC_NOHASH requires that no external (outside of 3237 * slab) information is allowed. This, in turn, implies that the 3238 * cache's slabsize must be exactly a single page, since per-slab 3239 * information (e.g. the freelist for the slab) is kept at the 3240 * end of the slab, where it is easy to locate. Should be changed 3241 * when a more obvious kmem_cache interface/flag will become 3242 * available. 3243 */ 3244 void 3245 mem_config_init() 3246 { 3247 memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg), 3248 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH); 3249 } 3250 3251 struct memseg * 3252 memseg_alloc() 3253 { 3254 struct memseg *seg; 3255 3256 seg = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3257 bzero(seg, sizeof (struct memseg)); 3258 3259 return (seg); 3260 } 3261 3262 /* 3263 * Return whether the page_t memory for this memseg 3264 * is included in the memseg itself. 3265 */ 3266 static int 3267 memseg_includes_meta(struct memseg *seg) 3268 { 3269 return (seg->msegflags & MEMSEG_META_INCL); 3270 } 3271 3272 pfn_t 3273 memseg_get_start(struct memseg *seg) 3274 { 3275 pfn_t pt_start; 3276 3277 if (memseg_includes_meta(seg)) { 3278 pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages); 3279 3280 /* Meta data is required to be at the beginning */ 3281 ASSERT(pt_start < seg->pages_base); 3282 } else 3283 pt_start = seg->pages_base; 3284 3285 return (pt_start); 3286 } 3287 3288 /* 3289 * Invalidate memseg pointers in cpu private vm data caches. 3290 */ 3291 static void 3292 memseg_cpu_vm_flush() 3293 { 3294 cpu_t *cp; 3295 vm_cpu_data_t *vc; 3296 3297 mutex_enter(&cpu_lock); 3298 pause_cpus(NULL); 3299 3300 cp = cpu_list; 3301 do { 3302 vc = cp->cpu_vm_data; 3303 vc->vc_pnum_memseg = NULL; 3304 vc->vc_pnext_memseg = NULL; 3305 3306 } while ((cp = cp->cpu_next) != cpu_list); 3307 3308 start_cpus(); 3309 mutex_exit(&cpu_lock); 3310 } 3311