1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1987, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 26 /* All Rights Reserved */ 27 28 /* 29 * University Copyright- Copyright (c) 1982, 1986, 1988 30 * The Regents of the University of California 31 * All Rights Reserved 32 * 33 * University Acknowledgment- Portions of this document are derived from 34 * software developed by the University of California, Berkeley, and its 35 * contributors. 36 */ 37 38 /* 39 * Each physical swap area has an associated bitmap representing 40 * its physical storage. The bitmap records which swap slots are 41 * currently allocated or freed. Allocation is done by searching 42 * through the bitmap for the first free slot. Thus, there's 43 * no linear relation between offset within the swap device and the 44 * address (within its segment(s)) of the page that the slot backs; 45 * instead, it's an arbitrary one-to-one mapping. 46 * 47 * Associated with each swap area is a swapinfo structure. These 48 * structures are linked into a linear list that determines the 49 * ordering of swap areas in the logical swap device. Each contains a 50 * pointer to the corresponding bitmap, the area's size, and its 51 * associated vnode. 52 */ 53 54 #include <sys/types.h> 55 #include <sys/inttypes.h> 56 #include <sys/param.h> 57 #include <sys/t_lock.h> 58 #include <sys/sysmacros.h> 59 #include <sys/systm.h> 60 #include <sys/errno.h> 61 #include <sys/kmem.h> 62 #include <sys/vfs.h> 63 #include <sys/vnode.h> 64 #include <sys/pathname.h> 65 #include <sys/cmn_err.h> 66 #include <sys/vtrace.h> 67 #include <sys/swap.h> 68 #include <sys/dumphdr.h> 69 #include <sys/debug.h> 70 #include <sys/fs/snode.h> 71 #include <sys/fs/swapnode.h> 72 #include <sys/policy.h> 73 #include <sys/zone.h> 74 75 #include <vm/as.h> 76 #include <vm/seg.h> 77 #include <vm/page.h> 78 #include <vm/seg_vn.h> 79 #include <vm/hat.h> 80 #include <vm/anon.h> 81 #include <vm/seg_map.h> 82 83 /* 84 * To balance the load among multiple swap areas, we don't allow 85 * more than swap_maxcontig allocations to be satisfied from a 86 * single swap area before moving on to the next swap area. This 87 * effectively "interleaves" allocations among the many swap areas. 88 */ 89 int swap_maxcontig; /* set by anon_init() to 1 Mb */ 90 91 #define MINIROOTSIZE 12000 /* ~6 Meg XXX */ 92 93 /* 94 * XXX - this lock is a kludge. It serializes some aspects of swapadd() and 95 * swapdel() (namely VOP_OPEN, VOP_CLOSE, VN_RELE). It protects against 96 * somebody swapadd'ing and getting swap slots from a vnode, while someone 97 * else is in the process of closing or rele'ing it. 98 */ 99 static kmutex_t swap_lock; 100 101 kmutex_t swapinfo_lock; 102 103 /* 104 * protected by the swapinfo_lock 105 */ 106 struct swapinfo *swapinfo; 107 108 static struct swapinfo *silast; 109 static int nswapfiles; 110 111 static u_offset_t swap_getoff(struct swapinfo *); 112 static int swapadd(struct vnode *, ulong_t, ulong_t, char *); 113 static int swapdel(struct vnode *, ulong_t); 114 static int swapslot_free(struct vnode *, u_offset_t, struct swapinfo *); 115 116 /* 117 * swap device bitmap allocation macros 118 */ 119 #define MAPSHIFT 5 120 #define NBBW (NBPW * NBBY) /* number of bits per word */ 121 #define TESTBIT(map, i) (((map)[(i) >> MAPSHIFT] & (1 << (i) % NBBW))) 122 #define SETBIT(map, i) (((map)[(i) >> MAPSHIFT] |= (1 << (i) % NBBW))) 123 #define CLEARBIT(map, i) (((map)[(i) >> MAPSHIFT] &= ~(1 << (i) % NBBW))) 124 125 int swap_debug = 0; /* set for debug printf's */ 126 int swap_verify = 0; /* set to verify slots when freeing and allocating */ 127 128 uint_t swapalloc_maxcontig; 129 130 /* 131 * Allocate a range of up to *lenp contiguous slots (page) from a physical 132 * swap device. Flags are one of: 133 * SA_NOT Must have a slot from a physical swap device other than the 134 * the one containing input (*vpp, *offp). 135 * Less slots than requested may be returned. *lenp allocated slots are 136 * returned starting at *offp on *vpp. 137 * Returns 1 for a successful allocation, 0 for couldn't allocate any slots. 138 */ 139 int 140 swap_phys_alloc( 141 struct vnode **vpp, 142 u_offset_t *offp, 143 size_t *lenp, 144 uint_t flags) 145 { 146 struct swapinfo *sip; 147 offset_t soff, noff; 148 size_t len; 149 150 mutex_enter(&swapinfo_lock); 151 sip = silast; 152 153 /* Find a desirable physical device and allocate from it. */ 154 do { 155 if (sip == NULL) 156 break; 157 if (!(sip->si_flags & ST_INDEL) && 158 (spgcnt_t)sip->si_nfpgs > 0) { 159 /* Caller wants other than specified swap device */ 160 if (flags & SA_NOT) { 161 if (*vpp != sip->si_vp || 162 *offp < sip->si_soff || 163 *offp >= sip->si_eoff) 164 goto found; 165 /* Caller is loose, will take anything */ 166 } else 167 goto found; 168 } else if (sip->si_nfpgs == 0) 169 sip->si_allocs = 0; 170 if ((sip = sip->si_next) == NULL) 171 sip = swapinfo; 172 } while (sip != silast); 173 mutex_exit(&swapinfo_lock); 174 return (0); 175 found: 176 soff = swap_getoff(sip); 177 sip->si_nfpgs--; 178 if (soff == -1) 179 panic("swap_alloc: swap_getoff failed!"); 180 181 for (len = PAGESIZE; len < *lenp; len += PAGESIZE) { 182 if (sip->si_nfpgs == 0) 183 break; 184 if (swapalloc_maxcontig && len >= swapalloc_maxcontig) 185 break; 186 noff = swap_getoff(sip); 187 if (noff == -1) { 188 break; 189 } else if (noff != soff + len) { 190 CLEARBIT(sip->si_swapslots, btop(noff - sip->si_soff)); 191 break; 192 } 193 sip->si_nfpgs--; 194 } 195 *vpp = sip->si_vp; 196 *offp = soff; 197 *lenp = len; 198 ASSERT((spgcnt_t)sip->si_nfpgs >= 0); 199 sip->si_allocs += btop(len); 200 if (sip->si_allocs >= swap_maxcontig) { 201 sip->si_allocs = 0; 202 if ((silast = sip->si_next) == NULL) 203 silast = swapinfo; 204 } 205 TRACE_2(TR_FAC_VM, TR_SWAP_ALLOC, 206 "swap_alloc:sip %p offset %lx", sip, soff); 207 mutex_exit(&swapinfo_lock); 208 return (1); 209 } 210 211 int swap_backsearch = 0; 212 213 /* 214 * Get a free offset on swap device sip. 215 * Return >=0 offset if succeeded, -1 for failure. 216 */ 217 static u_offset_t 218 swap_getoff(struct swapinfo *sip) 219 { 220 uint_t *sp, *ep; 221 size_t aoff, boff, poff, slotnumber; 222 223 ASSERT(MUTEX_HELD(&swapinfo_lock)); 224 225 sip->si_alloccnt++; 226 for (sp = &sip->si_swapslots[sip->si_hint >> MAPSHIFT], 227 ep = &sip->si_swapslots[sip->si_mapsize / NBPW]; sp < ep; sp++) { 228 if (*sp != (uint_t)0xffffffff) 229 goto foundentry; 230 else 231 sip->si_checkcnt++; 232 } 233 SWAP_PRINT(SW_ALLOC, 234 "swap_getoff: couldn't find slot from hint %ld to end\n", 235 sip->si_hint, 0, 0, 0, 0); 236 /* 237 * Go backwards? Check for faster method XXX 238 */ 239 if (swap_backsearch) { 240 for (sp = &sip->si_swapslots[sip->si_hint >> MAPSHIFT], 241 ep = sip->si_swapslots; sp > ep; sp--) { 242 if (*sp != (uint_t)0xffffffff) 243 goto foundentry; 244 else 245 sip->si_checkcnt++; 246 } 247 } else { 248 for (sp = sip->si_swapslots, 249 ep = &sip->si_swapslots[sip->si_hint >> MAPSHIFT]; 250 sp < ep; sp++) { 251 if (*sp != (uint_t)0xffffffff) 252 goto foundentry; 253 else 254 sip->si_checkcnt++; 255 } 256 } 257 if (*sp == 0xffffffff) { 258 cmn_err(CE_WARN, "No free swap slots!"); 259 return ((u_offset_t)-1); 260 } 261 262 foundentry: 263 /* 264 * aoff is the page number offset (in bytes) of the si_swapslots 265 * array element containing a free page 266 * 267 * boff is the page number offset of the free page 268 * (i.e. cleared bit) in si_swapslots[aoff]. 269 */ 270 aoff = ((char *)sp - (char *)sip->si_swapslots) * NBBY; 271 272 for (boff = (sip->si_hint % NBBW); boff < NBBW; boff++) { 273 if (!TESTBIT(sip->si_swapslots, aoff + boff)) 274 goto foundslot; 275 else 276 sip->si_checkcnt++; 277 } 278 for (boff = 0; boff < (sip->si_hint % NBBW); boff++) { 279 if (!TESTBIT(sip->si_swapslots, aoff + boff)) 280 goto foundslot; 281 else 282 sip->si_checkcnt++; 283 } 284 panic("swap_getoff: didn't find slot in word hint %ld", sip->si_hint); 285 286 foundslot: 287 /* 288 * Return the offset of the free page in swap device. 289 * Convert page number of byte offset and add starting 290 * offset of swap device. 291 */ 292 slotnumber = aoff + boff; 293 SWAP_PRINT(SW_ALLOC, "swap_getoff: allocating slot %ld\n", 294 slotnumber, 0, 0, 0, 0); 295 poff = ptob(slotnumber); 296 if (poff + sip->si_soff >= sip->si_eoff) 297 printf("ptob(aoff(%ld) + boff(%ld))(%ld) >= eoff(%ld)\n", 298 aoff, boff, ptob(slotnumber), (long)sip->si_eoff); 299 ASSERT(poff < sip->si_eoff); 300 /* 301 * We could verify here that the slot isn't already allocated 302 * by looking through all the anon slots. 303 */ 304 SETBIT(sip->si_swapslots, slotnumber); 305 sip->si_hint = slotnumber + 1; /* hint = next slot */ 306 return (poff + sip->si_soff); 307 } 308 309 /* 310 * Free a swap page. 311 */ 312 void 313 swap_phys_free(struct vnode *vp, u_offset_t off, size_t len) 314 { 315 struct swapinfo *sip; 316 ssize_t pagenumber, npage; 317 318 mutex_enter(&swapinfo_lock); 319 sip = swapinfo; 320 321 do { 322 if (sip->si_vp == vp && 323 sip->si_soff <= off && off < sip->si_eoff) { 324 for (pagenumber = btop(off - sip->si_soff), 325 npage = btop(len) + pagenumber; 326 pagenumber < npage; pagenumber++) { 327 SWAP_PRINT(SW_ALLOC, 328 "swap_phys_free: freeing slot %ld on " 329 "sip %p\n", 330 pagenumber, sip, 0, 0, 0); 331 if (!TESTBIT(sip->si_swapslots, pagenumber)) { 332 panic( 333 "swap_phys_free: freeing free slot " 334 "%p,%lx\n", (void *)vp, 335 ptob(pagenumber) + sip->si_soff); 336 } 337 CLEARBIT(sip->si_swapslots, pagenumber); 338 sip->si_nfpgs++; 339 } 340 ASSERT(sip->si_nfpgs <= sip->si_npgs); 341 mutex_exit(&swapinfo_lock); 342 return; 343 } 344 } while ((sip = sip->si_next) != NULL); 345 panic("swap_phys_free"); 346 /*NOTREACHED*/ 347 } 348 349 /* 350 * Return the anon struct corresponding for the given 351 * <vnode, off> if it is part of the virtual swap device. 352 * Return the anon struct if found, otherwise NULL. 353 */ 354 struct anon * 355 swap_anon(struct vnode *vp, u_offset_t off) 356 { 357 struct anon *ap; 358 359 ASSERT(MUTEX_HELD(AH_MUTEX(vp, off))); 360 361 for (ap = anon_hash[ANON_HASH(vp, off)]; ap != NULL; ap = ap->an_hash) { 362 if (ap->an_vp == vp && ap->an_off == off) 363 return (ap); 364 } 365 return (NULL); 366 } 367 368 369 /* 370 * Determine if the vp offset range overlap a swap device. 371 */ 372 int 373 swap_in_range(struct vnode *vp, u_offset_t offset, size_t len) 374 { 375 struct swapinfo *sip; 376 u_offset_t eoff; 377 378 eoff = offset + len; 379 ASSERT(eoff > offset); 380 381 mutex_enter(&swapinfo_lock); 382 sip = swapinfo; 383 if (vp && sip) { 384 do { 385 if (vp != sip->si_vp || eoff <= sip->si_soff || 386 offset >= sip->si_eoff) 387 continue; 388 mutex_exit(&swapinfo_lock); 389 return (1); 390 } while ((sip = sip->si_next) != NULL); 391 } 392 mutex_exit(&swapinfo_lock); 393 return (0); 394 } 395 396 /* 397 * See if name is one of our swap files 398 * even though lookupname failed. 399 * This can be used by swapdel to delete 400 * swap resources on remote machines 401 * where the link has gone down. 402 */ 403 static struct vnode * 404 swapdel_byname( 405 char *name, /* pathname to delete */ 406 ulong_t lowblk) /* Low block number of area to delete */ 407 { 408 struct swapinfo **sipp, *osip; 409 u_offset_t soff; 410 411 /* 412 * Find the swap file entry for the file to 413 * be deleted. Skip any entries that are in 414 * transition. 415 */ 416 417 soff = ptob(btopr(lowblk << SCTRSHFT)); /* must be page aligned */ 418 419 mutex_enter(&swapinfo_lock); 420 for (sipp = &swapinfo; (osip = *sipp) != NULL; sipp = &osip->si_next) { 421 if ((strcmp(osip->si_pname, name) == 0) && 422 (osip->si_soff == soff) && (osip->si_flags == 0)) { 423 struct vnode *vp = osip->si_vp; 424 425 VN_HOLD(vp); 426 mutex_exit(&swapinfo_lock); 427 return (vp); 428 } 429 } 430 mutex_exit(&swapinfo_lock); 431 return (NULL); 432 } 433 434 435 /* 436 * New system call to manipulate swap files. 437 */ 438 int 439 swapctl(int sc_cmd, void *sc_arg, int *rv) 440 { 441 struct swapinfo *sip, *csip, *tsip; 442 int error = 0; 443 struct swapent st, *ust; 444 struct swapres sr; 445 struct vnode *vp; 446 int cnt = 0; 447 int tmp_nswapfiles; 448 int nswap; 449 int length, nlen; 450 int gplen = 0, plen; 451 char *swapname; 452 char *pname; 453 char *tpname; 454 struct anoninfo ai; 455 spgcnt_t avail; 456 int global = INGLOBALZONE(curproc); 457 struct zone *zp = curproc->p_zone; 458 459 /* 460 * When running in a zone we want to hide the details of the swap 461 * devices: we report there only being one swap device named "swap" 462 * having a size equal to the sum of the sizes of all real swap devices 463 * on the system. 464 */ 465 switch (sc_cmd) { 466 case SC_GETNSWP: 467 if (global) 468 *rv = nswapfiles; 469 else 470 *rv = 1; 471 return (0); 472 473 case SC_AINFO: 474 /* 475 * Return anoninfo information with these changes: 476 * ani_max = maximum amount of swap space 477 * (including potentially available physical memory) 478 * ani_free = amount of unallocated anonymous memory 479 * (some of which might be reserved and including 480 * potentially available physical memory) 481 * ani_resv = amount of claimed (reserved) anonymous memory 482 */ 483 avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0); 484 ai.ani_max = (k_anoninfo.ani_max + 485 k_anoninfo.ani_mem_resv) + avail; 486 487 /* Update ani_free */ 488 set_anoninfo(); 489 ai.ani_free = k_anoninfo.ani_free + avail; 490 491 ai.ani_resv = k_anoninfo.ani_phys_resv + 492 k_anoninfo.ani_mem_resv; 493 494 if (!global && zp->zone_max_swap_ctl != UINT64_MAX) { 495 /* 496 * We're in a non-global zone with a swap cap. We 497 * always report the system-wide values for the global 498 * zone, even though it too can have a swap cap. 499 */ 500 501 /* 502 * For a swap-capped zone, the numbers are contrived 503 * since we don't have a correct value of 'reserved' 504 * for the zone. 505 * 506 * The ani_max value is always the zone's swap cap. 507 * 508 * The ani_free value is always the difference between 509 * the cap and the amount of swap in use by the zone. 510 * 511 * The ani_resv value is typically set to be the amount 512 * of swap in use by the zone, but can be adjusted 513 * upwards to indicate how much swap is currently 514 * unavailable to that zone due to usage by entities 515 * outside the zone. 516 * 517 * This works as follows. 518 * 519 * In the 'swap -s' output, the data is displayed 520 * as follows: 521 * allocated = ani_max - ani_free 522 * reserved = ani_resv - allocated 523 * available = ani_max - ani_resv 524 * 525 * Taking a contrived example, if the swap cap is 100 526 * and the amount of swap used by the zone is 75, this 527 * gives: 528 * allocated = ani_max - ani_free = 100 - 25 = 75 529 * reserved = ani_resv - allocated = 75 - 75 = 0 530 * available = ani_max - ani_resv = 100 - 75 = 25 531 * 532 * In this typical case, you can see that the 'swap -s' 533 * 'reserved' will always be 0 inside a swap capped 534 * zone. 535 * 536 * However, if the system as a whole has less free 537 * swap than the zone limits allow, then we adjust 538 * the ani_resv value up so that it is the difference 539 * between the zone cap and the amount of free system 540 * swap. Taking the above example, but when the 541 * system as a whole only has 20 of swap available, we 542 * get an ani_resv of 100 - 20 = 80. This gives: 543 * allocated = ani_max - ani_free = 100 - 25 = 75 544 * reserved = ani_resv - allocated = 80 - 75 = 5 545 * available = ani_max - ani_resv = 100 - 80 = 20 546 * 547 * In this case, you can see how the ani_resv value is 548 * tweaked up to make the 'swap -s' numbers work inside 549 * the zone. 550 */ 551 rctl_qty_t cap, used; 552 pgcnt_t pgcap, sys_avail; 553 554 mutex_enter(&zp->zone_mem_lock); 555 cap = zp->zone_max_swap_ctl; 556 used = zp->zone_max_swap; 557 mutex_exit(&zp->zone_mem_lock); 558 559 pgcap = MIN(btop(cap), ai.ani_max); 560 ai.ani_free = pgcap - btop(used); 561 562 /* Get the system-wide swap currently available. */ 563 sys_avail = ai.ani_max - ai.ani_resv; 564 if (sys_avail < ai.ani_free) 565 ai.ani_resv = pgcap - sys_avail; 566 else 567 ai.ani_resv = btop(used); 568 569 ai.ani_max = pgcap; 570 } 571 572 if (copyout(&ai, sc_arg, sizeof (struct anoninfo)) != 0) 573 return (EFAULT); 574 return (0); 575 576 case SC_LIST: 577 if (copyin(sc_arg, &length, sizeof (int)) != 0) 578 return (EFAULT); 579 if (!global) { 580 struct swapent st; 581 char *swappath = "swap"; 582 583 if (length < 1) 584 return (ENOMEM); 585 ust = (swapent_t *)((swaptbl_t *)sc_arg)->swt_ent; 586 if (copyin(ust, &st, sizeof (swapent_t)) != 0) 587 return (EFAULT); 588 st.ste_start = PAGESIZE >> SCTRSHFT; 589 st.ste_length = (off_t)0; 590 st.ste_pages = 0; 591 st.ste_free = 0; 592 st.ste_flags = 0; 593 594 mutex_enter(&swapinfo_lock); 595 for (sip = swapinfo, nswap = 0; 596 sip != NULL && nswap < nswapfiles; 597 sip = sip->si_next, nswap++) { 598 st.ste_length += 599 (sip->si_eoff - sip->si_soff) >> SCTRSHFT; 600 st.ste_pages += sip->si_npgs; 601 st.ste_free += sip->si_nfpgs; 602 } 603 mutex_exit(&swapinfo_lock); 604 605 if (zp->zone_max_swap_ctl != UINT64_MAX) { 606 rctl_qty_t cap, used; 607 608 mutex_enter(&zp->zone_mem_lock); 609 cap = zp->zone_max_swap_ctl; 610 used = zp->zone_max_swap; 611 mutex_exit(&zp->zone_mem_lock); 612 613 st.ste_length = MIN(cap, st.ste_length); 614 st.ste_pages = MIN(btop(cap), st.ste_pages); 615 st.ste_free = MIN(st.ste_pages - btop(used), 616 st.ste_free); 617 } 618 619 if (copyout(&st, ust, sizeof (swapent_t)) != 0 || 620 copyout(swappath, st.ste_path, 621 strlen(swappath) + 1) != 0) { 622 return (EFAULT); 623 } 624 *rv = 1; 625 return (0); 626 } 627 beginning: 628 tmp_nswapfiles = nswapfiles; 629 /* Return an error if not enough space for the whole table. */ 630 if (length < tmp_nswapfiles) 631 return (ENOMEM); 632 /* 633 * Get memory to hold the swap entries and their names. We'll 634 * copy the real entries into these and then copy these out. 635 * Allocating the pathname memory is only a guess so we may 636 * find that we need more and have to do it again. 637 * All this is because we have to hold the anon lock while 638 * traversing the swapinfo list, and we can't be doing copyouts 639 * and/or kmem_alloc()s during this. 640 */ 641 csip = kmem_zalloc(tmp_nswapfiles * sizeof (struct swapinfo), 642 KM_SLEEP); 643 retry: 644 nlen = tmp_nswapfiles * (gplen += 100); 645 pname = kmem_zalloc(nlen, KM_SLEEP); 646 647 mutex_enter(&swapinfo_lock); 648 649 if (tmp_nswapfiles != nswapfiles) { 650 mutex_exit(&swapinfo_lock); 651 kmem_free(pname, nlen); 652 kmem_free(csip, 653 tmp_nswapfiles * sizeof (struct swapinfo)); 654 gplen = 0; 655 goto beginning; 656 } 657 for (sip = swapinfo, tsip = csip, tpname = pname, nswap = 0; 658 sip && nswap < tmp_nswapfiles; 659 sip = sip->si_next, tsip++, tpname += plen, nswap++) { 660 plen = sip->si_pnamelen; 661 if (tpname + plen - pname > nlen) { 662 mutex_exit(&swapinfo_lock); 663 kmem_free(pname, nlen); 664 goto retry; 665 } 666 *tsip = *sip; 667 tsip->si_pname = tpname; 668 (void) strcpy(tsip->si_pname, sip->si_pname); 669 } 670 mutex_exit(&swapinfo_lock); 671 672 if (sip) { 673 error = ENOMEM; 674 goto lout; 675 } 676 ust = (swapent_t *)((swaptbl_t *)sc_arg)->swt_ent; 677 for (tsip = csip, cnt = 0; cnt < nswap; tsip++, ust++, cnt++) { 678 if (copyin(ust, &st, sizeof (swapent_t)) != 0) { 679 error = EFAULT; 680 goto lout; 681 } 682 st.ste_flags = tsip->si_flags; 683 st.ste_length = 684 (tsip->si_eoff - tsip->si_soff) >> SCTRSHFT; 685 st.ste_start = tsip->si_soff >> SCTRSHFT; 686 st.ste_pages = tsip->si_npgs; 687 st.ste_free = tsip->si_nfpgs; 688 if (copyout(&st, ust, sizeof (swapent_t)) != 0) { 689 error = EFAULT; 690 goto lout; 691 } 692 if (!tsip->si_pnamelen) 693 continue; 694 if (copyout(tsip->si_pname, st.ste_path, 695 tsip->si_pnamelen) != 0) { 696 error = EFAULT; 697 goto lout; 698 } 699 } 700 *rv = nswap; 701 lout: 702 kmem_free(csip, tmp_nswapfiles * sizeof (struct swapinfo)); 703 kmem_free(pname, nlen); 704 return (error); 705 706 case SC_ADD: 707 case SC_REMOVE: 708 break; 709 default: 710 return (EINVAL); 711 } 712 if ((error = secpolicy_swapctl(CRED())) != 0) 713 return (error); 714 715 if (copyin(sc_arg, &sr, sizeof (swapres_t))) 716 return (EFAULT); 717 718 /* Allocate the space to read in pathname */ 719 if ((swapname = kmem_alloc(MAXPATHLEN, KM_NOSLEEP)) == NULL) 720 return (ENOMEM); 721 722 error = copyinstr(sr.sr_name, swapname, MAXPATHLEN, 0); 723 if (error) 724 goto out; 725 726 error = lookupname(swapname, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); 727 if (error) { 728 if (sc_cmd == SC_ADD) 729 goto out; 730 /* see if we match by name */ 731 vp = swapdel_byname(swapname, (size_t)sr.sr_start); 732 if (vp == NULL) 733 goto out; 734 } 735 736 if (vp->v_flag & (VNOMAP | VNOSWAP)) { 737 VN_RELE(vp); 738 error = ENOSYS; 739 goto out; 740 } 741 switch (vp->v_type) { 742 case VBLK: 743 break; 744 745 case VREG: 746 if (vp->v_vfsp && vn_is_readonly(vp)) 747 error = EROFS; 748 else 749 error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL); 750 break; 751 752 case VDIR: 753 error = EISDIR; 754 break; 755 default: 756 error = ENOSYS; 757 break; 758 } 759 if (error == 0) { 760 if (sc_cmd == SC_REMOVE) 761 error = swapdel(vp, sr.sr_start); 762 else 763 error = swapadd(vp, sr.sr_start, 764 sr.sr_length, swapname); 765 } 766 VN_RELE(vp); 767 out: 768 kmem_free(swapname, MAXPATHLEN); 769 return (error); 770 } 771 772 #if defined(_LP64) && defined(_SYSCALL32) 773 774 int 775 swapctl32(int sc_cmd, void *sc_arg, int *rv) 776 { 777 struct swapinfo *sip, *csip, *tsip; 778 int error = 0; 779 struct swapent32 st, *ust; 780 struct swapres32 sr; 781 struct vnode *vp; 782 int cnt = 0; 783 int tmp_nswapfiles; 784 int nswap; 785 int length, nlen; 786 int gplen = 0, plen; 787 char *swapname; 788 char *pname; 789 char *tpname; 790 struct anoninfo32 ai; 791 size_t s; 792 spgcnt_t avail; 793 int global = INGLOBALZONE(curproc); 794 struct zone *zp = curproc->p_zone; 795 796 /* 797 * When running in a zone we want to hide the details of the swap 798 * devices: we report there only being one swap device named "swap" 799 * having a size equal to the sum of the sizes of all real swap devices 800 * on the system. 801 */ 802 switch (sc_cmd) { 803 case SC_GETNSWP: 804 if (global) 805 *rv = nswapfiles; 806 else 807 *rv = 1; 808 return (0); 809 810 case SC_AINFO: 811 /* 812 * Return anoninfo information with these changes: 813 * ani_max = maximum amount of swap space 814 * (including potentially available physical memory) 815 * ani_free = amount of unallocated anonymous memory 816 * (some of which might be reserved and including 817 * potentially available physical memory) 818 * ani_resv = amount of claimed (reserved) anonymous memory 819 */ 820 avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0); 821 s = (k_anoninfo.ani_max + k_anoninfo.ani_mem_resv) + avail; 822 if (s > UINT32_MAX) 823 return (EOVERFLOW); 824 ai.ani_max = s; 825 826 /* Update ani_free */ 827 set_anoninfo(); 828 s = k_anoninfo.ani_free + avail; 829 if (s > UINT32_MAX) 830 return (EOVERFLOW); 831 ai.ani_free = s; 832 833 s = k_anoninfo.ani_phys_resv + k_anoninfo.ani_mem_resv; 834 if (s > UINT32_MAX) 835 return (EOVERFLOW); 836 ai.ani_resv = s; 837 838 if (!global && zp->zone_max_swap_ctl != UINT64_MAX) { 839 /* 840 * We're in a non-global zone with a swap cap. We 841 * always report the system-wide values for the global 842 * zone, even though it too can have a swap cap. 843 * See the comment for the SC_AINFO case in swapctl() 844 * which explains the following logic. 845 */ 846 rctl_qty_t cap, used; 847 pgcnt_t pgcap, sys_avail; 848 849 mutex_enter(&zp->zone_mem_lock); 850 cap = zp->zone_max_swap_ctl; 851 used = zp->zone_max_swap; 852 mutex_exit(&zp->zone_mem_lock); 853 854 pgcap = MIN(btop(cap), ai.ani_max); 855 ai.ani_free = pgcap - btop(used); 856 857 /* Get the system-wide swap currently available. */ 858 sys_avail = ai.ani_max - ai.ani_resv; 859 if (sys_avail < ai.ani_free) 860 ai.ani_resv = pgcap - sys_avail; 861 else 862 ai.ani_resv = btop(used); 863 864 ai.ani_max = pgcap; 865 } 866 867 if (copyout(&ai, sc_arg, sizeof (ai)) != 0) 868 return (EFAULT); 869 return (0); 870 871 case SC_LIST: 872 if (copyin(sc_arg, &length, sizeof (int32_t)) != 0) 873 return (EFAULT); 874 if (!global) { 875 struct swapent32 st; 876 char *swappath = "swap"; 877 878 if (length < 1) 879 return (ENOMEM); 880 ust = (swapent32_t *)((swaptbl32_t *)sc_arg)->swt_ent; 881 if (copyin(ust, &st, sizeof (swapent32_t)) != 0) 882 return (EFAULT); 883 st.ste_start = PAGESIZE >> SCTRSHFT; 884 st.ste_length = (off_t)0; 885 st.ste_pages = 0; 886 st.ste_free = 0; 887 st.ste_flags = 0; 888 889 mutex_enter(&swapinfo_lock); 890 for (sip = swapinfo, nswap = 0; 891 sip != NULL && nswap < nswapfiles; 892 sip = sip->si_next, nswap++) { 893 st.ste_length += 894 (sip->si_eoff - sip->si_soff) >> SCTRSHFT; 895 st.ste_pages += sip->si_npgs; 896 st.ste_free += sip->si_nfpgs; 897 } 898 mutex_exit(&swapinfo_lock); 899 900 if (zp->zone_max_swap_ctl != UINT64_MAX) { 901 rctl_qty_t cap, used; 902 903 mutex_enter(&zp->zone_mem_lock); 904 cap = zp->zone_max_swap_ctl; 905 used = zp->zone_max_swap; 906 mutex_exit(&zp->zone_mem_lock); 907 908 st.ste_length = MIN(cap, st.ste_length); 909 st.ste_pages = MIN(btop(cap), st.ste_pages); 910 st.ste_free = MIN(st.ste_pages - btop(used), 911 st.ste_free); 912 } 913 914 if (copyout(&st, ust, sizeof (swapent32_t)) != 0 || 915 copyout(swappath, (caddr_t)(uintptr_t)st.ste_path, 916 strlen(swappath) + 1) != 0) { 917 return (EFAULT); 918 } 919 *rv = 1; 920 return (0); 921 } 922 beginning: 923 tmp_nswapfiles = nswapfiles; 924 /* Return an error if not enough space for the whole table. */ 925 if (length < tmp_nswapfiles) 926 return (ENOMEM); 927 /* 928 * Get memory to hold the swap entries and their names. We'll 929 * copy the real entries into these and then copy these out. 930 * Allocating the pathname memory is only a guess so we may 931 * find that we need more and have to do it again. 932 * All this is because we have to hold the anon lock while 933 * traversing the swapinfo list, and we can't be doing copyouts 934 * and/or kmem_alloc()s during this. 935 */ 936 csip = kmem_zalloc(tmp_nswapfiles * sizeof (*csip), KM_SLEEP); 937 retry: 938 nlen = tmp_nswapfiles * (gplen += 100); 939 pname = kmem_zalloc(nlen, KM_SLEEP); 940 941 mutex_enter(&swapinfo_lock); 942 943 if (tmp_nswapfiles != nswapfiles) { 944 mutex_exit(&swapinfo_lock); 945 kmem_free(pname, nlen); 946 kmem_free(csip, tmp_nswapfiles * sizeof (*csip)); 947 gplen = 0; 948 goto beginning; 949 } 950 for (sip = swapinfo, tsip = csip, tpname = pname, nswap = 0; 951 (sip != NULL) && (nswap < tmp_nswapfiles); 952 sip = sip->si_next, tsip++, tpname += plen, nswap++) { 953 plen = sip->si_pnamelen; 954 if (tpname + plen - pname > nlen) { 955 mutex_exit(&swapinfo_lock); 956 kmem_free(pname, nlen); 957 goto retry; 958 } 959 *tsip = *sip; 960 tsip->si_pname = tpname; 961 (void) strcpy(tsip->si_pname, sip->si_pname); 962 } 963 mutex_exit(&swapinfo_lock); 964 965 if (sip != NULL) { 966 error = ENOMEM; 967 goto lout; 968 } 969 ust = (swapent32_t *)((swaptbl32_t *)sc_arg)->swt_ent; 970 for (tsip = csip, cnt = 0; cnt < nswap; tsip++, ust++, cnt++) { 971 if (copyin(ust, &st, sizeof (*ust)) != 0) { 972 error = EFAULT; 973 goto lout; 974 } 975 st.ste_flags = tsip->si_flags; 976 st.ste_length = 977 (tsip->si_eoff - tsip->si_soff) >> SCTRSHFT; 978 st.ste_start = tsip->si_soff >> SCTRSHFT; 979 st.ste_pages = tsip->si_npgs; 980 st.ste_free = tsip->si_nfpgs; 981 if (copyout(&st, ust, sizeof (st)) != 0) { 982 error = EFAULT; 983 goto lout; 984 } 985 if (!tsip->si_pnamelen) 986 continue; 987 if (copyout(tsip->si_pname, 988 (caddr_t)(uintptr_t)st.ste_path, 989 tsip->si_pnamelen) != 0) { 990 error = EFAULT; 991 goto lout; 992 } 993 } 994 *rv = nswap; 995 lout: 996 kmem_free(csip, tmp_nswapfiles * sizeof (*csip)); 997 kmem_free(pname, nlen); 998 return (error); 999 1000 case SC_ADD: 1001 case SC_REMOVE: 1002 break; 1003 default: 1004 return (EINVAL); 1005 } 1006 if ((error = secpolicy_swapctl(CRED())) != 0) 1007 return (error); 1008 1009 if (copyin(sc_arg, &sr, sizeof (sr))) 1010 return (EFAULT); 1011 1012 /* Allocate the space to read in pathname */ 1013 if ((swapname = kmem_alloc(MAXPATHLEN, KM_NOSLEEP)) == NULL) 1014 return (ENOMEM); 1015 1016 error = copyinstr((caddr_t)(uintptr_t)sr.sr_name, 1017 swapname, MAXPATHLEN, NULL); 1018 if (error) 1019 goto out; 1020 1021 error = lookupname(swapname, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); 1022 if (error) { 1023 if (sc_cmd == SC_ADD) 1024 goto out; 1025 /* see if we match by name */ 1026 vp = swapdel_byname(swapname, (uint_t)sr.sr_start); 1027 if (vp == NULL) 1028 goto out; 1029 } 1030 1031 if (vp->v_flag & (VNOMAP | VNOSWAP)) { 1032 VN_RELE(vp); 1033 error = ENOSYS; 1034 goto out; 1035 } 1036 switch (vp->v_type) { 1037 case VBLK: 1038 break; 1039 1040 case VREG: 1041 if (vp->v_vfsp && vn_is_readonly(vp)) 1042 error = EROFS; 1043 else 1044 error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL); 1045 break; 1046 1047 case VDIR: 1048 error = EISDIR; 1049 break; 1050 default: 1051 error = ENOSYS; 1052 break; 1053 } 1054 if (error == 0) { 1055 if (sc_cmd == SC_REMOVE) 1056 error = swapdel(vp, sr.sr_start); 1057 else 1058 error = swapadd(vp, sr.sr_start, sr.sr_length, 1059 swapname); 1060 } 1061 VN_RELE(vp); 1062 out: 1063 kmem_free(swapname, MAXPATHLEN); 1064 return (error); 1065 } 1066 1067 #endif /* _LP64 && _SYSCALL32 */ 1068 1069 /* 1070 * Add a new swap file. 1071 */ 1072 int 1073 swapadd(struct vnode *vp, ulong_t lowblk, ulong_t nblks, char *swapname) 1074 { 1075 struct swapinfo **sipp, *nsip = NULL, *esip = NULL; 1076 struct vnode *cvp; 1077 struct vattr vattr; 1078 pgcnt_t pages; 1079 u_offset_t soff, eoff; 1080 int error; 1081 ssize_t i, start, end; 1082 ushort_t wasswap; 1083 ulong_t startblk; 1084 size_t returned_mem; 1085 1086 SWAP_PRINT(SW_CTL, "swapadd: vp %p lowblk %ld nblks %ld swapname %s\n", 1087 vp, lowblk, nblks, swapname, 0); 1088 /* 1089 * Get the real vnode. (If vp is not a specnode it just returns vp, so 1090 * it does the right thing, but having this code know about specnodes 1091 * violates the spirit of having it be indepedent of vnode type.) 1092 */ 1093 cvp = common_specvp(vp); 1094 1095 /* 1096 * Or in VISSWAP so file system has chance to deny swap-ons during open. 1097 */ 1098 mutex_enter(&cvp->v_lock); 1099 wasswap = cvp->v_flag & VISSWAP; 1100 cvp->v_flag |= VISSWAP; 1101 mutex_exit(&cvp->v_lock); 1102 1103 mutex_enter(&swap_lock); 1104 if (error = VOP_OPEN(&cvp, FREAD|FWRITE, CRED(), NULL)) { 1105 mutex_exit(&swap_lock); 1106 /* restore state of v_flag */ 1107 if (!wasswap) { 1108 mutex_enter(&cvp->v_lock); 1109 cvp->v_flag &= ~VISSWAP; 1110 mutex_exit(&cvp->v_lock); 1111 } 1112 return (error); 1113 } 1114 mutex_exit(&swap_lock); 1115 1116 /* 1117 * Get partition size. Return error if empty partition, 1118 * or if request does not fit within the partition. 1119 * If this is the first swap device, we can reduce 1120 * the size of the swap area to match what is 1121 * available. This can happen if the system was built 1122 * on a machine with a different size swap partition. 1123 */ 1124 vattr.va_mask = AT_SIZE; 1125 if (error = VOP_GETATTR(cvp, &vattr, ATTR_COMM, CRED(), NULL)) 1126 goto out; 1127 1128 /* 1129 * Specfs returns a va_size of MAXOFFSET_T (UNKNOWN_SIZE) when the 1130 * size of the device can't be determined. 1131 */ 1132 if ((vattr.va_size == 0) || (vattr.va_size == MAXOFFSET_T)) { 1133 error = EINVAL; 1134 goto out; 1135 } 1136 1137 #ifdef _ILP32 1138 /* 1139 * No support for large swap in 32-bit OS, if the size of the swap is 1140 * bigger than MAXOFF32_T then the size used by swapfs must be limited. 1141 * This limitation is imposed by the swap subsystem itself, a D_64BIT 1142 * driver as the target of swap operation should be able to field 1143 * the IO. 1144 */ 1145 if (vattr.va_size > MAXOFF32_T) { 1146 cmn_err(CE_NOTE, 1147 "!swap device %s truncated from 0x%llx to 0x%x bytes", 1148 swapname, vattr.va_size, MAXOFF32_T); 1149 vattr.va_size = MAXOFF32_T; 1150 } 1151 #endif /* _ILP32 */ 1152 1153 /* Fail if file not writeable (try to set size to current size) */ 1154 vattr.va_mask = AT_SIZE; 1155 if (error = VOP_SETATTR(cvp, &vattr, 0, CRED(), NULL)) 1156 goto out; 1157 1158 /* Fail if fs does not support VOP_PAGEIO */ 1159 error = VOP_PAGEIO(cvp, (page_t *)NULL, (u_offset_t)0, 0, 0, CRED(), 1160 NULL); 1161 1162 if (error == ENOSYS) 1163 goto out; 1164 else 1165 error = 0; 1166 /* 1167 * If swapping on the root filesystem don't put swap blocks that 1168 * correspond to the miniroot filesystem on the swap free list. 1169 */ 1170 if (cvp == rootdir) 1171 startblk = roundup(MINIROOTSIZE<<SCTRSHFT, klustsize)>>SCTRSHFT; 1172 else /* Skip 1st page (disk label) */ 1173 startblk = (ulong_t)(lowblk ? lowblk : 1); 1174 1175 soff = startblk << SCTRSHFT; 1176 if (soff >= vattr.va_size) { 1177 error = EINVAL; 1178 goto out; 1179 } 1180 1181 /* 1182 * If user specified 0 blks, use the size of the device 1183 */ 1184 eoff = nblks ? soff + (nblks - (startblk - lowblk) << SCTRSHFT) : 1185 vattr.va_size; 1186 1187 SWAP_PRINT(SW_CTL, "swapadd: va_size %ld soff %ld eoff %ld\n", 1188 vattr.va_size, soff, eoff, 0, 0); 1189 1190 if (eoff > vattr.va_size) { 1191 error = EINVAL; 1192 goto out; 1193 } 1194 1195 /* 1196 * The starting and ending offsets must be page aligned. 1197 * Round soff up to next page boundary, round eoff 1198 * down to previous page boundary. 1199 */ 1200 soff = ptob(btopr(soff)); 1201 eoff = ptob(btop(eoff)); 1202 if (soff >= eoff) { 1203 SWAP_PRINT(SW_CTL, "swapadd: soff %ld >= eoff %ld\n", 1204 soff, eoff, 0, 0, 0); 1205 error = EINVAL; 1206 goto out; 1207 } 1208 1209 pages = btop(eoff - soff); 1210 1211 /* Allocate and partially set up the new swapinfo */ 1212 nsip = kmem_zalloc(sizeof (struct swapinfo), KM_SLEEP); 1213 nsip->si_vp = cvp; 1214 1215 nsip->si_soff = soff; 1216 nsip->si_eoff = eoff; 1217 nsip->si_hint = 0; 1218 nsip->si_checkcnt = nsip->si_alloccnt = 0; 1219 1220 nsip->si_pnamelen = (int)strlen(swapname) + 1; 1221 nsip->si_pname = (char *)kmem_zalloc(nsip->si_pnamelen, KM_SLEEP); 1222 bcopy(swapname, nsip->si_pname, nsip->si_pnamelen - 1); 1223 SWAP_PRINT(SW_CTL, "swapadd: allocating swapinfo for %s, %ld pages\n", 1224 swapname, pages, 0, 0, 0); 1225 /* 1226 * Size of swapslots map in bytes 1227 */ 1228 nsip->si_mapsize = P2ROUNDUP(pages, NBBW) / NBBY; 1229 nsip->si_swapslots = kmem_zalloc(nsip->si_mapsize, KM_SLEEP); 1230 1231 /* 1232 * Permanently set the bits that can't ever be allocated, 1233 * i.e. those from the ending offset to the round up slot for the 1234 * swapslots bit map. 1235 */ 1236 start = pages; 1237 end = P2ROUNDUP(pages, NBBW); 1238 for (i = start; i < end; i++) { 1239 SWAP_PRINT(SW_CTL, "swapadd: set bit for page %ld\n", i, 1240 0, 0, 0, 0); 1241 SETBIT(nsip->si_swapslots, i); 1242 } 1243 nsip->si_npgs = nsip->si_nfpgs = pages; 1244 /* 1245 * Now check to see if we can add it. We wait til now to check because 1246 * we need the swapinfo_lock and we don't want sleep with it (e.g., 1247 * during kmem_alloc()) while we're setting up the swapinfo. 1248 */ 1249 mutex_enter(&swapinfo_lock); 1250 for (sipp = &swapinfo; (esip = *sipp) != NULL; sipp = &esip->si_next) { 1251 if (esip->si_vp == cvp) { 1252 if (esip->si_soff == soff && esip->si_npgs == pages && 1253 (esip->si_flags & ST_DOINGDEL)) { 1254 /* 1255 * We are adding a device that we are in the 1256 * middle of deleting. Just clear the 1257 * ST_DOINGDEL flag to signal this and 1258 * the deletion routine will eventually notice 1259 * it and add it back. 1260 */ 1261 esip->si_flags &= ~ST_DOINGDEL; 1262 mutex_exit(&swapinfo_lock); 1263 goto out; 1264 } 1265 /* disallow overlapping swap files */ 1266 if ((soff < esip->si_eoff) && (eoff > esip->si_soff)) { 1267 error = EEXIST; 1268 mutex_exit(&swapinfo_lock); 1269 goto out; 1270 } 1271 } 1272 } 1273 1274 nswapfiles++; 1275 1276 /* 1277 * add new swap device to list and shift allocations to it 1278 * before updating the anoninfo counters 1279 */ 1280 *sipp = nsip; 1281 silast = nsip; 1282 1283 /* 1284 * Update the total amount of reservable swap space 1285 * accounting properly for swap space from physical memory 1286 */ 1287 /* New swap device soaks up currently reserved memory swap */ 1288 mutex_enter(&anoninfo_lock); 1289 1290 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 1291 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 1292 1293 k_anoninfo.ani_max += pages; 1294 ANI_ADD(pages); 1295 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) { 1296 returned_mem = MIN(k_anoninfo.ani_mem_resv - 1297 k_anoninfo.ani_locked_swap, 1298 k_anoninfo.ani_max - k_anoninfo.ani_phys_resv); 1299 1300 ANI_ADD(-returned_mem); 1301 k_anoninfo.ani_free -= returned_mem; 1302 k_anoninfo.ani_mem_resv -= returned_mem; 1303 k_anoninfo.ani_phys_resv += returned_mem; 1304 1305 mutex_enter(&freemem_lock); 1306 availrmem += returned_mem; 1307 mutex_exit(&freemem_lock); 1308 } 1309 /* 1310 * At boot time, to permit booting small memory machines using 1311 * only physical memory as swap space, we allowed a dangerously 1312 * large amount of memory to be used as swap space; now that 1313 * more physical backing store is available bump down the amount 1314 * we can get from memory to a safer size. 1315 */ 1316 if (swapfs_minfree < swapfs_desfree) { 1317 mutex_enter(&freemem_lock); 1318 if (availrmem > swapfs_desfree || !k_anoninfo.ani_mem_resv) 1319 swapfs_minfree = swapfs_desfree; 1320 mutex_exit(&freemem_lock); 1321 } 1322 1323 SWAP_PRINT(SW_CTL, "swapadd: ani_max %ld ani_free %ld\n", 1324 k_anoninfo.ani_free, k_anoninfo.ani_free, 0, 0, 0); 1325 1326 mutex_exit(&anoninfo_lock); 1327 1328 mutex_exit(&swapinfo_lock); 1329 1330 /* Initialize the dump device */ 1331 mutex_enter(&dump_lock); 1332 if (dumpvp == NULL) 1333 (void) dumpinit(vp, swapname, 0); 1334 mutex_exit(&dump_lock); 1335 1336 VN_HOLD(cvp); 1337 out: 1338 if (error || esip) { 1339 SWAP_PRINT(SW_CTL, "swapadd: error (%d)\n", error, 0, 0, 0, 0); 1340 1341 if (!wasswap) { 1342 mutex_enter(&cvp->v_lock); 1343 cvp->v_flag &= ~VISSWAP; 1344 mutex_exit(&cvp->v_lock); 1345 } 1346 if (nsip) { 1347 kmem_free(nsip->si_swapslots, (size_t)nsip->si_mapsize); 1348 kmem_free(nsip->si_pname, nsip->si_pnamelen); 1349 kmem_free(nsip, sizeof (*nsip)); 1350 } 1351 mutex_enter(&swap_lock); 1352 (void) VOP_CLOSE(cvp, FREAD|FWRITE, 1, (offset_t)0, CRED(), 1353 NULL); 1354 mutex_exit(&swap_lock); 1355 } 1356 return (error); 1357 } 1358 1359 /* 1360 * Delete a swap file. 1361 */ 1362 static int 1363 swapdel( 1364 struct vnode *vp, 1365 ulong_t lowblk) /* Low block number of area to delete. */ 1366 { 1367 struct swapinfo **sipp, *osip = NULL; 1368 struct vnode *cvp; 1369 u_offset_t soff; 1370 int error = 0; 1371 u_offset_t toff = 0; 1372 struct vnode *tvp = NULL; 1373 spgcnt_t pages; 1374 struct anon **app, *ap; 1375 kmutex_t *ahm; 1376 pgcnt_t adjust_swap = 0; 1377 1378 /* Find the swap file entry for the file to be deleted */ 1379 cvp = common_specvp(vp); 1380 1381 1382 lowblk = lowblk ? lowblk : 1; /* Skip first page (disk label) */ 1383 soff = ptob(btopr(lowblk << SCTRSHFT)); /* must be page aligned */ 1384 1385 mutex_enter(&swapinfo_lock); 1386 for (sipp = &swapinfo; (osip = *sipp) != NULL; sipp = &osip->si_next) { 1387 if ((osip->si_vp == cvp) && 1388 (osip->si_soff == soff) && (osip->si_flags == 0)) 1389 break; 1390 } 1391 1392 /* If the file was not found, error. */ 1393 if (osip == NULL) { 1394 error = EINVAL; 1395 mutex_exit(&swapinfo_lock); 1396 goto out; 1397 } 1398 1399 pages = osip->si_npgs; 1400 1401 /* 1402 * Do not delete if we will be low on swap pages. 1403 */ 1404 mutex_enter(&anoninfo_lock); 1405 1406 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 1407 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 1408 1409 mutex_enter(&freemem_lock); 1410 if (((k_anoninfo.ani_max - k_anoninfo.ani_phys_resv) + 1411 MAX((spgcnt_t)(availrmem - swapfs_minfree), 0)) < pages) { 1412 mutex_exit(&freemem_lock); 1413 mutex_exit(&anoninfo_lock); 1414 error = ENOMEM; 1415 cmn_err(CE_WARN, "swapdel - too few free pages"); 1416 mutex_exit(&swapinfo_lock); 1417 goto out; 1418 } 1419 mutex_exit(&freemem_lock); 1420 1421 k_anoninfo.ani_max -= pages; 1422 1423 /* If needed, reserve memory swap to replace old device */ 1424 if (k_anoninfo.ani_phys_resv > k_anoninfo.ani_max) { 1425 adjust_swap = k_anoninfo.ani_phys_resv - k_anoninfo.ani_max; 1426 k_anoninfo.ani_phys_resv -= adjust_swap; 1427 k_anoninfo.ani_mem_resv += adjust_swap; 1428 mutex_enter(&freemem_lock); 1429 availrmem -= adjust_swap; 1430 mutex_exit(&freemem_lock); 1431 ANI_ADD(adjust_swap); 1432 } 1433 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 1434 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 1435 mutex_exit(&anoninfo_lock); 1436 1437 ANI_ADD(-pages); 1438 1439 /* 1440 * Set the delete flag. This prevents anyone from allocating more 1441 * pages from this file. Also set ST_DOINGDEL. Someone who wants to 1442 * add the file back while we're deleting it will signify by clearing 1443 * this flag. 1444 */ 1445 osip->si_flags |= ST_INDEL|ST_DOINGDEL; 1446 mutex_exit(&swapinfo_lock); 1447 1448 /* 1449 * Free all the allocated physical slots for this file. We do this 1450 * by walking through the entire anon hash array, because we need 1451 * to update all the anon slots that have physical swap slots on 1452 * this file, and this is the only way to find them all. We go back 1453 * to the beginning of a bucket after each slot is freed because the 1454 * anonhash_lock is not held during the free and thus the hash table 1455 * may change under us. 1456 */ 1457 for (app = anon_hash; app < &anon_hash[ANON_HASH_SIZE]; app++) { 1458 ahm = &anonhash_lock[(app - anon_hash) & 1459 (AH_LOCK_SIZE - 1)].pad_mutex; 1460 mutex_enter(ahm); 1461 top: 1462 for (ap = *app; ap != NULL; ap = ap->an_hash) { 1463 if (ap->an_pvp == cvp && 1464 ap->an_poff >= osip->si_soff && 1465 ap->an_poff < osip->si_eoff) { 1466 ASSERT(TESTBIT(osip->si_swapslots, 1467 btop((size_t)(ap->an_poff - 1468 osip->si_soff)))); 1469 tvp = ap->an_vp; 1470 toff = ap->an_off; 1471 VN_HOLD(tvp); 1472 mutex_exit(ahm); 1473 1474 error = swapslot_free(tvp, toff, osip); 1475 1476 VN_RELE(tvp); 1477 mutex_enter(ahm); 1478 if (!error && (osip->si_flags & ST_DOINGDEL)) { 1479 goto top; 1480 } else { 1481 if (error) { 1482 cmn_err(CE_WARN, 1483 "swapslot_free failed %d", 1484 error); 1485 } 1486 1487 /* 1488 * Add device back before making it 1489 * visible. 1490 */ 1491 mutex_enter(&swapinfo_lock); 1492 osip->si_flags &= 1493 ~(ST_INDEL | ST_DOINGDEL); 1494 mutex_exit(&swapinfo_lock); 1495 1496 /* 1497 * Update the anon space available 1498 */ 1499 mutex_enter(&anoninfo_lock); 1500 1501 k_anoninfo.ani_phys_resv += adjust_swap; 1502 k_anoninfo.ani_mem_resv -= adjust_swap; 1503 k_anoninfo.ani_max += pages; 1504 1505 mutex_enter(&freemem_lock); 1506 availrmem += adjust_swap; 1507 mutex_exit(&freemem_lock); 1508 1509 mutex_exit(&anoninfo_lock); 1510 1511 ANI_ADD(pages); 1512 1513 mutex_exit(ahm); 1514 goto out; 1515 } 1516 } 1517 } 1518 mutex_exit(ahm); 1519 } 1520 1521 /* All done, they'd better all be free! */ 1522 mutex_enter(&swapinfo_lock); 1523 ASSERT(osip->si_nfpgs == osip->si_npgs); 1524 1525 /* Now remove it from the swapinfo list */ 1526 for (sipp = &swapinfo; *sipp != NULL; sipp = &(*sipp)->si_next) { 1527 if (*sipp == osip) 1528 break; 1529 } 1530 ASSERT(*sipp); 1531 *sipp = osip->si_next; 1532 if (silast == osip) 1533 if ((silast = osip->si_next) == NULL) 1534 silast = swapinfo; 1535 nswapfiles--; 1536 mutex_exit(&swapinfo_lock); 1537 1538 kmem_free(osip->si_swapslots, osip->si_mapsize); 1539 kmem_free(osip->si_pname, osip->si_pnamelen); 1540 kmem_free(osip, sizeof (*osip)); 1541 1542 mutex_enter(&dump_lock); 1543 if (cvp == dumpvp) 1544 dumpfini(); 1545 mutex_exit(&dump_lock); 1546 1547 /* Release the vnode */ 1548 1549 mutex_enter(&swap_lock); 1550 (void) VOP_CLOSE(cvp, FREAD|FWRITE, 1, (offset_t)0, CRED(), NULL); 1551 mutex_enter(&cvp->v_lock); 1552 cvp->v_flag &= ~VISSWAP; 1553 mutex_exit(&cvp->v_lock); 1554 VN_RELE(cvp); 1555 mutex_exit(&swap_lock); 1556 out: 1557 return (error); 1558 } 1559 1560 /* 1561 * Free up a physical swap slot on swapinfo sip, currently in use by the 1562 * anonymous page whose name is (vp, off). 1563 */ 1564 static int 1565 swapslot_free( 1566 struct vnode *vp, 1567 u_offset_t off, 1568 struct swapinfo *sip) 1569 { 1570 struct page *pp = NULL; 1571 struct anon *ap = NULL; 1572 int error = 0; 1573 kmutex_t *ahm; 1574 struct vnode *pvp = NULL; 1575 u_offset_t poff; 1576 int alloc_pg = 0; 1577 1578 ASSERT(sip->si_vp != NULL); 1579 /* 1580 * Get the page for the old swap slot if exists or create a new one. 1581 */ 1582 again: 1583 if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) { 1584 pp = page_create_va(vp, off, PAGESIZE, PG_WAIT | PG_EXCL, 1585 segkmap, NULL); 1586 if (pp == NULL) 1587 goto again; 1588 alloc_pg = 1; 1589 1590 error = swap_getphysname(vp, off, &pvp, &poff); 1591 if (error || pvp != sip->si_vp || poff < sip->si_soff || 1592 poff >= sip->si_eoff) { 1593 page_io_unlock(pp); 1594 /*LINTED: constant in conditional context*/ 1595 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1596 return (0); 1597 } 1598 1599 error = VOP_PAGEIO(pvp, pp, poff, PAGESIZE, B_READ, 1600 CRED(), NULL); 1601 if (error) { 1602 page_io_unlock(pp); 1603 if (error == EFAULT) 1604 error = 0; 1605 /*LINTED: constant in conditional context*/ 1606 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1607 return (error); 1608 } 1609 } 1610 1611 /* 1612 * The anon could have been removed by anon_decref* and/or reallocated 1613 * by anon layer (an_pvp == NULL) with the same vp, off. 1614 * In this case the page which has been allocated needs to 1615 * be freed. 1616 */ 1617 if (!alloc_pg) 1618 page_io_lock(pp); 1619 ahm = AH_MUTEX(vp, off); 1620 mutex_enter(ahm); 1621 ap = swap_anon(vp, off); 1622 if ((ap == NULL || ap->an_pvp == NULL) && alloc_pg) { 1623 mutex_exit(ahm); 1624 page_io_unlock(pp); 1625 /*LINTED: constant in conditional context*/ 1626 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1627 return (0); 1628 } 1629 1630 /* 1631 * Free the physical slot. It may have been freed up and replaced with 1632 * another one while we were getting the page so we have to re-verify 1633 * that this is really one we want. If we do free the slot we have 1634 * to mark the page modified, as its backing store is now gone. 1635 */ 1636 if ((ap != NULL) && (ap->an_pvp == sip->si_vp && ap->an_poff >= 1637 sip->si_soff && ap->an_poff < sip->si_eoff)) { 1638 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 1639 ap->an_pvp = NULL; 1640 ap->an_poff = 0; 1641 mutex_exit(ahm); 1642 hat_setmod(pp); 1643 } else { 1644 mutex_exit(ahm); 1645 } 1646 page_io_unlock(pp); 1647 page_unlock(pp); 1648 return (0); 1649 } 1650 1651 1652 /* 1653 * Get contig physical backing store for vp, in the range 1654 * [*offp, *offp + *lenp), May back a subrange of this, but must 1655 * always include the requested offset or fail. Returns the offsets 1656 * backed as [*offp, *offp + *lenp) and the physical offsets used to 1657 * back them from *pvpp in the range [*pstartp, *pstartp + *lenp). 1658 * Returns 0 for success 1659 * SE_NOANON -- no anon slot for requested paged 1660 * SE_NOSWAP -- no physical swap space available 1661 */ 1662 int 1663 swap_newphysname( 1664 struct vnode *vp, 1665 u_offset_t offset, 1666 u_offset_t *offp, 1667 size_t *lenp, 1668 struct vnode **pvpp, 1669 u_offset_t *poffp) 1670 { 1671 struct anon *ap = NULL; /* anon slot for vp, off */ 1672 int error = 0; 1673 struct vnode *pvp; 1674 u_offset_t poff, pstart, prem; 1675 size_t plen; 1676 u_offset_t off, start; 1677 kmutex_t *ahm; 1678 1679 ASSERT(*offp <= offset && offset < *offp + *lenp); 1680 1681 /* Get new physical swap slots. */ 1682 plen = *lenp; 1683 if (!swap_phys_alloc(&pvp, &pstart, &plen, 0)) { 1684 /* 1685 * No swap available so return error unless requested 1686 * offset is already backed in which case return that. 1687 */ 1688 ahm = AH_MUTEX(vp, offset); 1689 mutex_enter(ahm); 1690 if ((ap = swap_anon(vp, offset)) == NULL) { 1691 error = SE_NOANON; 1692 mutex_exit(ahm); 1693 return (error); 1694 } 1695 error = (ap->an_pvp ? 0 : SE_NOSWAP); 1696 *offp = offset; 1697 *lenp = PAGESIZE; 1698 *pvpp = ap->an_pvp; 1699 *poffp = ap->an_poff; 1700 mutex_exit(ahm); 1701 return (error); 1702 } 1703 1704 /* 1705 * We got plen (<= *lenp) contig slots. Use these to back a 1706 * subrange of [*offp, *offp + *lenp) which includes offset. 1707 * For now we just put offset at the end of the kluster. 1708 * Clearly there are other possible choices - which is best? 1709 */ 1710 start = MAX(*offp, 1711 (offset + PAGESIZE > plen) ? (offset + PAGESIZE - plen) : 0); 1712 ASSERT(start + plen <= *offp + *lenp); 1713 1714 for (off = start, poff = pstart; poff < pstart + plen; 1715 off += PAGESIZE, poff += PAGESIZE) { 1716 ahm = AH_MUTEX(vp, off); 1717 mutex_enter(ahm); 1718 if ((ap = swap_anon(vp, off)) != NULL) { 1719 /* Free old slot if any, and assign new one */ 1720 if (ap->an_pvp) 1721 swap_phys_free(ap->an_pvp, ap->an_poff, 1722 PAGESIZE); 1723 ap->an_pvp = pvp; 1724 ap->an_poff = poff; 1725 } else { /* No anon slot for a klustered page, quit. */ 1726 prem = (pstart + plen) - poff; 1727 /* Already did requested page, do partial kluster */ 1728 if (off > offset) { 1729 plen = poff - pstart; 1730 error = 0; 1731 /* Fail on requested page, error */ 1732 } else if (off == offset) { 1733 error = SE_NOANON; 1734 /* Fail on prior page, fail on requested page, error */ 1735 } else if ((ap = swap_anon(vp, offset)) == NULL) { 1736 error = SE_NOANON; 1737 /* Fail on prior page, got requested page, do only it */ 1738 } else { 1739 /* Free old slot if any, and assign new one */ 1740 if (ap->an_pvp) 1741 swap_phys_free(ap->an_pvp, ap->an_poff, 1742 PAGESIZE); 1743 ap->an_pvp = pvp; 1744 ap->an_poff = poff; 1745 /* One page kluster */ 1746 start = offset; 1747 plen = PAGESIZE; 1748 pstart = poff; 1749 poff += PAGESIZE; 1750 prem -= PAGESIZE; 1751 } 1752 /* Free unassigned slots */ 1753 swap_phys_free(pvp, poff, prem); 1754 mutex_exit(ahm); 1755 break; 1756 } 1757 mutex_exit(ahm); 1758 } 1759 ASSERT(*offp <= start && start + plen <= *offp + *lenp); 1760 ASSERT(start <= offset && offset < start + plen); 1761 *offp = start; 1762 *lenp = plen; 1763 *pvpp = pvp; 1764 *poffp = pstart; 1765 return (error); 1766 } 1767 1768 1769 /* 1770 * Get the physical swap backing store location for a given anonymous page 1771 * named (vp, off). The backing store name is returned in (*pvpp, *poffp). 1772 * Returns 0 success 1773 * EIDRM -- no anon slot (page is not allocated) 1774 */ 1775 int 1776 swap_getphysname( 1777 struct vnode *vp, 1778 u_offset_t off, 1779 struct vnode **pvpp, 1780 u_offset_t *poffp) 1781 { 1782 struct anon *ap; 1783 int error = 0; 1784 kmutex_t *ahm; 1785 1786 ahm = AH_MUTEX(vp, off); 1787 mutex_enter(ahm); 1788 1789 /* Get anon slot for vp, off */ 1790 ap = swap_anon(vp, off); 1791 if (ap == NULL) { 1792 error = EIDRM; 1793 goto out; 1794 } 1795 *pvpp = ap->an_pvp; 1796 *poffp = ap->an_poff; 1797 out: 1798 mutex_exit(ahm); 1799 return (error); 1800 } 1801