1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2015 Joyent, Inc. 24 */ 25 26 /* 27 * Copyright (c) 1987, 2010, Oracle and/or its affiliates. All rights reserved. 28 */ 29 30 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 31 /* All Rights Reserved */ 32 33 /* 34 * University Copyright- Copyright (c) 1982, 1986, 1988 35 * The Regents of the University of California 36 * All Rights Reserved 37 * 38 * University Acknowledgment- Portions of this document are derived from 39 * software developed by the University of California, Berkeley, and its 40 * contributors. 41 */ 42 43 /* 44 * Each physical swap area has an associated bitmap representing 45 * its physical storage. The bitmap records which swap slots are 46 * currently allocated or freed. Allocation is done by searching 47 * through the bitmap for the first free slot. Thus, there's 48 * no linear relation between offset within the swap device and the 49 * address (within its segment(s)) of the page that the slot backs; 50 * instead, it's an arbitrary one-to-one mapping. 51 * 52 * Associated with each swap area is a swapinfo structure. These 53 * structures are linked into a linear list that determines the 54 * ordering of swap areas in the logical swap device. Each contains a 55 * pointer to the corresponding bitmap, the area's size, and its 56 * associated vnode. 57 */ 58 59 #include <sys/types.h> 60 #include <sys/inttypes.h> 61 #include <sys/param.h> 62 #include <sys/t_lock.h> 63 #include <sys/sysmacros.h> 64 #include <sys/systm.h> 65 #include <sys/errno.h> 66 #include <sys/kmem.h> 67 #include <sys/vfs.h> 68 #include <sys/vnode.h> 69 #include <sys/pathname.h> 70 #include <sys/cmn_err.h> 71 #include <sys/vtrace.h> 72 #include <sys/swap.h> 73 #include <sys/dumphdr.h> 74 #include <sys/debug.h> 75 #include <sys/fs/snode.h> 76 #include <sys/fs/swapnode.h> 77 #include <sys/policy.h> 78 #include <sys/zone.h> 79 80 #include <vm/as.h> 81 #include <vm/seg.h> 82 #include <vm/page.h> 83 #include <vm/seg_vn.h> 84 #include <vm/hat.h> 85 #include <vm/anon.h> 86 #include <vm/seg_map.h> 87 88 /* 89 * To balance the load among multiple swap areas, we don't allow 90 * more than swap_maxcontig allocations to be satisfied from a 91 * single swap area before moving on to the next swap area. This 92 * effectively "interleaves" allocations among the many swap areas. 93 */ 94 int swap_maxcontig; /* set by anon_init() to 1 Mb */ 95 96 #define MINIROOTSIZE 12000 /* ~6 Meg XXX */ 97 98 /* 99 * XXX - this lock is a kludge. It serializes some aspects of swapadd() and 100 * swapdel() (namely VOP_OPEN, VOP_CLOSE, VN_RELE). It protects against 101 * somebody swapadd'ing and getting swap slots from a vnode, while someone 102 * else is in the process of closing or rele'ing it. 103 */ 104 static kmutex_t swap_lock; 105 106 kmutex_t swapinfo_lock; 107 108 /* 109 * protected by the swapinfo_lock 110 */ 111 extern struct swapinfo *swapinfo; 112 113 static struct swapinfo *silast; 114 static int nswapfiles; 115 116 static u_offset_t swap_getoff(struct swapinfo *); 117 static int swapadd(struct vnode *, ulong_t, ulong_t, char *); 118 static int swapdel(struct vnode *, ulong_t); 119 static int swapslot_free(struct vnode *, u_offset_t, struct swapinfo *); 120 121 /* 122 * swap device bitmap allocation macros 123 */ 124 #define MAPSHIFT 5 125 #define NBBW (NBPW * NBBY) /* number of bits per word */ 126 #define TESTBIT(map, i) (((map)[(i) >> MAPSHIFT] & (1 << (i) % NBBW))) 127 #define SETBIT(map, i) (((map)[(i) >> MAPSHIFT] |= (1 << (i) % NBBW))) 128 #define CLEARBIT(map, i) (((map)[(i) >> MAPSHIFT] &= ~(1 << (i) % NBBW))) 129 130 int swap_debug = 0; /* set for debug printf's */ 131 int swap_verify = 0; /* set to verify slots when freeing and allocating */ 132 133 uint_t swapalloc_maxcontig; 134 135 /* 136 * Allocate a range of up to *lenp contiguous slots (page) from a physical 137 * swap device. Flags are one of: 138 * SA_NOT Must have a slot from a physical swap device other than the 139 * the one containing input (*vpp, *offp). 140 * Less slots than requested may be returned. *lenp allocated slots are 141 * returned starting at *offp on *vpp. 142 * Returns 1 for a successful allocation, 0 for couldn't allocate any slots. 143 */ 144 int 145 swap_phys_alloc( 146 struct vnode **vpp, 147 u_offset_t *offp, 148 size_t *lenp, 149 uint_t flags) 150 { 151 struct swapinfo *sip; 152 offset_t soff, noff; 153 size_t len; 154 155 mutex_enter(&swapinfo_lock); 156 sip = silast; 157 158 /* Find a desirable physical device and allocate from it. */ 159 do { 160 if (sip == NULL) 161 break; 162 if (!(sip->si_flags & ST_INDEL) && 163 (spgcnt_t)sip->si_nfpgs > 0) { 164 /* Caller wants other than specified swap device */ 165 if (flags & SA_NOT) { 166 if (*vpp != sip->si_vp || 167 *offp < sip->si_soff || 168 *offp >= sip->si_eoff) 169 goto found; 170 /* Caller is loose, will take anything */ 171 } else 172 goto found; 173 } else if (sip->si_nfpgs == 0) 174 sip->si_allocs = 0; 175 if ((sip = sip->si_next) == NULL) 176 sip = swapinfo; 177 } while (sip != silast); 178 mutex_exit(&swapinfo_lock); 179 return (0); 180 found: 181 soff = swap_getoff(sip); 182 sip->si_nfpgs--; 183 if (soff == -1) 184 panic("swap_alloc: swap_getoff failed!"); 185 186 for (len = PAGESIZE; len < *lenp; len += PAGESIZE) { 187 if (sip->si_nfpgs == 0) 188 break; 189 if (swapalloc_maxcontig && len >= swapalloc_maxcontig) 190 break; 191 noff = swap_getoff(sip); 192 if (noff == -1) { 193 break; 194 } else if (noff != soff + len) { 195 CLEARBIT(sip->si_swapslots, btop(noff - sip->si_soff)); 196 break; 197 } 198 sip->si_nfpgs--; 199 } 200 *vpp = sip->si_vp; 201 *offp = soff; 202 *lenp = len; 203 ASSERT((spgcnt_t)sip->si_nfpgs >= 0); 204 sip->si_allocs += btop(len); 205 if (sip->si_allocs >= swap_maxcontig) { 206 sip->si_allocs = 0; 207 if ((silast = sip->si_next) == NULL) 208 silast = swapinfo; 209 } 210 TRACE_2(TR_FAC_VM, TR_SWAP_ALLOC, 211 "swap_alloc:sip %p offset %lx", sip, soff); 212 mutex_exit(&swapinfo_lock); 213 return (1); 214 } 215 216 int swap_backsearch = 0; 217 218 /* 219 * Get a free offset on swap device sip. 220 * Return >=0 offset if succeeded, -1 for failure. 221 */ 222 static u_offset_t 223 swap_getoff(struct swapinfo *sip) 224 { 225 uint_t *sp, *ep; 226 size_t aoff, boff, poff, slotnumber; 227 228 ASSERT(MUTEX_HELD(&swapinfo_lock)); 229 230 sip->si_alloccnt++; 231 for (sp = &sip->si_swapslots[sip->si_hint >> MAPSHIFT], 232 ep = &sip->si_swapslots[sip->si_mapsize / NBPW]; sp < ep; sp++) { 233 if (*sp != (uint_t)0xffffffff) 234 goto foundentry; 235 else 236 sip->si_checkcnt++; 237 } 238 SWAP_PRINT(SW_ALLOC, 239 "swap_getoff: couldn't find slot from hint %ld to end\n", 240 sip->si_hint, 0, 0, 0, 0); 241 /* 242 * Go backwards? Check for faster method XXX 243 */ 244 if (swap_backsearch) { 245 for (sp = &sip->si_swapslots[sip->si_hint >> MAPSHIFT], 246 ep = sip->si_swapslots; sp > ep; sp--) { 247 if (*sp != (uint_t)0xffffffff) 248 goto foundentry; 249 else 250 sip->si_checkcnt++; 251 } 252 } else { 253 for (sp = sip->si_swapslots, 254 ep = &sip->si_swapslots[sip->si_hint >> MAPSHIFT]; 255 sp < ep; sp++) { 256 if (*sp != (uint_t)0xffffffff) 257 goto foundentry; 258 else 259 sip->si_checkcnt++; 260 } 261 } 262 if (*sp == 0xffffffff) { 263 cmn_err(CE_WARN, "No free swap slots!"); 264 return ((u_offset_t)-1); 265 } 266 267 foundentry: 268 /* 269 * aoff is the page number offset (in bytes) of the si_swapslots 270 * array element containing a free page 271 * 272 * boff is the page number offset of the free page 273 * (i.e. cleared bit) in si_swapslots[aoff]. 274 */ 275 aoff = ((char *)sp - (char *)sip->si_swapslots) * NBBY; 276 277 for (boff = (sip->si_hint % NBBW); boff < NBBW; boff++) { 278 if (!TESTBIT(sip->si_swapslots, aoff + boff)) 279 goto foundslot; 280 else 281 sip->si_checkcnt++; 282 } 283 for (boff = 0; boff < (sip->si_hint % NBBW); boff++) { 284 if (!TESTBIT(sip->si_swapslots, aoff + boff)) 285 goto foundslot; 286 else 287 sip->si_checkcnt++; 288 } 289 panic("swap_getoff: didn't find slot in word hint %ld", sip->si_hint); 290 291 foundslot: 292 /* 293 * Return the offset of the free page in swap device. 294 * Convert page number of byte offset and add starting 295 * offset of swap device. 296 */ 297 slotnumber = aoff + boff; 298 SWAP_PRINT(SW_ALLOC, "swap_getoff: allocating slot %ld\n", 299 slotnumber, 0, 0, 0, 0); 300 poff = ptob(slotnumber); 301 if (poff + sip->si_soff >= sip->si_eoff) 302 printf("ptob(aoff(%ld) + boff(%ld))(%ld) >= eoff(%ld)\n", 303 aoff, boff, ptob(slotnumber), (long)sip->si_eoff); 304 ASSERT(poff < sip->si_eoff); 305 /* 306 * We could verify here that the slot isn't already allocated 307 * by looking through all the anon slots. 308 */ 309 SETBIT(sip->si_swapslots, slotnumber); 310 sip->si_hint = slotnumber + 1; /* hint = next slot */ 311 return (poff + sip->si_soff); 312 } 313 314 /* 315 * Free a swap page. 316 */ 317 void 318 swap_phys_free(struct vnode *vp, u_offset_t off, size_t len) 319 { 320 struct swapinfo *sip; 321 ssize_t pagenumber, npage; 322 323 mutex_enter(&swapinfo_lock); 324 sip = swapinfo; 325 326 do { 327 if (sip->si_vp == vp && 328 sip->si_soff <= off && off < sip->si_eoff) { 329 for (pagenumber = btop(off - sip->si_soff), 330 npage = btop(len) + pagenumber; 331 pagenumber < npage; pagenumber++) { 332 SWAP_PRINT(SW_ALLOC, 333 "swap_phys_free: freeing slot %ld on " 334 "sip %p\n", 335 pagenumber, sip, 0, 0, 0); 336 if (!TESTBIT(sip->si_swapslots, pagenumber)) { 337 panic( 338 "swap_phys_free: freeing free slot " 339 "%p,%lx\n", (void *)vp, 340 ptob(pagenumber) + sip->si_soff); 341 } 342 CLEARBIT(sip->si_swapslots, pagenumber); 343 sip->si_nfpgs++; 344 } 345 ASSERT(sip->si_nfpgs <= sip->si_npgs); 346 mutex_exit(&swapinfo_lock); 347 return; 348 } 349 } while ((sip = sip->si_next) != NULL); 350 panic("swap_phys_free"); 351 /*NOTREACHED*/ 352 } 353 354 /* 355 * Return the anon struct corresponding for the given 356 * <vnode, off> if it is part of the virtual swap device. 357 * Return the anon struct if found, otherwise NULL. 358 */ 359 struct anon * 360 swap_anon(struct vnode *vp, u_offset_t off) 361 { 362 struct anon *ap; 363 364 ASSERT(MUTEX_HELD(AH_MUTEX(vp, off))); 365 366 for (ap = anon_hash[ANON_HASH(vp, off)]; ap != NULL; ap = ap->an_hash) { 367 if (ap->an_vp == vp && ap->an_off == off) 368 return (ap); 369 } 370 return (NULL); 371 } 372 373 374 /* 375 * Determine if the vp offset range overlap a swap device. 376 */ 377 int 378 swap_in_range(struct vnode *vp, u_offset_t offset, size_t len) 379 { 380 struct swapinfo *sip; 381 u_offset_t eoff; 382 383 eoff = offset + len; 384 ASSERT(eoff > offset); 385 386 mutex_enter(&swapinfo_lock); 387 sip = swapinfo; 388 if (vp && sip) { 389 do { 390 if (vp != sip->si_vp || eoff <= sip->si_soff || 391 offset >= sip->si_eoff) 392 continue; 393 mutex_exit(&swapinfo_lock); 394 return (1); 395 } while ((sip = sip->si_next) != NULL); 396 } 397 mutex_exit(&swapinfo_lock); 398 return (0); 399 } 400 401 /* 402 * See if name is one of our swap files 403 * even though lookupname failed. 404 * This can be used by swapdel to delete 405 * swap resources on remote machines 406 * where the link has gone down. 407 */ 408 static struct vnode * 409 swapdel_byname( 410 char *name, /* pathname to delete */ 411 ulong_t lowblk) /* Low block number of area to delete */ 412 { 413 struct swapinfo **sipp, *osip; 414 u_offset_t soff; 415 416 /* 417 * Find the swap file entry for the file to 418 * be deleted. Skip any entries that are in 419 * transition. 420 */ 421 422 soff = ptob(btopr(lowblk << SCTRSHFT)); /* must be page aligned */ 423 424 mutex_enter(&swapinfo_lock); 425 for (sipp = &swapinfo; (osip = *sipp) != NULL; sipp = &osip->si_next) { 426 if ((strcmp(osip->si_pname, name) == 0) && 427 (osip->si_soff == soff) && (osip->si_flags == 0)) { 428 struct vnode *vp = osip->si_vp; 429 430 VN_HOLD(vp); 431 mutex_exit(&swapinfo_lock); 432 return (vp); 433 } 434 } 435 mutex_exit(&swapinfo_lock); 436 return (NULL); 437 } 438 439 440 /* 441 * New system call to manipulate swap files. 442 */ 443 int 444 swapctl(int sc_cmd, void *sc_arg, int *rv) 445 { 446 struct swapinfo *sip, *csip, *tsip; 447 int error = 0; 448 struct swapent st, *ust; 449 struct swapres sr; 450 struct vnode *vp; 451 int cnt = 0; 452 int tmp_nswapfiles; 453 int nswap; 454 int length, nlen; 455 int gplen = 0, plen; 456 char *swapname; 457 char *pname; 458 char *tpname; 459 struct anoninfo ai; 460 spgcnt_t avail; 461 int global = INGLOBALZONE(curproc); 462 struct zone *zp = curproc->p_zone; 463 464 /* 465 * When running in a zone we want to hide the details of the swap 466 * devices: we report there only being one swap device named "swap" 467 * having a size equal to the sum of the sizes of all real swap devices 468 * on the system. 469 */ 470 switch (sc_cmd) { 471 case SC_GETNSWP: 472 if (global) 473 *rv = nswapfiles; 474 else 475 *rv = 1; 476 return (0); 477 478 case SC_AINFO: 479 /* 480 * Return anoninfo information with these changes: 481 * ani_max = maximum amount of swap space 482 * (including potentially available physical memory) 483 * ani_free = amount of unallocated anonymous memory 484 * (some of which might be reserved and including 485 * potentially available physical memory) 486 * ani_resv = amount of claimed (reserved) anonymous memory 487 */ 488 avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0); 489 ai.ani_max = (k_anoninfo.ani_max + 490 k_anoninfo.ani_mem_resv) + avail; 491 492 /* Update ani_free */ 493 set_anoninfo(); 494 ai.ani_free = k_anoninfo.ani_free + avail; 495 496 ai.ani_resv = k_anoninfo.ani_phys_resv + 497 k_anoninfo.ani_mem_resv; 498 499 if (!global && zp->zone_max_swap_ctl != UINT64_MAX) { 500 /* 501 * We're in a non-global zone with a swap cap. We 502 * always report the system-wide values for the global 503 * zone, even though it too can have a swap cap. 504 */ 505 506 /* 507 * For a swap-capped zone, the numbers are contrived 508 * since we don't have a correct value of 'reserved' 509 * for the zone. 510 * 511 * The ani_max value is always the zone's swap cap. 512 * 513 * The ani_free value is always the difference between 514 * the cap and the amount of swap in use by the zone. 515 * 516 * The ani_resv value is typically set to be the amount 517 * of swap in use by the zone, but can be adjusted 518 * upwards to indicate how much swap is currently 519 * unavailable to that zone due to usage by entities 520 * outside the zone. 521 * 522 * This works as follows. 523 * 524 * In the 'swap -s' output, the data is displayed 525 * as follows: 526 * allocated = ani_max - ani_free 527 * reserved = ani_resv - allocated 528 * available = ani_max - ani_resv 529 * 530 * Taking a contrived example, if the swap cap is 100 531 * and the amount of swap used by the zone is 75, this 532 * gives: 533 * allocated = ani_max - ani_free = 100 - 25 = 75 534 * reserved = ani_resv - allocated = 75 - 75 = 0 535 * available = ani_max - ani_resv = 100 - 75 = 25 536 * 537 * In this typical case, you can see that the 'swap -s' 538 * 'reserved' will always be 0 inside a swap capped 539 * zone. 540 * 541 * However, if the system as a whole has less free 542 * swap than the zone limits allow, then we adjust 543 * the ani_resv value up so that it is the difference 544 * between the zone cap and the amount of free system 545 * swap. Taking the above example, but when the 546 * system as a whole only has 20 of swap available, we 547 * get an ani_resv of 100 - 20 = 80. This gives: 548 * allocated = ani_max - ani_free = 100 - 25 = 75 549 * reserved = ani_resv - allocated = 80 - 75 = 5 550 * available = ani_max - ani_resv = 100 - 80 = 20 551 * 552 * In this case, you can see how the ani_resv value is 553 * tweaked up to make the 'swap -s' numbers work inside 554 * the zone. 555 */ 556 rctl_qty_t cap, used; 557 pgcnt_t pgcap, sys_avail; 558 559 mutex_enter(&zp->zone_mem_lock); 560 cap = zp->zone_max_swap_ctl; 561 used = zp->zone_max_swap; 562 mutex_exit(&zp->zone_mem_lock); 563 564 pgcap = MIN(btop(cap), ai.ani_max); 565 ai.ani_free = pgcap - btop(used); 566 567 /* Get the system-wide swap currently available. */ 568 sys_avail = ai.ani_max - ai.ani_resv; 569 if (sys_avail < ai.ani_free) 570 ai.ani_resv = pgcap - sys_avail; 571 else 572 ai.ani_resv = btop(used); 573 574 ai.ani_max = pgcap; 575 } 576 577 if (copyout(&ai, sc_arg, sizeof (struct anoninfo)) != 0) 578 return (EFAULT); 579 return (0); 580 581 case SC_LIST: 582 if (copyin(sc_arg, &length, sizeof (int)) != 0) 583 return (EFAULT); 584 if (!global) { 585 struct swapent st; 586 char *swappath = "swap"; 587 588 if (length < 1) 589 return (ENOMEM); 590 ust = (swapent_t *)((swaptbl_t *)sc_arg)->swt_ent; 591 if (copyin(ust, &st, sizeof (swapent_t)) != 0) 592 return (EFAULT); 593 st.ste_start = PAGESIZE >> SCTRSHFT; 594 st.ste_length = (off_t)0; 595 st.ste_pages = 0; 596 st.ste_free = 0; 597 st.ste_flags = 0; 598 599 mutex_enter(&swapinfo_lock); 600 for (sip = swapinfo, nswap = 0; 601 sip != NULL && nswap < nswapfiles; 602 sip = sip->si_next, nswap++) { 603 st.ste_length += 604 (sip->si_eoff - sip->si_soff) >> SCTRSHFT; 605 st.ste_pages += sip->si_npgs; 606 st.ste_free += sip->si_nfpgs; 607 } 608 mutex_exit(&swapinfo_lock); 609 610 if (zp->zone_max_swap_ctl != UINT64_MAX) { 611 rctl_qty_t cap, used; 612 613 mutex_enter(&zp->zone_mem_lock); 614 cap = zp->zone_max_swap_ctl; 615 used = zp->zone_max_swap; 616 mutex_exit(&zp->zone_mem_lock); 617 618 st.ste_length = MIN(cap, st.ste_length); 619 st.ste_pages = MIN(btop(cap), st.ste_pages); 620 st.ste_free = MIN(st.ste_pages - btop(used), 621 st.ste_free); 622 } 623 624 if (copyout(&st, ust, sizeof (swapent_t)) != 0 || 625 copyout(swappath, st.ste_path, 626 strlen(swappath) + 1) != 0) { 627 return (EFAULT); 628 } 629 *rv = 1; 630 return (0); 631 } 632 beginning: 633 mutex_enter(&swapinfo_lock); 634 tmp_nswapfiles = nswapfiles; 635 mutex_exit(&swapinfo_lock); 636 637 /* 638 * Return early if there are no swap entries to report: 639 */ 640 if (tmp_nswapfiles < 1) { 641 *rv = 0; 642 return (0); 643 } 644 645 /* Return an error if not enough space for the whole table. */ 646 if (length < tmp_nswapfiles) 647 return (ENOMEM); 648 /* 649 * Get memory to hold the swap entries and their names. We'll 650 * copy the real entries into these and then copy these out. 651 * Allocating the pathname memory is only a guess so we may 652 * find that we need more and have to do it again. 653 * All this is because we have to hold the anon lock while 654 * traversing the swapinfo list, and we can't be doing copyouts 655 * and/or kmem_alloc()s during this. 656 */ 657 csip = kmem_zalloc(tmp_nswapfiles * sizeof (struct swapinfo), 658 KM_SLEEP); 659 retry: 660 nlen = tmp_nswapfiles * (gplen += 100); 661 pname = kmem_zalloc(nlen, KM_SLEEP); 662 663 mutex_enter(&swapinfo_lock); 664 665 if (tmp_nswapfiles != nswapfiles) { 666 mutex_exit(&swapinfo_lock); 667 kmem_free(pname, nlen); 668 kmem_free(csip, 669 tmp_nswapfiles * sizeof (struct swapinfo)); 670 gplen = 0; 671 goto beginning; 672 } 673 for (sip = swapinfo, tsip = csip, tpname = pname, nswap = 0; 674 sip && nswap < tmp_nswapfiles; 675 sip = sip->si_next, tsip++, tpname += plen, nswap++) { 676 plen = sip->si_pnamelen; 677 if (tpname + plen - pname > nlen) { 678 mutex_exit(&swapinfo_lock); 679 kmem_free(pname, nlen); 680 goto retry; 681 } 682 *tsip = *sip; 683 tsip->si_pname = tpname; 684 (void) strcpy(tsip->si_pname, sip->si_pname); 685 } 686 mutex_exit(&swapinfo_lock); 687 688 if (sip) { 689 error = ENOMEM; 690 goto lout; 691 } 692 ust = (swapent_t *)((swaptbl_t *)sc_arg)->swt_ent; 693 for (tsip = csip, cnt = 0; cnt < nswap; tsip++, ust++, cnt++) { 694 if (copyin(ust, &st, sizeof (swapent_t)) != 0) { 695 error = EFAULT; 696 goto lout; 697 } 698 st.ste_flags = tsip->si_flags; 699 st.ste_length = 700 (tsip->si_eoff - tsip->si_soff) >> SCTRSHFT; 701 st.ste_start = tsip->si_soff >> SCTRSHFT; 702 st.ste_pages = tsip->si_npgs; 703 st.ste_free = tsip->si_nfpgs; 704 if (copyout(&st, ust, sizeof (swapent_t)) != 0) { 705 error = EFAULT; 706 goto lout; 707 } 708 if (!tsip->si_pnamelen) 709 continue; 710 if (copyout(tsip->si_pname, st.ste_path, 711 tsip->si_pnamelen) != 0) { 712 error = EFAULT; 713 goto lout; 714 } 715 } 716 *rv = nswap; 717 lout: 718 kmem_free(csip, tmp_nswapfiles * sizeof (struct swapinfo)); 719 kmem_free(pname, nlen); 720 return (error); 721 722 case SC_ADD: 723 case SC_REMOVE: 724 break; 725 default: 726 return (EINVAL); 727 } 728 if ((error = secpolicy_swapctl(CRED())) != 0) 729 return (error); 730 731 if (copyin(sc_arg, &sr, sizeof (swapres_t))) 732 return (EFAULT); 733 734 /* Allocate the space to read in pathname */ 735 if ((swapname = kmem_alloc(MAXPATHLEN, KM_NOSLEEP)) == NULL) 736 return (ENOMEM); 737 738 error = copyinstr(sr.sr_name, swapname, MAXPATHLEN, 0); 739 if (error) 740 goto out; 741 742 error = lookupname(swapname, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); 743 if (error) { 744 if (sc_cmd == SC_ADD) 745 goto out; 746 /* see if we match by name */ 747 vp = swapdel_byname(swapname, (size_t)sr.sr_start); 748 if (vp == NULL) 749 goto out; 750 } 751 752 if (vp->v_flag & (VNOMAP | VNOSWAP)) { 753 VN_RELE(vp); 754 error = ENOSYS; 755 goto out; 756 } 757 switch (vp->v_type) { 758 case VBLK: 759 break; 760 761 case VREG: 762 if (vp->v_vfsp && vn_is_readonly(vp)) 763 error = EROFS; 764 else 765 error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL); 766 break; 767 768 case VDIR: 769 error = EISDIR; 770 break; 771 default: 772 error = ENOSYS; 773 break; 774 } 775 if (error == 0) { 776 if (sc_cmd == SC_REMOVE) 777 error = swapdel(vp, sr.sr_start); 778 else 779 error = swapadd(vp, sr.sr_start, 780 sr.sr_length, swapname); 781 } 782 VN_RELE(vp); 783 out: 784 kmem_free(swapname, MAXPATHLEN); 785 return (error); 786 } 787 788 #if defined(_LP64) && defined(_SYSCALL32) 789 790 int 791 swapctl32(int sc_cmd, void *sc_arg, int *rv) 792 { 793 struct swapinfo *sip, *csip, *tsip; 794 int error = 0; 795 struct swapent32 st, *ust; 796 struct swapres32 sr; 797 struct vnode *vp; 798 int cnt = 0; 799 int tmp_nswapfiles; 800 int nswap; 801 int length, nlen; 802 int gplen = 0, plen; 803 char *swapname; 804 char *pname; 805 char *tpname; 806 struct anoninfo32 ai; 807 size_t s; 808 spgcnt_t avail; 809 int global = INGLOBALZONE(curproc); 810 struct zone *zp = curproc->p_zone; 811 812 /* 813 * When running in a zone we want to hide the details of the swap 814 * devices: we report there only being one swap device named "swap" 815 * having a size equal to the sum of the sizes of all real swap devices 816 * on the system. 817 */ 818 switch (sc_cmd) { 819 case SC_GETNSWP: 820 if (global) 821 *rv = nswapfiles; 822 else 823 *rv = 1; 824 return (0); 825 826 case SC_AINFO: 827 /* 828 * Return anoninfo information with these changes: 829 * ani_max = maximum amount of swap space 830 * (including potentially available physical memory) 831 * ani_free = amount of unallocated anonymous memory 832 * (some of which might be reserved and including 833 * potentially available physical memory) 834 * ani_resv = amount of claimed (reserved) anonymous memory 835 */ 836 avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0); 837 s = (k_anoninfo.ani_max + k_anoninfo.ani_mem_resv) + avail; 838 if (s > UINT32_MAX) 839 return (EOVERFLOW); 840 ai.ani_max = s; 841 842 /* Update ani_free */ 843 set_anoninfo(); 844 s = k_anoninfo.ani_free + avail; 845 if (s > UINT32_MAX) 846 return (EOVERFLOW); 847 ai.ani_free = s; 848 849 s = k_anoninfo.ani_phys_resv + k_anoninfo.ani_mem_resv; 850 if (s > UINT32_MAX) 851 return (EOVERFLOW); 852 ai.ani_resv = s; 853 854 if (!global && zp->zone_max_swap_ctl != UINT64_MAX) { 855 /* 856 * We're in a non-global zone with a swap cap. We 857 * always report the system-wide values for the global 858 * zone, even though it too can have a swap cap. 859 * See the comment for the SC_AINFO case in swapctl() 860 * which explains the following logic. 861 */ 862 rctl_qty_t cap, used; 863 pgcnt_t pgcap, sys_avail; 864 865 mutex_enter(&zp->zone_mem_lock); 866 cap = zp->zone_max_swap_ctl; 867 used = zp->zone_max_swap; 868 mutex_exit(&zp->zone_mem_lock); 869 870 pgcap = MIN(btop(cap), ai.ani_max); 871 ai.ani_free = pgcap - btop(used); 872 873 /* Get the system-wide swap currently available. */ 874 sys_avail = ai.ani_max - ai.ani_resv; 875 if (sys_avail < ai.ani_free) 876 ai.ani_resv = pgcap - sys_avail; 877 else 878 ai.ani_resv = btop(used); 879 880 ai.ani_max = pgcap; 881 } 882 883 if (copyout(&ai, sc_arg, sizeof (ai)) != 0) 884 return (EFAULT); 885 return (0); 886 887 case SC_LIST: 888 if (copyin(sc_arg, &length, sizeof (int32_t)) != 0) 889 return (EFAULT); 890 if (!global) { 891 struct swapent32 st; 892 char *swappath = "swap"; 893 894 if (length < 1) 895 return (ENOMEM); 896 ust = (swapent32_t *)((swaptbl32_t *)sc_arg)->swt_ent; 897 if (copyin(ust, &st, sizeof (swapent32_t)) != 0) 898 return (EFAULT); 899 st.ste_start = PAGESIZE >> SCTRSHFT; 900 st.ste_length = (off_t)0; 901 st.ste_pages = 0; 902 st.ste_free = 0; 903 st.ste_flags = 0; 904 905 mutex_enter(&swapinfo_lock); 906 for (sip = swapinfo, nswap = 0; 907 sip != NULL && nswap < nswapfiles; 908 sip = sip->si_next, nswap++) { 909 st.ste_length += 910 (sip->si_eoff - sip->si_soff) >> SCTRSHFT; 911 st.ste_pages += sip->si_npgs; 912 st.ste_free += sip->si_nfpgs; 913 } 914 mutex_exit(&swapinfo_lock); 915 916 if (zp->zone_max_swap_ctl != UINT64_MAX) { 917 rctl_qty_t cap, used; 918 919 mutex_enter(&zp->zone_mem_lock); 920 cap = zp->zone_max_swap_ctl; 921 used = zp->zone_max_swap; 922 mutex_exit(&zp->zone_mem_lock); 923 924 st.ste_length = MIN(cap, st.ste_length); 925 st.ste_pages = MIN(btop(cap), st.ste_pages); 926 st.ste_free = MIN(st.ste_pages - btop(used), 927 st.ste_free); 928 } 929 930 if (copyout(&st, ust, sizeof (swapent32_t)) != 0 || 931 copyout(swappath, (caddr_t)(uintptr_t)st.ste_path, 932 strlen(swappath) + 1) != 0) { 933 return (EFAULT); 934 } 935 *rv = 1; 936 return (0); 937 } 938 beginning: 939 mutex_enter(&swapinfo_lock); 940 tmp_nswapfiles = nswapfiles; 941 mutex_exit(&swapinfo_lock); 942 943 /* 944 * Return early if there are no swap entries to report: 945 */ 946 if (tmp_nswapfiles < 1) { 947 *rv = 0; 948 return (0); 949 } 950 951 /* Return an error if not enough space for the whole table. */ 952 if (length < tmp_nswapfiles) 953 return (ENOMEM); 954 /* 955 * Get memory to hold the swap entries and their names. We'll 956 * copy the real entries into these and then copy these out. 957 * Allocating the pathname memory is only a guess so we may 958 * find that we need more and have to do it again. 959 * All this is because we have to hold the anon lock while 960 * traversing the swapinfo list, and we can't be doing copyouts 961 * and/or kmem_alloc()s during this. 962 */ 963 csip = kmem_zalloc(tmp_nswapfiles * sizeof (*csip), KM_SLEEP); 964 retry: 965 nlen = tmp_nswapfiles * (gplen += 100); 966 pname = kmem_zalloc(nlen, KM_SLEEP); 967 968 mutex_enter(&swapinfo_lock); 969 970 if (tmp_nswapfiles != nswapfiles) { 971 mutex_exit(&swapinfo_lock); 972 kmem_free(pname, nlen); 973 kmem_free(csip, tmp_nswapfiles * sizeof (*csip)); 974 gplen = 0; 975 goto beginning; 976 } 977 for (sip = swapinfo, tsip = csip, tpname = pname, nswap = 0; 978 (sip != NULL) && (nswap < tmp_nswapfiles); 979 sip = sip->si_next, tsip++, tpname += plen, nswap++) { 980 plen = sip->si_pnamelen; 981 if (tpname + plen - pname > nlen) { 982 mutex_exit(&swapinfo_lock); 983 kmem_free(pname, nlen); 984 goto retry; 985 } 986 *tsip = *sip; 987 tsip->si_pname = tpname; 988 (void) strcpy(tsip->si_pname, sip->si_pname); 989 } 990 mutex_exit(&swapinfo_lock); 991 992 if (sip != NULL) { 993 error = ENOMEM; 994 goto lout; 995 } 996 ust = (swapent32_t *)((swaptbl32_t *)sc_arg)->swt_ent; 997 for (tsip = csip, cnt = 0; cnt < nswap; tsip++, ust++, cnt++) { 998 if (copyin(ust, &st, sizeof (*ust)) != 0) { 999 error = EFAULT; 1000 goto lout; 1001 } 1002 st.ste_flags = tsip->si_flags; 1003 st.ste_length = 1004 (tsip->si_eoff - tsip->si_soff) >> SCTRSHFT; 1005 st.ste_start = tsip->si_soff >> SCTRSHFT; 1006 st.ste_pages = tsip->si_npgs; 1007 st.ste_free = tsip->si_nfpgs; 1008 if (copyout(&st, ust, sizeof (st)) != 0) { 1009 error = EFAULT; 1010 goto lout; 1011 } 1012 if (!tsip->si_pnamelen) 1013 continue; 1014 if (copyout(tsip->si_pname, 1015 (caddr_t)(uintptr_t)st.ste_path, 1016 tsip->si_pnamelen) != 0) { 1017 error = EFAULT; 1018 goto lout; 1019 } 1020 } 1021 *rv = nswap; 1022 lout: 1023 kmem_free(csip, tmp_nswapfiles * sizeof (*csip)); 1024 kmem_free(pname, nlen); 1025 return (error); 1026 1027 case SC_ADD: 1028 case SC_REMOVE: 1029 break; 1030 default: 1031 return (EINVAL); 1032 } 1033 if ((error = secpolicy_swapctl(CRED())) != 0) 1034 return (error); 1035 1036 if (copyin(sc_arg, &sr, sizeof (sr))) 1037 return (EFAULT); 1038 1039 /* Allocate the space to read in pathname */ 1040 if ((swapname = kmem_alloc(MAXPATHLEN, KM_NOSLEEP)) == NULL) 1041 return (ENOMEM); 1042 1043 error = copyinstr((caddr_t)(uintptr_t)sr.sr_name, 1044 swapname, MAXPATHLEN, NULL); 1045 if (error) 1046 goto out; 1047 1048 error = lookupname(swapname, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); 1049 if (error) { 1050 if (sc_cmd == SC_ADD) 1051 goto out; 1052 /* see if we match by name */ 1053 vp = swapdel_byname(swapname, (uint_t)sr.sr_start); 1054 if (vp == NULL) 1055 goto out; 1056 } 1057 1058 if (vp->v_flag & (VNOMAP | VNOSWAP)) { 1059 VN_RELE(vp); 1060 error = ENOSYS; 1061 goto out; 1062 } 1063 switch (vp->v_type) { 1064 case VBLK: 1065 break; 1066 1067 case VREG: 1068 if (vp->v_vfsp && vn_is_readonly(vp)) 1069 error = EROFS; 1070 else 1071 error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL); 1072 break; 1073 1074 case VDIR: 1075 error = EISDIR; 1076 break; 1077 default: 1078 error = ENOSYS; 1079 break; 1080 } 1081 if (error == 0) { 1082 if (sc_cmd == SC_REMOVE) 1083 error = swapdel(vp, sr.sr_start); 1084 else 1085 error = swapadd(vp, sr.sr_start, sr.sr_length, 1086 swapname); 1087 } 1088 VN_RELE(vp); 1089 out: 1090 kmem_free(swapname, MAXPATHLEN); 1091 return (error); 1092 } 1093 1094 #endif /* _LP64 && _SYSCALL32 */ 1095 1096 /* 1097 * Add a new swap file. 1098 */ 1099 int 1100 swapadd(struct vnode *vp, ulong_t lowblk, ulong_t nblks, char *swapname) 1101 { 1102 struct swapinfo **sipp, *nsip = NULL, *esip = NULL; 1103 struct vnode *cvp; 1104 struct vattr vattr; 1105 pgcnt_t pages; 1106 u_offset_t soff, eoff; 1107 int error; 1108 ssize_t i, start, end; 1109 ushort_t wasswap; 1110 ulong_t startblk; 1111 size_t returned_mem; 1112 1113 SWAP_PRINT(SW_CTL, "swapadd: vp %p lowblk %ld nblks %ld swapname %s\n", 1114 vp, lowblk, nblks, swapname, 0); 1115 /* 1116 * Get the real vnode. (If vp is not a specnode it just returns vp, so 1117 * it does the right thing, but having this code know about specnodes 1118 * violates the spirit of having it be indepedent of vnode type.) 1119 */ 1120 cvp = common_specvp(vp); 1121 1122 /* 1123 * Or in VISSWAP so file system has chance to deny swap-ons during open. 1124 */ 1125 mutex_enter(&cvp->v_lock); 1126 wasswap = cvp->v_flag & VISSWAP; 1127 cvp->v_flag |= VISSWAP; 1128 mutex_exit(&cvp->v_lock); 1129 1130 mutex_enter(&swap_lock); 1131 if (error = VOP_OPEN(&cvp, FREAD|FWRITE, CRED(), NULL)) { 1132 mutex_exit(&swap_lock); 1133 /* restore state of v_flag */ 1134 if (!wasswap) { 1135 mutex_enter(&cvp->v_lock); 1136 cvp->v_flag &= ~VISSWAP; 1137 mutex_exit(&cvp->v_lock); 1138 } 1139 return (error); 1140 } 1141 mutex_exit(&swap_lock); 1142 1143 /* 1144 * Get partition size. Return error if empty partition, 1145 * or if request does not fit within the partition. 1146 * If this is the first swap device, we can reduce 1147 * the size of the swap area to match what is 1148 * available. This can happen if the system was built 1149 * on a machine with a different size swap partition. 1150 */ 1151 vattr.va_mask = AT_SIZE; 1152 if (error = VOP_GETATTR(cvp, &vattr, ATTR_COMM, CRED(), NULL)) 1153 goto out; 1154 1155 /* 1156 * Specfs returns a va_size of MAXOFFSET_T (UNKNOWN_SIZE) when the 1157 * size of the device can't be determined. 1158 */ 1159 if ((vattr.va_size == 0) || (vattr.va_size == MAXOFFSET_T)) { 1160 error = EINVAL; 1161 goto out; 1162 } 1163 1164 #ifdef _ILP32 1165 /* 1166 * No support for large swap in 32-bit OS, if the size of the swap is 1167 * bigger than MAXOFF32_T then the size used by swapfs must be limited. 1168 * This limitation is imposed by the swap subsystem itself, a D_64BIT 1169 * driver as the target of swap operation should be able to field 1170 * the IO. 1171 */ 1172 if (vattr.va_size > MAXOFF32_T) { 1173 cmn_err(CE_NOTE, 1174 "!swap device %s truncated from 0x%llx to 0x%x bytes", 1175 swapname, vattr.va_size, MAXOFF32_T); 1176 vattr.va_size = MAXOFF32_T; 1177 } 1178 #endif /* _ILP32 */ 1179 1180 /* Fail if file not writeable (try to set size to current size) */ 1181 vattr.va_mask = AT_SIZE; 1182 if (error = VOP_SETATTR(cvp, &vattr, 0, CRED(), NULL)) 1183 goto out; 1184 1185 /* Fail if fs does not support VOP_PAGEIO */ 1186 error = VOP_PAGEIO(cvp, (page_t *)NULL, (u_offset_t)0, 0, 0, CRED(), 1187 NULL); 1188 1189 if (error == ENOSYS) 1190 goto out; 1191 else 1192 error = 0; 1193 /* 1194 * If swapping on the root filesystem don't put swap blocks that 1195 * correspond to the miniroot filesystem on the swap free list. 1196 */ 1197 if (cvp == rootdir) 1198 startblk = roundup(MINIROOTSIZE<<SCTRSHFT, klustsize)>>SCTRSHFT; 1199 else /* Skip 1st page (disk label) */ 1200 startblk = (ulong_t)(lowblk ? lowblk : 1); 1201 1202 soff = startblk << SCTRSHFT; 1203 if (soff >= vattr.va_size) { 1204 error = EINVAL; 1205 goto out; 1206 } 1207 1208 /* 1209 * If user specified 0 blks, use the size of the device 1210 */ 1211 eoff = nblks ? soff + (nblks - (startblk - lowblk) << SCTRSHFT) : 1212 vattr.va_size; 1213 1214 SWAP_PRINT(SW_CTL, "swapadd: va_size %ld soff %ld eoff %ld\n", 1215 vattr.va_size, soff, eoff, 0, 0); 1216 1217 if (eoff > vattr.va_size) { 1218 error = EINVAL; 1219 goto out; 1220 } 1221 1222 /* 1223 * The starting and ending offsets must be page aligned. 1224 * Round soff up to next page boundary, round eoff 1225 * down to previous page boundary. 1226 */ 1227 soff = ptob(btopr(soff)); 1228 eoff = ptob(btop(eoff)); 1229 if (soff >= eoff) { 1230 SWAP_PRINT(SW_CTL, "swapadd: soff %ld >= eoff %ld\n", 1231 soff, eoff, 0, 0, 0); 1232 error = EINVAL; 1233 goto out; 1234 } 1235 1236 pages = btop(eoff - soff); 1237 1238 /* Allocate and partially set up the new swapinfo */ 1239 nsip = kmem_zalloc(sizeof (struct swapinfo), KM_SLEEP); 1240 nsip->si_vp = cvp; 1241 1242 nsip->si_soff = soff; 1243 nsip->si_eoff = eoff; 1244 nsip->si_hint = 0; 1245 nsip->si_checkcnt = nsip->si_alloccnt = 0; 1246 1247 nsip->si_pnamelen = (int)strlen(swapname) + 1; 1248 nsip->si_pname = (char *)kmem_zalloc(nsip->si_pnamelen, KM_SLEEP); 1249 bcopy(swapname, nsip->si_pname, nsip->si_pnamelen - 1); 1250 SWAP_PRINT(SW_CTL, "swapadd: allocating swapinfo for %s, %ld pages\n", 1251 swapname, pages, 0, 0, 0); 1252 /* 1253 * Size of swapslots map in bytes 1254 */ 1255 nsip->si_mapsize = P2ROUNDUP(pages, NBBW) / NBBY; 1256 nsip->si_swapslots = kmem_zalloc(nsip->si_mapsize, KM_SLEEP); 1257 1258 /* 1259 * Permanently set the bits that can't ever be allocated, 1260 * i.e. those from the ending offset to the round up slot for the 1261 * swapslots bit map. 1262 */ 1263 start = pages; 1264 end = P2ROUNDUP(pages, NBBW); 1265 for (i = start; i < end; i++) { 1266 SWAP_PRINT(SW_CTL, "swapadd: set bit for page %ld\n", i, 1267 0, 0, 0, 0); 1268 SETBIT(nsip->si_swapslots, i); 1269 } 1270 nsip->si_npgs = nsip->si_nfpgs = pages; 1271 /* 1272 * Now check to see if we can add it. We wait til now to check because 1273 * we need the swapinfo_lock and we don't want sleep with it (e.g., 1274 * during kmem_alloc()) while we're setting up the swapinfo. 1275 */ 1276 mutex_enter(&swapinfo_lock); 1277 for (sipp = &swapinfo; (esip = *sipp) != NULL; sipp = &esip->si_next) { 1278 if (esip->si_vp == cvp) { 1279 if (esip->si_soff == soff && esip->si_npgs == pages && 1280 (esip->si_flags & ST_DOINGDEL)) { 1281 /* 1282 * We are adding a device that we are in the 1283 * middle of deleting. Just clear the 1284 * ST_DOINGDEL flag to signal this and 1285 * the deletion routine will eventually notice 1286 * it and add it back. 1287 */ 1288 esip->si_flags &= ~ST_DOINGDEL; 1289 mutex_exit(&swapinfo_lock); 1290 goto out; 1291 } 1292 /* disallow overlapping swap files */ 1293 if ((soff < esip->si_eoff) && (eoff > esip->si_soff)) { 1294 error = EEXIST; 1295 mutex_exit(&swapinfo_lock); 1296 goto out; 1297 } 1298 } 1299 } 1300 1301 nswapfiles++; 1302 1303 /* 1304 * add new swap device to list and shift allocations to it 1305 * before updating the anoninfo counters 1306 */ 1307 *sipp = nsip; 1308 silast = nsip; 1309 1310 /* 1311 * Update the total amount of reservable swap space 1312 * accounting properly for swap space from physical memory 1313 */ 1314 /* New swap device soaks up currently reserved memory swap */ 1315 mutex_enter(&anoninfo_lock); 1316 1317 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 1318 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 1319 1320 k_anoninfo.ani_max += pages; 1321 ANI_ADD(pages); 1322 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) { 1323 returned_mem = MIN(k_anoninfo.ani_mem_resv - 1324 k_anoninfo.ani_locked_swap, 1325 k_anoninfo.ani_max - k_anoninfo.ani_phys_resv); 1326 1327 ANI_ADD(-returned_mem); 1328 k_anoninfo.ani_free -= returned_mem; 1329 k_anoninfo.ani_mem_resv -= returned_mem; 1330 k_anoninfo.ani_phys_resv += returned_mem; 1331 1332 mutex_enter(&freemem_lock); 1333 availrmem += returned_mem; 1334 mutex_exit(&freemem_lock); 1335 } 1336 /* 1337 * At boot time, to permit booting small memory machines using 1338 * only physical memory as swap space, we allowed a dangerously 1339 * large amount of memory to be used as swap space; now that 1340 * more physical backing store is available bump down the amount 1341 * we can get from memory to a safer size. 1342 */ 1343 if (swapfs_minfree < swapfs_desfree) { 1344 mutex_enter(&freemem_lock); 1345 if (availrmem > swapfs_desfree || !k_anoninfo.ani_mem_resv) 1346 swapfs_minfree = swapfs_desfree; 1347 mutex_exit(&freemem_lock); 1348 } 1349 1350 SWAP_PRINT(SW_CTL, "swapadd: ani_max %ld ani_free %ld\n", 1351 k_anoninfo.ani_free, k_anoninfo.ani_free, 0, 0, 0); 1352 1353 mutex_exit(&anoninfo_lock); 1354 1355 mutex_exit(&swapinfo_lock); 1356 1357 /* Initialize the dump device */ 1358 mutex_enter(&dump_lock); 1359 if (dumpvp == NULL) 1360 (void) dumpinit(vp, swapname, 0); 1361 mutex_exit(&dump_lock); 1362 1363 VN_HOLD(cvp); 1364 out: 1365 if (error || esip) { 1366 SWAP_PRINT(SW_CTL, "swapadd: error (%d)\n", error, 0, 0, 0, 0); 1367 1368 if (!wasswap) { 1369 mutex_enter(&cvp->v_lock); 1370 cvp->v_flag &= ~VISSWAP; 1371 mutex_exit(&cvp->v_lock); 1372 } 1373 if (nsip) { 1374 kmem_free(nsip->si_swapslots, (size_t)nsip->si_mapsize); 1375 kmem_free(nsip->si_pname, nsip->si_pnamelen); 1376 kmem_free(nsip, sizeof (*nsip)); 1377 } 1378 mutex_enter(&swap_lock); 1379 (void) VOP_CLOSE(cvp, FREAD|FWRITE, 1, (offset_t)0, CRED(), 1380 NULL); 1381 mutex_exit(&swap_lock); 1382 } 1383 return (error); 1384 } 1385 1386 /* 1387 * Delete a swap file. 1388 */ 1389 static int 1390 swapdel( 1391 struct vnode *vp, 1392 ulong_t lowblk) /* Low block number of area to delete. */ 1393 { 1394 struct swapinfo **sipp, *osip = NULL; 1395 struct vnode *cvp; 1396 u_offset_t soff; 1397 int error = 0; 1398 u_offset_t toff = 0; 1399 struct vnode *tvp = NULL; 1400 spgcnt_t pages; 1401 struct anon **app, *ap; 1402 kmutex_t *ahm; 1403 pgcnt_t adjust_swap = 0; 1404 1405 /* Find the swap file entry for the file to be deleted */ 1406 cvp = common_specvp(vp); 1407 1408 1409 lowblk = lowblk ? lowblk : 1; /* Skip first page (disk label) */ 1410 soff = ptob(btopr(lowblk << SCTRSHFT)); /* must be page aligned */ 1411 1412 mutex_enter(&swapinfo_lock); 1413 for (sipp = &swapinfo; (osip = *sipp) != NULL; sipp = &osip->si_next) { 1414 if ((osip->si_vp == cvp) && 1415 (osip->si_soff == soff) && (osip->si_flags == 0)) 1416 break; 1417 } 1418 1419 /* If the file was not found, error. */ 1420 if (osip == NULL) { 1421 error = EINVAL; 1422 mutex_exit(&swapinfo_lock); 1423 goto out; 1424 } 1425 1426 pages = osip->si_npgs; 1427 1428 /* 1429 * Do not delete if we will be low on swap pages. 1430 */ 1431 mutex_enter(&anoninfo_lock); 1432 1433 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 1434 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 1435 1436 mutex_enter(&freemem_lock); 1437 if (((k_anoninfo.ani_max - k_anoninfo.ani_phys_resv) + 1438 MAX((spgcnt_t)(availrmem - swapfs_minfree), 0)) < pages) { 1439 mutex_exit(&freemem_lock); 1440 mutex_exit(&anoninfo_lock); 1441 error = ENOMEM; 1442 cmn_err(CE_WARN, "swapdel - too few free pages"); 1443 mutex_exit(&swapinfo_lock); 1444 goto out; 1445 } 1446 mutex_exit(&freemem_lock); 1447 1448 k_anoninfo.ani_max -= pages; 1449 1450 /* If needed, reserve memory swap to replace old device */ 1451 if (k_anoninfo.ani_phys_resv > k_anoninfo.ani_max) { 1452 adjust_swap = k_anoninfo.ani_phys_resv - k_anoninfo.ani_max; 1453 k_anoninfo.ani_phys_resv -= adjust_swap; 1454 k_anoninfo.ani_mem_resv += adjust_swap; 1455 mutex_enter(&freemem_lock); 1456 availrmem -= adjust_swap; 1457 mutex_exit(&freemem_lock); 1458 ANI_ADD(adjust_swap); 1459 } 1460 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 1461 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 1462 mutex_exit(&anoninfo_lock); 1463 1464 ANI_ADD(-pages); 1465 1466 /* 1467 * Set the delete flag. This prevents anyone from allocating more 1468 * pages from this file. Also set ST_DOINGDEL. Someone who wants to 1469 * add the file back while we're deleting it will signify by clearing 1470 * this flag. 1471 */ 1472 osip->si_flags |= ST_INDEL|ST_DOINGDEL; 1473 mutex_exit(&swapinfo_lock); 1474 1475 /* 1476 * Free all the allocated physical slots for this file. We do this 1477 * by walking through the entire anon hash array, because we need 1478 * to update all the anon slots that have physical swap slots on 1479 * this file, and this is the only way to find them all. We go back 1480 * to the beginning of a bucket after each slot is freed because the 1481 * anonhash_lock is not held during the free and thus the hash table 1482 * may change under us. 1483 */ 1484 for (app = anon_hash; app < &anon_hash[ANON_HASH_SIZE]; app++) { 1485 ahm = &anonhash_lock[(app - anon_hash) & 1486 (AH_LOCK_SIZE - 1)].pad_mutex; 1487 mutex_enter(ahm); 1488 top: 1489 for (ap = *app; ap != NULL; ap = ap->an_hash) { 1490 if (ap->an_pvp == cvp && 1491 ap->an_poff >= osip->si_soff && 1492 ap->an_poff < osip->si_eoff) { 1493 ASSERT(TESTBIT(osip->si_swapslots, 1494 btop((size_t)(ap->an_poff - 1495 osip->si_soff)))); 1496 tvp = ap->an_vp; 1497 toff = ap->an_off; 1498 VN_HOLD(tvp); 1499 mutex_exit(ahm); 1500 1501 error = swapslot_free(tvp, toff, osip); 1502 1503 VN_RELE(tvp); 1504 mutex_enter(ahm); 1505 if (!error && (osip->si_flags & ST_DOINGDEL)) { 1506 goto top; 1507 } else { 1508 if (error) { 1509 cmn_err(CE_WARN, 1510 "swapslot_free failed %d", 1511 error); 1512 } 1513 1514 /* 1515 * Add device back before making it 1516 * visible. 1517 */ 1518 mutex_enter(&swapinfo_lock); 1519 osip->si_flags &= 1520 ~(ST_INDEL | ST_DOINGDEL); 1521 mutex_exit(&swapinfo_lock); 1522 1523 /* 1524 * Update the anon space available 1525 */ 1526 mutex_enter(&anoninfo_lock); 1527 1528 k_anoninfo.ani_phys_resv += adjust_swap; 1529 k_anoninfo.ani_mem_resv -= adjust_swap; 1530 k_anoninfo.ani_max += pages; 1531 1532 mutex_enter(&freemem_lock); 1533 availrmem += adjust_swap; 1534 mutex_exit(&freemem_lock); 1535 1536 mutex_exit(&anoninfo_lock); 1537 1538 ANI_ADD(pages); 1539 1540 mutex_exit(ahm); 1541 goto out; 1542 } 1543 } 1544 } 1545 mutex_exit(ahm); 1546 } 1547 1548 /* All done, they'd better all be free! */ 1549 mutex_enter(&swapinfo_lock); 1550 ASSERT(osip->si_nfpgs == osip->si_npgs); 1551 1552 /* Now remove it from the swapinfo list */ 1553 for (sipp = &swapinfo; *sipp != NULL; sipp = &(*sipp)->si_next) { 1554 if (*sipp == osip) 1555 break; 1556 } 1557 ASSERT(*sipp); 1558 *sipp = osip->si_next; 1559 if (silast == osip) 1560 if ((silast = osip->si_next) == NULL) 1561 silast = swapinfo; 1562 nswapfiles--; 1563 mutex_exit(&swapinfo_lock); 1564 1565 kmem_free(osip->si_swapslots, osip->si_mapsize); 1566 kmem_free(osip->si_pname, osip->si_pnamelen); 1567 kmem_free(osip, sizeof (*osip)); 1568 1569 mutex_enter(&dump_lock); 1570 if (cvp == dumpvp) 1571 dumpfini(); 1572 mutex_exit(&dump_lock); 1573 1574 /* Release the vnode */ 1575 1576 mutex_enter(&swap_lock); 1577 (void) VOP_CLOSE(cvp, FREAD|FWRITE, 1, (offset_t)0, CRED(), NULL); 1578 mutex_enter(&cvp->v_lock); 1579 cvp->v_flag &= ~VISSWAP; 1580 mutex_exit(&cvp->v_lock); 1581 VN_RELE(cvp); 1582 mutex_exit(&swap_lock); 1583 out: 1584 return (error); 1585 } 1586 1587 /* 1588 * Free up a physical swap slot on swapinfo sip, currently in use by the 1589 * anonymous page whose name is (vp, off). 1590 */ 1591 static int 1592 swapslot_free( 1593 struct vnode *vp, 1594 u_offset_t off, 1595 struct swapinfo *sip) 1596 { 1597 struct page *pp = NULL; 1598 struct anon *ap = NULL; 1599 int error = 0; 1600 kmutex_t *ahm; 1601 struct vnode *pvp = NULL; 1602 u_offset_t poff; 1603 int alloc_pg = 0; 1604 1605 ASSERT(sip->si_vp != NULL); 1606 /* 1607 * Get the page for the old swap slot if exists or create a new one. 1608 */ 1609 again: 1610 if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) { 1611 pp = page_create_va(vp, off, PAGESIZE, PG_WAIT | PG_EXCL, 1612 segkmap, NULL); 1613 if (pp == NULL) 1614 goto again; 1615 alloc_pg = 1; 1616 1617 error = swap_getphysname(vp, off, &pvp, &poff); 1618 if (error || pvp != sip->si_vp || poff < sip->si_soff || 1619 poff >= sip->si_eoff) { 1620 page_io_unlock(pp); 1621 /*LINTED: constant in conditional context*/ 1622 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1623 return (0); 1624 } 1625 1626 error = VOP_PAGEIO(pvp, pp, poff, PAGESIZE, B_READ, 1627 CRED(), NULL); 1628 if (error) { 1629 page_io_unlock(pp); 1630 if (error == EFAULT) 1631 error = 0; 1632 /*LINTED: constant in conditional context*/ 1633 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1634 return (error); 1635 } 1636 } 1637 1638 /* 1639 * The anon could have been removed by anon_decref* and/or reallocated 1640 * by anon layer (an_pvp == NULL) with the same vp, off. 1641 * In this case the page which has been allocated needs to 1642 * be freed. 1643 */ 1644 if (!alloc_pg) 1645 page_io_lock(pp); 1646 ahm = AH_MUTEX(vp, off); 1647 mutex_enter(ahm); 1648 ap = swap_anon(vp, off); 1649 if ((ap == NULL || ap->an_pvp == NULL) && alloc_pg) { 1650 mutex_exit(ahm); 1651 page_io_unlock(pp); 1652 /*LINTED: constant in conditional context*/ 1653 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1654 return (0); 1655 } 1656 1657 /* 1658 * Free the physical slot. It may have been freed up and replaced with 1659 * another one while we were getting the page so we have to re-verify 1660 * that this is really one we want. If we do free the slot we have 1661 * to mark the page modified, as its backing store is now gone. 1662 */ 1663 if ((ap != NULL) && (ap->an_pvp == sip->si_vp && ap->an_poff >= 1664 sip->si_soff && ap->an_poff < sip->si_eoff)) { 1665 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 1666 ap->an_pvp = NULL; 1667 ap->an_poff = 0; 1668 mutex_exit(ahm); 1669 hat_setmod(pp); 1670 } else { 1671 mutex_exit(ahm); 1672 } 1673 page_io_unlock(pp); 1674 page_unlock(pp); 1675 return (0); 1676 } 1677 1678 1679 /* 1680 * Get contig physical backing store for vp, in the range 1681 * [*offp, *offp + *lenp), May back a subrange of this, but must 1682 * always include the requested offset or fail. Returns the offsets 1683 * backed as [*offp, *offp + *lenp) and the physical offsets used to 1684 * back them from *pvpp in the range [*pstartp, *pstartp + *lenp). 1685 * Returns 0 for success 1686 * SE_NOANON -- no anon slot for requested paged 1687 * SE_NOSWAP -- no physical swap space available 1688 */ 1689 int 1690 swap_newphysname( 1691 struct vnode *vp, 1692 u_offset_t offset, 1693 u_offset_t *offp, 1694 size_t *lenp, 1695 struct vnode **pvpp, 1696 u_offset_t *poffp) 1697 { 1698 struct anon *ap = NULL; /* anon slot for vp, off */ 1699 int error = 0; 1700 struct vnode *pvp; 1701 u_offset_t poff, pstart, prem; 1702 size_t plen; 1703 u_offset_t off, start; 1704 kmutex_t *ahm; 1705 1706 ASSERT(*offp <= offset && offset < *offp + *lenp); 1707 1708 /* Get new physical swap slots. */ 1709 plen = *lenp; 1710 if (!swap_phys_alloc(&pvp, &pstart, &plen, 0)) { 1711 /* 1712 * No swap available so return error unless requested 1713 * offset is already backed in which case return that. 1714 */ 1715 ahm = AH_MUTEX(vp, offset); 1716 mutex_enter(ahm); 1717 if ((ap = swap_anon(vp, offset)) == NULL) { 1718 error = SE_NOANON; 1719 mutex_exit(ahm); 1720 return (error); 1721 } 1722 error = (ap->an_pvp ? 0 : SE_NOSWAP); 1723 *offp = offset; 1724 *lenp = PAGESIZE; 1725 *pvpp = ap->an_pvp; 1726 *poffp = ap->an_poff; 1727 mutex_exit(ahm); 1728 return (error); 1729 } 1730 1731 /* 1732 * We got plen (<= *lenp) contig slots. Use these to back a 1733 * subrange of [*offp, *offp + *lenp) which includes offset. 1734 * For now we just put offset at the end of the kluster. 1735 * Clearly there are other possible choices - which is best? 1736 */ 1737 start = MAX(*offp, 1738 (offset + PAGESIZE > plen) ? (offset + PAGESIZE - plen) : 0); 1739 ASSERT(start + plen <= *offp + *lenp); 1740 1741 for (off = start, poff = pstart; poff < pstart + plen; 1742 off += PAGESIZE, poff += PAGESIZE) { 1743 ahm = AH_MUTEX(vp, off); 1744 mutex_enter(ahm); 1745 if ((ap = swap_anon(vp, off)) != NULL) { 1746 /* Free old slot if any, and assign new one */ 1747 if (ap->an_pvp) 1748 swap_phys_free(ap->an_pvp, ap->an_poff, 1749 PAGESIZE); 1750 ap->an_pvp = pvp; 1751 ap->an_poff = poff; 1752 } else { /* No anon slot for a klustered page, quit. */ 1753 prem = (pstart + plen) - poff; 1754 /* Already did requested page, do partial kluster */ 1755 if (off > offset) { 1756 plen = poff - pstart; 1757 error = 0; 1758 /* Fail on requested page, error */ 1759 } else if (off == offset) { 1760 error = SE_NOANON; 1761 /* Fail on prior page, fail on requested page, error */ 1762 } else if ((ap = swap_anon(vp, offset)) == NULL) { 1763 error = SE_NOANON; 1764 /* Fail on prior page, got requested page, do only it */ 1765 } else { 1766 /* Free old slot if any, and assign new one */ 1767 if (ap->an_pvp) 1768 swap_phys_free(ap->an_pvp, ap->an_poff, 1769 PAGESIZE); 1770 ap->an_pvp = pvp; 1771 ap->an_poff = poff; 1772 /* One page kluster */ 1773 start = offset; 1774 plen = PAGESIZE; 1775 pstart = poff; 1776 poff += PAGESIZE; 1777 prem -= PAGESIZE; 1778 } 1779 /* Free unassigned slots */ 1780 swap_phys_free(pvp, poff, prem); 1781 mutex_exit(ahm); 1782 break; 1783 } 1784 mutex_exit(ahm); 1785 } 1786 ASSERT(*offp <= start && start + plen <= *offp + *lenp); 1787 ASSERT(start <= offset && offset < start + plen); 1788 *offp = start; 1789 *lenp = plen; 1790 *pvpp = pvp; 1791 *poffp = pstart; 1792 return (error); 1793 } 1794 1795 1796 /* 1797 * Get the physical swap backing store location for a given anonymous page 1798 * named (vp, off). The backing store name is returned in (*pvpp, *poffp). 1799 * Returns 0 success 1800 * EIDRM -- no anon slot (page is not allocated) 1801 */ 1802 int 1803 swap_getphysname( 1804 struct vnode *vp, 1805 u_offset_t off, 1806 struct vnode **pvpp, 1807 u_offset_t *poffp) 1808 { 1809 struct anon *ap; 1810 int error = 0; 1811 kmutex_t *ahm; 1812 1813 ahm = AH_MUTEX(vp, off); 1814 mutex_enter(ahm); 1815 1816 /* Get anon slot for vp, off */ 1817 ap = swap_anon(vp, off); 1818 if (ap == NULL) { 1819 error = EIDRM; 1820 goto out; 1821 } 1822 *pvpp = ap->an_pvp; 1823 *poffp = ap->an_poff; 1824 out: 1825 mutex_exit(ahm); 1826 return (error); 1827 } 1828