1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - address spaces. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/t_lock.h> 47 #include <sys/param.h> 48 #include <sys/errno.h> 49 #include <sys/systm.h> 50 #include <sys/mman.h> 51 #include <sys/sysmacros.h> 52 #include <sys/cpuvar.h> 53 #include <sys/sysinfo.h> 54 #include <sys/kmem.h> 55 #include <sys/vnode.h> 56 #include <sys/vmsystm.h> 57 #include <sys/cmn_err.h> 58 #include <sys/debug.h> 59 #include <sys/tnf_probe.h> 60 #include <sys/vtrace.h> 61 62 #include <vm/hat.h> 63 #include <vm/xhat.h> 64 #include <vm/as.h> 65 #include <vm/seg.h> 66 #include <vm/seg_vn.h> 67 #include <vm/seg_dev.h> 68 #include <vm/seg_kmem.h> 69 #include <vm/seg_map.h> 70 #include <vm/seg_spt.h> 71 #include <vm/page.h> 72 73 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */ 74 75 static struct kmem_cache *as_cache; 76 77 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t); 78 static void as_clearwatchprot(struct as *, caddr_t, size_t); 79 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *); 80 81 82 /* 83 * Verifying the segment lists is very time-consuming; it may not be 84 * desirable always to define VERIFY_SEGLIST when DEBUG is set. 85 */ 86 #ifdef DEBUG 87 #define VERIFY_SEGLIST 88 int do_as_verify = 0; 89 #endif 90 91 /* 92 * Allocate a new callback data structure entry and fill in the events of 93 * interest, the address range of interest, and the callback argument. 94 * Link the entry on the as->a_callbacks list. A callback entry for the 95 * entire address space may be specified with vaddr = 0 and size = -1. 96 * 97 * CALLERS RESPONSIBILITY: If not calling from within the process context for 98 * the specified as, the caller must guarantee persistence of the specified as 99 * for the duration of this function (eg. pages being locked within the as 100 * will guarantee persistence). 101 */ 102 int 103 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events, 104 caddr_t vaddr, size_t size, int sleepflag) 105 { 106 struct as_callback *current_head, *cb; 107 caddr_t saddr; 108 size_t rsize; 109 110 /* callback function and an event are mandatory */ 111 if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0)) 112 return (EINVAL); 113 114 /* Adding a callback after as_free has been called is not allowed */ 115 if (as == &kas) 116 return (ENOMEM); 117 118 /* 119 * vaddr = 0 and size = -1 is used to indicate that the callback range 120 * is the entire address space so no rounding is done in that case. 121 */ 122 if (size != -1) { 123 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK); 124 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) - 125 (size_t)saddr; 126 /* check for wraparound */ 127 if (saddr + rsize < saddr) 128 return (ENOMEM); 129 } else { 130 if (vaddr != 0) 131 return (EINVAL); 132 saddr = vaddr; 133 rsize = size; 134 } 135 136 /* Allocate and initialize a callback entry */ 137 cb = kmem_zalloc(sizeof (struct as_callback), sleepflag); 138 if (cb == NULL) 139 return (EAGAIN); 140 141 cb->ascb_func = cb_func; 142 cb->ascb_arg = arg; 143 cb->ascb_events = events; 144 cb->ascb_saddr = saddr; 145 cb->ascb_len = rsize; 146 147 /* Add the entry to the list */ 148 mutex_enter(&as->a_contents); 149 current_head = as->a_callbacks; 150 as->a_callbacks = cb; 151 cb->ascb_next = current_head; 152 153 /* 154 * The call to this function may lose in a race with 155 * a pertinent event - eg. a thread does long term memory locking 156 * but before the callback is added another thread executes as_unmap. 157 * A broadcast here resolves that. 158 */ 159 if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) { 160 AS_CLRUNMAPWAIT(as); 161 cv_broadcast(&as->a_cv); 162 } 163 164 mutex_exit(&as->a_contents); 165 return (0); 166 } 167 168 /* 169 * Search the callback list for an entry which pertains to arg. 170 * 171 * This is called from within the client upon completion of the callback. 172 * RETURN VALUES: 173 * AS_CALLBACK_DELETED (callback entry found and deleted) 174 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok) 175 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this 176 * entry will be made in as_do_callbacks) 177 * 178 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED 179 * set, it indicates that as_do_callbacks is processing this entry. The 180 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made 181 * to unblock as_do_callbacks, in case it is blocked. 182 * 183 * CALLERS RESPONSIBILITY: If not calling from within the process context for 184 * the specified as, the caller must guarantee persistence of the specified as 185 * for the duration of this function (eg. pages being locked within the as 186 * will guarantee persistence). 187 */ 188 uint_t 189 as_delete_callback(struct as *as, void *arg) 190 { 191 struct as_callback **prevcb = &as->a_callbacks; 192 struct as_callback *cb; 193 uint_t rc = AS_CALLBACK_NOTFOUND; 194 195 mutex_enter(&as->a_contents); 196 for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) { 197 if (cb->ascb_arg != arg) 198 continue; 199 200 /* 201 * If the events indicate AS_CALLBACK_CALLED, just clear 202 * AS_ALL_EVENT in the events field and wakeup the thread 203 * that may be waiting in as_do_callbacks. as_do_callbacks 204 * will take care of removing this entry from the list. In 205 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise 206 * (AS_CALLBACK_CALLED not set), just remove it from the 207 * list, return the memory and return AS_CALLBACK_DELETED. 208 */ 209 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) { 210 /* leave AS_CALLBACK_CALLED */ 211 cb->ascb_events &= ~AS_ALL_EVENT; 212 rc = AS_CALLBACK_DELETE_DEFERRED; 213 cv_broadcast(&as->a_cv); 214 } else { 215 *prevcb = cb->ascb_next; 216 kmem_free(cb, sizeof (struct as_callback)); 217 rc = AS_CALLBACK_DELETED; 218 } 219 break; 220 } 221 mutex_exit(&as->a_contents); 222 return (rc); 223 } 224 225 /* 226 * Searches the as callback list for a matching entry. 227 * Returns a pointer to the first matching callback, or NULL if 228 * nothing is found. 229 * This function never sleeps so it is ok to call it with more 230 * locks held but the (required) a_contents mutex. 231 * 232 * See also comment on as_do_callbacks below. 233 */ 234 static struct as_callback * 235 as_find_callback(struct as *as, uint_t events, caddr_t event_addr, 236 size_t event_len) 237 { 238 struct as_callback *cb; 239 240 ASSERT(MUTEX_HELD(&as->a_contents)); 241 for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) { 242 /* 243 * If the callback has not already been called, then 244 * check if events or address range pertains. An event_len 245 * of zero means do an unconditional callback. 246 */ 247 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) || 248 ((event_len != 0) && (((cb->ascb_events & events) == 0) || 249 (event_addr + event_len < cb->ascb_saddr) || 250 (event_addr > (cb->ascb_saddr + cb->ascb_len))))) { 251 continue; 252 } 253 break; 254 } 255 return (cb); 256 } 257 258 /* 259 * Executes a given callback and removes it from the callback list for 260 * this address space. 261 * This function may sleep so the caller must drop all locks except 262 * a_contents before calling this func. 263 * 264 * See also comments on as_do_callbacks below. 265 */ 266 static void 267 as_execute_callback(struct as *as, struct as_callback *cb, 268 uint_t events) 269 { 270 struct as_callback **prevcb; 271 void *cb_arg; 272 273 ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events)); 274 cb->ascb_events |= AS_CALLBACK_CALLED; 275 mutex_exit(&as->a_contents); 276 (*cb->ascb_func)(as, cb->ascb_arg, events); 277 mutex_enter(&as->a_contents); 278 /* 279 * the callback function is required to delete the callback 280 * when the callback function determines it is OK for 281 * this thread to continue. as_delete_callback will clear 282 * the AS_ALL_EVENT in the events field when it is deleted. 283 * If the callback function called as_delete_callback, 284 * events will already be cleared and there will be no blocking. 285 */ 286 while ((cb->ascb_events & events) != 0) { 287 cv_wait(&as->a_cv, &as->a_contents); 288 } 289 /* 290 * This entry needs to be taken off the list. Normally, the 291 * callback func itself does that, but unfortunately the list 292 * may have changed while the callback was running because the 293 * a_contents mutex was dropped and someone else other than the 294 * callback func itself could have called as_delete_callback, 295 * so we have to search to find this entry again. The entry 296 * must have AS_CALLBACK_CALLED, and have the same 'arg'. 297 */ 298 cb_arg = cb->ascb_arg; 299 prevcb = &as->a_callbacks; 300 for (cb = as->a_callbacks; cb != NULL; 301 prevcb = &cb->ascb_next, cb = *prevcb) { 302 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) || 303 (cb_arg != cb->ascb_arg)) { 304 continue; 305 } 306 *prevcb = cb->ascb_next; 307 kmem_free(cb, sizeof (struct as_callback)); 308 break; 309 } 310 } 311 312 /* 313 * Check the callback list for a matching event and intersection of 314 * address range. If there is a match invoke the callback. Skip an entry if: 315 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED) 316 * - not event of interest 317 * - not address range of interest 318 * 319 * An event_len of zero indicates a request for an unconditional callback 320 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The 321 * a_contents lock must be dropped before a callback, so only one callback 322 * can be done before returning. Return -1 (true) if a callback was 323 * executed and removed from the list, else return 0 (false). 324 * 325 * The logically separate parts, i.e. finding a matching callback and 326 * executing a given callback have been separated into two functions 327 * so that they can be called with different sets of locks held beyond 328 * the always-required a_contents. as_find_callback does not sleep so 329 * it is ok to call it if more locks than a_contents (i.e. the a_lock 330 * rwlock) are held. as_execute_callback on the other hand may sleep 331 * so all locks beyond a_contents must be dropped by the caller if one 332 * does not want to end comatose. 333 */ 334 static int 335 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr, 336 size_t event_len) 337 { 338 struct as_callback *cb; 339 340 if ((cb = as_find_callback(as, events, event_addr, event_len))) { 341 as_execute_callback(as, cb, events); 342 return (-1); 343 } 344 return (0); 345 } 346 347 /* 348 * Search for the segment containing addr. If a segment containing addr 349 * exists, that segment is returned. If no such segment exists, and 350 * the list spans addresses greater than addr, then the first segment 351 * whose base is greater than addr is returned; otherwise, NULL is 352 * returned unless tail is true, in which case the last element of the 353 * list is returned. 354 * 355 * a_seglast is used to cache the last found segment for repeated 356 * searches to the same addr (which happens frequently). 357 */ 358 struct seg * 359 as_findseg(struct as *as, caddr_t addr, int tail) 360 { 361 struct seg *seg = as->a_seglast; 362 avl_index_t where; 363 364 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 365 366 if (seg != NULL && 367 seg->s_base <= addr && 368 addr < seg->s_base + seg->s_size) 369 return (seg); 370 371 seg = avl_find(&as->a_segtree, &addr, &where); 372 if (seg != NULL) 373 return (as->a_seglast = seg); 374 375 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); 376 if (seg == NULL && tail) 377 seg = avl_last(&as->a_segtree); 378 return (as->a_seglast = seg); 379 } 380 381 #ifdef VERIFY_SEGLIST 382 /* 383 * verify that the linked list is coherent 384 */ 385 static void 386 as_verify(struct as *as) 387 { 388 struct seg *seg, *seglast, *p, *n; 389 uint_t nsegs = 0; 390 391 if (do_as_verify == 0) 392 return; 393 394 seglast = as->a_seglast; 395 396 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 397 ASSERT(seg->s_as == as); 398 p = AS_SEGPREV(as, seg); 399 n = AS_SEGNEXT(as, seg); 400 ASSERT(p == NULL || p->s_as == as); 401 ASSERT(p == NULL || p->s_base < seg->s_base); 402 ASSERT(n == NULL || n->s_base > seg->s_base); 403 ASSERT(n != NULL || seg == avl_last(&as->a_segtree)); 404 if (seg == seglast) 405 seglast = NULL; 406 nsegs++; 407 } 408 ASSERT(seglast == NULL); 409 ASSERT(avl_numnodes(&as->a_segtree) == nsegs); 410 } 411 #endif /* VERIFY_SEGLIST */ 412 413 /* 414 * Add a new segment to the address space. The avl_find() 415 * may be expensive so we attempt to use last segment accessed 416 * in as_gap() as an insertion point. 417 */ 418 int 419 as_addseg(struct as *as, struct seg *newseg) 420 { 421 struct seg *seg; 422 caddr_t addr; 423 caddr_t eaddr; 424 avl_index_t where; 425 426 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 427 428 as->a_updatedir = 1; /* inform /proc */ 429 gethrestime(&as->a_updatetime); 430 431 if (as->a_lastgaphl != NULL) { 432 struct seg *hseg = NULL; 433 struct seg *lseg = NULL; 434 435 if (as->a_lastgaphl->s_base > newseg->s_base) { 436 hseg = as->a_lastgaphl; 437 lseg = AVL_PREV(&as->a_segtree, hseg); 438 } else { 439 lseg = as->a_lastgaphl; 440 hseg = AVL_NEXT(&as->a_segtree, lseg); 441 } 442 443 if (hseg && lseg && lseg->s_base < newseg->s_base && 444 hseg->s_base > newseg->s_base) { 445 avl_insert_here(&as->a_segtree, newseg, lseg, 446 AVL_AFTER); 447 as->a_lastgaphl = NULL; 448 as->a_seglast = newseg; 449 return (0); 450 } 451 as->a_lastgaphl = NULL; 452 } 453 454 addr = newseg->s_base; 455 eaddr = addr + newseg->s_size; 456 again: 457 458 seg = avl_find(&as->a_segtree, &addr, &where); 459 460 if (seg == NULL) 461 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); 462 463 if (seg == NULL) 464 seg = avl_last(&as->a_segtree); 465 466 if (seg != NULL) { 467 caddr_t base = seg->s_base; 468 469 /* 470 * If top of seg is below the requested address, then 471 * the insertion point is at the end of the linked list, 472 * and seg points to the tail of the list. Otherwise, 473 * the insertion point is immediately before seg. 474 */ 475 if (base + seg->s_size > addr) { 476 if (addr >= base || eaddr > base) { 477 #ifdef __sparc 478 extern struct seg_ops segnf_ops; 479 480 /* 481 * no-fault segs must disappear if overlaid. 482 * XXX need new segment type so 483 * we don't have to check s_ops 484 */ 485 if (seg->s_ops == &segnf_ops) { 486 seg_unmap(seg); 487 goto again; 488 } 489 #endif 490 return (-1); /* overlapping segment */ 491 } 492 } 493 } 494 as->a_seglast = newseg; 495 avl_insert(&as->a_segtree, newseg, where); 496 497 #ifdef VERIFY_SEGLIST 498 as_verify(as); 499 #endif 500 return (0); 501 } 502 503 struct seg * 504 as_removeseg(struct as *as, struct seg *seg) 505 { 506 avl_tree_t *t; 507 508 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 509 510 as->a_updatedir = 1; /* inform /proc */ 511 gethrestime(&as->a_updatetime); 512 513 if (seg == NULL) 514 return (NULL); 515 516 t = &as->a_segtree; 517 if (as->a_seglast == seg) 518 as->a_seglast = NULL; 519 as->a_lastgaphl = NULL; 520 521 /* 522 * if this segment is at an address higher than 523 * a_lastgap, set a_lastgap to the next segment (NULL if last segment) 524 */ 525 if (as->a_lastgap && 526 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base)) 527 as->a_lastgap = AVL_NEXT(t, seg); 528 529 /* 530 * remove the segment from the seg tree 531 */ 532 avl_remove(t, seg); 533 534 #ifdef VERIFY_SEGLIST 535 as_verify(as); 536 #endif 537 return (seg); 538 } 539 540 /* 541 * Find a segment containing addr. 542 */ 543 struct seg * 544 as_segat(struct as *as, caddr_t addr) 545 { 546 struct seg *seg = as->a_seglast; 547 548 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 549 550 if (seg != NULL && seg->s_base <= addr && 551 addr < seg->s_base + seg->s_size) 552 return (seg); 553 554 seg = avl_find(&as->a_segtree, &addr, NULL); 555 return (seg); 556 } 557 558 /* 559 * Serialize all searches for holes in an address space to 560 * prevent two or more threads from allocating the same virtual 561 * address range. The address space must not be "read/write" 562 * locked by the caller since we may block. 563 */ 564 void 565 as_rangelock(struct as *as) 566 { 567 mutex_enter(&as->a_contents); 568 while (AS_ISCLAIMGAP(as)) 569 cv_wait(&as->a_cv, &as->a_contents); 570 AS_SETCLAIMGAP(as); 571 mutex_exit(&as->a_contents); 572 } 573 574 /* 575 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads. 576 */ 577 void 578 as_rangeunlock(struct as *as) 579 { 580 mutex_enter(&as->a_contents); 581 AS_CLRCLAIMGAP(as); 582 cv_signal(&as->a_cv); 583 mutex_exit(&as->a_contents); 584 } 585 586 /* 587 * compar segments (or just an address) by segment address range 588 */ 589 static int 590 as_segcompar(const void *x, const void *y) 591 { 592 struct seg *a = (struct seg *)x; 593 struct seg *b = (struct seg *)y; 594 595 if (a->s_base < b->s_base) 596 return (-1); 597 if (a->s_base >= b->s_base + b->s_size) 598 return (1); 599 return (0); 600 } 601 602 603 void 604 as_avlinit(struct as *as) 605 { 606 avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg), 607 offsetof(struct seg, s_tree)); 608 avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page), 609 offsetof(struct watched_page, wp_link)); 610 } 611 612 /*ARGSUSED*/ 613 static int 614 as_constructor(void *buf, void *cdrarg, int kmflags) 615 { 616 struct as *as = buf; 617 618 mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL); 619 cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL); 620 rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL); 621 as_avlinit(as); 622 return (0); 623 } 624 625 /*ARGSUSED1*/ 626 static void 627 as_destructor(void *buf, void *cdrarg) 628 { 629 struct as *as = buf; 630 631 avl_destroy(&as->a_segtree); 632 mutex_destroy(&as->a_contents); 633 cv_destroy(&as->a_cv); 634 rw_destroy(&as->a_lock); 635 } 636 637 void 638 as_init(void) 639 { 640 as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0, 641 as_constructor, as_destructor, NULL, NULL, NULL, 0); 642 } 643 644 /* 645 * Allocate and initialize an address space data structure. 646 * We call hat_alloc to allow any machine dependent 647 * information in the hat structure to be initialized. 648 */ 649 struct as * 650 as_alloc(void) 651 { 652 struct as *as; 653 654 as = kmem_cache_alloc(as_cache, KM_SLEEP); 655 656 as->a_flags = 0; 657 as->a_vbits = 0; 658 as->a_hrm = NULL; 659 as->a_seglast = NULL; 660 as->a_size = 0; 661 as->a_updatedir = 0; 662 gethrestime(&as->a_updatetime); 663 as->a_objectdir = NULL; 664 as->a_sizedir = 0; 665 as->a_userlimit = (caddr_t)USERLIMIT; 666 as->a_lastgap = NULL; 667 as->a_lastgaphl = NULL; 668 as->a_callbacks = NULL; 669 670 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 671 as->a_hat = hat_alloc(as); /* create hat for default system mmu */ 672 AS_LOCK_EXIT(as, &as->a_lock); 673 674 as->a_xhat = NULL; 675 676 return (as); 677 } 678 679 /* 680 * Free an address space data structure. 681 * Need to free the hat first and then 682 * all the segments on this as and finally 683 * the space for the as struct itself. 684 */ 685 void 686 as_free(struct as *as) 687 { 688 struct hat *hat = as->a_hat; 689 struct seg *seg, *next; 690 int called = 0; 691 692 top: 693 /* 694 * Invoke ALL callbacks. as_do_callbacks will do one callback 695 * per call, and not return (-1) until the callback has completed. 696 * When as_do_callbacks returns zero, all callbacks have completed. 697 */ 698 mutex_enter(&as->a_contents); 699 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0)); 700 701 /* This will prevent new XHATs from attaching to as */ 702 if (!called) 703 AS_SETBUSY(as); 704 mutex_exit(&as->a_contents); 705 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 706 707 if (!called) { 708 called = 1; 709 hat_free_start(hat); 710 if (as->a_xhat != NULL) 711 xhat_free_start_all(as); 712 } 713 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) { 714 int err; 715 716 next = AS_SEGNEXT(as, seg); 717 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 718 if (err == EAGAIN) { 719 mutex_enter(&as->a_contents); 720 if (as->a_callbacks) { 721 AS_LOCK_EXIT(as, &as->a_lock); 722 } else { 723 /* 724 * Memory is currently locked. Wait for a 725 * cv_signal that it has been unlocked, then 726 * try the operation again. 727 */ 728 if (AS_ISUNMAPWAIT(as) == 0) 729 cv_broadcast(&as->a_cv); 730 AS_SETUNMAPWAIT(as); 731 AS_LOCK_EXIT(as, &as->a_lock); 732 while (AS_ISUNMAPWAIT(as)) 733 cv_wait(&as->a_cv, &as->a_contents); 734 } 735 mutex_exit(&as->a_contents); 736 goto top; 737 } else { 738 /* 739 * We do not expect any other error return at this 740 * time. This is similar to an ASSERT in seg_unmap() 741 */ 742 ASSERT(err == 0); 743 } 744 } 745 hat_free_end(hat); 746 if (as->a_xhat != NULL) 747 xhat_free_end_all(as); 748 AS_LOCK_EXIT(as, &as->a_lock); 749 750 /* /proc stuff */ 751 ASSERT(avl_numnodes(&as->a_wpage) == 0); 752 if (as->a_objectdir) { 753 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *)); 754 as->a_objectdir = NULL; 755 as->a_sizedir = 0; 756 } 757 758 /* 759 * Free the struct as back to kmem. Assert it has no segments. 760 */ 761 ASSERT(avl_numnodes(&as->a_segtree) == 0); 762 kmem_cache_free(as_cache, as); 763 } 764 765 int 766 as_dup(struct as *as, struct as **outas) 767 { 768 struct as *newas; 769 struct seg *seg, *newseg; 770 int error; 771 772 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 773 as_clearwatch(as); 774 newas = as_alloc(); 775 newas->a_userlimit = as->a_userlimit; 776 AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER); 777 778 /* This will prevent new XHATs from attaching */ 779 mutex_enter(&as->a_contents); 780 AS_SETBUSY(as); 781 mutex_exit(&as->a_contents); 782 mutex_enter(&newas->a_contents); 783 AS_SETBUSY(newas); 784 mutex_exit(&newas->a_contents); 785 786 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD); 787 788 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 789 790 if (seg->s_flags & S_PURGE) 791 continue; 792 793 newseg = seg_alloc(newas, seg->s_base, seg->s_size); 794 if (newseg == NULL) { 795 AS_LOCK_EXIT(newas, &newas->a_lock); 796 as_setwatch(as); 797 mutex_enter(&as->a_contents); 798 AS_CLRBUSY(as); 799 mutex_exit(&as->a_contents); 800 AS_LOCK_EXIT(as, &as->a_lock); 801 as_free(newas); 802 return (-1); 803 } 804 if ((error = SEGOP_DUP(seg, newseg)) != 0) { 805 /* 806 * We call seg_free() on the new seg 807 * because the segment is not set up 808 * completely; i.e. it has no ops. 809 */ 810 as_setwatch(as); 811 mutex_enter(&as->a_contents); 812 AS_CLRBUSY(as); 813 mutex_exit(&as->a_contents); 814 AS_LOCK_EXIT(as, &as->a_lock); 815 seg_free(newseg); 816 AS_LOCK_EXIT(newas, &newas->a_lock); 817 as_free(newas); 818 return (error); 819 } 820 newas->a_size += seg->s_size; 821 } 822 823 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL); 824 if (as->a_xhat != NULL) 825 error |= xhat_dup_all(as, newas, NULL, 0, HAT_DUP_ALL); 826 827 mutex_enter(&newas->a_contents); 828 AS_CLRBUSY(newas); 829 mutex_exit(&newas->a_contents); 830 AS_LOCK_EXIT(newas, &newas->a_lock); 831 832 as_setwatch(as); 833 mutex_enter(&as->a_contents); 834 AS_CLRBUSY(as); 835 mutex_exit(&as->a_contents); 836 AS_LOCK_EXIT(as, &as->a_lock); 837 if (error != 0) { 838 as_free(newas); 839 return (error); 840 } 841 *outas = newas; 842 return (0); 843 } 844 845 /* 846 * Handle a ``fault'' at addr for size bytes. 847 */ 848 faultcode_t 849 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size, 850 enum fault_type type, enum seg_rw rw) 851 { 852 struct seg *seg; 853 caddr_t raddr; /* rounded down addr */ 854 size_t rsize; /* rounded up size */ 855 size_t ssize; 856 faultcode_t res = 0; 857 caddr_t addrsav; 858 struct seg *segsav; 859 int as_lock_held; 860 klwp_t *lwp = ttolwp(curthread); 861 int is_xhat = 0; 862 int holding_wpage = 0; 863 extern struct seg_ops segdev_ops; 864 865 866 867 if (as->a_hat != hat) { 868 /* This must be an XHAT then */ 869 is_xhat = 1; 870 871 if ((type != F_INVAL) || (as == &kas)) 872 return (FC_NOSUPPORT); 873 } 874 875 retry: 876 if (!is_xhat) { 877 /* 878 * Indicate that the lwp is not to be stopped while waiting 879 * for a pagefault. This is to avoid deadlock while debugging 880 * a process via /proc over NFS (in particular). 881 */ 882 if (lwp != NULL) 883 lwp->lwp_nostop++; 884 885 /* 886 * same length must be used when we softlock and softunlock. 887 * We don't support softunlocking lengths less than 888 * the original length when there is largepage support. 889 * See seg_dev.c for more comments. 890 */ 891 switch (type) { 892 893 case F_SOFTLOCK: 894 CPU_STATS_ADD_K(vm, softlock, 1); 895 break; 896 897 case F_SOFTUNLOCK: 898 break; 899 900 case F_PROT: 901 CPU_STATS_ADD_K(vm, prot_fault, 1); 902 break; 903 904 case F_INVAL: 905 CPU_STATS_ENTER_K(); 906 CPU_STATS_ADDQ(CPU, vm, as_fault, 1); 907 if (as == &kas) 908 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1); 909 CPU_STATS_EXIT_K(); 910 break; 911 } 912 } 913 914 /* Kernel probe */ 915 TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */, 916 tnf_opaque, address, addr, 917 tnf_fault_type, fault_type, type, 918 tnf_seg_access, access, rw); 919 920 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 921 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 922 (size_t)raddr; 923 924 /* 925 * XXX -- Don't grab the as lock for segkmap. We should grab it for 926 * correctness, but then we could be stuck holding this lock for 927 * a LONG time if the fault needs to be resolved on a slow 928 * filesystem, and then no-one will be able to exec new commands, 929 * as exec'ing requires the write lock on the as. 930 */ 931 if (as == &kas && segkmap && segkmap->s_base <= raddr && 932 raddr + size < segkmap->s_base + segkmap->s_size) { 933 /* 934 * if (as==&kas), this can't be XHAT: we've already returned 935 * FC_NOSUPPORT. 936 */ 937 seg = segkmap; 938 as_lock_held = 0; 939 } else { 940 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 941 if (is_xhat && avl_numnodes(&as->a_wpage) != 0) { 942 /* 943 * Grab and hold the writers' lock on the as 944 * if the fault is to a watched page. 945 * This will keep CPUs from "peeking" at the 946 * address range while we're temporarily boosting 947 * the permissions for the XHAT device to 948 * resolve the fault in the segment layer. 949 * 950 * We could check whether faulted address 951 * is within a watched page and only then grab 952 * the writer lock, but this is simpler. 953 */ 954 AS_LOCK_EXIT(as, &as->a_lock); 955 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 956 } 957 958 seg = as_segat(as, raddr); 959 if (seg == NULL) { 960 AS_LOCK_EXIT(as, &as->a_lock); 961 if ((lwp != NULL) && (!is_xhat)) 962 lwp->lwp_nostop--; 963 return (FC_NOMAP); 964 } 965 966 as_lock_held = 1; 967 } 968 969 addrsav = raddr; 970 segsav = seg; 971 972 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 973 if (raddr >= seg->s_base + seg->s_size) { 974 seg = AS_SEGNEXT(as, seg); 975 if (seg == NULL || raddr != seg->s_base) { 976 res = FC_NOMAP; 977 break; 978 } 979 } 980 if (raddr + rsize > seg->s_base + seg->s_size) 981 ssize = seg->s_base + seg->s_size - raddr; 982 else 983 ssize = rsize; 984 985 if (!is_xhat || (seg->s_ops != &segdev_ops)) { 986 987 if (is_xhat && avl_numnodes(&as->a_wpage) != 0 && 988 pr_is_watchpage_as(raddr, rw, as)) { 989 /* 990 * Handle watch pages. If we're faulting on a 991 * watched page from an X-hat, we have to 992 * restore the original permissions while we 993 * handle the fault. 994 */ 995 as_clearwatch(as); 996 holding_wpage = 1; 997 } 998 999 res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw); 1000 1001 /* Restore watchpoints */ 1002 if (holding_wpage) { 1003 as_setwatch(as); 1004 holding_wpage = 0; 1005 } 1006 1007 if (res != 0) 1008 break; 1009 } else { 1010 /* XHAT does not support seg_dev */ 1011 res = FC_NOSUPPORT; 1012 break; 1013 } 1014 } 1015 1016 /* 1017 * If we were SOFTLOCKing and encountered a failure, 1018 * we must SOFTUNLOCK the range we already did. (Maybe we 1019 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing 1020 * right here...) 1021 */ 1022 if (res != 0 && type == F_SOFTLOCK) { 1023 for (seg = segsav; addrsav < raddr; addrsav += ssize) { 1024 if (addrsav >= seg->s_base + seg->s_size) 1025 seg = AS_SEGNEXT(as, seg); 1026 ASSERT(seg != NULL); 1027 /* 1028 * Now call the fault routine again to perform the 1029 * unlock using S_OTHER instead of the rw variable 1030 * since we never got a chance to touch the pages. 1031 */ 1032 if (raddr > seg->s_base + seg->s_size) 1033 ssize = seg->s_base + seg->s_size - addrsav; 1034 else 1035 ssize = raddr - addrsav; 1036 (void) SEGOP_FAULT(hat, seg, addrsav, ssize, 1037 F_SOFTUNLOCK, S_OTHER); 1038 } 1039 } 1040 if (as_lock_held) 1041 AS_LOCK_EXIT(as, &as->a_lock); 1042 if ((lwp != NULL) && (!is_xhat)) 1043 lwp->lwp_nostop--; 1044 1045 /* 1046 * If the lower levels returned EDEADLK for a fault, 1047 * It means that we should retry the fault. Let's wait 1048 * a bit also to let the deadlock causing condition clear. 1049 * This is part of a gross hack to work around a design flaw 1050 * in the ufs/sds logging code and should go away when the 1051 * logging code is re-designed to fix the problem. See bug 1052 * 4125102 for details of the problem. 1053 */ 1054 if (FC_ERRNO(res) == EDEADLK) { 1055 delay(deadlk_wait); 1056 res = 0; 1057 goto retry; 1058 } 1059 return (res); 1060 } 1061 1062 1063 1064 /* 1065 * Asynchronous ``fault'' at addr for size bytes. 1066 */ 1067 faultcode_t 1068 as_faulta(struct as *as, caddr_t addr, size_t size) 1069 { 1070 struct seg *seg; 1071 caddr_t raddr; /* rounded down addr */ 1072 size_t rsize; /* rounded up size */ 1073 faultcode_t res = 0; 1074 klwp_t *lwp = ttolwp(curthread); 1075 1076 retry: 1077 /* 1078 * Indicate that the lwp is not to be stopped while waiting 1079 * for a pagefault. This is to avoid deadlock while debugging 1080 * a process via /proc over NFS (in particular). 1081 */ 1082 if (lwp != NULL) 1083 lwp->lwp_nostop++; 1084 1085 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1086 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1087 (size_t)raddr; 1088 1089 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1090 seg = as_segat(as, raddr); 1091 if (seg == NULL) { 1092 AS_LOCK_EXIT(as, &as->a_lock); 1093 if (lwp != NULL) 1094 lwp->lwp_nostop--; 1095 return (FC_NOMAP); 1096 } 1097 1098 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) { 1099 if (raddr >= seg->s_base + seg->s_size) { 1100 seg = AS_SEGNEXT(as, seg); 1101 if (seg == NULL || raddr != seg->s_base) { 1102 res = FC_NOMAP; 1103 break; 1104 } 1105 } 1106 res = SEGOP_FAULTA(seg, raddr); 1107 if (res != 0) 1108 break; 1109 } 1110 AS_LOCK_EXIT(as, &as->a_lock); 1111 if (lwp != NULL) 1112 lwp->lwp_nostop--; 1113 /* 1114 * If the lower levels returned EDEADLK for a fault, 1115 * It means that we should retry the fault. Let's wait 1116 * a bit also to let the deadlock causing condition clear. 1117 * This is part of a gross hack to work around a design flaw 1118 * in the ufs/sds logging code and should go away when the 1119 * logging code is re-designed to fix the problem. See bug 1120 * 4125102 for details of the problem. 1121 */ 1122 if (FC_ERRNO(res) == EDEADLK) { 1123 delay(deadlk_wait); 1124 res = 0; 1125 goto retry; 1126 } 1127 return (res); 1128 } 1129 1130 /* 1131 * Set the virtual mapping for the interval from [addr : addr + size) 1132 * in address space `as' to have the specified protection. 1133 * It is ok for the range to cross over several segments, 1134 * as long as they are contiguous. 1135 */ 1136 int 1137 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 1138 { 1139 struct seg *seg; 1140 struct as_callback *cb; 1141 size_t ssize; 1142 caddr_t raddr; /* rounded down addr */ 1143 size_t rsize; /* rounded up size */ 1144 int error = 0, writer = 0; 1145 caddr_t saveraddr; 1146 size_t saversize; 1147 1148 setprot_top: 1149 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1150 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1151 (size_t)raddr; 1152 1153 if (raddr + rsize < raddr) /* check for wraparound */ 1154 return (ENOMEM); 1155 1156 saveraddr = raddr; 1157 saversize = rsize; 1158 1159 /* 1160 * Normally we only lock the as as a reader. But 1161 * if due to setprot the segment driver needs to split 1162 * a segment it will return IE_RETRY. Therefore we re-aquire 1163 * the as lock as a writer so the segment driver can change 1164 * the seg list. Also the segment driver will return IE_RETRY 1165 * after it has changed the segment list so we therefore keep 1166 * locking as a writer. Since these opeartions should be rare 1167 * want to only lock as a writer when necessary. 1168 */ 1169 if (writer || avl_numnodes(&as->a_wpage) != 0) { 1170 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1171 } else { 1172 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1173 } 1174 1175 as_clearwatchprot(as, raddr, rsize); 1176 seg = as_segat(as, raddr); 1177 if (seg == NULL) { 1178 as_setwatch(as); 1179 AS_LOCK_EXIT(as, &as->a_lock); 1180 return (ENOMEM); 1181 } 1182 1183 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 1184 if (raddr >= seg->s_base + seg->s_size) { 1185 seg = AS_SEGNEXT(as, seg); 1186 if (seg == NULL || raddr != seg->s_base) { 1187 error = ENOMEM; 1188 break; 1189 } 1190 } 1191 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 1192 ssize = seg->s_base + seg->s_size - raddr; 1193 else 1194 ssize = rsize; 1195 error = SEGOP_SETPROT(seg, raddr, ssize, prot); 1196 1197 if (error == IE_NOMEM) { 1198 error = EAGAIN; 1199 break; 1200 } 1201 1202 if (error == IE_RETRY) { 1203 AS_LOCK_EXIT(as, &as->a_lock); 1204 writer = 1; 1205 goto setprot_top; 1206 } 1207 1208 if (error == EAGAIN) { 1209 /* 1210 * Make sure we have a_lock as writer. 1211 */ 1212 if (writer == 0) { 1213 AS_LOCK_EXIT(as, &as->a_lock); 1214 writer = 1; 1215 goto setprot_top; 1216 } 1217 1218 /* 1219 * Memory is currently locked. It must be unlocked 1220 * before this operation can succeed through a retry. 1221 * The possible reasons for locked memory and 1222 * corresponding strategies for unlocking are: 1223 * (1) Normal I/O 1224 * wait for a signal that the I/O operation 1225 * has completed and the memory is unlocked. 1226 * (2) Asynchronous I/O 1227 * The aio subsystem does not unlock pages when 1228 * the I/O is completed. Those pages are unlocked 1229 * when the application calls aiowait/aioerror. 1230 * So, to prevent blocking forever, cv_broadcast() 1231 * is done to wake up aio_cleanup_thread. 1232 * Subsequently, segvn_reclaim will be called, and 1233 * that will do AS_CLRUNMAPWAIT() and wake us up. 1234 * (3) Long term page locking: 1235 * Drivers intending to have pages locked for a 1236 * period considerably longer than for normal I/O 1237 * (essentially forever) may have registered for a 1238 * callback so they may unlock these pages on 1239 * request. This is needed to allow this operation 1240 * to succeed. Each entry on the callback list is 1241 * examined. If the event or address range pertains 1242 * the callback is invoked (unless it already is in 1243 * progress). The a_contents lock must be dropped 1244 * before the callback, so only one callback can 1245 * be done at a time. Go to the top and do more 1246 * until zero is returned. If zero is returned, 1247 * either there were no callbacks for this event 1248 * or they were already in progress. 1249 */ 1250 mutex_enter(&as->a_contents); 1251 if (as->a_callbacks && 1252 (cb = as_find_callback(as, AS_SETPROT_EVENT, 1253 seg->s_base, seg->s_size))) { 1254 AS_LOCK_EXIT(as, &as->a_lock); 1255 as_execute_callback(as, cb, AS_SETPROT_EVENT); 1256 } else { 1257 if (AS_ISUNMAPWAIT(as) == 0) 1258 cv_broadcast(&as->a_cv); 1259 AS_SETUNMAPWAIT(as); 1260 AS_LOCK_EXIT(as, &as->a_lock); 1261 while (AS_ISUNMAPWAIT(as)) 1262 cv_wait(&as->a_cv, &as->a_contents); 1263 } 1264 mutex_exit(&as->a_contents); 1265 goto setprot_top; 1266 } else if (error != 0) 1267 break; 1268 } 1269 if (error != 0) { 1270 as_setwatch(as); 1271 } else { 1272 as_setwatchprot(as, saveraddr, saversize, prot); 1273 } 1274 AS_LOCK_EXIT(as, &as->a_lock); 1275 return (error); 1276 } 1277 1278 /* 1279 * Check to make sure that the interval [addr, addr + size) 1280 * in address space `as' has at least the specified protection. 1281 * It is ok for the range to cross over several segments, as long 1282 * as they are contiguous. 1283 */ 1284 int 1285 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 1286 { 1287 struct seg *seg; 1288 size_t ssize; 1289 caddr_t raddr; /* rounded down addr */ 1290 size_t rsize; /* rounded up size */ 1291 int error = 0; 1292 1293 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1294 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1295 (size_t)raddr; 1296 1297 if (raddr + rsize < raddr) /* check for wraparound */ 1298 return (ENOMEM); 1299 1300 /* 1301 * This is ugly as sin... 1302 * Normally, we only acquire the address space readers lock. 1303 * However, if the address space has watchpoints present, 1304 * we must acquire the writer lock on the address space for 1305 * the benefit of as_clearwatchprot() and as_setwatchprot(). 1306 */ 1307 if (avl_numnodes(&as->a_wpage) != 0) 1308 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1309 else 1310 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1311 as_clearwatchprot(as, raddr, rsize); 1312 seg = as_segat(as, raddr); 1313 if (seg == NULL) { 1314 as_setwatch(as); 1315 AS_LOCK_EXIT(as, &as->a_lock); 1316 return (ENOMEM); 1317 } 1318 1319 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 1320 if (raddr >= seg->s_base + seg->s_size) { 1321 seg = AS_SEGNEXT(as, seg); 1322 if (seg == NULL || raddr != seg->s_base) { 1323 error = ENOMEM; 1324 break; 1325 } 1326 } 1327 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 1328 ssize = seg->s_base + seg->s_size - raddr; 1329 else 1330 ssize = rsize; 1331 1332 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot); 1333 if (error != 0) 1334 break; 1335 } 1336 as_setwatch(as); 1337 AS_LOCK_EXIT(as, &as->a_lock); 1338 return (error); 1339 } 1340 1341 int 1342 as_unmap(struct as *as, caddr_t addr, size_t size) 1343 { 1344 struct seg *seg, *seg_next; 1345 struct as_callback *cb; 1346 caddr_t raddr, eaddr; 1347 size_t ssize; 1348 int err; 1349 1350 top: 1351 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1352 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) & 1353 (uintptr_t)PAGEMASK); 1354 1355 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1356 1357 as->a_updatedir = 1; /* inform /proc */ 1358 gethrestime(&as->a_updatetime); 1359 1360 /* 1361 * Use as_findseg to find the first segment in the range, then 1362 * step through the segments in order, following s_next. 1363 */ 1364 as_clearwatchprot(as, raddr, eaddr - raddr); 1365 1366 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) { 1367 if (eaddr <= seg->s_base) 1368 break; /* eaddr was in a gap; all done */ 1369 1370 /* this is implied by the test above */ 1371 ASSERT(raddr < eaddr); 1372 1373 if (raddr < seg->s_base) 1374 raddr = seg->s_base; /* raddr was in a gap */ 1375 1376 if (eaddr > (seg->s_base + seg->s_size)) 1377 ssize = seg->s_base + seg->s_size - raddr; 1378 else 1379 ssize = eaddr - raddr; 1380 1381 /* 1382 * Save next segment pointer since seg can be 1383 * destroyed during the segment unmap operation. 1384 */ 1385 seg_next = AS_SEGNEXT(as, seg); 1386 1387 err = SEGOP_UNMAP(seg, raddr, ssize); 1388 if (err == EAGAIN) { 1389 /* 1390 * Memory is currently locked. It must be unlocked 1391 * before this operation can succeed through a retry. 1392 * The possible reasons for locked memory and 1393 * corresponding strategies for unlocking are: 1394 * (1) Normal I/O 1395 * wait for a signal that the I/O operation 1396 * has completed and the memory is unlocked. 1397 * (2) Asynchronous I/O 1398 * The aio subsystem does not unlock pages when 1399 * the I/O is completed. Those pages are unlocked 1400 * when the application calls aiowait/aioerror. 1401 * So, to prevent blocking forever, cv_broadcast() 1402 * is done to wake up aio_cleanup_thread. 1403 * Subsequently, segvn_reclaim will be called, and 1404 * that will do AS_CLRUNMAPWAIT() and wake us up. 1405 * (3) Long term page locking: 1406 * Drivers intending to have pages locked for a 1407 * period considerably longer than for normal I/O 1408 * (essentially forever) may have registered for a 1409 * callback so they may unlock these pages on 1410 * request. This is needed to allow this operation 1411 * to succeed. Each entry on the callback list is 1412 * examined. If the event or address range pertains 1413 * the callback is invoked (unless it already is in 1414 * progress). The a_contents lock must be dropped 1415 * before the callback, so only one callback can 1416 * be done at a time. Go to the top and do more 1417 * until zero is returned. If zero is returned, 1418 * either there were no callbacks for this event 1419 * or they were already in progress. 1420 */ 1421 as_setwatch(as); 1422 mutex_enter(&as->a_contents); 1423 if (as->a_callbacks && 1424 (cb = as_find_callback(as, AS_UNMAP_EVENT, 1425 seg->s_base, seg->s_size))) { 1426 AS_LOCK_EXIT(as, &as->a_lock); 1427 as_execute_callback(as, cb, AS_UNMAP_EVENT); 1428 } else { 1429 if (AS_ISUNMAPWAIT(as) == 0) 1430 cv_broadcast(&as->a_cv); 1431 AS_SETUNMAPWAIT(as); 1432 AS_LOCK_EXIT(as, &as->a_lock); 1433 while (AS_ISUNMAPWAIT(as)) 1434 cv_wait(&as->a_cv, &as->a_contents); 1435 } 1436 mutex_exit(&as->a_contents); 1437 goto top; 1438 } else if (err == IE_RETRY) { 1439 as_setwatch(as); 1440 AS_LOCK_EXIT(as, &as->a_lock); 1441 goto top; 1442 } else if (err) { 1443 as_setwatch(as); 1444 AS_LOCK_EXIT(as, &as->a_lock); 1445 return (-1); 1446 } 1447 1448 as->a_size -= ssize; 1449 raddr += ssize; 1450 } 1451 AS_LOCK_EXIT(as, &as->a_lock); 1452 return (0); 1453 } 1454 1455 static int 1456 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec, 1457 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated) 1458 { 1459 uint_t szc; 1460 uint_t nszc; 1461 int error; 1462 caddr_t a; 1463 caddr_t eaddr; 1464 size_t segsize; 1465 struct seg *seg; 1466 size_t pgsz; 1467 int do_off = (vn_a->vp != NULL || vn_a->amp != NULL); 1468 uint_t save_szcvec; 1469 1470 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1471 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1472 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1473 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL); 1474 if (!do_off) { 1475 vn_a->offset = 0; 1476 } 1477 1478 if (szcvec <= 1) { 1479 seg = seg_alloc(as, addr, size); 1480 if (seg == NULL) { 1481 return (ENOMEM); 1482 } 1483 vn_a->szc = 0; 1484 error = (*crfp)(seg, vn_a); 1485 if (error != 0) { 1486 seg_free(seg); 1487 } else { 1488 as->a_size += size; 1489 } 1490 return (error); 1491 } 1492 1493 eaddr = addr + size; 1494 save_szcvec = szcvec; 1495 szcvec >>= 1; 1496 szc = 0; 1497 nszc = 0; 1498 while (szcvec) { 1499 if ((szcvec & 0x1) == 0) { 1500 nszc++; 1501 szcvec >>= 1; 1502 continue; 1503 } 1504 nszc++; 1505 pgsz = page_get_pagesize(nszc); 1506 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 1507 if (a != addr) { 1508 ASSERT(a < eaddr); 1509 segsize = a - addr; 1510 seg = seg_alloc(as, addr, segsize); 1511 if (seg == NULL) { 1512 return (ENOMEM); 1513 } 1514 vn_a->szc = szc; 1515 error = (*crfp)(seg, vn_a); 1516 if (error != 0) { 1517 seg_free(seg); 1518 return (error); 1519 } 1520 as->a_size += segsize; 1521 *segcreated = 1; 1522 if (do_off) { 1523 vn_a->offset += segsize; 1524 } 1525 addr = a; 1526 } 1527 szc = nszc; 1528 szcvec >>= 1; 1529 } 1530 1531 ASSERT(addr < eaddr); 1532 szcvec = save_szcvec | 1; /* add 8K pages */ 1533 while (szcvec) { 1534 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 1535 ASSERT(a >= addr); 1536 if (a != addr) { 1537 segsize = a - addr; 1538 seg = seg_alloc(as, addr, segsize); 1539 if (seg == NULL) { 1540 return (ENOMEM); 1541 } 1542 vn_a->szc = szc; 1543 error = (*crfp)(seg, vn_a); 1544 if (error != 0) { 1545 seg_free(seg); 1546 return (error); 1547 } 1548 as->a_size += segsize; 1549 *segcreated = 1; 1550 if (do_off) { 1551 vn_a->offset += segsize; 1552 } 1553 addr = a; 1554 } 1555 szcvec &= ~(1 << szc); 1556 if (szcvec) { 1557 szc = highbit(szcvec) - 1; 1558 pgsz = page_get_pagesize(szc); 1559 } 1560 } 1561 ASSERT(addr == eaddr); 1562 1563 return (0); 1564 } 1565 1566 static int 1567 as_map_vnsegs(struct as *as, caddr_t addr, size_t size, 1568 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated) 1569 { 1570 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA); 1571 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM; 1572 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags, 1573 type, 0); 1574 int error; 1575 struct seg *seg; 1576 struct vattr va; 1577 u_offset_t eoff; 1578 size_t save_size = 0; 1579 extern size_t textrepl_size_thresh; 1580 1581 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1582 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1583 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1584 ASSERT(vn_a->vp != NULL); 1585 ASSERT(vn_a->amp == NULL); 1586 1587 again: 1588 if (szcvec <= 1) { 1589 seg = seg_alloc(as, addr, size); 1590 if (seg == NULL) { 1591 return (ENOMEM); 1592 } 1593 vn_a->szc = 0; 1594 error = (*crfp)(seg, vn_a); 1595 if (error != 0) { 1596 seg_free(seg); 1597 } else { 1598 as->a_size += size; 1599 } 1600 return (error); 1601 } 1602 1603 va.va_mask = AT_SIZE; 1604 if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred) != 0) { 1605 szcvec = 0; 1606 goto again; 1607 } 1608 eoff = vn_a->offset & PAGEMASK; 1609 if (eoff >= va.va_size) { 1610 szcvec = 0; 1611 goto again; 1612 } 1613 eoff += size; 1614 if (btopr(va.va_size) < btopr(eoff)) { 1615 save_size = size; 1616 size = va.va_size - (vn_a->offset & PAGEMASK); 1617 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t); 1618 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags, 1619 type, 0); 1620 if (szcvec <= 1) { 1621 size = save_size; 1622 goto again; 1623 } 1624 } 1625 1626 if (size > textrepl_size_thresh) { 1627 vn_a->flags |= _MAP_TEXTREPL; 1628 } 1629 error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a, 1630 segcreated); 1631 if (error != 0) { 1632 return (error); 1633 } 1634 if (save_size) { 1635 addr += size; 1636 size = save_size - size; 1637 szcvec = 0; 1638 goto again; 1639 } 1640 return (0); 1641 } 1642 1643 /* 1644 * as_map_ansegs: shared or private anonymous memory. Note that the flags 1645 * passed to map_pgszvec cannot be MAP_INITDATA, for anon. 1646 */ 1647 static int 1648 as_map_ansegs(struct as *as, caddr_t addr, size_t size, 1649 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated) 1650 { 1651 uint_t szcvec; 1652 uchar_t type; 1653 1654 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE); 1655 if (vn_a->type == MAP_SHARED) { 1656 type = MAPPGSZC_SHM; 1657 } else if (vn_a->type == MAP_PRIVATE) { 1658 if (vn_a->szc == AS_MAP_HEAP) { 1659 type = MAPPGSZC_HEAP; 1660 } else if (vn_a->szc == AS_MAP_STACK) { 1661 type = MAPPGSZC_STACK; 1662 } else { 1663 type = MAPPGSZC_PRIVM; 1664 } 1665 } 1666 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ? 1667 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE), 1668 (vn_a->flags & MAP_TEXT), type, 0); 1669 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1670 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1671 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1672 ASSERT(vn_a->vp == NULL); 1673 1674 return (as_map_segvn_segs(as, addr, size, szcvec, 1675 crfp, vn_a, segcreated)); 1676 } 1677 1678 int 1679 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp) 1680 { 1681 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1682 return (as_map_locked(as, addr, size, crfp, argsp)); 1683 } 1684 1685 int 1686 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(), 1687 void *argsp) 1688 { 1689 struct seg *seg = NULL; 1690 caddr_t raddr; /* rounded down addr */ 1691 size_t rsize; /* rounded up size */ 1692 int error; 1693 int unmap = 0; 1694 struct proc *p = curproc; 1695 struct segvn_crargs crargs; 1696 1697 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1698 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1699 (size_t)raddr; 1700 1701 /* 1702 * check for wrap around 1703 */ 1704 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) { 1705 AS_LOCK_EXIT(as, &as->a_lock); 1706 return (ENOMEM); 1707 } 1708 1709 as->a_updatedir = 1; /* inform /proc */ 1710 gethrestime(&as->a_updatetime); 1711 1712 if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) { 1713 AS_LOCK_EXIT(as, &as->a_lock); 1714 1715 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p, 1716 RCA_UNSAFE_ALL); 1717 1718 return (ENOMEM); 1719 } 1720 1721 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) { 1722 crargs = *(struct segvn_crargs *)argsp; 1723 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap); 1724 if (error != 0) { 1725 AS_LOCK_EXIT(as, &as->a_lock); 1726 if (unmap) { 1727 (void) as_unmap(as, addr, size); 1728 } 1729 return (error); 1730 } 1731 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) { 1732 crargs = *(struct segvn_crargs *)argsp; 1733 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap); 1734 if (error != 0) { 1735 AS_LOCK_EXIT(as, &as->a_lock); 1736 if (unmap) { 1737 (void) as_unmap(as, addr, size); 1738 } 1739 return (error); 1740 } 1741 } else { 1742 seg = seg_alloc(as, addr, size); 1743 if (seg == NULL) { 1744 AS_LOCK_EXIT(as, &as->a_lock); 1745 return (ENOMEM); 1746 } 1747 1748 error = (*crfp)(seg, argsp); 1749 if (error != 0) { 1750 seg_free(seg); 1751 AS_LOCK_EXIT(as, &as->a_lock); 1752 return (error); 1753 } 1754 /* 1755 * Add size now so as_unmap will work if as_ctl fails. 1756 */ 1757 as->a_size += rsize; 1758 } 1759 1760 as_setwatch(as); 1761 1762 /* 1763 * If the address space is locked, 1764 * establish memory locks for the new segment. 1765 */ 1766 mutex_enter(&as->a_contents); 1767 if (AS_ISPGLCK(as)) { 1768 mutex_exit(&as->a_contents); 1769 AS_LOCK_EXIT(as, &as->a_lock); 1770 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0); 1771 if (error != 0) 1772 (void) as_unmap(as, addr, size); 1773 } else { 1774 mutex_exit(&as->a_contents); 1775 AS_LOCK_EXIT(as, &as->a_lock); 1776 } 1777 return (error); 1778 } 1779 1780 1781 /* 1782 * Delete all segments in the address space marked with S_PURGE. 1783 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c). 1784 * These segments are deleted as a first step before calls to as_gap(), so 1785 * that they don't affect mmap() or shmat(). 1786 */ 1787 void 1788 as_purge(struct as *as) 1789 { 1790 struct seg *seg; 1791 struct seg *next_seg; 1792 1793 /* 1794 * the setting of NEEDSPURGE is protect by as_rangelock(), so 1795 * no need to grab a_contents mutex for this check 1796 */ 1797 if ((as->a_flags & AS_NEEDSPURGE) == 0) 1798 return; 1799 1800 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1801 next_seg = NULL; 1802 seg = AS_SEGFIRST(as); 1803 while (seg != NULL) { 1804 next_seg = AS_SEGNEXT(as, seg); 1805 if (seg->s_flags & S_PURGE) 1806 SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 1807 seg = next_seg; 1808 } 1809 AS_LOCK_EXIT(as, &as->a_lock); 1810 1811 mutex_enter(&as->a_contents); 1812 as->a_flags &= ~AS_NEEDSPURGE; 1813 mutex_exit(&as->a_contents); 1814 } 1815 1816 /* 1817 * Find a hole of at least size minlen within [base, base + len). 1818 * 1819 * If flags specifies AH_HI, the hole will have the highest possible address 1820 * in the range. We use the as->a_lastgap field to figure out where to 1821 * start looking for a gap. 1822 * 1823 * Otherwise, the gap will have the lowest possible address. 1824 * 1825 * If flags specifies AH_CONTAIN, the hole will contain the address addr. 1826 * 1827 * If an adequate hole is found, base and len are set to reflect the part of 1828 * the hole that is within range, and 0 is returned, otherwise, 1829 * -1 is returned. 1830 * 1831 * NOTE: This routine is not correct when base+len overflows caddr_t. 1832 */ 1833 int 1834 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags, 1835 caddr_t addr) 1836 { 1837 caddr_t lobound = *basep; 1838 caddr_t hibound = lobound + *lenp; 1839 struct seg *lseg, *hseg; 1840 caddr_t lo, hi; 1841 int forward; 1842 caddr_t save_base; 1843 size_t save_len; 1844 1845 save_base = *basep; 1846 save_len = *lenp; 1847 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1848 if (AS_SEGFIRST(as) == NULL) { 1849 if (valid_va_range(basep, lenp, minlen, flags & AH_DIR)) { 1850 AS_LOCK_EXIT(as, &as->a_lock); 1851 return (0); 1852 } else { 1853 AS_LOCK_EXIT(as, &as->a_lock); 1854 *basep = save_base; 1855 *lenp = save_len; 1856 return (-1); 1857 } 1858 } 1859 1860 /* 1861 * Set up to iterate over all the inter-segment holes in the given 1862 * direction. lseg is NULL for the lowest-addressed hole and hseg is 1863 * NULL for the highest-addressed hole. If moving backwards, we reset 1864 * sseg to denote the highest-addressed segment. 1865 */ 1866 forward = (flags & AH_DIR) == AH_LO; 1867 if (forward) { 1868 hseg = as_findseg(as, lobound, 1); 1869 lseg = AS_SEGPREV(as, hseg); 1870 } else { 1871 1872 /* 1873 * If allocating at least as much as the last allocation, 1874 * use a_lastgap's base as a better estimate of hibound. 1875 */ 1876 if (as->a_lastgap && 1877 minlen >= as->a_lastgap->s_size && 1878 hibound >= as->a_lastgap->s_base) 1879 hibound = as->a_lastgap->s_base; 1880 1881 hseg = as_findseg(as, hibound, 1); 1882 if (hseg->s_base + hseg->s_size < hibound) { 1883 lseg = hseg; 1884 hseg = NULL; 1885 } else { 1886 lseg = AS_SEGPREV(as, hseg); 1887 } 1888 } 1889 1890 for (;;) { 1891 /* 1892 * Set lo and hi to the hole's boundaries. (We should really 1893 * use MAXADDR in place of hibound in the expression below, 1894 * but can't express it easily; using hibound in its place is 1895 * harmless.) 1896 */ 1897 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size; 1898 hi = (hseg == NULL) ? hibound : hseg->s_base; 1899 /* 1900 * If the iteration has moved past the interval from lobound 1901 * to hibound it's pointless to continue. 1902 */ 1903 if ((forward && lo > hibound) || (!forward && hi < lobound)) 1904 break; 1905 else if (lo > hibound || hi < lobound) 1906 goto cont; 1907 /* 1908 * Candidate hole lies at least partially within the allowable 1909 * range. Restrict it to fall completely within that range, 1910 * i.e., to [max(lo, lobound), min(hi, hibound)]. 1911 */ 1912 if (lo < lobound) 1913 lo = lobound; 1914 if (hi > hibound) 1915 hi = hibound; 1916 /* 1917 * Verify that the candidate hole is big enough and meets 1918 * hardware constraints. 1919 */ 1920 *basep = lo; 1921 *lenp = hi - lo; 1922 if (valid_va_range(basep, lenp, minlen, 1923 forward ? AH_LO : AH_HI) && 1924 ((flags & AH_CONTAIN) == 0 || 1925 (*basep <= addr && *basep + *lenp > addr))) { 1926 if (!forward) 1927 as->a_lastgap = hseg; 1928 if (hseg != NULL) 1929 as->a_lastgaphl = hseg; 1930 else 1931 as->a_lastgaphl = lseg; 1932 AS_LOCK_EXIT(as, &as->a_lock); 1933 return (0); 1934 } 1935 cont: 1936 /* 1937 * Move to the next hole. 1938 */ 1939 if (forward) { 1940 lseg = hseg; 1941 if (lseg == NULL) 1942 break; 1943 hseg = AS_SEGNEXT(as, hseg); 1944 } else { 1945 hseg = lseg; 1946 if (hseg == NULL) 1947 break; 1948 lseg = AS_SEGPREV(as, lseg); 1949 } 1950 } 1951 *basep = save_base; 1952 *lenp = save_len; 1953 AS_LOCK_EXIT(as, &as->a_lock); 1954 return (-1); 1955 } 1956 1957 /* 1958 * Return the next range within [base, base + len) that is backed 1959 * with "real memory". Skip holes and non-seg_vn segments. 1960 * We're lazy and only return one segment at a time. 1961 */ 1962 int 1963 as_memory(struct as *as, caddr_t *basep, size_t *lenp) 1964 { 1965 extern struct seg_ops segspt_shmops; /* needs a header file */ 1966 struct seg *seg; 1967 caddr_t addr, eaddr; 1968 caddr_t segend; 1969 1970 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1971 1972 addr = *basep; 1973 eaddr = addr + *lenp; 1974 1975 seg = as_findseg(as, addr, 0); 1976 if (seg != NULL) 1977 addr = MAX(seg->s_base, addr); 1978 1979 for (;;) { 1980 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) { 1981 AS_LOCK_EXIT(as, &as->a_lock); 1982 return (EINVAL); 1983 } 1984 1985 if (seg->s_ops == &segvn_ops) { 1986 segend = seg->s_base + seg->s_size; 1987 break; 1988 } 1989 1990 /* 1991 * We do ISM by looking into the private data 1992 * to determine the real size of the segment. 1993 */ 1994 if (seg->s_ops == &segspt_shmops) { 1995 segend = seg->s_base + spt_realsize(seg); 1996 if (addr < segend) 1997 break; 1998 } 1999 2000 seg = AS_SEGNEXT(as, seg); 2001 2002 if (seg != NULL) 2003 addr = seg->s_base; 2004 } 2005 2006 *basep = addr; 2007 2008 if (segend > eaddr) 2009 *lenp = eaddr - addr; 2010 else 2011 *lenp = segend - addr; 2012 2013 AS_LOCK_EXIT(as, &as->a_lock); 2014 return (0); 2015 } 2016 2017 /* 2018 * Swap the pages associated with the address space as out to 2019 * secondary storage, returning the number of bytes actually 2020 * swapped. 2021 * 2022 * The value returned is intended to correlate well with the process's 2023 * memory requirements. Its usefulness for this purpose depends on 2024 * how well the segment-level routines do at returning accurate 2025 * information. 2026 */ 2027 size_t 2028 as_swapout(struct as *as) 2029 { 2030 struct seg *seg; 2031 size_t swpcnt = 0; 2032 2033 /* 2034 * Kernel-only processes have given up their address 2035 * spaces. Of course, we shouldn't be attempting to 2036 * swap out such processes in the first place... 2037 */ 2038 if (as == NULL) 2039 return (0); 2040 2041 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2042 2043 /* Prevent XHATs from attaching */ 2044 mutex_enter(&as->a_contents); 2045 AS_SETBUSY(as); 2046 mutex_exit(&as->a_contents); 2047 2048 2049 /* 2050 * Free all mapping resources associated with the address 2051 * space. The segment-level swapout routines capitalize 2052 * on this unmapping by scavanging pages that have become 2053 * unmapped here. 2054 */ 2055 hat_swapout(as->a_hat); 2056 if (as->a_xhat != NULL) 2057 xhat_swapout_all(as); 2058 2059 mutex_enter(&as->a_contents); 2060 AS_CLRBUSY(as); 2061 mutex_exit(&as->a_contents); 2062 2063 /* 2064 * Call the swapout routines of all segments in the address 2065 * space to do the actual work, accumulating the amount of 2066 * space reclaimed. 2067 */ 2068 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 2069 struct seg_ops *ov = seg->s_ops; 2070 2071 /* 2072 * We have to check to see if the seg has 2073 * an ops vector because the seg may have 2074 * been in the middle of being set up when 2075 * the process was picked for swapout. 2076 */ 2077 if ((ov != NULL) && (ov->swapout != NULL)) 2078 swpcnt += SEGOP_SWAPOUT(seg); 2079 } 2080 AS_LOCK_EXIT(as, &as->a_lock); 2081 return (swpcnt); 2082 } 2083 2084 /* 2085 * Determine whether data from the mappings in interval [addr, addr + size) 2086 * are in the primary memory (core) cache. 2087 */ 2088 int 2089 as_incore(struct as *as, caddr_t addr, 2090 size_t size, char *vec, size_t *sizep) 2091 { 2092 struct seg *seg; 2093 size_t ssize; 2094 caddr_t raddr; /* rounded down addr */ 2095 size_t rsize; /* rounded up size */ 2096 size_t isize; /* iteration size */ 2097 int error = 0; /* result, assume success */ 2098 2099 *sizep = 0; 2100 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2101 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) - 2102 (size_t)raddr; 2103 2104 if (raddr + rsize < raddr) /* check for wraparound */ 2105 return (ENOMEM); 2106 2107 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2108 seg = as_segat(as, raddr); 2109 if (seg == NULL) { 2110 AS_LOCK_EXIT(as, &as->a_lock); 2111 return (-1); 2112 } 2113 2114 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 2115 if (raddr >= seg->s_base + seg->s_size) { 2116 seg = AS_SEGNEXT(as, seg); 2117 if (seg == NULL || raddr != seg->s_base) { 2118 error = -1; 2119 break; 2120 } 2121 } 2122 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2123 ssize = seg->s_base + seg->s_size - raddr; 2124 else 2125 ssize = rsize; 2126 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec); 2127 if (isize != ssize) { 2128 error = -1; 2129 break; 2130 } 2131 vec += btopr(ssize); 2132 } 2133 AS_LOCK_EXIT(as, &as->a_lock); 2134 return (error); 2135 } 2136 2137 static void 2138 as_segunlock(struct seg *seg, caddr_t addr, int attr, 2139 ulong_t *bitmap, size_t position, size_t npages) 2140 { 2141 caddr_t range_start; 2142 size_t pos1 = position; 2143 size_t pos2; 2144 size_t size; 2145 size_t end_pos = npages + position; 2146 2147 while (bt_range(bitmap, &pos1, &pos2, end_pos)) { 2148 size = ptob((pos2 - pos1)); 2149 range_start = (caddr_t)((uintptr_t)addr + 2150 ptob(pos1 - position)); 2151 2152 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK, 2153 (ulong_t *)NULL, (size_t)NULL); 2154 pos1 = pos2; 2155 } 2156 } 2157 2158 static void 2159 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map, 2160 caddr_t raddr, size_t rsize) 2161 { 2162 struct seg *seg = as_segat(as, raddr); 2163 size_t ssize; 2164 2165 while (rsize != 0) { 2166 if (raddr >= seg->s_base + seg->s_size) 2167 seg = AS_SEGNEXT(as, seg); 2168 2169 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2170 ssize = seg->s_base + seg->s_size - raddr; 2171 else 2172 ssize = rsize; 2173 2174 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize)); 2175 2176 rsize -= ssize; 2177 raddr += ssize; 2178 } 2179 } 2180 2181 /* 2182 * Cache control operations over the interval [addr, addr + size) in 2183 * address space "as". 2184 */ 2185 /*ARGSUSED*/ 2186 int 2187 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr, 2188 uintptr_t arg, ulong_t *lock_map, size_t pos) 2189 { 2190 struct seg *seg; /* working segment */ 2191 caddr_t raddr; /* rounded down addr */ 2192 caddr_t initraddr; /* saved initial rounded down addr */ 2193 size_t rsize; /* rounded up size */ 2194 size_t initrsize; /* saved initial rounded up size */ 2195 size_t ssize; /* size of seg */ 2196 int error = 0; /* result */ 2197 size_t mlock_size; /* size of bitmap */ 2198 ulong_t *mlock_map; /* pointer to bitmap used */ 2199 /* to represent the locked */ 2200 /* pages. */ 2201 retry: 2202 if (error == IE_RETRY) 2203 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 2204 else 2205 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2206 2207 /* 2208 * If these are address space lock/unlock operations, loop over 2209 * all segments in the address space, as appropriate. 2210 */ 2211 if (func == MC_LOCKAS) { 2212 size_t npages, idx; 2213 size_t rlen = 0; /* rounded as length */ 2214 2215 idx = pos; 2216 2217 if (arg & MCL_FUTURE) { 2218 mutex_enter(&as->a_contents); 2219 AS_SETPGLCK(as); 2220 mutex_exit(&as->a_contents); 2221 } 2222 if ((arg & MCL_CURRENT) == 0) { 2223 AS_LOCK_EXIT(as, &as->a_lock); 2224 return (0); 2225 } 2226 2227 seg = AS_SEGFIRST(as); 2228 if (seg == NULL) { 2229 AS_LOCK_EXIT(as, &as->a_lock); 2230 return (0); 2231 } 2232 2233 do { 2234 raddr = (caddr_t)((uintptr_t)seg->s_base & 2235 (uintptr_t)PAGEMASK); 2236 rlen += (((uintptr_t)(seg->s_base + seg->s_size) + 2237 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr; 2238 } while ((seg = AS_SEGNEXT(as, seg)) != NULL); 2239 2240 mlock_size = BT_BITOUL(btopr(rlen)); 2241 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * 2242 sizeof (ulong_t), KM_NOSLEEP)) == NULL) { 2243 AS_LOCK_EXIT(as, &as->a_lock); 2244 return (EAGAIN); 2245 } 2246 2247 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 2248 error = SEGOP_LOCKOP(seg, seg->s_base, 2249 seg->s_size, attr, MC_LOCK, mlock_map, pos); 2250 if (error != 0) 2251 break; 2252 pos += seg_pages(seg); 2253 } 2254 2255 if (error) { 2256 for (seg = AS_SEGFIRST(as); seg != NULL; 2257 seg = AS_SEGNEXT(as, seg)) { 2258 2259 raddr = (caddr_t)((uintptr_t)seg->s_base & 2260 (uintptr_t)PAGEMASK); 2261 npages = seg_pages(seg); 2262 as_segunlock(seg, raddr, attr, mlock_map, 2263 idx, npages); 2264 idx += npages; 2265 } 2266 } 2267 2268 kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); 2269 AS_LOCK_EXIT(as, &as->a_lock); 2270 goto lockerr; 2271 } else if (func == MC_UNLOCKAS) { 2272 mutex_enter(&as->a_contents); 2273 AS_CLRPGLCK(as); 2274 mutex_exit(&as->a_contents); 2275 2276 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 2277 error = SEGOP_LOCKOP(seg, seg->s_base, 2278 seg->s_size, attr, MC_UNLOCK, NULL, 0); 2279 if (error != 0) 2280 break; 2281 } 2282 2283 AS_LOCK_EXIT(as, &as->a_lock); 2284 goto lockerr; 2285 } 2286 2287 /* 2288 * Normalize addresses and sizes. 2289 */ 2290 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2291 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2292 (size_t)raddr; 2293 2294 if (raddr + rsize < raddr) { /* check for wraparound */ 2295 AS_LOCK_EXIT(as, &as->a_lock); 2296 return (ENOMEM); 2297 } 2298 2299 /* 2300 * Get initial segment. 2301 */ 2302 if ((seg = as_segat(as, raddr)) == NULL) { 2303 AS_LOCK_EXIT(as, &as->a_lock); 2304 return (ENOMEM); 2305 } 2306 2307 if (func == MC_LOCK) { 2308 mlock_size = BT_BITOUL(btopr(rsize)); 2309 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * 2310 sizeof (ulong_t), KM_NOSLEEP)) == NULL) { 2311 AS_LOCK_EXIT(as, &as->a_lock); 2312 return (EAGAIN); 2313 } 2314 } 2315 2316 /* 2317 * Loop over all segments. If a hole in the address range is 2318 * discovered, then fail. For each segment, perform the appropriate 2319 * control operation. 2320 */ 2321 while (rsize != 0) { 2322 2323 /* 2324 * Make sure there's no hole, calculate the portion 2325 * of the next segment to be operated over. 2326 */ 2327 if (raddr >= seg->s_base + seg->s_size) { 2328 seg = AS_SEGNEXT(as, seg); 2329 if (seg == NULL || raddr != seg->s_base) { 2330 if (func == MC_LOCK) { 2331 as_unlockerr(as, attr, mlock_map, 2332 initraddr, initrsize - rsize); 2333 kmem_free(mlock_map, 2334 mlock_size * sizeof (ulong_t)); 2335 } 2336 AS_LOCK_EXIT(as, &as->a_lock); 2337 return (ENOMEM); 2338 } 2339 } 2340 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2341 ssize = seg->s_base + seg->s_size - raddr; 2342 else 2343 ssize = rsize; 2344 2345 /* 2346 * Dispatch on specific function. 2347 */ 2348 switch (func) { 2349 2350 /* 2351 * Synchronize cached data from mappings with backing 2352 * objects. 2353 */ 2354 case MC_SYNC: 2355 if (error = SEGOP_SYNC(seg, raddr, ssize, 2356 attr, (uint_t)arg)) { 2357 AS_LOCK_EXIT(as, &as->a_lock); 2358 return (error); 2359 } 2360 break; 2361 2362 /* 2363 * Lock pages in memory. 2364 */ 2365 case MC_LOCK: 2366 if (error = SEGOP_LOCKOP(seg, raddr, ssize, 2367 attr, func, mlock_map, pos)) { 2368 as_unlockerr(as, attr, mlock_map, initraddr, 2369 initrsize - rsize + ssize); 2370 kmem_free(mlock_map, mlock_size * 2371 sizeof (ulong_t)); 2372 AS_LOCK_EXIT(as, &as->a_lock); 2373 goto lockerr; 2374 } 2375 break; 2376 2377 /* 2378 * Unlock mapped pages. 2379 */ 2380 case MC_UNLOCK: 2381 (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func, 2382 (ulong_t *)NULL, (size_t)NULL); 2383 break; 2384 2385 /* 2386 * Store VM advise for mapped pages in segment layer. 2387 */ 2388 case MC_ADVISE: 2389 error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg); 2390 2391 /* 2392 * Check for regular errors and special retry error 2393 */ 2394 if (error) { 2395 if (error == IE_RETRY) { 2396 /* 2397 * Need to acquire writers lock, so 2398 * have to drop readers lock and start 2399 * all over again 2400 */ 2401 AS_LOCK_EXIT(as, &as->a_lock); 2402 goto retry; 2403 } else if (error == IE_REATTACH) { 2404 /* 2405 * Find segment for current address 2406 * because current segment just got 2407 * split or concatenated 2408 */ 2409 seg = as_segat(as, raddr); 2410 if (seg == NULL) { 2411 AS_LOCK_EXIT(as, &as->a_lock); 2412 return (ENOMEM); 2413 } 2414 } else { 2415 /* 2416 * Regular error 2417 */ 2418 AS_LOCK_EXIT(as, &as->a_lock); 2419 return (error); 2420 } 2421 } 2422 break; 2423 2424 /* 2425 * Can't happen. 2426 */ 2427 default: 2428 panic("as_ctl: bad operation %d", func); 2429 /*NOTREACHED*/ 2430 } 2431 2432 rsize -= ssize; 2433 raddr += ssize; 2434 } 2435 2436 if (func == MC_LOCK) 2437 kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); 2438 AS_LOCK_EXIT(as, &as->a_lock); 2439 return (0); 2440 lockerr: 2441 2442 /* 2443 * If the lower levels returned EDEADLK for a segment lockop, 2444 * it means that we should retry the operation. Let's wait 2445 * a bit also to let the deadlock causing condition clear. 2446 * This is part of a gross hack to work around a design flaw 2447 * in the ufs/sds logging code and should go away when the 2448 * logging code is re-designed to fix the problem. See bug 2449 * 4125102 for details of the problem. 2450 */ 2451 if (error == EDEADLK) { 2452 delay(deadlk_wait); 2453 error = 0; 2454 goto retry; 2455 } 2456 return (error); 2457 } 2458 2459 /* 2460 * Special code for exec to move the stack segment from its interim 2461 * place in the old address to the right place in the new address space. 2462 */ 2463 /*ARGSUSED*/ 2464 int 2465 as_exec(struct as *oas, caddr_t ostka, size_t stksz, 2466 struct as *nas, caddr_t nstka, uint_t hatflag) 2467 { 2468 struct seg *stkseg; 2469 2470 AS_LOCK_ENTER(oas, &oas->a_lock, RW_WRITER); 2471 stkseg = as_segat(oas, ostka); 2472 stkseg = as_removeseg(oas, stkseg); 2473 ASSERT(stkseg != NULL); 2474 ASSERT(stkseg->s_base == ostka && stkseg->s_size == stksz); 2475 stkseg->s_as = nas; 2476 stkseg->s_base = nstka; 2477 2478 /* 2479 * It's ok to lock the address space we are about to exec to. 2480 */ 2481 AS_LOCK_ENTER(nas, &nas->a_lock, RW_WRITER); 2482 ASSERT(avl_numnodes(&nas->a_wpage) == 0); 2483 nas->a_size += stkseg->s_size; 2484 oas->a_size -= stkseg->s_size; 2485 (void) as_addseg(nas, stkseg); 2486 AS_LOCK_EXIT(nas, &nas->a_lock); 2487 AS_LOCK_EXIT(oas, &oas->a_lock); 2488 return (0); 2489 } 2490 2491 static int 2492 f_decode(faultcode_t fault_err) 2493 { 2494 int error = 0; 2495 2496 switch (FC_CODE(fault_err)) { 2497 case FC_OBJERR: 2498 error = FC_ERRNO(fault_err); 2499 break; 2500 case FC_PROT: 2501 error = EACCES; 2502 break; 2503 default: 2504 error = EFAULT; 2505 break; 2506 } 2507 return (error); 2508 } 2509 2510 /* 2511 * lock pages in a given address space. Return shadow list. If 2512 * the list is NULL, the MMU mapping is also locked. 2513 */ 2514 int 2515 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr, 2516 size_t size, enum seg_rw rw) 2517 { 2518 size_t rsize; 2519 caddr_t base; 2520 caddr_t raddr; 2521 faultcode_t fault_err; 2522 struct seg *seg; 2523 int res; 2524 int prefaulted = 0; 2525 2526 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START, 2527 "as_pagelock_start: addr %p size %ld", addr, size); 2528 2529 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2530 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2531 (size_t)raddr; 2532 top: 2533 /* 2534 * if the request crosses two segments let 2535 * as_fault handle it. 2536 */ 2537 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2538 seg = as_findseg(as, addr, 0); 2539 if ((seg == NULL) || ((base = seg->s_base) > addr) || 2540 (addr + size) > base + seg->s_size) { 2541 AS_LOCK_EXIT(as, &as->a_lock); 2542 goto slow; 2543 } 2544 2545 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START, 2546 "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize); 2547 2548 /* 2549 * try to lock pages and pass back shadow list 2550 */ 2551 res = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw); 2552 2553 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end"); 2554 AS_LOCK_EXIT(as, &as->a_lock); 2555 if (res == 0) { 2556 return (0); 2557 } else if (res == ENOTSUP || prefaulted) { 2558 /* 2559 * (1) segment driver doesn't support PAGELOCK fastpath, or 2560 * (2) we've already tried fast path unsuccessfully after 2561 * faulting in the addr range below; system might be 2562 * thrashing or there may not be enough availrmem. 2563 */ 2564 goto slow; 2565 } 2566 2567 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_FAULT_START, 2568 "as_fault_start: addr %p size %ld", addr, size); 2569 2570 /* 2571 * we might get here because of some COW fault or non 2572 * existing page. Let as_fault deal with it. Just load 2573 * the page, don't lock the MMU mapping. 2574 */ 2575 fault_err = as_fault(as->a_hat, as, addr, size, F_INVAL, rw); 2576 if (fault_err != 0) { 2577 return (f_decode(fault_err)); 2578 } 2579 2580 prefaulted = 1; 2581 2582 /* 2583 * try fast path again; since we've dropped a_lock, 2584 * we need to try the dance from the start to see if 2585 * the addr range is still valid. 2586 */ 2587 goto top; 2588 slow: 2589 /* 2590 * load the page and lock the MMU mapping. 2591 */ 2592 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw); 2593 if (fault_err != 0) { 2594 return (f_decode(fault_err)); 2595 } 2596 *ppp = NULL; 2597 2598 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end"); 2599 return (0); 2600 } 2601 2602 /* 2603 * unlock pages in a given address range 2604 */ 2605 void 2606 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size, 2607 enum seg_rw rw) 2608 { 2609 struct seg *seg; 2610 size_t rsize; 2611 caddr_t raddr; 2612 2613 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START, 2614 "as_pageunlock_start: addr %p size %ld", addr, size); 2615 2616 /* 2617 * if the shadow list is NULL, as_pagelock was 2618 * falling back to as_fault 2619 */ 2620 if (pp == NULL) { 2621 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw); 2622 return; 2623 } 2624 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2625 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2626 (size_t)raddr; 2627 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2628 seg = as_findseg(as, addr, 0); 2629 ASSERT(seg); 2630 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START, 2631 "seg_unlock_start: raddr %p rsize %ld", raddr, rsize); 2632 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw); 2633 AS_LOCK_EXIT(as, &as->a_lock); 2634 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end"); 2635 } 2636 2637 /* 2638 * reclaim cached pages in a given address range 2639 */ 2640 void 2641 as_pagereclaim(struct as *as, struct page **pp, caddr_t addr, 2642 size_t size, enum seg_rw rw) 2643 { 2644 struct seg *seg; 2645 size_t rsize; 2646 caddr_t raddr; 2647 2648 ASSERT(AS_READ_HELD(as, &as->a_lock)); 2649 ASSERT(pp != NULL); 2650 2651 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2652 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2653 (size_t)raddr; 2654 seg = as_findseg(as, addr, 0); 2655 ASSERT(seg); 2656 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGERECLAIM, rw); 2657 } 2658 2659 #define MAXPAGEFLIP 4 2660 #define MAXPAGEFLIPSIZ MAXPAGEFLIP*PAGESIZE 2661 2662 int 2663 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc, 2664 boolean_t wait) 2665 { 2666 struct seg *seg; 2667 size_t ssize; 2668 caddr_t raddr; /* rounded down addr */ 2669 size_t rsize; /* rounded up size */ 2670 int error = 0; 2671 size_t pgsz = page_get_pagesize(szc); 2672 2673 setpgsz_top: 2674 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) { 2675 return (EINVAL); 2676 } 2677 2678 raddr = addr; 2679 rsize = size; 2680 2681 if (raddr + rsize < raddr) /* check for wraparound */ 2682 return (ENOMEM); 2683 2684 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 2685 as_clearwatchprot(as, raddr, rsize); 2686 seg = as_segat(as, raddr); 2687 if (seg == NULL) { 2688 as_setwatch(as); 2689 AS_LOCK_EXIT(as, &as->a_lock); 2690 return (ENOMEM); 2691 } 2692 2693 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 2694 if (raddr >= seg->s_base + seg->s_size) { 2695 seg = AS_SEGNEXT(as, seg); 2696 if (seg == NULL || raddr != seg->s_base) { 2697 error = ENOMEM; 2698 break; 2699 } 2700 } 2701 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 2702 ssize = seg->s_base + seg->s_size - raddr; 2703 } else { 2704 ssize = rsize; 2705 } 2706 2707 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc); 2708 2709 if (error == IE_NOMEM) { 2710 error = EAGAIN; 2711 break; 2712 } 2713 2714 if (error == IE_RETRY) { 2715 AS_LOCK_EXIT(as, &as->a_lock); 2716 goto setpgsz_top; 2717 } 2718 2719 if (error == ENOTSUP) { 2720 error = EINVAL; 2721 break; 2722 } 2723 2724 if (wait && (error == EAGAIN)) { 2725 /* 2726 * Memory is currently locked. It must be unlocked 2727 * before this operation can succeed through a retry. 2728 * The possible reasons for locked memory and 2729 * corresponding strategies for unlocking are: 2730 * (1) Normal I/O 2731 * wait for a signal that the I/O operation 2732 * has completed and the memory is unlocked. 2733 * (2) Asynchronous I/O 2734 * The aio subsystem does not unlock pages when 2735 * the I/O is completed. Those pages are unlocked 2736 * when the application calls aiowait/aioerror. 2737 * So, to prevent blocking forever, cv_broadcast() 2738 * is done to wake up aio_cleanup_thread. 2739 * Subsequently, segvn_reclaim will be called, and 2740 * that will do AS_CLRUNMAPWAIT() and wake us up. 2741 * (3) Long term page locking: 2742 * This is not relevant for as_setpagesize() 2743 * because we cannot change the page size for 2744 * driver memory. The attempt to do so will 2745 * fail with a different error than EAGAIN so 2746 * there's no need to trigger as callbacks like 2747 * as_unmap, as_setprot or as_free would do. 2748 */ 2749 mutex_enter(&as->a_contents); 2750 if (AS_ISUNMAPWAIT(as) == 0) { 2751 cv_broadcast(&as->a_cv); 2752 } 2753 AS_SETUNMAPWAIT(as); 2754 AS_LOCK_EXIT(as, &as->a_lock); 2755 while (AS_ISUNMAPWAIT(as)) { 2756 cv_wait(&as->a_cv, &as->a_contents); 2757 } 2758 mutex_exit(&as->a_contents); 2759 goto setpgsz_top; 2760 } else if (error != 0) { 2761 break; 2762 } 2763 } 2764 as_setwatch(as); 2765 AS_LOCK_EXIT(as, &as->a_lock); 2766 return (error); 2767 } 2768 2769 /* 2770 * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments 2771 * in its chunk where s_szc is less than the szc we want to set. 2772 */ 2773 static int 2774 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc, 2775 int *retry) 2776 { 2777 struct seg *seg; 2778 size_t ssize; 2779 int error; 2780 2781 seg = as_segat(as, raddr); 2782 if (seg == NULL) { 2783 panic("as_iset3_default_lpsize: no seg"); 2784 } 2785 2786 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 2787 if (raddr >= seg->s_base + seg->s_size) { 2788 seg = AS_SEGNEXT(as, seg); 2789 if (seg == NULL || raddr != seg->s_base) { 2790 panic("as_iset3_default_lpsize: as changed"); 2791 } 2792 } 2793 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 2794 ssize = seg->s_base + seg->s_size - raddr; 2795 } else { 2796 ssize = rsize; 2797 } 2798 2799 if (szc > seg->s_szc) { 2800 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc); 2801 /* Only retry on EINVAL segments that have no vnode. */ 2802 if (error == EINVAL) { 2803 vnode_t *vp = NULL; 2804 if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) && 2805 (SEGOP_GETVP(seg, raddr, &vp) != 0 || 2806 vp == NULL)) { 2807 *retry = 1; 2808 } else { 2809 *retry = 0; 2810 } 2811 } 2812 if (error) { 2813 return (error); 2814 } 2815 } 2816 } 2817 return (0); 2818 } 2819 2820 /* 2821 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the 2822 * pagesize on each segment in its range, but if any fails with EINVAL, 2823 * then it reduces the pagesizes to the next size in the bitmap and 2824 * retries as_iset3_default_lpsize(). The reason why the code retries 2825 * smaller allowed sizes on EINVAL is because (a) the anon offset may not 2826 * match the bigger sizes, and (b) it's hard to get this offset (to begin 2827 * with) to pass to map_pgszcvec(). 2828 */ 2829 static int 2830 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc, 2831 uint_t szcvec) 2832 { 2833 int error; 2834 int retry; 2835 2836 for (;;) { 2837 error = as_iset3_default_lpsize(as, addr, size, szc, &retry); 2838 if (error == EINVAL && retry) { 2839 szcvec &= ~(1 << szc); 2840 if (szcvec <= 1) { 2841 return (EINVAL); 2842 } 2843 szc = highbit(szcvec) - 1; 2844 } else { 2845 return (error); 2846 } 2847 } 2848 } 2849 2850 /* 2851 * as_iset1_default_lpsize() breaks its chunk into areas where existing 2852 * segments have a smaller szc than we want to set. For each such area, 2853 * it calls as_iset2_default_lpsize() 2854 */ 2855 static int 2856 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc, 2857 uint_t szcvec) 2858 { 2859 struct seg *seg; 2860 size_t ssize; 2861 caddr_t setaddr = raddr; 2862 size_t setsize = 0; 2863 int set; 2864 int error; 2865 2866 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 2867 2868 seg = as_segat(as, raddr); 2869 if (seg == NULL) { 2870 panic("as_iset1_default_lpsize: no seg"); 2871 } 2872 if (seg->s_szc < szc) { 2873 set = 1; 2874 } else { 2875 set = 0; 2876 } 2877 2878 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) { 2879 if (raddr >= seg->s_base + seg->s_size) { 2880 seg = AS_SEGNEXT(as, seg); 2881 if (seg == NULL || raddr != seg->s_base) { 2882 panic("as_iset1_default_lpsize: as changed"); 2883 } 2884 if (seg->s_szc >= szc && set) { 2885 ASSERT(setsize != 0); 2886 error = as_iset2_default_lpsize(as, 2887 setaddr, setsize, szc, szcvec); 2888 if (error) { 2889 return (error); 2890 } 2891 set = 0; 2892 } else if (seg->s_szc < szc && !set) { 2893 setaddr = raddr; 2894 setsize = 0; 2895 set = 1; 2896 } 2897 } 2898 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 2899 ssize = seg->s_base + seg->s_size - raddr; 2900 } else { 2901 ssize = rsize; 2902 } 2903 } 2904 error = 0; 2905 if (set) { 2906 ASSERT(setsize != 0); 2907 error = as_iset2_default_lpsize(as, setaddr, setsize, 2908 szc, szcvec); 2909 } 2910 return (error); 2911 } 2912 2913 /* 2914 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap 2915 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each 2916 * chunk to as_iset1_default_lpsize(). 2917 */ 2918 static int 2919 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags, 2920 int type) 2921 { 2922 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM; 2923 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, 2924 flags, rtype, 1); 2925 uint_t szc; 2926 uint_t nszc; 2927 int error; 2928 caddr_t a; 2929 caddr_t eaddr; 2930 size_t segsize; 2931 size_t pgsz; 2932 uint_t save_szcvec; 2933 2934 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 2935 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 2936 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 2937 2938 szcvec &= ~1; 2939 if (szcvec <= 1) { /* skip if base page size */ 2940 return (0); 2941 } 2942 2943 /* Get the pagesize of the first larger page size. */ 2944 szc = lowbit(szcvec) - 1; 2945 pgsz = page_get_pagesize(szc); 2946 eaddr = addr + size; 2947 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 2948 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 2949 2950 save_szcvec = szcvec; 2951 szcvec >>= (szc + 1); 2952 nszc = szc; 2953 while (szcvec) { 2954 if ((szcvec & 0x1) == 0) { 2955 nszc++; 2956 szcvec >>= 1; 2957 continue; 2958 } 2959 nszc++; 2960 pgsz = page_get_pagesize(nszc); 2961 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 2962 if (a != addr) { 2963 ASSERT(szc > 0); 2964 ASSERT(a < eaddr); 2965 segsize = a - addr; 2966 error = as_iset1_default_lpsize(as, addr, segsize, szc, 2967 save_szcvec); 2968 if (error) { 2969 return (error); 2970 } 2971 addr = a; 2972 } 2973 szc = nszc; 2974 szcvec >>= 1; 2975 } 2976 2977 ASSERT(addr < eaddr); 2978 szcvec = save_szcvec; 2979 while (szcvec) { 2980 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 2981 ASSERT(a >= addr); 2982 if (a != addr) { 2983 ASSERT(szc > 0); 2984 segsize = a - addr; 2985 error = as_iset1_default_lpsize(as, addr, segsize, szc, 2986 save_szcvec); 2987 if (error) { 2988 return (error); 2989 } 2990 addr = a; 2991 } 2992 szcvec &= ~(1 << szc); 2993 if (szcvec) { 2994 szc = highbit(szcvec) - 1; 2995 pgsz = page_get_pagesize(szc); 2996 } 2997 } 2998 ASSERT(addr == eaddr); 2999 3000 return (0); 3001 } 3002 3003 /* 3004 * Set the default large page size for the range. Called via memcntl with 3005 * page size set to 0. as_set_default_lpsize breaks the range down into 3006 * chunks with the same type/flags, ignores-non segvn segments, and passes 3007 * each chunk to as_iset_default_lpsize(). 3008 */ 3009 int 3010 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size) 3011 { 3012 struct seg *seg; 3013 caddr_t raddr; 3014 size_t rsize; 3015 size_t ssize; 3016 int rtype, rflags; 3017 int stype, sflags; 3018 int error; 3019 caddr_t setaddr; 3020 size_t setsize; 3021 int segvn; 3022 3023 if (size == 0) 3024 return (0); 3025 3026 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 3027 again: 3028 error = 0; 3029 3030 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3031 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 3032 (size_t)raddr; 3033 3034 if (raddr + rsize < raddr) { /* check for wraparound */ 3035 AS_LOCK_EXIT(as, &as->a_lock); 3036 return (ENOMEM); 3037 } 3038 as_clearwatchprot(as, raddr, rsize); 3039 seg = as_segat(as, raddr); 3040 if (seg == NULL) { 3041 as_setwatch(as); 3042 AS_LOCK_EXIT(as, &as->a_lock); 3043 return (ENOMEM); 3044 } 3045 if (seg->s_ops == &segvn_ops) { 3046 rtype = SEGOP_GETTYPE(seg, addr); 3047 rflags = rtype & (MAP_TEXT | MAP_INITDATA); 3048 rtype = rtype & (MAP_SHARED | MAP_PRIVATE); 3049 segvn = 1; 3050 } else { 3051 segvn = 0; 3052 } 3053 setaddr = raddr; 3054 setsize = 0; 3055 3056 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) { 3057 if (raddr >= (seg->s_base + seg->s_size)) { 3058 seg = AS_SEGNEXT(as, seg); 3059 if (seg == NULL || raddr != seg->s_base) { 3060 error = ENOMEM; 3061 break; 3062 } 3063 if (seg->s_ops == &segvn_ops) { 3064 stype = SEGOP_GETTYPE(seg, raddr); 3065 sflags = stype & (MAP_TEXT | MAP_INITDATA); 3066 stype &= (MAP_SHARED | MAP_PRIVATE); 3067 if (segvn && (rflags != sflags || 3068 rtype != stype)) { 3069 /* 3070 * The next segment is also segvn but 3071 * has different flags and/or type. 3072 */ 3073 ASSERT(setsize != 0); 3074 error = as_iset_default_lpsize(as, 3075 setaddr, setsize, rflags, rtype); 3076 if (error) { 3077 break; 3078 } 3079 rflags = sflags; 3080 rtype = stype; 3081 setaddr = raddr; 3082 setsize = 0; 3083 } else if (!segvn) { 3084 rflags = sflags; 3085 rtype = stype; 3086 setaddr = raddr; 3087 setsize = 0; 3088 segvn = 1; 3089 } 3090 } else if (segvn) { 3091 /* The next segment is not segvn. */ 3092 ASSERT(setsize != 0); 3093 error = as_iset_default_lpsize(as, 3094 setaddr, setsize, rflags, rtype); 3095 if (error) { 3096 break; 3097 } 3098 segvn = 0; 3099 } 3100 } 3101 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3102 ssize = seg->s_base + seg->s_size - raddr; 3103 } else { 3104 ssize = rsize; 3105 } 3106 } 3107 if (error == 0 && segvn) { 3108 /* The last chunk when rsize == 0. */ 3109 ASSERT(setsize != 0); 3110 error = as_iset_default_lpsize(as, setaddr, setsize, 3111 rflags, rtype); 3112 } 3113 3114 if (error == IE_RETRY) { 3115 goto again; 3116 } else if (error == IE_NOMEM) { 3117 error = EAGAIN; 3118 } else if (error == ENOTSUP) { 3119 error = EINVAL; 3120 } else if (error == EAGAIN) { 3121 mutex_enter(&as->a_contents); 3122 if (AS_ISUNMAPWAIT(as) == 0) { 3123 cv_broadcast(&as->a_cv); 3124 } 3125 AS_SETUNMAPWAIT(as); 3126 AS_LOCK_EXIT(as, &as->a_lock); 3127 while (AS_ISUNMAPWAIT(as)) { 3128 cv_wait(&as->a_cv, &as->a_contents); 3129 } 3130 mutex_exit(&as->a_contents); 3131 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 3132 goto again; 3133 } 3134 3135 as_setwatch(as); 3136 AS_LOCK_EXIT(as, &as->a_lock); 3137 return (error); 3138 } 3139 3140 /* 3141 * Setup all of the uninitialized watched pages that we can. 3142 */ 3143 void 3144 as_setwatch(struct as *as) 3145 { 3146 struct watched_page *pwp; 3147 struct seg *seg; 3148 caddr_t vaddr; 3149 uint_t prot; 3150 int err, retrycnt; 3151 3152 if (avl_numnodes(&as->a_wpage) == 0) 3153 return; 3154 3155 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3156 3157 for (pwp = avl_first(&as->a_wpage); pwp != NULL; 3158 pwp = AVL_NEXT(&as->a_wpage, pwp)) { 3159 retrycnt = 0; 3160 retry: 3161 vaddr = pwp->wp_vaddr; 3162 if (pwp->wp_oprot != 0 || /* already set up */ 3163 (seg = as_segat(as, vaddr)) == NULL || 3164 SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0) 3165 continue; 3166 3167 pwp->wp_oprot = prot; 3168 if (pwp->wp_read) 3169 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3170 if (pwp->wp_write) 3171 prot &= ~PROT_WRITE; 3172 if (pwp->wp_exec) 3173 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3174 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) { 3175 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot); 3176 if (err == IE_RETRY) { 3177 pwp->wp_oprot = 0; 3178 ASSERT(retrycnt == 0); 3179 retrycnt++; 3180 goto retry; 3181 } 3182 } 3183 pwp->wp_prot = prot; 3184 } 3185 } 3186 3187 /* 3188 * Clear all of the watched pages in the address space. 3189 */ 3190 void 3191 as_clearwatch(struct as *as) 3192 { 3193 struct watched_page *pwp; 3194 struct seg *seg; 3195 caddr_t vaddr; 3196 uint_t prot; 3197 int err, retrycnt; 3198 3199 if (avl_numnodes(&as->a_wpage) == 0) 3200 return; 3201 3202 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3203 3204 for (pwp = avl_first(&as->a_wpage); pwp != NULL; 3205 pwp = AVL_NEXT(&as->a_wpage, pwp)) { 3206 retrycnt = 0; 3207 retry: 3208 vaddr = pwp->wp_vaddr; 3209 if (pwp->wp_oprot == 0 || /* not set up */ 3210 (seg = as_segat(as, vaddr)) == NULL) 3211 continue; 3212 3213 if ((prot = pwp->wp_oprot) != pwp->wp_prot) { 3214 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot); 3215 if (err == IE_RETRY) { 3216 ASSERT(retrycnt == 0); 3217 retrycnt++; 3218 goto retry; 3219 } 3220 } 3221 pwp->wp_oprot = 0; 3222 pwp->wp_prot = 0; 3223 } 3224 } 3225 3226 /* 3227 * Force a new setup for all the watched pages in the range. 3228 */ 3229 static void 3230 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 3231 { 3232 struct watched_page *pwp; 3233 struct watched_page tpw; 3234 caddr_t eaddr = addr + size; 3235 caddr_t vaddr; 3236 struct seg *seg; 3237 int err, retrycnt; 3238 uint_t wprot; 3239 avl_index_t where; 3240 3241 if (avl_numnodes(&as->a_wpage) == 0) 3242 return; 3243 3244 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3245 3246 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3247 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) 3248 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); 3249 3250 while (pwp != NULL && pwp->wp_vaddr < eaddr) { 3251 retrycnt = 0; 3252 vaddr = pwp->wp_vaddr; 3253 3254 wprot = prot; 3255 if (pwp->wp_read) 3256 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3257 if (pwp->wp_write) 3258 wprot &= ~PROT_WRITE; 3259 if (pwp->wp_exec) 3260 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3261 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) { 3262 retry: 3263 seg = as_segat(as, vaddr); 3264 if (seg == NULL) { 3265 panic("as_setwatchprot: no seg"); 3266 /*NOTREACHED*/ 3267 } 3268 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot); 3269 if (err == IE_RETRY) { 3270 ASSERT(retrycnt == 0); 3271 retrycnt++; 3272 goto retry; 3273 } 3274 } 3275 pwp->wp_oprot = prot; 3276 pwp->wp_prot = wprot; 3277 3278 pwp = AVL_NEXT(&as->a_wpage, pwp); 3279 } 3280 } 3281 3282 /* 3283 * Clear all of the watched pages in the range. 3284 */ 3285 static void 3286 as_clearwatchprot(struct as *as, caddr_t addr, size_t size) 3287 { 3288 caddr_t eaddr = addr + size; 3289 struct watched_page *pwp; 3290 struct watched_page tpw; 3291 uint_t prot; 3292 struct seg *seg; 3293 int err, retrycnt; 3294 avl_index_t where; 3295 3296 if (avl_numnodes(&as->a_wpage) == 0) 3297 return; 3298 3299 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3300 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) 3301 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); 3302 3303 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3304 3305 while (pwp != NULL && pwp->wp_vaddr < eaddr) { 3306 3307 if ((prot = pwp->wp_oprot) != 0) { 3308 retrycnt = 0; 3309 3310 if (prot != pwp->wp_prot) { 3311 retry: 3312 seg = as_segat(as, pwp->wp_vaddr); 3313 if (seg == NULL) 3314 continue; 3315 err = SEGOP_SETPROT(seg, pwp->wp_vaddr, 3316 PAGESIZE, prot); 3317 if (err == IE_RETRY) { 3318 ASSERT(retrycnt == 0); 3319 retrycnt++; 3320 goto retry; 3321 3322 } 3323 } 3324 pwp->wp_oprot = 0; 3325 pwp->wp_prot = 0; 3326 } 3327 3328 pwp = AVL_NEXT(&as->a_wpage, pwp); 3329 } 3330 } 3331 3332 void 3333 as_signal_proc(struct as *as, k_siginfo_t *siginfo) 3334 { 3335 struct proc *p; 3336 3337 mutex_enter(&pidlock); 3338 for (p = practive; p; p = p->p_next) { 3339 if (p->p_as == as) { 3340 mutex_enter(&p->p_lock); 3341 if (p->p_as == as) 3342 sigaddq(p, NULL, siginfo, KM_NOSLEEP); 3343 mutex_exit(&p->p_lock); 3344 } 3345 } 3346 mutex_exit(&pidlock); 3347 } 3348 3349 /* 3350 * return memory object ID 3351 */ 3352 int 3353 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp) 3354 { 3355 struct seg *seg; 3356 int sts; 3357 3358 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 3359 seg = as_segat(as, addr); 3360 if (seg == NULL) { 3361 AS_LOCK_EXIT(as, &as->a_lock); 3362 return (EFAULT); 3363 } 3364 /* 3365 * catch old drivers which may not support getmemid 3366 */ 3367 if (seg->s_ops->getmemid == NULL) { 3368 AS_LOCK_EXIT(as, &as->a_lock); 3369 return (ENODEV); 3370 } 3371 3372 sts = SEGOP_GETMEMID(seg, addr, memidp); 3373 3374 AS_LOCK_EXIT(as, &as->a_lock); 3375 return (sts); 3376 } 3377