1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 /* 40 * VM - address spaces. 41 */ 42 43 #include <sys/types.h> 44 #include <sys/t_lock.h> 45 #include <sys/param.h> 46 #include <sys/errno.h> 47 #include <sys/systm.h> 48 #include <sys/mman.h> 49 #include <sys/sysmacros.h> 50 #include <sys/cpuvar.h> 51 #include <sys/sysinfo.h> 52 #include <sys/kmem.h> 53 #include <sys/vnode.h> 54 #include <sys/vmsystm.h> 55 #include <sys/cmn_err.h> 56 #include <sys/debug.h> 57 #include <sys/tnf_probe.h> 58 #include <sys/vtrace.h> 59 60 #include <vm/hat.h> 61 #include <vm/xhat.h> 62 #include <vm/as.h> 63 #include <vm/seg.h> 64 #include <vm/seg_vn.h> 65 #include <vm/seg_dev.h> 66 #include <vm/seg_kmem.h> 67 #include <vm/seg_map.h> 68 #include <vm/seg_spt.h> 69 #include <vm/page.h> 70 71 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */ 72 73 static struct kmem_cache *as_cache; 74 75 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t); 76 static void as_clearwatchprot(struct as *, caddr_t, size_t); 77 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *); 78 79 80 /* 81 * Verifying the segment lists is very time-consuming; it may not be 82 * desirable always to define VERIFY_SEGLIST when DEBUG is set. 83 */ 84 #ifdef DEBUG 85 #define VERIFY_SEGLIST 86 int do_as_verify = 0; 87 #endif 88 89 /* 90 * Allocate a new callback data structure entry and fill in the events of 91 * interest, the address range of interest, and the callback argument. 92 * Link the entry on the as->a_callbacks list. A callback entry for the 93 * entire address space may be specified with vaddr = 0 and size = -1. 94 * 95 * CALLERS RESPONSIBILITY: If not calling from within the process context for 96 * the specified as, the caller must guarantee persistence of the specified as 97 * for the duration of this function (eg. pages being locked within the as 98 * will guarantee persistence). 99 */ 100 int 101 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events, 102 caddr_t vaddr, size_t size, int sleepflag) 103 { 104 struct as_callback *current_head, *cb; 105 caddr_t saddr; 106 size_t rsize; 107 108 /* callback function and an event are mandatory */ 109 if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0)) 110 return (EINVAL); 111 112 /* Adding a callback after as_free has been called is not allowed */ 113 if (as == &kas) 114 return (ENOMEM); 115 116 /* 117 * vaddr = 0 and size = -1 is used to indicate that the callback range 118 * is the entire address space so no rounding is done in that case. 119 */ 120 if (size != -1) { 121 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK); 122 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) - 123 (size_t)saddr; 124 /* check for wraparound */ 125 if (saddr + rsize < saddr) 126 return (ENOMEM); 127 } else { 128 if (vaddr != 0) 129 return (EINVAL); 130 saddr = vaddr; 131 rsize = size; 132 } 133 134 /* Allocate and initialize a callback entry */ 135 cb = kmem_zalloc(sizeof (struct as_callback), sleepflag); 136 if (cb == NULL) 137 return (EAGAIN); 138 139 cb->ascb_func = cb_func; 140 cb->ascb_arg = arg; 141 cb->ascb_events = events; 142 cb->ascb_saddr = saddr; 143 cb->ascb_len = rsize; 144 145 /* Add the entry to the list */ 146 mutex_enter(&as->a_contents); 147 current_head = as->a_callbacks; 148 as->a_callbacks = cb; 149 cb->ascb_next = current_head; 150 151 /* 152 * The call to this function may lose in a race with 153 * a pertinent event - eg. a thread does long term memory locking 154 * but before the callback is added another thread executes as_unmap. 155 * A broadcast here resolves that. 156 */ 157 if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) { 158 AS_CLRUNMAPWAIT(as); 159 cv_broadcast(&as->a_cv); 160 } 161 162 mutex_exit(&as->a_contents); 163 return (0); 164 } 165 166 /* 167 * Search the callback list for an entry which pertains to arg. 168 * 169 * This is called from within the client upon completion of the callback. 170 * RETURN VALUES: 171 * AS_CALLBACK_DELETED (callback entry found and deleted) 172 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok) 173 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this 174 * entry will be made in as_do_callbacks) 175 * 176 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED 177 * set, it indicates that as_do_callbacks is processing this entry. The 178 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made 179 * to unblock as_do_callbacks, in case it is blocked. 180 * 181 * CALLERS RESPONSIBILITY: If not calling from within the process context for 182 * the specified as, the caller must guarantee persistence of the specified as 183 * for the duration of this function (eg. pages being locked within the as 184 * will guarantee persistence). 185 */ 186 uint_t 187 as_delete_callback(struct as *as, void *arg) 188 { 189 struct as_callback **prevcb = &as->a_callbacks; 190 struct as_callback *cb; 191 uint_t rc = AS_CALLBACK_NOTFOUND; 192 193 mutex_enter(&as->a_contents); 194 for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) { 195 if (cb->ascb_arg != arg) 196 continue; 197 198 /* 199 * If the events indicate AS_CALLBACK_CALLED, just clear 200 * AS_ALL_EVENT in the events field and wakeup the thread 201 * that may be waiting in as_do_callbacks. as_do_callbacks 202 * will take care of removing this entry from the list. In 203 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise 204 * (AS_CALLBACK_CALLED not set), just remove it from the 205 * list, return the memory and return AS_CALLBACK_DELETED. 206 */ 207 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) { 208 /* leave AS_CALLBACK_CALLED */ 209 cb->ascb_events &= ~AS_ALL_EVENT; 210 rc = AS_CALLBACK_DELETE_DEFERRED; 211 cv_broadcast(&as->a_cv); 212 } else { 213 *prevcb = cb->ascb_next; 214 kmem_free(cb, sizeof (struct as_callback)); 215 rc = AS_CALLBACK_DELETED; 216 } 217 break; 218 } 219 mutex_exit(&as->a_contents); 220 return (rc); 221 } 222 223 /* 224 * Searches the as callback list for a matching entry. 225 * Returns a pointer to the first matching callback, or NULL if 226 * nothing is found. 227 * This function never sleeps so it is ok to call it with more 228 * locks held but the (required) a_contents mutex. 229 * 230 * See also comment on as_do_callbacks below. 231 */ 232 static struct as_callback * 233 as_find_callback(struct as *as, uint_t events, caddr_t event_addr, 234 size_t event_len) 235 { 236 struct as_callback *cb; 237 238 ASSERT(MUTEX_HELD(&as->a_contents)); 239 for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) { 240 /* 241 * If the callback has not already been called, then 242 * check if events or address range pertains. An event_len 243 * of zero means do an unconditional callback. 244 */ 245 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) || 246 ((event_len != 0) && (((cb->ascb_events & events) == 0) || 247 (event_addr + event_len < cb->ascb_saddr) || 248 (event_addr > (cb->ascb_saddr + cb->ascb_len))))) { 249 continue; 250 } 251 break; 252 } 253 return (cb); 254 } 255 256 /* 257 * Executes a given callback and removes it from the callback list for 258 * this address space. 259 * This function may sleep so the caller must drop all locks except 260 * a_contents before calling this func. 261 * 262 * See also comments on as_do_callbacks below. 263 */ 264 static void 265 as_execute_callback(struct as *as, struct as_callback *cb, 266 uint_t events) 267 { 268 struct as_callback **prevcb; 269 void *cb_arg; 270 271 ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events)); 272 cb->ascb_events |= AS_CALLBACK_CALLED; 273 mutex_exit(&as->a_contents); 274 (*cb->ascb_func)(as, cb->ascb_arg, events); 275 mutex_enter(&as->a_contents); 276 /* 277 * the callback function is required to delete the callback 278 * when the callback function determines it is OK for 279 * this thread to continue. as_delete_callback will clear 280 * the AS_ALL_EVENT in the events field when it is deleted. 281 * If the callback function called as_delete_callback, 282 * events will already be cleared and there will be no blocking. 283 */ 284 while ((cb->ascb_events & events) != 0) { 285 cv_wait(&as->a_cv, &as->a_contents); 286 } 287 /* 288 * This entry needs to be taken off the list. Normally, the 289 * callback func itself does that, but unfortunately the list 290 * may have changed while the callback was running because the 291 * a_contents mutex was dropped and someone else other than the 292 * callback func itself could have called as_delete_callback, 293 * so we have to search to find this entry again. The entry 294 * must have AS_CALLBACK_CALLED, and have the same 'arg'. 295 */ 296 cb_arg = cb->ascb_arg; 297 prevcb = &as->a_callbacks; 298 for (cb = as->a_callbacks; cb != NULL; 299 prevcb = &cb->ascb_next, cb = *prevcb) { 300 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) || 301 (cb_arg != cb->ascb_arg)) { 302 continue; 303 } 304 *prevcb = cb->ascb_next; 305 kmem_free(cb, sizeof (struct as_callback)); 306 break; 307 } 308 } 309 310 /* 311 * Check the callback list for a matching event and intersection of 312 * address range. If there is a match invoke the callback. Skip an entry if: 313 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED) 314 * - not event of interest 315 * - not address range of interest 316 * 317 * An event_len of zero indicates a request for an unconditional callback 318 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The 319 * a_contents lock must be dropped before a callback, so only one callback 320 * can be done before returning. Return -1 (true) if a callback was 321 * executed and removed from the list, else return 0 (false). 322 * 323 * The logically separate parts, i.e. finding a matching callback and 324 * executing a given callback have been separated into two functions 325 * so that they can be called with different sets of locks held beyond 326 * the always-required a_contents. as_find_callback does not sleep so 327 * it is ok to call it if more locks than a_contents (i.e. the a_lock 328 * rwlock) are held. as_execute_callback on the other hand may sleep 329 * so all locks beyond a_contents must be dropped by the caller if one 330 * does not want to end comatose. 331 */ 332 static int 333 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr, 334 size_t event_len) 335 { 336 struct as_callback *cb; 337 338 if ((cb = as_find_callback(as, events, event_addr, event_len))) { 339 as_execute_callback(as, cb, events); 340 return (-1); 341 } 342 return (0); 343 } 344 345 /* 346 * Search for the segment containing addr. If a segment containing addr 347 * exists, that segment is returned. If no such segment exists, and 348 * the list spans addresses greater than addr, then the first segment 349 * whose base is greater than addr is returned; otherwise, NULL is 350 * returned unless tail is true, in which case the last element of the 351 * list is returned. 352 * 353 * a_seglast is used to cache the last found segment for repeated 354 * searches to the same addr (which happens frequently). 355 */ 356 struct seg * 357 as_findseg(struct as *as, caddr_t addr, int tail) 358 { 359 struct seg *seg = as->a_seglast; 360 avl_index_t where; 361 362 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 363 364 if (seg != NULL && 365 seg->s_base <= addr && 366 addr < seg->s_base + seg->s_size) 367 return (seg); 368 369 seg = avl_find(&as->a_segtree, &addr, &where); 370 if (seg != NULL) 371 return (as->a_seglast = seg); 372 373 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); 374 if (seg == NULL && tail) 375 seg = avl_last(&as->a_segtree); 376 return (as->a_seglast = seg); 377 } 378 379 #ifdef VERIFY_SEGLIST 380 /* 381 * verify that the linked list is coherent 382 */ 383 static void 384 as_verify(struct as *as) 385 { 386 struct seg *seg, *seglast, *p, *n; 387 uint_t nsegs = 0; 388 389 if (do_as_verify == 0) 390 return; 391 392 seglast = as->a_seglast; 393 394 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 395 ASSERT(seg->s_as == as); 396 p = AS_SEGPREV(as, seg); 397 n = AS_SEGNEXT(as, seg); 398 ASSERT(p == NULL || p->s_as == as); 399 ASSERT(p == NULL || p->s_base < seg->s_base); 400 ASSERT(n == NULL || n->s_base > seg->s_base); 401 ASSERT(n != NULL || seg == avl_last(&as->a_segtree)); 402 if (seg == seglast) 403 seglast = NULL; 404 nsegs++; 405 } 406 ASSERT(seglast == NULL); 407 ASSERT(avl_numnodes(&as->a_segtree) == nsegs); 408 } 409 #endif /* VERIFY_SEGLIST */ 410 411 /* 412 * Add a new segment to the address space. The avl_find() 413 * may be expensive so we attempt to use last segment accessed 414 * in as_gap() as an insertion point. 415 */ 416 int 417 as_addseg(struct as *as, struct seg *newseg) 418 { 419 struct seg *seg; 420 caddr_t addr; 421 caddr_t eaddr; 422 avl_index_t where; 423 424 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 425 426 as->a_updatedir = 1; /* inform /proc */ 427 gethrestime(&as->a_updatetime); 428 429 if (as->a_lastgaphl != NULL) { 430 struct seg *hseg = NULL; 431 struct seg *lseg = NULL; 432 433 if (as->a_lastgaphl->s_base > newseg->s_base) { 434 hseg = as->a_lastgaphl; 435 lseg = AVL_PREV(&as->a_segtree, hseg); 436 } else { 437 lseg = as->a_lastgaphl; 438 hseg = AVL_NEXT(&as->a_segtree, lseg); 439 } 440 441 if (hseg && lseg && lseg->s_base < newseg->s_base && 442 hseg->s_base > newseg->s_base) { 443 avl_insert_here(&as->a_segtree, newseg, lseg, 444 AVL_AFTER); 445 as->a_lastgaphl = NULL; 446 as->a_seglast = newseg; 447 return (0); 448 } 449 as->a_lastgaphl = NULL; 450 } 451 452 addr = newseg->s_base; 453 eaddr = addr + newseg->s_size; 454 again: 455 456 seg = avl_find(&as->a_segtree, &addr, &where); 457 458 if (seg == NULL) 459 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); 460 461 if (seg == NULL) 462 seg = avl_last(&as->a_segtree); 463 464 if (seg != NULL) { 465 caddr_t base = seg->s_base; 466 467 /* 468 * If top of seg is below the requested address, then 469 * the insertion point is at the end of the linked list, 470 * and seg points to the tail of the list. Otherwise, 471 * the insertion point is immediately before seg. 472 */ 473 if (base + seg->s_size > addr) { 474 if (addr >= base || eaddr > base) { 475 #ifdef __sparc 476 extern struct seg_ops segnf_ops; 477 478 /* 479 * no-fault segs must disappear if overlaid. 480 * XXX need new segment type so 481 * we don't have to check s_ops 482 */ 483 if (seg->s_ops == &segnf_ops) { 484 seg_unmap(seg); 485 goto again; 486 } 487 #endif 488 return (-1); /* overlapping segment */ 489 } 490 } 491 } 492 as->a_seglast = newseg; 493 avl_insert(&as->a_segtree, newseg, where); 494 495 #ifdef VERIFY_SEGLIST 496 as_verify(as); 497 #endif 498 return (0); 499 } 500 501 struct seg * 502 as_removeseg(struct as *as, struct seg *seg) 503 { 504 avl_tree_t *t; 505 506 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 507 508 as->a_updatedir = 1; /* inform /proc */ 509 gethrestime(&as->a_updatetime); 510 511 if (seg == NULL) 512 return (NULL); 513 514 t = &as->a_segtree; 515 if (as->a_seglast == seg) 516 as->a_seglast = NULL; 517 as->a_lastgaphl = NULL; 518 519 /* 520 * if this segment is at an address higher than 521 * a_lastgap, set a_lastgap to the next segment (NULL if last segment) 522 */ 523 if (as->a_lastgap && 524 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base)) 525 as->a_lastgap = AVL_NEXT(t, seg); 526 527 /* 528 * remove the segment from the seg tree 529 */ 530 avl_remove(t, seg); 531 532 #ifdef VERIFY_SEGLIST 533 as_verify(as); 534 #endif 535 return (seg); 536 } 537 538 /* 539 * Find a segment containing addr. 540 */ 541 struct seg * 542 as_segat(struct as *as, caddr_t addr) 543 { 544 struct seg *seg = as->a_seglast; 545 546 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 547 548 if (seg != NULL && seg->s_base <= addr && 549 addr < seg->s_base + seg->s_size) 550 return (seg); 551 552 seg = avl_find(&as->a_segtree, &addr, NULL); 553 return (seg); 554 } 555 556 /* 557 * Serialize all searches for holes in an address space to 558 * prevent two or more threads from allocating the same virtual 559 * address range. The address space must not be "read/write" 560 * locked by the caller since we may block. 561 */ 562 void 563 as_rangelock(struct as *as) 564 { 565 mutex_enter(&as->a_contents); 566 while (AS_ISCLAIMGAP(as)) 567 cv_wait(&as->a_cv, &as->a_contents); 568 AS_SETCLAIMGAP(as); 569 mutex_exit(&as->a_contents); 570 } 571 572 /* 573 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads. 574 */ 575 void 576 as_rangeunlock(struct as *as) 577 { 578 mutex_enter(&as->a_contents); 579 AS_CLRCLAIMGAP(as); 580 cv_signal(&as->a_cv); 581 mutex_exit(&as->a_contents); 582 } 583 584 /* 585 * compar segments (or just an address) by segment address range 586 */ 587 static int 588 as_segcompar(const void *x, const void *y) 589 { 590 struct seg *a = (struct seg *)x; 591 struct seg *b = (struct seg *)y; 592 593 if (a->s_base < b->s_base) 594 return (-1); 595 if (a->s_base >= b->s_base + b->s_size) 596 return (1); 597 return (0); 598 } 599 600 601 void 602 as_avlinit(struct as *as) 603 { 604 avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg), 605 offsetof(struct seg, s_tree)); 606 avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page), 607 offsetof(struct watched_page, wp_link)); 608 } 609 610 /*ARGSUSED*/ 611 static int 612 as_constructor(void *buf, void *cdrarg, int kmflags) 613 { 614 struct as *as = buf; 615 616 mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL); 617 cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL); 618 rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL); 619 as_avlinit(as); 620 return (0); 621 } 622 623 /*ARGSUSED1*/ 624 static void 625 as_destructor(void *buf, void *cdrarg) 626 { 627 struct as *as = buf; 628 629 avl_destroy(&as->a_segtree); 630 mutex_destroy(&as->a_contents); 631 cv_destroy(&as->a_cv); 632 rw_destroy(&as->a_lock); 633 } 634 635 void 636 as_init(void) 637 { 638 as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0, 639 as_constructor, as_destructor, NULL, NULL, NULL, 0); 640 } 641 642 /* 643 * Allocate and initialize an address space data structure. 644 * We call hat_alloc to allow any machine dependent 645 * information in the hat structure to be initialized. 646 */ 647 struct as * 648 as_alloc(void) 649 { 650 struct as *as; 651 652 as = kmem_cache_alloc(as_cache, KM_SLEEP); 653 654 as->a_flags = 0; 655 as->a_vbits = 0; 656 as->a_hrm = NULL; 657 as->a_seglast = NULL; 658 as->a_size = 0; 659 as->a_resvsize = 0; 660 as->a_updatedir = 0; 661 gethrestime(&as->a_updatetime); 662 as->a_objectdir = NULL; 663 as->a_sizedir = 0; 664 as->a_userlimit = (caddr_t)USERLIMIT; 665 as->a_lastgap = NULL; 666 as->a_lastgaphl = NULL; 667 as->a_callbacks = NULL; 668 669 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 670 as->a_hat = hat_alloc(as); /* create hat for default system mmu */ 671 AS_LOCK_EXIT(as, &as->a_lock); 672 673 as->a_xhat = NULL; 674 675 return (as); 676 } 677 678 /* 679 * Free an address space data structure. 680 * Need to free the hat first and then 681 * all the segments on this as and finally 682 * the space for the as struct itself. 683 */ 684 void 685 as_free(struct as *as) 686 { 687 struct hat *hat = as->a_hat; 688 struct seg *seg, *next; 689 int called = 0; 690 691 top: 692 /* 693 * Invoke ALL callbacks. as_do_callbacks will do one callback 694 * per call, and not return (-1) until the callback has completed. 695 * When as_do_callbacks returns zero, all callbacks have completed. 696 */ 697 mutex_enter(&as->a_contents); 698 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0)) 699 ; 700 701 /* This will prevent new XHATs from attaching to as */ 702 if (!called) 703 AS_SETBUSY(as); 704 mutex_exit(&as->a_contents); 705 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 706 707 if (!called) { 708 called = 1; 709 hat_free_start(hat); 710 if (as->a_xhat != NULL) 711 xhat_free_start_all(as); 712 } 713 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) { 714 int err; 715 716 next = AS_SEGNEXT(as, seg); 717 retry: 718 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 719 if (err == EAGAIN) { 720 mutex_enter(&as->a_contents); 721 if (as->a_callbacks) { 722 AS_LOCK_EXIT(as, &as->a_lock); 723 } else if (!AS_ISNOUNMAPWAIT(as)) { 724 /* 725 * Memory is currently locked. Wait for a 726 * cv_signal that it has been unlocked, then 727 * try the operation again. 728 */ 729 if (AS_ISUNMAPWAIT(as) == 0) 730 cv_broadcast(&as->a_cv); 731 AS_SETUNMAPWAIT(as); 732 AS_LOCK_EXIT(as, &as->a_lock); 733 while (AS_ISUNMAPWAIT(as)) 734 cv_wait(&as->a_cv, &as->a_contents); 735 } else { 736 /* 737 * We may have raced with 738 * segvn_reclaim()/segspt_reclaim(). In this 739 * case clean nounmapwait flag and retry since 740 * softlockcnt in this segment may be already 741 * 0. We don't drop as writer lock so our 742 * number of retries without sleeping should 743 * be very small. See segvn_reclaim() for 744 * more comments. 745 */ 746 AS_CLRNOUNMAPWAIT(as); 747 mutex_exit(&as->a_contents); 748 goto retry; 749 } 750 mutex_exit(&as->a_contents); 751 goto top; 752 } else { 753 /* 754 * We do not expect any other error return at this 755 * time. This is similar to an ASSERT in seg_unmap() 756 */ 757 ASSERT(err == 0); 758 } 759 } 760 hat_free_end(hat); 761 if (as->a_xhat != NULL) 762 xhat_free_end_all(as); 763 AS_LOCK_EXIT(as, &as->a_lock); 764 765 /* /proc stuff */ 766 ASSERT(avl_numnodes(&as->a_wpage) == 0); 767 if (as->a_objectdir) { 768 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *)); 769 as->a_objectdir = NULL; 770 as->a_sizedir = 0; 771 } 772 773 /* 774 * Free the struct as back to kmem. Assert it has no segments. 775 */ 776 ASSERT(avl_numnodes(&as->a_segtree) == 0); 777 kmem_cache_free(as_cache, as); 778 } 779 780 int 781 as_dup(struct as *as, struct proc *forkedproc) 782 { 783 struct as *newas; 784 struct seg *seg, *newseg; 785 size_t purgesize = 0; 786 int error; 787 788 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 789 as_clearwatch(as); 790 newas = as_alloc(); 791 newas->a_userlimit = as->a_userlimit; 792 newas->a_proc = forkedproc; 793 794 AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER); 795 796 /* This will prevent new XHATs from attaching */ 797 mutex_enter(&as->a_contents); 798 AS_SETBUSY(as); 799 mutex_exit(&as->a_contents); 800 mutex_enter(&newas->a_contents); 801 AS_SETBUSY(newas); 802 mutex_exit(&newas->a_contents); 803 804 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD); 805 806 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 807 808 if (seg->s_flags & S_PURGE) { 809 purgesize += seg->s_size; 810 continue; 811 } 812 813 newseg = seg_alloc(newas, seg->s_base, seg->s_size); 814 if (newseg == NULL) { 815 AS_LOCK_EXIT(newas, &newas->a_lock); 816 as_setwatch(as); 817 mutex_enter(&as->a_contents); 818 AS_CLRBUSY(as); 819 mutex_exit(&as->a_contents); 820 AS_LOCK_EXIT(as, &as->a_lock); 821 as_free(newas); 822 return (-1); 823 } 824 if ((error = SEGOP_DUP(seg, newseg)) != 0) { 825 /* 826 * We call seg_free() on the new seg 827 * because the segment is not set up 828 * completely; i.e. it has no ops. 829 */ 830 as_setwatch(as); 831 mutex_enter(&as->a_contents); 832 AS_CLRBUSY(as); 833 mutex_exit(&as->a_contents); 834 AS_LOCK_EXIT(as, &as->a_lock); 835 seg_free(newseg); 836 AS_LOCK_EXIT(newas, &newas->a_lock); 837 as_free(newas); 838 return (error); 839 } 840 newas->a_size += seg->s_size; 841 } 842 newas->a_resvsize = as->a_resvsize - purgesize; 843 844 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL); 845 if (as->a_xhat != NULL) 846 error |= xhat_dup_all(as, newas, NULL, 0, HAT_DUP_ALL); 847 848 mutex_enter(&newas->a_contents); 849 AS_CLRBUSY(newas); 850 mutex_exit(&newas->a_contents); 851 AS_LOCK_EXIT(newas, &newas->a_lock); 852 853 as_setwatch(as); 854 mutex_enter(&as->a_contents); 855 AS_CLRBUSY(as); 856 mutex_exit(&as->a_contents); 857 AS_LOCK_EXIT(as, &as->a_lock); 858 if (error != 0) { 859 as_free(newas); 860 return (error); 861 } 862 forkedproc->p_as = newas; 863 return (0); 864 } 865 866 /* 867 * Handle a ``fault'' at addr for size bytes. 868 */ 869 faultcode_t 870 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size, 871 enum fault_type type, enum seg_rw rw) 872 { 873 struct seg *seg; 874 caddr_t raddr; /* rounded down addr */ 875 size_t rsize; /* rounded up size */ 876 size_t ssize; 877 faultcode_t res = 0; 878 caddr_t addrsav; 879 struct seg *segsav; 880 int as_lock_held; 881 klwp_t *lwp = ttolwp(curthread); 882 int is_xhat = 0; 883 int holding_wpage = 0; 884 extern struct seg_ops segdev_ops; 885 886 887 888 if (as->a_hat != hat) { 889 /* This must be an XHAT then */ 890 is_xhat = 1; 891 892 if ((type != F_INVAL) || (as == &kas)) 893 return (FC_NOSUPPORT); 894 } 895 896 retry: 897 if (!is_xhat) { 898 /* 899 * Indicate that the lwp is not to be stopped while waiting 900 * for a pagefault. This is to avoid deadlock while debugging 901 * a process via /proc over NFS (in particular). 902 */ 903 if (lwp != NULL) 904 lwp->lwp_nostop++; 905 906 /* 907 * same length must be used when we softlock and softunlock. 908 * We don't support softunlocking lengths less than 909 * the original length when there is largepage support. 910 * See seg_dev.c for more comments. 911 */ 912 switch (type) { 913 914 case F_SOFTLOCK: 915 CPU_STATS_ADD_K(vm, softlock, 1); 916 break; 917 918 case F_SOFTUNLOCK: 919 break; 920 921 case F_PROT: 922 CPU_STATS_ADD_K(vm, prot_fault, 1); 923 break; 924 925 case F_INVAL: 926 CPU_STATS_ENTER_K(); 927 CPU_STATS_ADDQ(CPU, vm, as_fault, 1); 928 if (as == &kas) 929 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1); 930 CPU_STATS_EXIT_K(); 931 break; 932 } 933 } 934 935 /* Kernel probe */ 936 TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */, 937 tnf_opaque, address, addr, 938 tnf_fault_type, fault_type, type, 939 tnf_seg_access, access, rw); 940 941 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 942 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 943 (size_t)raddr; 944 945 /* 946 * XXX -- Don't grab the as lock for segkmap. We should grab it for 947 * correctness, but then we could be stuck holding this lock for 948 * a LONG time if the fault needs to be resolved on a slow 949 * filesystem, and then no-one will be able to exec new commands, 950 * as exec'ing requires the write lock on the as. 951 */ 952 if (as == &kas && segkmap && segkmap->s_base <= raddr && 953 raddr + size < segkmap->s_base + segkmap->s_size) { 954 /* 955 * if (as==&kas), this can't be XHAT: we've already returned 956 * FC_NOSUPPORT. 957 */ 958 seg = segkmap; 959 as_lock_held = 0; 960 } else { 961 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 962 if (is_xhat && avl_numnodes(&as->a_wpage) != 0) { 963 /* 964 * Grab and hold the writers' lock on the as 965 * if the fault is to a watched page. 966 * This will keep CPUs from "peeking" at the 967 * address range while we're temporarily boosting 968 * the permissions for the XHAT device to 969 * resolve the fault in the segment layer. 970 * 971 * We could check whether faulted address 972 * is within a watched page and only then grab 973 * the writer lock, but this is simpler. 974 */ 975 AS_LOCK_EXIT(as, &as->a_lock); 976 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 977 } 978 979 seg = as_segat(as, raddr); 980 if (seg == NULL) { 981 AS_LOCK_EXIT(as, &as->a_lock); 982 if ((lwp != NULL) && (!is_xhat)) 983 lwp->lwp_nostop--; 984 return (FC_NOMAP); 985 } 986 987 as_lock_held = 1; 988 } 989 990 addrsav = raddr; 991 segsav = seg; 992 993 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 994 if (raddr >= seg->s_base + seg->s_size) { 995 seg = AS_SEGNEXT(as, seg); 996 if (seg == NULL || raddr != seg->s_base) { 997 res = FC_NOMAP; 998 break; 999 } 1000 } 1001 if (raddr + rsize > seg->s_base + seg->s_size) 1002 ssize = seg->s_base + seg->s_size - raddr; 1003 else 1004 ssize = rsize; 1005 1006 if (!is_xhat || (seg->s_ops != &segdev_ops)) { 1007 1008 if (is_xhat && avl_numnodes(&as->a_wpage) != 0 && 1009 pr_is_watchpage_as(raddr, rw, as)) { 1010 /* 1011 * Handle watch pages. If we're faulting on a 1012 * watched page from an X-hat, we have to 1013 * restore the original permissions while we 1014 * handle the fault. 1015 */ 1016 as_clearwatch(as); 1017 holding_wpage = 1; 1018 } 1019 1020 res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw); 1021 1022 /* Restore watchpoints */ 1023 if (holding_wpage) { 1024 as_setwatch(as); 1025 holding_wpage = 0; 1026 } 1027 1028 if (res != 0) 1029 break; 1030 } else { 1031 /* XHAT does not support seg_dev */ 1032 res = FC_NOSUPPORT; 1033 break; 1034 } 1035 } 1036 1037 /* 1038 * If we were SOFTLOCKing and encountered a failure, 1039 * we must SOFTUNLOCK the range we already did. (Maybe we 1040 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing 1041 * right here...) 1042 */ 1043 if (res != 0 && type == F_SOFTLOCK) { 1044 for (seg = segsav; addrsav < raddr; addrsav += ssize) { 1045 if (addrsav >= seg->s_base + seg->s_size) 1046 seg = AS_SEGNEXT(as, seg); 1047 ASSERT(seg != NULL); 1048 /* 1049 * Now call the fault routine again to perform the 1050 * unlock using S_OTHER instead of the rw variable 1051 * since we never got a chance to touch the pages. 1052 */ 1053 if (raddr > seg->s_base + seg->s_size) 1054 ssize = seg->s_base + seg->s_size - addrsav; 1055 else 1056 ssize = raddr - addrsav; 1057 (void) SEGOP_FAULT(hat, seg, addrsav, ssize, 1058 F_SOFTUNLOCK, S_OTHER); 1059 } 1060 } 1061 if (as_lock_held) 1062 AS_LOCK_EXIT(as, &as->a_lock); 1063 if ((lwp != NULL) && (!is_xhat)) 1064 lwp->lwp_nostop--; 1065 1066 /* 1067 * If the lower levels returned EDEADLK for a fault, 1068 * It means that we should retry the fault. Let's wait 1069 * a bit also to let the deadlock causing condition clear. 1070 * This is part of a gross hack to work around a design flaw 1071 * in the ufs/sds logging code and should go away when the 1072 * logging code is re-designed to fix the problem. See bug 1073 * 4125102 for details of the problem. 1074 */ 1075 if (FC_ERRNO(res) == EDEADLK) { 1076 delay(deadlk_wait); 1077 res = 0; 1078 goto retry; 1079 } 1080 return (res); 1081 } 1082 1083 1084 1085 /* 1086 * Asynchronous ``fault'' at addr for size bytes. 1087 */ 1088 faultcode_t 1089 as_faulta(struct as *as, caddr_t addr, size_t size) 1090 { 1091 struct seg *seg; 1092 caddr_t raddr; /* rounded down addr */ 1093 size_t rsize; /* rounded up size */ 1094 faultcode_t res = 0; 1095 klwp_t *lwp = ttolwp(curthread); 1096 1097 retry: 1098 /* 1099 * Indicate that the lwp is not to be stopped while waiting 1100 * for a pagefault. This is to avoid deadlock while debugging 1101 * a process via /proc over NFS (in particular). 1102 */ 1103 if (lwp != NULL) 1104 lwp->lwp_nostop++; 1105 1106 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1107 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1108 (size_t)raddr; 1109 1110 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1111 seg = as_segat(as, raddr); 1112 if (seg == NULL) { 1113 AS_LOCK_EXIT(as, &as->a_lock); 1114 if (lwp != NULL) 1115 lwp->lwp_nostop--; 1116 return (FC_NOMAP); 1117 } 1118 1119 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) { 1120 if (raddr >= seg->s_base + seg->s_size) { 1121 seg = AS_SEGNEXT(as, seg); 1122 if (seg == NULL || raddr != seg->s_base) { 1123 res = FC_NOMAP; 1124 break; 1125 } 1126 } 1127 res = SEGOP_FAULTA(seg, raddr); 1128 if (res != 0) 1129 break; 1130 } 1131 AS_LOCK_EXIT(as, &as->a_lock); 1132 if (lwp != NULL) 1133 lwp->lwp_nostop--; 1134 /* 1135 * If the lower levels returned EDEADLK for a fault, 1136 * It means that we should retry the fault. Let's wait 1137 * a bit also to let the deadlock causing condition clear. 1138 * This is part of a gross hack to work around a design flaw 1139 * in the ufs/sds logging code and should go away when the 1140 * logging code is re-designed to fix the problem. See bug 1141 * 4125102 for details of the problem. 1142 */ 1143 if (FC_ERRNO(res) == EDEADLK) { 1144 delay(deadlk_wait); 1145 res = 0; 1146 goto retry; 1147 } 1148 return (res); 1149 } 1150 1151 /* 1152 * Set the virtual mapping for the interval from [addr : addr + size) 1153 * in address space `as' to have the specified protection. 1154 * It is ok for the range to cross over several segments, 1155 * as long as they are contiguous. 1156 */ 1157 int 1158 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 1159 { 1160 struct seg *seg; 1161 struct as_callback *cb; 1162 size_t ssize; 1163 caddr_t raddr; /* rounded down addr */ 1164 size_t rsize; /* rounded up size */ 1165 int error = 0, writer = 0; 1166 caddr_t saveraddr; 1167 size_t saversize; 1168 1169 setprot_top: 1170 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1171 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1172 (size_t)raddr; 1173 1174 if (raddr + rsize < raddr) /* check for wraparound */ 1175 return (ENOMEM); 1176 1177 saveraddr = raddr; 1178 saversize = rsize; 1179 1180 /* 1181 * Normally we only lock the as as a reader. But 1182 * if due to setprot the segment driver needs to split 1183 * a segment it will return IE_RETRY. Therefore we re-acquire 1184 * the as lock as a writer so the segment driver can change 1185 * the seg list. Also the segment driver will return IE_RETRY 1186 * after it has changed the segment list so we therefore keep 1187 * locking as a writer. Since these opeartions should be rare 1188 * want to only lock as a writer when necessary. 1189 */ 1190 if (writer || avl_numnodes(&as->a_wpage) != 0) { 1191 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1192 } else { 1193 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1194 } 1195 1196 as_clearwatchprot(as, raddr, rsize); 1197 seg = as_segat(as, raddr); 1198 if (seg == NULL) { 1199 as_setwatch(as); 1200 AS_LOCK_EXIT(as, &as->a_lock); 1201 return (ENOMEM); 1202 } 1203 1204 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 1205 if (raddr >= seg->s_base + seg->s_size) { 1206 seg = AS_SEGNEXT(as, seg); 1207 if (seg == NULL || raddr != seg->s_base) { 1208 error = ENOMEM; 1209 break; 1210 } 1211 } 1212 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 1213 ssize = seg->s_base + seg->s_size - raddr; 1214 else 1215 ssize = rsize; 1216 retry: 1217 error = SEGOP_SETPROT(seg, raddr, ssize, prot); 1218 1219 if (error == IE_NOMEM) { 1220 error = EAGAIN; 1221 break; 1222 } 1223 1224 if (error == IE_RETRY) { 1225 AS_LOCK_EXIT(as, &as->a_lock); 1226 writer = 1; 1227 goto setprot_top; 1228 } 1229 1230 if (error == EAGAIN) { 1231 /* 1232 * Make sure we have a_lock as writer. 1233 */ 1234 if (writer == 0) { 1235 AS_LOCK_EXIT(as, &as->a_lock); 1236 writer = 1; 1237 goto setprot_top; 1238 } 1239 1240 /* 1241 * Memory is currently locked. It must be unlocked 1242 * before this operation can succeed through a retry. 1243 * The possible reasons for locked memory and 1244 * corresponding strategies for unlocking are: 1245 * (1) Normal I/O 1246 * wait for a signal that the I/O operation 1247 * has completed and the memory is unlocked. 1248 * (2) Asynchronous I/O 1249 * The aio subsystem does not unlock pages when 1250 * the I/O is completed. Those pages are unlocked 1251 * when the application calls aiowait/aioerror. 1252 * So, to prevent blocking forever, cv_broadcast() 1253 * is done to wake up aio_cleanup_thread. 1254 * Subsequently, segvn_reclaim will be called, and 1255 * that will do AS_CLRUNMAPWAIT() and wake us up. 1256 * (3) Long term page locking: 1257 * Drivers intending to have pages locked for a 1258 * period considerably longer than for normal I/O 1259 * (essentially forever) may have registered for a 1260 * callback so they may unlock these pages on 1261 * request. This is needed to allow this operation 1262 * to succeed. Each entry on the callback list is 1263 * examined. If the event or address range pertains 1264 * the callback is invoked (unless it already is in 1265 * progress). The a_contents lock must be dropped 1266 * before the callback, so only one callback can 1267 * be done at a time. Go to the top and do more 1268 * until zero is returned. If zero is returned, 1269 * either there were no callbacks for this event 1270 * or they were already in progress. 1271 */ 1272 mutex_enter(&as->a_contents); 1273 if (as->a_callbacks && 1274 (cb = as_find_callback(as, AS_SETPROT_EVENT, 1275 seg->s_base, seg->s_size))) { 1276 AS_LOCK_EXIT(as, &as->a_lock); 1277 as_execute_callback(as, cb, AS_SETPROT_EVENT); 1278 } else if (!AS_ISNOUNMAPWAIT(as)) { 1279 if (AS_ISUNMAPWAIT(as) == 0) 1280 cv_broadcast(&as->a_cv); 1281 AS_SETUNMAPWAIT(as); 1282 AS_LOCK_EXIT(as, &as->a_lock); 1283 while (AS_ISUNMAPWAIT(as)) 1284 cv_wait(&as->a_cv, &as->a_contents); 1285 } else { 1286 /* 1287 * We may have raced with 1288 * segvn_reclaim()/segspt_reclaim(). In this 1289 * case clean nounmapwait flag and retry since 1290 * softlockcnt in this segment may be already 1291 * 0. We don't drop as writer lock so our 1292 * number of retries without sleeping should 1293 * be very small. See segvn_reclaim() for 1294 * more comments. 1295 */ 1296 AS_CLRNOUNMAPWAIT(as); 1297 mutex_exit(&as->a_contents); 1298 goto retry; 1299 } 1300 mutex_exit(&as->a_contents); 1301 goto setprot_top; 1302 } else if (error != 0) 1303 break; 1304 } 1305 if (error != 0) { 1306 as_setwatch(as); 1307 } else { 1308 as_setwatchprot(as, saveraddr, saversize, prot); 1309 } 1310 AS_LOCK_EXIT(as, &as->a_lock); 1311 return (error); 1312 } 1313 1314 /* 1315 * Check to make sure that the interval [addr, addr + size) 1316 * in address space `as' has at least the specified protection. 1317 * It is ok for the range to cross over several segments, as long 1318 * as they are contiguous. 1319 */ 1320 int 1321 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 1322 { 1323 struct seg *seg; 1324 size_t ssize; 1325 caddr_t raddr; /* rounded down addr */ 1326 size_t rsize; /* rounded up size */ 1327 int error = 0; 1328 1329 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1330 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1331 (size_t)raddr; 1332 1333 if (raddr + rsize < raddr) /* check for wraparound */ 1334 return (ENOMEM); 1335 1336 /* 1337 * This is ugly as sin... 1338 * Normally, we only acquire the address space readers lock. 1339 * However, if the address space has watchpoints present, 1340 * we must acquire the writer lock on the address space for 1341 * the benefit of as_clearwatchprot() and as_setwatchprot(). 1342 */ 1343 if (avl_numnodes(&as->a_wpage) != 0) 1344 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1345 else 1346 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1347 as_clearwatchprot(as, raddr, rsize); 1348 seg = as_segat(as, raddr); 1349 if (seg == NULL) { 1350 as_setwatch(as); 1351 AS_LOCK_EXIT(as, &as->a_lock); 1352 return (ENOMEM); 1353 } 1354 1355 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 1356 if (raddr >= seg->s_base + seg->s_size) { 1357 seg = AS_SEGNEXT(as, seg); 1358 if (seg == NULL || raddr != seg->s_base) { 1359 error = ENOMEM; 1360 break; 1361 } 1362 } 1363 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 1364 ssize = seg->s_base + seg->s_size - raddr; 1365 else 1366 ssize = rsize; 1367 1368 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot); 1369 if (error != 0) 1370 break; 1371 } 1372 as_setwatch(as); 1373 AS_LOCK_EXIT(as, &as->a_lock); 1374 return (error); 1375 } 1376 1377 int 1378 as_unmap(struct as *as, caddr_t addr, size_t size) 1379 { 1380 struct seg *seg, *seg_next; 1381 struct as_callback *cb; 1382 caddr_t raddr, eaddr; 1383 size_t ssize, rsize = 0; 1384 int err; 1385 1386 top: 1387 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1388 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) & 1389 (uintptr_t)PAGEMASK); 1390 1391 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1392 1393 as->a_updatedir = 1; /* inform /proc */ 1394 gethrestime(&as->a_updatetime); 1395 1396 /* 1397 * Use as_findseg to find the first segment in the range, then 1398 * step through the segments in order, following s_next. 1399 */ 1400 as_clearwatchprot(as, raddr, eaddr - raddr); 1401 1402 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) { 1403 if (eaddr <= seg->s_base) 1404 break; /* eaddr was in a gap; all done */ 1405 1406 /* this is implied by the test above */ 1407 ASSERT(raddr < eaddr); 1408 1409 if (raddr < seg->s_base) 1410 raddr = seg->s_base; /* raddr was in a gap */ 1411 1412 if (eaddr > (seg->s_base + seg->s_size)) 1413 ssize = seg->s_base + seg->s_size - raddr; 1414 else 1415 ssize = eaddr - raddr; 1416 1417 /* 1418 * Save next segment pointer since seg can be 1419 * destroyed during the segment unmap operation. 1420 */ 1421 seg_next = AS_SEGNEXT(as, seg); 1422 1423 /* 1424 * We didn't count /dev/null mappings, so ignore them here. 1425 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again, 1426 * we have to do this check here while we have seg.) 1427 */ 1428 if (!SEG_IS_DEVNULL_MAPPING(seg) && 1429 !SEG_IS_PARTIAL_RESV(seg)) 1430 rsize = ssize; 1431 1432 retry: 1433 err = SEGOP_UNMAP(seg, raddr, ssize); 1434 if (err == EAGAIN) { 1435 /* 1436 * Memory is currently locked. It must be unlocked 1437 * before this operation can succeed through a retry. 1438 * The possible reasons for locked memory and 1439 * corresponding strategies for unlocking are: 1440 * (1) Normal I/O 1441 * wait for a signal that the I/O operation 1442 * has completed and the memory is unlocked. 1443 * (2) Asynchronous I/O 1444 * The aio subsystem does not unlock pages when 1445 * the I/O is completed. Those pages are unlocked 1446 * when the application calls aiowait/aioerror. 1447 * So, to prevent blocking forever, cv_broadcast() 1448 * is done to wake up aio_cleanup_thread. 1449 * Subsequently, segvn_reclaim will be called, and 1450 * that will do AS_CLRUNMAPWAIT() and wake us up. 1451 * (3) Long term page locking: 1452 * Drivers intending to have pages locked for a 1453 * period considerably longer than for normal I/O 1454 * (essentially forever) may have registered for a 1455 * callback so they may unlock these pages on 1456 * request. This is needed to allow this operation 1457 * to succeed. Each entry on the callback list is 1458 * examined. If the event or address range pertains 1459 * the callback is invoked (unless it already is in 1460 * progress). The a_contents lock must be dropped 1461 * before the callback, so only one callback can 1462 * be done at a time. Go to the top and do more 1463 * until zero is returned. If zero is returned, 1464 * either there were no callbacks for this event 1465 * or they were already in progress. 1466 */ 1467 mutex_enter(&as->a_contents); 1468 if (as->a_callbacks && 1469 (cb = as_find_callback(as, AS_UNMAP_EVENT, 1470 seg->s_base, seg->s_size))) { 1471 AS_LOCK_EXIT(as, &as->a_lock); 1472 as_execute_callback(as, cb, AS_UNMAP_EVENT); 1473 } else if (!AS_ISNOUNMAPWAIT(as)) { 1474 if (AS_ISUNMAPWAIT(as) == 0) 1475 cv_broadcast(&as->a_cv); 1476 AS_SETUNMAPWAIT(as); 1477 AS_LOCK_EXIT(as, &as->a_lock); 1478 while (AS_ISUNMAPWAIT(as)) 1479 cv_wait(&as->a_cv, &as->a_contents); 1480 } else { 1481 /* 1482 * We may have raced with 1483 * segvn_reclaim()/segspt_reclaim(). In this 1484 * case clean nounmapwait flag and retry since 1485 * softlockcnt in this segment may be already 1486 * 0. We don't drop as writer lock so our 1487 * number of retries without sleeping should 1488 * be very small. See segvn_reclaim() for 1489 * more comments. 1490 */ 1491 AS_CLRNOUNMAPWAIT(as); 1492 mutex_exit(&as->a_contents); 1493 goto retry; 1494 } 1495 mutex_exit(&as->a_contents); 1496 goto top; 1497 } else if (err == IE_RETRY) { 1498 AS_LOCK_EXIT(as, &as->a_lock); 1499 goto top; 1500 } else if (err) { 1501 as_setwatch(as); 1502 AS_LOCK_EXIT(as, &as->a_lock); 1503 return (-1); 1504 } 1505 1506 as->a_size -= ssize; 1507 as->a_resvsize -= rsize; 1508 raddr += ssize; 1509 } 1510 AS_LOCK_EXIT(as, &as->a_lock); 1511 return (0); 1512 } 1513 1514 static int 1515 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec, 1516 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated) 1517 { 1518 uint_t szc; 1519 uint_t nszc; 1520 int error; 1521 caddr_t a; 1522 caddr_t eaddr; 1523 size_t segsize; 1524 struct seg *seg; 1525 size_t pgsz; 1526 int do_off = (vn_a->vp != NULL || vn_a->amp != NULL); 1527 uint_t save_szcvec; 1528 1529 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1530 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1531 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1532 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL); 1533 if (!do_off) { 1534 vn_a->offset = 0; 1535 } 1536 1537 if (szcvec <= 1) { 1538 seg = seg_alloc(as, addr, size); 1539 if (seg == NULL) { 1540 return (ENOMEM); 1541 } 1542 vn_a->szc = 0; 1543 error = (*crfp)(seg, vn_a); 1544 if (error != 0) { 1545 seg_free(seg); 1546 } else { 1547 as->a_size += size; 1548 /* 1549 * We'll count MAP_NORESERVE mappings as we fault 1550 * pages in. 1551 */ 1552 if (!SEG_IS_PARTIAL_RESV(seg)) 1553 as->a_resvsize += size; 1554 } 1555 return (error); 1556 } 1557 1558 eaddr = addr + size; 1559 save_szcvec = szcvec; 1560 szcvec >>= 1; 1561 szc = 0; 1562 nszc = 0; 1563 while (szcvec) { 1564 if ((szcvec & 0x1) == 0) { 1565 nszc++; 1566 szcvec >>= 1; 1567 continue; 1568 } 1569 nszc++; 1570 pgsz = page_get_pagesize(nszc); 1571 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 1572 if (a != addr) { 1573 ASSERT(a < eaddr); 1574 segsize = a - addr; 1575 seg = seg_alloc(as, addr, segsize); 1576 if (seg == NULL) { 1577 return (ENOMEM); 1578 } 1579 vn_a->szc = szc; 1580 error = (*crfp)(seg, vn_a); 1581 if (error != 0) { 1582 seg_free(seg); 1583 return (error); 1584 } 1585 as->a_size += segsize; 1586 /* 1587 * We'll count MAP_NORESERVE mappings as we fault 1588 * pages in. We don't count /dev/null mappings at all. 1589 */ 1590 if (!SEG_IS_DEVNULL_MAPPING(seg) && 1591 !SEG_IS_PARTIAL_RESV(seg)) 1592 as->a_resvsize += segsize; 1593 1594 *segcreated = 1; 1595 if (do_off) { 1596 vn_a->offset += segsize; 1597 } 1598 addr = a; 1599 } 1600 szc = nszc; 1601 szcvec >>= 1; 1602 } 1603 1604 ASSERT(addr < eaddr); 1605 szcvec = save_szcvec | 1; /* add 8K pages */ 1606 while (szcvec) { 1607 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 1608 ASSERT(a >= addr); 1609 if (a != addr) { 1610 segsize = a - addr; 1611 seg = seg_alloc(as, addr, segsize); 1612 if (seg == NULL) { 1613 return (ENOMEM); 1614 } 1615 vn_a->szc = szc; 1616 error = (*crfp)(seg, vn_a); 1617 if (error != 0) { 1618 seg_free(seg); 1619 return (error); 1620 } 1621 as->a_size += segsize; 1622 /* 1623 * We'll count MAP_NORESERVE mappings as we fault 1624 * pages in. We don't count /dev/null mappings at all. 1625 */ 1626 if (!SEG_IS_DEVNULL_MAPPING(seg) && 1627 !SEG_IS_PARTIAL_RESV(seg)) 1628 as->a_resvsize += segsize; 1629 1630 *segcreated = 1; 1631 if (do_off) { 1632 vn_a->offset += segsize; 1633 } 1634 addr = a; 1635 } 1636 szcvec &= ~(1 << szc); 1637 if (szcvec) { 1638 szc = highbit(szcvec) - 1; 1639 pgsz = page_get_pagesize(szc); 1640 } 1641 } 1642 ASSERT(addr == eaddr); 1643 1644 return (0); 1645 } 1646 1647 static int 1648 as_map_vnsegs(struct as *as, caddr_t addr, size_t size, 1649 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated) 1650 { 1651 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA); 1652 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM; 1653 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags, 1654 type, 0); 1655 int error; 1656 struct seg *seg; 1657 struct vattr va; 1658 u_offset_t eoff; 1659 size_t save_size = 0; 1660 extern size_t textrepl_size_thresh; 1661 1662 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1663 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1664 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1665 ASSERT(vn_a->vp != NULL); 1666 ASSERT(vn_a->amp == NULL); 1667 1668 again: 1669 if (szcvec <= 1) { 1670 seg = seg_alloc(as, addr, size); 1671 if (seg == NULL) { 1672 return (ENOMEM); 1673 } 1674 vn_a->szc = 0; 1675 error = (*crfp)(seg, vn_a); 1676 if (error != 0) { 1677 seg_free(seg); 1678 } else { 1679 as->a_size += size; 1680 /* 1681 * We'll count MAP_NORESERVE mappings as we fault 1682 * pages in. 1683 */ 1684 if (!SEG_IS_PARTIAL_RESV(seg)) 1685 as->a_resvsize += size; 1686 } 1687 return (error); 1688 } 1689 1690 va.va_mask = AT_SIZE; 1691 if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) { 1692 szcvec = 0; 1693 goto again; 1694 } 1695 eoff = vn_a->offset & PAGEMASK; 1696 if (eoff >= va.va_size) { 1697 szcvec = 0; 1698 goto again; 1699 } 1700 eoff += size; 1701 if (btopr(va.va_size) < btopr(eoff)) { 1702 save_size = size; 1703 size = va.va_size - (vn_a->offset & PAGEMASK); 1704 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t); 1705 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags, 1706 type, 0); 1707 if (szcvec <= 1) { 1708 size = save_size; 1709 goto again; 1710 } 1711 } 1712 1713 if (size > textrepl_size_thresh) { 1714 vn_a->flags |= _MAP_TEXTREPL; 1715 } 1716 error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a, 1717 segcreated); 1718 if (error != 0) { 1719 return (error); 1720 } 1721 if (save_size) { 1722 addr += size; 1723 size = save_size - size; 1724 szcvec = 0; 1725 goto again; 1726 } 1727 return (0); 1728 } 1729 1730 /* 1731 * as_map_ansegs: shared or private anonymous memory. Note that the flags 1732 * passed to map_pgszvec cannot be MAP_INITDATA, for anon. 1733 */ 1734 static int 1735 as_map_ansegs(struct as *as, caddr_t addr, size_t size, 1736 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated) 1737 { 1738 uint_t szcvec; 1739 uchar_t type; 1740 1741 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE); 1742 if (vn_a->type == MAP_SHARED) { 1743 type = MAPPGSZC_SHM; 1744 } else if (vn_a->type == MAP_PRIVATE) { 1745 if (vn_a->szc == AS_MAP_HEAP) { 1746 type = MAPPGSZC_HEAP; 1747 } else if (vn_a->szc == AS_MAP_STACK) { 1748 type = MAPPGSZC_STACK; 1749 } else { 1750 type = MAPPGSZC_PRIVM; 1751 } 1752 } 1753 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ? 1754 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE), 1755 (vn_a->flags & MAP_TEXT), type, 0); 1756 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1757 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1758 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1759 ASSERT(vn_a->vp == NULL); 1760 1761 return (as_map_segvn_segs(as, addr, size, szcvec, 1762 crfp, vn_a, segcreated)); 1763 } 1764 1765 int 1766 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp) 1767 { 1768 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1769 return (as_map_locked(as, addr, size, crfp, argsp)); 1770 } 1771 1772 int 1773 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(), 1774 void *argsp) 1775 { 1776 struct seg *seg = NULL; 1777 caddr_t raddr; /* rounded down addr */ 1778 size_t rsize; /* rounded up size */ 1779 int error; 1780 int unmap = 0; 1781 struct proc *p = curproc; 1782 struct segvn_crargs crargs; 1783 1784 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1785 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1786 (size_t)raddr; 1787 1788 /* 1789 * check for wrap around 1790 */ 1791 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) { 1792 AS_LOCK_EXIT(as, &as->a_lock); 1793 return (ENOMEM); 1794 } 1795 1796 as->a_updatedir = 1; /* inform /proc */ 1797 gethrestime(&as->a_updatetime); 1798 1799 if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) { 1800 AS_LOCK_EXIT(as, &as->a_lock); 1801 1802 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p, 1803 RCA_UNSAFE_ALL); 1804 1805 return (ENOMEM); 1806 } 1807 1808 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) { 1809 crargs = *(struct segvn_crargs *)argsp; 1810 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap); 1811 if (error != 0) { 1812 AS_LOCK_EXIT(as, &as->a_lock); 1813 if (unmap) { 1814 (void) as_unmap(as, addr, size); 1815 } 1816 return (error); 1817 } 1818 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) { 1819 crargs = *(struct segvn_crargs *)argsp; 1820 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap); 1821 if (error != 0) { 1822 AS_LOCK_EXIT(as, &as->a_lock); 1823 if (unmap) { 1824 (void) as_unmap(as, addr, size); 1825 } 1826 return (error); 1827 } 1828 } else { 1829 seg = seg_alloc(as, addr, size); 1830 if (seg == NULL) { 1831 AS_LOCK_EXIT(as, &as->a_lock); 1832 return (ENOMEM); 1833 } 1834 1835 error = (*crfp)(seg, argsp); 1836 if (error != 0) { 1837 seg_free(seg); 1838 AS_LOCK_EXIT(as, &as->a_lock); 1839 return (error); 1840 } 1841 /* 1842 * Add size now so as_unmap will work if as_ctl fails. 1843 */ 1844 as->a_size += rsize; 1845 /* 1846 * We'll count MAP_NORESERVE mappings as we fault 1847 * pages in. We don't count /dev/null mappings at all. 1848 */ 1849 if (!SEG_IS_DEVNULL_MAPPING(seg) && 1850 !SEG_IS_PARTIAL_RESV(seg)) 1851 as->a_resvsize += rsize; 1852 } 1853 1854 as_setwatch(as); 1855 1856 /* 1857 * If the address space is locked, 1858 * establish memory locks for the new segment. 1859 */ 1860 mutex_enter(&as->a_contents); 1861 if (AS_ISPGLCK(as)) { 1862 mutex_exit(&as->a_contents); 1863 AS_LOCK_EXIT(as, &as->a_lock); 1864 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0); 1865 if (error != 0) 1866 (void) as_unmap(as, addr, size); 1867 } else { 1868 mutex_exit(&as->a_contents); 1869 AS_LOCK_EXIT(as, &as->a_lock); 1870 } 1871 return (error); 1872 } 1873 1874 1875 /* 1876 * Delete all segments in the address space marked with S_PURGE. 1877 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c). 1878 * These segments are deleted as a first step before calls to as_gap(), so 1879 * that they don't affect mmap() or shmat(). 1880 */ 1881 void 1882 as_purge(struct as *as) 1883 { 1884 struct seg *seg; 1885 struct seg *next_seg; 1886 1887 /* 1888 * the setting of NEEDSPURGE is protect by as_rangelock(), so 1889 * no need to grab a_contents mutex for this check 1890 */ 1891 if ((as->a_flags & AS_NEEDSPURGE) == 0) 1892 return; 1893 1894 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1895 next_seg = NULL; 1896 seg = AS_SEGFIRST(as); 1897 while (seg != NULL) { 1898 next_seg = AS_SEGNEXT(as, seg); 1899 if (seg->s_flags & S_PURGE) 1900 SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 1901 seg = next_seg; 1902 } 1903 AS_LOCK_EXIT(as, &as->a_lock); 1904 1905 mutex_enter(&as->a_contents); 1906 as->a_flags &= ~AS_NEEDSPURGE; 1907 mutex_exit(&as->a_contents); 1908 } 1909 1910 /* 1911 * Find a hole within [*basep, *basep + *lenp), which contains a mappable 1912 * range of addresses at least "minlen" long, where the base of the range is 1913 * at "off" phase from an "align" boundary and there is space for a 1914 * "redzone"-sized redzone on eithe rside of the range. Thus, 1915 * if align was 4M and off was 16k, the user wants a hole which will start 1916 * 16k into a 4M page. 1917 * 1918 * If flags specifies AH_HI, the hole will have the highest possible address 1919 * in the range. We use the as->a_lastgap field to figure out where to 1920 * start looking for a gap. 1921 * 1922 * Otherwise, the gap will have the lowest possible address. 1923 * 1924 * If flags specifies AH_CONTAIN, the hole will contain the address addr. 1925 * 1926 * If an adequate hole is found, *basep and *lenp are set to reflect the part of 1927 * the hole that is within range, and 0 is returned. On failure, -1 is returned. 1928 * 1929 * NOTE: This routine is not correct when base+len overflows caddr_t. 1930 */ 1931 int 1932 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, 1933 uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off) 1934 { 1935 caddr_t lobound = *basep; 1936 caddr_t hibound = lobound + *lenp; 1937 struct seg *lseg, *hseg; 1938 caddr_t lo, hi; 1939 int forward; 1940 caddr_t save_base; 1941 size_t save_len; 1942 size_t save_minlen; 1943 size_t save_redzone; 1944 int fast_path = 1; 1945 1946 save_base = *basep; 1947 save_len = *lenp; 1948 save_minlen = minlen; 1949 save_redzone = redzone; 1950 1951 /* 1952 * For the first pass/fast_path, just add align and redzone into 1953 * minlen since if we get an allocation, we can guarantee that it 1954 * will fit the alignment and redzone requested. 1955 * This increases the chance that hibound will be adjusted to 1956 * a_lastgap->s_base which will likely allow us to find an 1957 * acceptable hole in the address space quicker. 1958 * If we can't find a hole with this fast_path, then we look for 1959 * smaller holes in which the alignment and offset may allow 1960 * the allocation to fit. 1961 */ 1962 minlen += align; 1963 minlen += 2 * redzone; 1964 redzone = 0; 1965 1966 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1967 if (AS_SEGFIRST(as) == NULL) { 1968 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR, 1969 align, redzone, off)) { 1970 AS_LOCK_EXIT(as, &as->a_lock); 1971 return (0); 1972 } else { 1973 AS_LOCK_EXIT(as, &as->a_lock); 1974 *basep = save_base; 1975 *lenp = save_len; 1976 return (-1); 1977 } 1978 } 1979 1980 retry: 1981 /* 1982 * Set up to iterate over all the inter-segment holes in the given 1983 * direction. lseg is NULL for the lowest-addressed hole and hseg is 1984 * NULL for the highest-addressed hole. If moving backwards, we reset 1985 * sseg to denote the highest-addressed segment. 1986 */ 1987 forward = (flags & AH_DIR) == AH_LO; 1988 if (forward) { 1989 hseg = as_findseg(as, lobound, 1); 1990 lseg = AS_SEGPREV(as, hseg); 1991 } else { 1992 1993 /* 1994 * If allocating at least as much as the last allocation, 1995 * use a_lastgap's base as a better estimate of hibound. 1996 */ 1997 if (as->a_lastgap && 1998 minlen >= as->a_lastgap->s_size && 1999 hibound >= as->a_lastgap->s_base) 2000 hibound = as->a_lastgap->s_base; 2001 2002 hseg = as_findseg(as, hibound, 1); 2003 if (hseg->s_base + hseg->s_size < hibound) { 2004 lseg = hseg; 2005 hseg = NULL; 2006 } else { 2007 lseg = AS_SEGPREV(as, hseg); 2008 } 2009 } 2010 2011 for (;;) { 2012 /* 2013 * Set lo and hi to the hole's boundaries. (We should really 2014 * use MAXADDR in place of hibound in the expression below, 2015 * but can't express it easily; using hibound in its place is 2016 * harmless.) 2017 */ 2018 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size; 2019 hi = (hseg == NULL) ? hibound : hseg->s_base; 2020 /* 2021 * If the iteration has moved past the interval from lobound 2022 * to hibound it's pointless to continue. 2023 */ 2024 if ((forward && lo > hibound) || (!forward && hi < lobound)) 2025 break; 2026 else if (lo > hibound || hi < lobound) 2027 goto cont; 2028 /* 2029 * Candidate hole lies at least partially within the allowable 2030 * range. Restrict it to fall completely within that range, 2031 * i.e., to [max(lo, lobound), min(hi, hibound)]. 2032 */ 2033 if (lo < lobound) 2034 lo = lobound; 2035 if (hi > hibound) 2036 hi = hibound; 2037 /* 2038 * Verify that the candidate hole is big enough and meets 2039 * hardware constraints. If the hole is too small, no need 2040 * to do the further checks since they will fail. 2041 */ 2042 *basep = lo; 2043 *lenp = hi - lo; 2044 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp, 2045 minlen, forward ? AH_LO : AH_HI, align, redzone, off) && 2046 ((flags & AH_CONTAIN) == 0 || 2047 (*basep <= addr && *basep + *lenp > addr))) { 2048 if (!forward) 2049 as->a_lastgap = hseg; 2050 if (hseg != NULL) 2051 as->a_lastgaphl = hseg; 2052 else 2053 as->a_lastgaphl = lseg; 2054 AS_LOCK_EXIT(as, &as->a_lock); 2055 return (0); 2056 } 2057 cont: 2058 /* 2059 * Move to the next hole. 2060 */ 2061 if (forward) { 2062 lseg = hseg; 2063 if (lseg == NULL) 2064 break; 2065 hseg = AS_SEGNEXT(as, hseg); 2066 } else { 2067 hseg = lseg; 2068 if (hseg == NULL) 2069 break; 2070 lseg = AS_SEGPREV(as, lseg); 2071 } 2072 } 2073 if (fast_path && (align != 0 || save_redzone != 0)) { 2074 fast_path = 0; 2075 minlen = save_minlen; 2076 redzone = save_redzone; 2077 goto retry; 2078 } 2079 *basep = save_base; 2080 *lenp = save_len; 2081 AS_LOCK_EXIT(as, &as->a_lock); 2082 return (-1); 2083 } 2084 2085 /* 2086 * Find a hole of at least size minlen within [*basep, *basep + *lenp). 2087 * 2088 * If flags specifies AH_HI, the hole will have the highest possible address 2089 * in the range. We use the as->a_lastgap field to figure out where to 2090 * start looking for a gap. 2091 * 2092 * Otherwise, the gap will have the lowest possible address. 2093 * 2094 * If flags specifies AH_CONTAIN, the hole will contain the address addr. 2095 * 2096 * If an adequate hole is found, base and len are set to reflect the part of 2097 * the hole that is within range, and 0 is returned, otherwise, 2098 * -1 is returned. 2099 * 2100 * NOTE: This routine is not correct when base+len overflows caddr_t. 2101 */ 2102 int 2103 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags, 2104 caddr_t addr) 2105 { 2106 2107 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0)); 2108 } 2109 2110 /* 2111 * Return the next range within [base, base + len) that is backed 2112 * with "real memory". Skip holes and non-seg_vn segments. 2113 * We're lazy and only return one segment at a time. 2114 */ 2115 int 2116 as_memory(struct as *as, caddr_t *basep, size_t *lenp) 2117 { 2118 extern struct seg_ops segspt_shmops; /* needs a header file */ 2119 struct seg *seg; 2120 caddr_t addr, eaddr; 2121 caddr_t segend; 2122 2123 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2124 2125 addr = *basep; 2126 eaddr = addr + *lenp; 2127 2128 seg = as_findseg(as, addr, 0); 2129 if (seg != NULL) 2130 addr = MAX(seg->s_base, addr); 2131 2132 for (;;) { 2133 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) { 2134 AS_LOCK_EXIT(as, &as->a_lock); 2135 return (EINVAL); 2136 } 2137 2138 if (seg->s_ops == &segvn_ops) { 2139 segend = seg->s_base + seg->s_size; 2140 break; 2141 } 2142 2143 /* 2144 * We do ISM by looking into the private data 2145 * to determine the real size of the segment. 2146 */ 2147 if (seg->s_ops == &segspt_shmops) { 2148 segend = seg->s_base + spt_realsize(seg); 2149 if (addr < segend) 2150 break; 2151 } 2152 2153 seg = AS_SEGNEXT(as, seg); 2154 2155 if (seg != NULL) 2156 addr = seg->s_base; 2157 } 2158 2159 *basep = addr; 2160 2161 if (segend > eaddr) 2162 *lenp = eaddr - addr; 2163 else 2164 *lenp = segend - addr; 2165 2166 AS_LOCK_EXIT(as, &as->a_lock); 2167 return (0); 2168 } 2169 2170 /* 2171 * Swap the pages associated with the address space as out to 2172 * secondary storage, returning the number of bytes actually 2173 * swapped. 2174 * 2175 * The value returned is intended to correlate well with the process's 2176 * memory requirements. Its usefulness for this purpose depends on 2177 * how well the segment-level routines do at returning accurate 2178 * information. 2179 */ 2180 size_t 2181 as_swapout(struct as *as) 2182 { 2183 struct seg *seg; 2184 size_t swpcnt = 0; 2185 2186 /* 2187 * Kernel-only processes have given up their address 2188 * spaces. Of course, we shouldn't be attempting to 2189 * swap out such processes in the first place... 2190 */ 2191 if (as == NULL) 2192 return (0); 2193 2194 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2195 2196 /* Prevent XHATs from attaching */ 2197 mutex_enter(&as->a_contents); 2198 AS_SETBUSY(as); 2199 mutex_exit(&as->a_contents); 2200 2201 2202 /* 2203 * Free all mapping resources associated with the address 2204 * space. The segment-level swapout routines capitalize 2205 * on this unmapping by scavanging pages that have become 2206 * unmapped here. 2207 */ 2208 hat_swapout(as->a_hat); 2209 if (as->a_xhat != NULL) 2210 xhat_swapout_all(as); 2211 2212 mutex_enter(&as->a_contents); 2213 AS_CLRBUSY(as); 2214 mutex_exit(&as->a_contents); 2215 2216 /* 2217 * Call the swapout routines of all segments in the address 2218 * space to do the actual work, accumulating the amount of 2219 * space reclaimed. 2220 */ 2221 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 2222 struct seg_ops *ov = seg->s_ops; 2223 2224 /* 2225 * We have to check to see if the seg has 2226 * an ops vector because the seg may have 2227 * been in the middle of being set up when 2228 * the process was picked for swapout. 2229 */ 2230 if ((ov != NULL) && (ov->swapout != NULL)) 2231 swpcnt += SEGOP_SWAPOUT(seg); 2232 } 2233 AS_LOCK_EXIT(as, &as->a_lock); 2234 return (swpcnt); 2235 } 2236 2237 /* 2238 * Determine whether data from the mappings in interval [addr, addr + size) 2239 * are in the primary memory (core) cache. 2240 */ 2241 int 2242 as_incore(struct as *as, caddr_t addr, 2243 size_t size, char *vec, size_t *sizep) 2244 { 2245 struct seg *seg; 2246 size_t ssize; 2247 caddr_t raddr; /* rounded down addr */ 2248 size_t rsize; /* rounded up size */ 2249 size_t isize; /* iteration size */ 2250 int error = 0; /* result, assume success */ 2251 2252 *sizep = 0; 2253 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2254 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) - 2255 (size_t)raddr; 2256 2257 if (raddr + rsize < raddr) /* check for wraparound */ 2258 return (ENOMEM); 2259 2260 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2261 seg = as_segat(as, raddr); 2262 if (seg == NULL) { 2263 AS_LOCK_EXIT(as, &as->a_lock); 2264 return (-1); 2265 } 2266 2267 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 2268 if (raddr >= seg->s_base + seg->s_size) { 2269 seg = AS_SEGNEXT(as, seg); 2270 if (seg == NULL || raddr != seg->s_base) { 2271 error = -1; 2272 break; 2273 } 2274 } 2275 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2276 ssize = seg->s_base + seg->s_size - raddr; 2277 else 2278 ssize = rsize; 2279 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec); 2280 if (isize != ssize) { 2281 error = -1; 2282 break; 2283 } 2284 vec += btopr(ssize); 2285 } 2286 AS_LOCK_EXIT(as, &as->a_lock); 2287 return (error); 2288 } 2289 2290 static void 2291 as_segunlock(struct seg *seg, caddr_t addr, int attr, 2292 ulong_t *bitmap, size_t position, size_t npages) 2293 { 2294 caddr_t range_start; 2295 size_t pos1 = position; 2296 size_t pos2; 2297 size_t size; 2298 size_t end_pos = npages + position; 2299 2300 while (bt_range(bitmap, &pos1, &pos2, end_pos)) { 2301 size = ptob((pos2 - pos1)); 2302 range_start = (caddr_t)((uintptr_t)addr + 2303 ptob(pos1 - position)); 2304 2305 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK, 2306 (ulong_t *)NULL, (size_t)NULL); 2307 pos1 = pos2; 2308 } 2309 } 2310 2311 static void 2312 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map, 2313 caddr_t raddr, size_t rsize) 2314 { 2315 struct seg *seg = as_segat(as, raddr); 2316 size_t ssize; 2317 2318 while (rsize != 0) { 2319 if (raddr >= seg->s_base + seg->s_size) 2320 seg = AS_SEGNEXT(as, seg); 2321 2322 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2323 ssize = seg->s_base + seg->s_size - raddr; 2324 else 2325 ssize = rsize; 2326 2327 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize)); 2328 2329 rsize -= ssize; 2330 raddr += ssize; 2331 } 2332 } 2333 2334 /* 2335 * Cache control operations over the interval [addr, addr + size) in 2336 * address space "as". 2337 */ 2338 /*ARGSUSED*/ 2339 int 2340 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr, 2341 uintptr_t arg, ulong_t *lock_map, size_t pos) 2342 { 2343 struct seg *seg; /* working segment */ 2344 caddr_t raddr; /* rounded down addr */ 2345 caddr_t initraddr; /* saved initial rounded down addr */ 2346 size_t rsize; /* rounded up size */ 2347 size_t initrsize; /* saved initial rounded up size */ 2348 size_t ssize; /* size of seg */ 2349 int error = 0; /* result */ 2350 size_t mlock_size; /* size of bitmap */ 2351 ulong_t *mlock_map; /* pointer to bitmap used */ 2352 /* to represent the locked */ 2353 /* pages. */ 2354 retry: 2355 if (error == IE_RETRY) 2356 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 2357 else 2358 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2359 2360 /* 2361 * If these are address space lock/unlock operations, loop over 2362 * all segments in the address space, as appropriate. 2363 */ 2364 if (func == MC_LOCKAS) { 2365 size_t npages, idx; 2366 size_t rlen = 0; /* rounded as length */ 2367 2368 idx = pos; 2369 2370 if (arg & MCL_FUTURE) { 2371 mutex_enter(&as->a_contents); 2372 AS_SETPGLCK(as); 2373 mutex_exit(&as->a_contents); 2374 } 2375 if ((arg & MCL_CURRENT) == 0) { 2376 AS_LOCK_EXIT(as, &as->a_lock); 2377 return (0); 2378 } 2379 2380 seg = AS_SEGFIRST(as); 2381 if (seg == NULL) { 2382 AS_LOCK_EXIT(as, &as->a_lock); 2383 return (0); 2384 } 2385 2386 do { 2387 raddr = (caddr_t)((uintptr_t)seg->s_base & 2388 (uintptr_t)PAGEMASK); 2389 rlen += (((uintptr_t)(seg->s_base + seg->s_size) + 2390 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr; 2391 } while ((seg = AS_SEGNEXT(as, seg)) != NULL); 2392 2393 mlock_size = BT_BITOUL(btopr(rlen)); 2394 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * 2395 sizeof (ulong_t), KM_NOSLEEP)) == NULL) { 2396 AS_LOCK_EXIT(as, &as->a_lock); 2397 return (EAGAIN); 2398 } 2399 2400 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 2401 error = SEGOP_LOCKOP(seg, seg->s_base, 2402 seg->s_size, attr, MC_LOCK, mlock_map, pos); 2403 if (error != 0) 2404 break; 2405 pos += seg_pages(seg); 2406 } 2407 2408 if (error) { 2409 for (seg = AS_SEGFIRST(as); seg != NULL; 2410 seg = AS_SEGNEXT(as, seg)) { 2411 2412 raddr = (caddr_t)((uintptr_t)seg->s_base & 2413 (uintptr_t)PAGEMASK); 2414 npages = seg_pages(seg); 2415 as_segunlock(seg, raddr, attr, mlock_map, 2416 idx, npages); 2417 idx += npages; 2418 } 2419 } 2420 2421 kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); 2422 AS_LOCK_EXIT(as, &as->a_lock); 2423 goto lockerr; 2424 } else if (func == MC_UNLOCKAS) { 2425 mutex_enter(&as->a_contents); 2426 AS_CLRPGLCK(as); 2427 mutex_exit(&as->a_contents); 2428 2429 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 2430 error = SEGOP_LOCKOP(seg, seg->s_base, 2431 seg->s_size, attr, MC_UNLOCK, NULL, 0); 2432 if (error != 0) 2433 break; 2434 } 2435 2436 AS_LOCK_EXIT(as, &as->a_lock); 2437 goto lockerr; 2438 } 2439 2440 /* 2441 * Normalize addresses and sizes. 2442 */ 2443 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2444 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2445 (size_t)raddr; 2446 2447 if (raddr + rsize < raddr) { /* check for wraparound */ 2448 AS_LOCK_EXIT(as, &as->a_lock); 2449 return (ENOMEM); 2450 } 2451 2452 /* 2453 * Get initial segment. 2454 */ 2455 if ((seg = as_segat(as, raddr)) == NULL) { 2456 AS_LOCK_EXIT(as, &as->a_lock); 2457 return (ENOMEM); 2458 } 2459 2460 if (func == MC_LOCK) { 2461 mlock_size = BT_BITOUL(btopr(rsize)); 2462 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * 2463 sizeof (ulong_t), KM_NOSLEEP)) == NULL) { 2464 AS_LOCK_EXIT(as, &as->a_lock); 2465 return (EAGAIN); 2466 } 2467 } 2468 2469 /* 2470 * Loop over all segments. If a hole in the address range is 2471 * discovered, then fail. For each segment, perform the appropriate 2472 * control operation. 2473 */ 2474 while (rsize != 0) { 2475 2476 /* 2477 * Make sure there's no hole, calculate the portion 2478 * of the next segment to be operated over. 2479 */ 2480 if (raddr >= seg->s_base + seg->s_size) { 2481 seg = AS_SEGNEXT(as, seg); 2482 if (seg == NULL || raddr != seg->s_base) { 2483 if (func == MC_LOCK) { 2484 as_unlockerr(as, attr, mlock_map, 2485 initraddr, initrsize - rsize); 2486 kmem_free(mlock_map, 2487 mlock_size * sizeof (ulong_t)); 2488 } 2489 AS_LOCK_EXIT(as, &as->a_lock); 2490 return (ENOMEM); 2491 } 2492 } 2493 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2494 ssize = seg->s_base + seg->s_size - raddr; 2495 else 2496 ssize = rsize; 2497 2498 /* 2499 * Dispatch on specific function. 2500 */ 2501 switch (func) { 2502 2503 /* 2504 * Synchronize cached data from mappings with backing 2505 * objects. 2506 */ 2507 case MC_SYNC: 2508 if (error = SEGOP_SYNC(seg, raddr, ssize, 2509 attr, (uint_t)arg)) { 2510 AS_LOCK_EXIT(as, &as->a_lock); 2511 return (error); 2512 } 2513 break; 2514 2515 /* 2516 * Lock pages in memory. 2517 */ 2518 case MC_LOCK: 2519 if (error = SEGOP_LOCKOP(seg, raddr, ssize, 2520 attr, func, mlock_map, pos)) { 2521 as_unlockerr(as, attr, mlock_map, initraddr, 2522 initrsize - rsize + ssize); 2523 kmem_free(mlock_map, mlock_size * 2524 sizeof (ulong_t)); 2525 AS_LOCK_EXIT(as, &as->a_lock); 2526 goto lockerr; 2527 } 2528 break; 2529 2530 /* 2531 * Unlock mapped pages. 2532 */ 2533 case MC_UNLOCK: 2534 (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func, 2535 (ulong_t *)NULL, (size_t)NULL); 2536 break; 2537 2538 /* 2539 * Store VM advise for mapped pages in segment layer. 2540 */ 2541 case MC_ADVISE: 2542 error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg); 2543 2544 /* 2545 * Check for regular errors and special retry error 2546 */ 2547 if (error) { 2548 if (error == IE_RETRY) { 2549 /* 2550 * Need to acquire writers lock, so 2551 * have to drop readers lock and start 2552 * all over again 2553 */ 2554 AS_LOCK_EXIT(as, &as->a_lock); 2555 goto retry; 2556 } else if (error == IE_REATTACH) { 2557 /* 2558 * Find segment for current address 2559 * because current segment just got 2560 * split or concatenated 2561 */ 2562 seg = as_segat(as, raddr); 2563 if (seg == NULL) { 2564 AS_LOCK_EXIT(as, &as->a_lock); 2565 return (ENOMEM); 2566 } 2567 } else { 2568 /* 2569 * Regular error 2570 */ 2571 AS_LOCK_EXIT(as, &as->a_lock); 2572 return (error); 2573 } 2574 } 2575 break; 2576 2577 /* 2578 * Can't happen. 2579 */ 2580 default: 2581 panic("as_ctl: bad operation %d", func); 2582 /*NOTREACHED*/ 2583 } 2584 2585 rsize -= ssize; 2586 raddr += ssize; 2587 } 2588 2589 if (func == MC_LOCK) 2590 kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); 2591 AS_LOCK_EXIT(as, &as->a_lock); 2592 return (0); 2593 lockerr: 2594 2595 /* 2596 * If the lower levels returned EDEADLK for a segment lockop, 2597 * it means that we should retry the operation. Let's wait 2598 * a bit also to let the deadlock causing condition clear. 2599 * This is part of a gross hack to work around a design flaw 2600 * in the ufs/sds logging code and should go away when the 2601 * logging code is re-designed to fix the problem. See bug 2602 * 4125102 for details of the problem. 2603 */ 2604 if (error == EDEADLK) { 2605 delay(deadlk_wait); 2606 error = 0; 2607 goto retry; 2608 } 2609 return (error); 2610 } 2611 2612 int 2613 fc_decode(faultcode_t fault_err) 2614 { 2615 int error = 0; 2616 2617 switch (FC_CODE(fault_err)) { 2618 case FC_OBJERR: 2619 error = FC_ERRNO(fault_err); 2620 break; 2621 case FC_PROT: 2622 error = EACCES; 2623 break; 2624 default: 2625 error = EFAULT; 2626 break; 2627 } 2628 return (error); 2629 } 2630 2631 /* 2632 * Pagelock pages from a range that spans more than 1 segment. Obtain shadow 2633 * lists from each segment and copy them to one contiguous shadow list (plist) 2634 * as expected by the caller. Save pointers to per segment shadow lists at 2635 * the tail of plist so that they can be used during as_pageunlock(). 2636 */ 2637 static int 2638 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp, 2639 caddr_t addr, size_t size, enum seg_rw rw) 2640 { 2641 caddr_t sv_addr = addr; 2642 size_t sv_size = size; 2643 struct seg *sv_seg = seg; 2644 ulong_t segcnt = 1; 2645 ulong_t cnt; 2646 size_t ssize; 2647 pgcnt_t npages = btop(size); 2648 page_t **plist; 2649 page_t **pl; 2650 int error; 2651 caddr_t eaddr; 2652 faultcode_t fault_err = 0; 2653 pgcnt_t pl_off; 2654 extern struct seg_ops segspt_shmops; 2655 2656 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 2657 ASSERT(seg != NULL); 2658 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size); 2659 ASSERT(addr + size > seg->s_base + seg->s_size); 2660 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 2661 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 2662 2663 /* 2664 * Count the number of segments covered by the range we are about to 2665 * lock. The segment count is used to size the shadow list we return 2666 * back to the caller. 2667 */ 2668 for (; size != 0; size -= ssize, addr += ssize) { 2669 if (addr >= seg->s_base + seg->s_size) { 2670 2671 seg = AS_SEGNEXT(as, seg); 2672 if (seg == NULL || addr != seg->s_base) { 2673 AS_LOCK_EXIT(as, &as->a_lock); 2674 return (EFAULT); 2675 } 2676 /* 2677 * Do a quick check if subsequent segments 2678 * will most likely support pagelock. 2679 */ 2680 if (seg->s_ops == &segvn_ops) { 2681 vnode_t *vp; 2682 2683 if (SEGOP_GETVP(seg, addr, &vp) != 0 || 2684 vp != NULL) { 2685 AS_LOCK_EXIT(as, &as->a_lock); 2686 goto slow; 2687 } 2688 } else if (seg->s_ops != &segspt_shmops) { 2689 AS_LOCK_EXIT(as, &as->a_lock); 2690 goto slow; 2691 } 2692 segcnt++; 2693 } 2694 if (addr + size > seg->s_base + seg->s_size) { 2695 ssize = seg->s_base + seg->s_size - addr; 2696 } else { 2697 ssize = size; 2698 } 2699 } 2700 ASSERT(segcnt > 1); 2701 2702 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP); 2703 2704 addr = sv_addr; 2705 size = sv_size; 2706 seg = sv_seg; 2707 2708 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) { 2709 if (addr >= seg->s_base + seg->s_size) { 2710 seg = AS_SEGNEXT(as, seg); 2711 ASSERT(seg != NULL && addr == seg->s_base); 2712 cnt++; 2713 ASSERT(cnt < segcnt); 2714 } 2715 if (addr + size > seg->s_base + seg->s_size) { 2716 ssize = seg->s_base + seg->s_size - addr; 2717 } else { 2718 ssize = size; 2719 } 2720 pl = &plist[npages + cnt]; 2721 error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, 2722 L_PAGELOCK, rw); 2723 if (error) { 2724 break; 2725 } 2726 ASSERT(plist[npages + cnt] != NULL); 2727 ASSERT(pl_off + btop(ssize) <= npages); 2728 bcopy(plist[npages + cnt], &plist[pl_off], 2729 btop(ssize) * sizeof (page_t *)); 2730 pl_off += btop(ssize); 2731 } 2732 2733 if (size == 0) { 2734 AS_LOCK_EXIT(as, &as->a_lock); 2735 ASSERT(cnt == segcnt - 1); 2736 *ppp = plist; 2737 return (0); 2738 } 2739 2740 /* 2741 * one of pagelock calls failed. The error type is in error variable. 2742 * Unlock what we've locked so far and retry with F_SOFTLOCK if error 2743 * type is either EFAULT or ENOTSUP. Otherwise just return the error 2744 * back to the caller. 2745 */ 2746 2747 eaddr = addr; 2748 seg = sv_seg; 2749 2750 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) { 2751 if (addr >= seg->s_base + seg->s_size) { 2752 seg = AS_SEGNEXT(as, seg); 2753 ASSERT(seg != NULL && addr == seg->s_base); 2754 cnt++; 2755 ASSERT(cnt < segcnt); 2756 } 2757 if (eaddr > seg->s_base + seg->s_size) { 2758 ssize = seg->s_base + seg->s_size - addr; 2759 } else { 2760 ssize = eaddr - addr; 2761 } 2762 pl = &plist[npages + cnt]; 2763 ASSERT(*pl != NULL); 2764 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, 2765 L_PAGEUNLOCK, rw); 2766 } 2767 2768 AS_LOCK_EXIT(as, &as->a_lock); 2769 2770 kmem_free(plist, (npages + segcnt) * sizeof (page_t *)); 2771 2772 if (error != ENOTSUP && error != EFAULT) { 2773 return (error); 2774 } 2775 2776 slow: 2777 /* 2778 * If we are here because pagelock failed due to the need to cow fault 2779 * in the pages we want to lock F_SOFTLOCK will do this job and in 2780 * next as_pagelock() call for this address range pagelock will 2781 * hopefully succeed. 2782 */ 2783 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw); 2784 if (fault_err != 0) { 2785 return (fc_decode(fault_err)); 2786 } 2787 *ppp = NULL; 2788 2789 return (0); 2790 } 2791 2792 /* 2793 * lock pages in a given address space. Return shadow list. If 2794 * the list is NULL, the MMU mapping is also locked. 2795 */ 2796 int 2797 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr, 2798 size_t size, enum seg_rw rw) 2799 { 2800 size_t rsize; 2801 caddr_t raddr; 2802 faultcode_t fault_err; 2803 struct seg *seg; 2804 int err; 2805 2806 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START, 2807 "as_pagelock_start: addr %p size %ld", addr, size); 2808 2809 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2810 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2811 (size_t)raddr; 2812 2813 /* 2814 * if the request crosses two segments let 2815 * as_fault handle it. 2816 */ 2817 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2818 2819 seg = as_segat(as, raddr); 2820 if (seg == NULL) { 2821 AS_LOCK_EXIT(as, &as->a_lock); 2822 return (EFAULT); 2823 } 2824 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size); 2825 if (raddr + rsize > seg->s_base + seg->s_size) { 2826 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw)); 2827 } 2828 if (raddr + rsize <= raddr) { 2829 AS_LOCK_EXIT(as, &as->a_lock); 2830 return (EFAULT); 2831 } 2832 2833 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START, 2834 "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize); 2835 2836 /* 2837 * try to lock pages and pass back shadow list 2838 */ 2839 err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw); 2840 2841 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end"); 2842 2843 AS_LOCK_EXIT(as, &as->a_lock); 2844 2845 if (err == 0 || (err != ENOTSUP && err != EFAULT)) { 2846 return (err); 2847 } 2848 2849 /* 2850 * Use F_SOFTLOCK to lock the pages because pagelock failed either due 2851 * to no pagelock support for this segment or pages need to be cow 2852 * faulted in. If fault is needed F_SOFTLOCK will do this job for 2853 * this as_pagelock() call and in the next as_pagelock() call for the 2854 * same address range pagelock call will hopefull succeed. 2855 */ 2856 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw); 2857 if (fault_err != 0) { 2858 return (fc_decode(fault_err)); 2859 } 2860 *ppp = NULL; 2861 2862 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end"); 2863 return (0); 2864 } 2865 2866 /* 2867 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow 2868 * lists from the end of plist and call pageunlock interface for each segment. 2869 * Drop as lock and free plist. 2870 */ 2871 static void 2872 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size, 2873 struct page **plist, enum seg_rw rw) 2874 { 2875 ulong_t cnt; 2876 caddr_t eaddr = addr + size; 2877 pgcnt_t npages = btop(size); 2878 size_t ssize; 2879 page_t **pl; 2880 2881 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 2882 ASSERT(seg != NULL); 2883 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size); 2884 ASSERT(addr + size > seg->s_base + seg->s_size); 2885 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 2886 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 2887 ASSERT(plist != NULL); 2888 2889 for (cnt = 0; addr < eaddr; addr += ssize) { 2890 if (addr >= seg->s_base + seg->s_size) { 2891 seg = AS_SEGNEXT(as, seg); 2892 ASSERT(seg != NULL && addr == seg->s_base); 2893 cnt++; 2894 } 2895 if (eaddr > seg->s_base + seg->s_size) { 2896 ssize = seg->s_base + seg->s_size - addr; 2897 } else { 2898 ssize = eaddr - addr; 2899 } 2900 pl = &plist[npages + cnt]; 2901 ASSERT(*pl != NULL); 2902 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, 2903 L_PAGEUNLOCK, rw); 2904 } 2905 ASSERT(cnt > 0); 2906 AS_LOCK_EXIT(as, &as->a_lock); 2907 2908 cnt++; 2909 kmem_free(plist, (npages + cnt) * sizeof (page_t *)); 2910 } 2911 2912 /* 2913 * unlock pages in a given address range 2914 */ 2915 void 2916 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size, 2917 enum seg_rw rw) 2918 { 2919 struct seg *seg; 2920 size_t rsize; 2921 caddr_t raddr; 2922 2923 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START, 2924 "as_pageunlock_start: addr %p size %ld", addr, size); 2925 2926 /* 2927 * if the shadow list is NULL, as_pagelock was 2928 * falling back to as_fault 2929 */ 2930 if (pp == NULL) { 2931 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw); 2932 return; 2933 } 2934 2935 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2936 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2937 (size_t)raddr; 2938 2939 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2940 seg = as_segat(as, raddr); 2941 ASSERT(seg != NULL); 2942 2943 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START, 2944 "seg_unlock_start: raddr %p rsize %ld", raddr, rsize); 2945 2946 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size); 2947 if (raddr + rsize <= seg->s_base + seg->s_size) { 2948 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw); 2949 } else { 2950 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw); 2951 return; 2952 } 2953 AS_LOCK_EXIT(as, &as->a_lock); 2954 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end"); 2955 } 2956 2957 int 2958 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc, 2959 boolean_t wait) 2960 { 2961 struct seg *seg; 2962 size_t ssize; 2963 caddr_t raddr; /* rounded down addr */ 2964 size_t rsize; /* rounded up size */ 2965 int error = 0; 2966 size_t pgsz = page_get_pagesize(szc); 2967 2968 setpgsz_top: 2969 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) { 2970 return (EINVAL); 2971 } 2972 2973 raddr = addr; 2974 rsize = size; 2975 2976 if (raddr + rsize < raddr) /* check for wraparound */ 2977 return (ENOMEM); 2978 2979 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 2980 as_clearwatchprot(as, raddr, rsize); 2981 seg = as_segat(as, raddr); 2982 if (seg == NULL) { 2983 as_setwatch(as); 2984 AS_LOCK_EXIT(as, &as->a_lock); 2985 return (ENOMEM); 2986 } 2987 2988 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 2989 if (raddr >= seg->s_base + seg->s_size) { 2990 seg = AS_SEGNEXT(as, seg); 2991 if (seg == NULL || raddr != seg->s_base) { 2992 error = ENOMEM; 2993 break; 2994 } 2995 } 2996 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 2997 ssize = seg->s_base + seg->s_size - raddr; 2998 } else { 2999 ssize = rsize; 3000 } 3001 3002 retry: 3003 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc); 3004 3005 if (error == IE_NOMEM) { 3006 error = EAGAIN; 3007 break; 3008 } 3009 3010 if (error == IE_RETRY) { 3011 AS_LOCK_EXIT(as, &as->a_lock); 3012 goto setpgsz_top; 3013 } 3014 3015 if (error == ENOTSUP) { 3016 error = EINVAL; 3017 break; 3018 } 3019 3020 if (wait && (error == EAGAIN)) { 3021 /* 3022 * Memory is currently locked. It must be unlocked 3023 * before this operation can succeed through a retry. 3024 * The possible reasons for locked memory and 3025 * corresponding strategies for unlocking are: 3026 * (1) Normal I/O 3027 * wait for a signal that the I/O operation 3028 * has completed and the memory is unlocked. 3029 * (2) Asynchronous I/O 3030 * The aio subsystem does not unlock pages when 3031 * the I/O is completed. Those pages are unlocked 3032 * when the application calls aiowait/aioerror. 3033 * So, to prevent blocking forever, cv_broadcast() 3034 * is done to wake up aio_cleanup_thread. 3035 * Subsequently, segvn_reclaim will be called, and 3036 * that will do AS_CLRUNMAPWAIT() and wake us up. 3037 * (3) Long term page locking: 3038 * This is not relevant for as_setpagesize() 3039 * because we cannot change the page size for 3040 * driver memory. The attempt to do so will 3041 * fail with a different error than EAGAIN so 3042 * there's no need to trigger as callbacks like 3043 * as_unmap, as_setprot or as_free would do. 3044 */ 3045 mutex_enter(&as->a_contents); 3046 if (!AS_ISNOUNMAPWAIT(as)) { 3047 if (AS_ISUNMAPWAIT(as) == 0) { 3048 cv_broadcast(&as->a_cv); 3049 } 3050 AS_SETUNMAPWAIT(as); 3051 AS_LOCK_EXIT(as, &as->a_lock); 3052 while (AS_ISUNMAPWAIT(as)) { 3053 cv_wait(&as->a_cv, &as->a_contents); 3054 } 3055 } else { 3056 /* 3057 * We may have raced with 3058 * segvn_reclaim()/segspt_reclaim(). In this 3059 * case clean nounmapwait flag and retry since 3060 * softlockcnt in this segment may be already 3061 * 0. We don't drop as writer lock so our 3062 * number of retries without sleeping should 3063 * be very small. See segvn_reclaim() for 3064 * more comments. 3065 */ 3066 AS_CLRNOUNMAPWAIT(as); 3067 mutex_exit(&as->a_contents); 3068 goto retry; 3069 } 3070 mutex_exit(&as->a_contents); 3071 goto setpgsz_top; 3072 } else if (error != 0) { 3073 break; 3074 } 3075 } 3076 as_setwatch(as); 3077 AS_LOCK_EXIT(as, &as->a_lock); 3078 return (error); 3079 } 3080 3081 /* 3082 * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments 3083 * in its chunk where s_szc is less than the szc we want to set. 3084 */ 3085 static int 3086 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc, 3087 int *retry) 3088 { 3089 struct seg *seg; 3090 size_t ssize; 3091 int error; 3092 3093 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3094 3095 seg = as_segat(as, raddr); 3096 if (seg == NULL) { 3097 panic("as_iset3_default_lpsize: no seg"); 3098 } 3099 3100 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 3101 if (raddr >= seg->s_base + seg->s_size) { 3102 seg = AS_SEGNEXT(as, seg); 3103 if (seg == NULL || raddr != seg->s_base) { 3104 panic("as_iset3_default_lpsize: as changed"); 3105 } 3106 } 3107 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3108 ssize = seg->s_base + seg->s_size - raddr; 3109 } else { 3110 ssize = rsize; 3111 } 3112 3113 if (szc > seg->s_szc) { 3114 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc); 3115 /* Only retry on EINVAL segments that have no vnode. */ 3116 if (error == EINVAL) { 3117 vnode_t *vp = NULL; 3118 if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) && 3119 (SEGOP_GETVP(seg, raddr, &vp) != 0 || 3120 vp == NULL)) { 3121 *retry = 1; 3122 } else { 3123 *retry = 0; 3124 } 3125 } 3126 if (error) { 3127 return (error); 3128 } 3129 } 3130 } 3131 return (0); 3132 } 3133 3134 /* 3135 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the 3136 * pagesize on each segment in its range, but if any fails with EINVAL, 3137 * then it reduces the pagesizes to the next size in the bitmap and 3138 * retries as_iset3_default_lpsize(). The reason why the code retries 3139 * smaller allowed sizes on EINVAL is because (a) the anon offset may not 3140 * match the bigger sizes, and (b) it's hard to get this offset (to begin 3141 * with) to pass to map_pgszcvec(). 3142 */ 3143 static int 3144 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc, 3145 uint_t szcvec) 3146 { 3147 int error; 3148 int retry; 3149 3150 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3151 3152 for (;;) { 3153 error = as_iset3_default_lpsize(as, addr, size, szc, &retry); 3154 if (error == EINVAL && retry) { 3155 szcvec &= ~(1 << szc); 3156 if (szcvec <= 1) { 3157 return (EINVAL); 3158 } 3159 szc = highbit(szcvec) - 1; 3160 } else { 3161 return (error); 3162 } 3163 } 3164 } 3165 3166 /* 3167 * as_iset1_default_lpsize() breaks its chunk into areas where existing 3168 * segments have a smaller szc than we want to set. For each such area, 3169 * it calls as_iset2_default_lpsize() 3170 */ 3171 static int 3172 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc, 3173 uint_t szcvec) 3174 { 3175 struct seg *seg; 3176 size_t ssize; 3177 caddr_t setaddr = raddr; 3178 size_t setsize = 0; 3179 int set; 3180 int error; 3181 3182 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3183 3184 seg = as_segat(as, raddr); 3185 if (seg == NULL) { 3186 panic("as_iset1_default_lpsize: no seg"); 3187 } 3188 if (seg->s_szc < szc) { 3189 set = 1; 3190 } else { 3191 set = 0; 3192 } 3193 3194 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) { 3195 if (raddr >= seg->s_base + seg->s_size) { 3196 seg = AS_SEGNEXT(as, seg); 3197 if (seg == NULL || raddr != seg->s_base) { 3198 panic("as_iset1_default_lpsize: as changed"); 3199 } 3200 if (seg->s_szc >= szc && set) { 3201 ASSERT(setsize != 0); 3202 error = as_iset2_default_lpsize(as, 3203 setaddr, setsize, szc, szcvec); 3204 if (error) { 3205 return (error); 3206 } 3207 set = 0; 3208 } else if (seg->s_szc < szc && !set) { 3209 setaddr = raddr; 3210 setsize = 0; 3211 set = 1; 3212 } 3213 } 3214 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3215 ssize = seg->s_base + seg->s_size - raddr; 3216 } else { 3217 ssize = rsize; 3218 } 3219 } 3220 error = 0; 3221 if (set) { 3222 ASSERT(setsize != 0); 3223 error = as_iset2_default_lpsize(as, setaddr, setsize, 3224 szc, szcvec); 3225 } 3226 return (error); 3227 } 3228 3229 /* 3230 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap 3231 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each 3232 * chunk to as_iset1_default_lpsize(). 3233 */ 3234 static int 3235 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags, 3236 int type) 3237 { 3238 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM; 3239 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, 3240 flags, rtype, 1); 3241 uint_t szc; 3242 uint_t nszc; 3243 int error; 3244 caddr_t a; 3245 caddr_t eaddr; 3246 size_t segsize; 3247 size_t pgsz; 3248 uint_t save_szcvec; 3249 3250 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3251 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 3252 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 3253 3254 szcvec &= ~1; 3255 if (szcvec <= 1) { /* skip if base page size */ 3256 return (0); 3257 } 3258 3259 /* Get the pagesize of the first larger page size. */ 3260 szc = lowbit(szcvec) - 1; 3261 pgsz = page_get_pagesize(szc); 3262 eaddr = addr + size; 3263 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 3264 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 3265 3266 save_szcvec = szcvec; 3267 szcvec >>= (szc + 1); 3268 nszc = szc; 3269 while (szcvec) { 3270 if ((szcvec & 0x1) == 0) { 3271 nszc++; 3272 szcvec >>= 1; 3273 continue; 3274 } 3275 nszc++; 3276 pgsz = page_get_pagesize(nszc); 3277 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 3278 if (a != addr) { 3279 ASSERT(szc > 0); 3280 ASSERT(a < eaddr); 3281 segsize = a - addr; 3282 error = as_iset1_default_lpsize(as, addr, segsize, szc, 3283 save_szcvec); 3284 if (error) { 3285 return (error); 3286 } 3287 addr = a; 3288 } 3289 szc = nszc; 3290 szcvec >>= 1; 3291 } 3292 3293 ASSERT(addr < eaddr); 3294 szcvec = save_szcvec; 3295 while (szcvec) { 3296 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 3297 ASSERT(a >= addr); 3298 if (a != addr) { 3299 ASSERT(szc > 0); 3300 segsize = a - addr; 3301 error = as_iset1_default_lpsize(as, addr, segsize, szc, 3302 save_szcvec); 3303 if (error) { 3304 return (error); 3305 } 3306 addr = a; 3307 } 3308 szcvec &= ~(1 << szc); 3309 if (szcvec) { 3310 szc = highbit(szcvec) - 1; 3311 pgsz = page_get_pagesize(szc); 3312 } 3313 } 3314 ASSERT(addr == eaddr); 3315 3316 return (0); 3317 } 3318 3319 /* 3320 * Set the default large page size for the range. Called via memcntl with 3321 * page size set to 0. as_set_default_lpsize breaks the range down into 3322 * chunks with the same type/flags, ignores-non segvn segments, and passes 3323 * each chunk to as_iset_default_lpsize(). 3324 */ 3325 int 3326 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size) 3327 { 3328 struct seg *seg; 3329 caddr_t raddr; 3330 size_t rsize; 3331 size_t ssize; 3332 int rtype, rflags; 3333 int stype, sflags; 3334 int error; 3335 caddr_t setaddr; 3336 size_t setsize; 3337 int segvn; 3338 3339 if (size == 0) 3340 return (0); 3341 3342 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 3343 again: 3344 error = 0; 3345 3346 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3347 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 3348 (size_t)raddr; 3349 3350 if (raddr + rsize < raddr) { /* check for wraparound */ 3351 AS_LOCK_EXIT(as, &as->a_lock); 3352 return (ENOMEM); 3353 } 3354 as_clearwatchprot(as, raddr, rsize); 3355 seg = as_segat(as, raddr); 3356 if (seg == NULL) { 3357 as_setwatch(as); 3358 AS_LOCK_EXIT(as, &as->a_lock); 3359 return (ENOMEM); 3360 } 3361 if (seg->s_ops == &segvn_ops) { 3362 rtype = SEGOP_GETTYPE(seg, addr); 3363 rflags = rtype & (MAP_TEXT | MAP_INITDATA); 3364 rtype = rtype & (MAP_SHARED | MAP_PRIVATE); 3365 segvn = 1; 3366 } else { 3367 segvn = 0; 3368 } 3369 setaddr = raddr; 3370 setsize = 0; 3371 3372 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) { 3373 if (raddr >= (seg->s_base + seg->s_size)) { 3374 seg = AS_SEGNEXT(as, seg); 3375 if (seg == NULL || raddr != seg->s_base) { 3376 error = ENOMEM; 3377 break; 3378 } 3379 if (seg->s_ops == &segvn_ops) { 3380 stype = SEGOP_GETTYPE(seg, raddr); 3381 sflags = stype & (MAP_TEXT | MAP_INITDATA); 3382 stype &= (MAP_SHARED | MAP_PRIVATE); 3383 if (segvn && (rflags != sflags || 3384 rtype != stype)) { 3385 /* 3386 * The next segment is also segvn but 3387 * has different flags and/or type. 3388 */ 3389 ASSERT(setsize != 0); 3390 error = as_iset_default_lpsize(as, 3391 setaddr, setsize, rflags, rtype); 3392 if (error) { 3393 break; 3394 } 3395 rflags = sflags; 3396 rtype = stype; 3397 setaddr = raddr; 3398 setsize = 0; 3399 } else if (!segvn) { 3400 rflags = sflags; 3401 rtype = stype; 3402 setaddr = raddr; 3403 setsize = 0; 3404 segvn = 1; 3405 } 3406 } else if (segvn) { 3407 /* The next segment is not segvn. */ 3408 ASSERT(setsize != 0); 3409 error = as_iset_default_lpsize(as, 3410 setaddr, setsize, rflags, rtype); 3411 if (error) { 3412 break; 3413 } 3414 segvn = 0; 3415 } 3416 } 3417 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3418 ssize = seg->s_base + seg->s_size - raddr; 3419 } else { 3420 ssize = rsize; 3421 } 3422 } 3423 if (error == 0 && segvn) { 3424 /* The last chunk when rsize == 0. */ 3425 ASSERT(setsize != 0); 3426 error = as_iset_default_lpsize(as, setaddr, setsize, 3427 rflags, rtype); 3428 } 3429 3430 if (error == IE_RETRY) { 3431 goto again; 3432 } else if (error == IE_NOMEM) { 3433 error = EAGAIN; 3434 } else if (error == ENOTSUP) { 3435 error = EINVAL; 3436 } else if (error == EAGAIN) { 3437 mutex_enter(&as->a_contents); 3438 if (!AS_ISNOUNMAPWAIT(as)) { 3439 if (AS_ISUNMAPWAIT(as) == 0) { 3440 cv_broadcast(&as->a_cv); 3441 } 3442 AS_SETUNMAPWAIT(as); 3443 AS_LOCK_EXIT(as, &as->a_lock); 3444 while (AS_ISUNMAPWAIT(as)) { 3445 cv_wait(&as->a_cv, &as->a_contents); 3446 } 3447 mutex_exit(&as->a_contents); 3448 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 3449 } else { 3450 /* 3451 * We may have raced with 3452 * segvn_reclaim()/segspt_reclaim(). In this case 3453 * clean nounmapwait flag and retry since softlockcnt 3454 * in this segment may be already 0. We don't drop as 3455 * writer lock so our number of retries without 3456 * sleeping should be very small. See segvn_reclaim() 3457 * for more comments. 3458 */ 3459 AS_CLRNOUNMAPWAIT(as); 3460 mutex_exit(&as->a_contents); 3461 } 3462 goto again; 3463 } 3464 3465 as_setwatch(as); 3466 AS_LOCK_EXIT(as, &as->a_lock); 3467 return (error); 3468 } 3469 3470 /* 3471 * Setup all of the uninitialized watched pages that we can. 3472 */ 3473 void 3474 as_setwatch(struct as *as) 3475 { 3476 struct watched_page *pwp; 3477 struct seg *seg; 3478 caddr_t vaddr; 3479 uint_t prot; 3480 int err, retrycnt; 3481 3482 if (avl_numnodes(&as->a_wpage) == 0) 3483 return; 3484 3485 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3486 3487 for (pwp = avl_first(&as->a_wpage); pwp != NULL; 3488 pwp = AVL_NEXT(&as->a_wpage, pwp)) { 3489 retrycnt = 0; 3490 retry: 3491 vaddr = pwp->wp_vaddr; 3492 if (pwp->wp_oprot != 0 || /* already set up */ 3493 (seg = as_segat(as, vaddr)) == NULL || 3494 SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0) 3495 continue; 3496 3497 pwp->wp_oprot = prot; 3498 if (pwp->wp_read) 3499 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3500 if (pwp->wp_write) 3501 prot &= ~PROT_WRITE; 3502 if (pwp->wp_exec) 3503 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3504 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) { 3505 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot); 3506 if (err == IE_RETRY) { 3507 pwp->wp_oprot = 0; 3508 ASSERT(retrycnt == 0); 3509 retrycnt++; 3510 goto retry; 3511 } 3512 } 3513 pwp->wp_prot = prot; 3514 } 3515 } 3516 3517 /* 3518 * Clear all of the watched pages in the address space. 3519 */ 3520 void 3521 as_clearwatch(struct as *as) 3522 { 3523 struct watched_page *pwp; 3524 struct seg *seg; 3525 caddr_t vaddr; 3526 uint_t prot; 3527 int err, retrycnt; 3528 3529 if (avl_numnodes(&as->a_wpage) == 0) 3530 return; 3531 3532 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3533 3534 for (pwp = avl_first(&as->a_wpage); pwp != NULL; 3535 pwp = AVL_NEXT(&as->a_wpage, pwp)) { 3536 retrycnt = 0; 3537 retry: 3538 vaddr = pwp->wp_vaddr; 3539 if (pwp->wp_oprot == 0 || /* not set up */ 3540 (seg = as_segat(as, vaddr)) == NULL) 3541 continue; 3542 3543 if ((prot = pwp->wp_oprot) != pwp->wp_prot) { 3544 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot); 3545 if (err == IE_RETRY) { 3546 ASSERT(retrycnt == 0); 3547 retrycnt++; 3548 goto retry; 3549 } 3550 } 3551 pwp->wp_oprot = 0; 3552 pwp->wp_prot = 0; 3553 } 3554 } 3555 3556 /* 3557 * Force a new setup for all the watched pages in the range. 3558 */ 3559 static void 3560 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 3561 { 3562 struct watched_page *pwp; 3563 struct watched_page tpw; 3564 caddr_t eaddr = addr + size; 3565 caddr_t vaddr; 3566 struct seg *seg; 3567 int err, retrycnt; 3568 uint_t wprot; 3569 avl_index_t where; 3570 3571 if (avl_numnodes(&as->a_wpage) == 0) 3572 return; 3573 3574 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3575 3576 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3577 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) 3578 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); 3579 3580 while (pwp != NULL && pwp->wp_vaddr < eaddr) { 3581 retrycnt = 0; 3582 vaddr = pwp->wp_vaddr; 3583 3584 wprot = prot; 3585 if (pwp->wp_read) 3586 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3587 if (pwp->wp_write) 3588 wprot &= ~PROT_WRITE; 3589 if (pwp->wp_exec) 3590 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3591 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) { 3592 retry: 3593 seg = as_segat(as, vaddr); 3594 if (seg == NULL) { 3595 panic("as_setwatchprot: no seg"); 3596 /*NOTREACHED*/ 3597 } 3598 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot); 3599 if (err == IE_RETRY) { 3600 ASSERT(retrycnt == 0); 3601 retrycnt++; 3602 goto retry; 3603 } 3604 } 3605 pwp->wp_oprot = prot; 3606 pwp->wp_prot = wprot; 3607 3608 pwp = AVL_NEXT(&as->a_wpage, pwp); 3609 } 3610 } 3611 3612 /* 3613 * Clear all of the watched pages in the range. 3614 */ 3615 static void 3616 as_clearwatchprot(struct as *as, caddr_t addr, size_t size) 3617 { 3618 caddr_t eaddr = addr + size; 3619 struct watched_page *pwp; 3620 struct watched_page tpw; 3621 uint_t prot; 3622 struct seg *seg; 3623 int err, retrycnt; 3624 avl_index_t where; 3625 3626 if (avl_numnodes(&as->a_wpage) == 0) 3627 return; 3628 3629 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3630 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) 3631 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); 3632 3633 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3634 3635 while (pwp != NULL && pwp->wp_vaddr < eaddr) { 3636 3637 if ((prot = pwp->wp_oprot) != 0) { 3638 retrycnt = 0; 3639 3640 if (prot != pwp->wp_prot) { 3641 retry: 3642 seg = as_segat(as, pwp->wp_vaddr); 3643 if (seg == NULL) 3644 continue; 3645 err = SEGOP_SETPROT(seg, pwp->wp_vaddr, 3646 PAGESIZE, prot); 3647 if (err == IE_RETRY) { 3648 ASSERT(retrycnt == 0); 3649 retrycnt++; 3650 goto retry; 3651 3652 } 3653 } 3654 pwp->wp_oprot = 0; 3655 pwp->wp_prot = 0; 3656 } 3657 3658 pwp = AVL_NEXT(&as->a_wpage, pwp); 3659 } 3660 } 3661 3662 void 3663 as_signal_proc(struct as *as, k_siginfo_t *siginfo) 3664 { 3665 struct proc *p; 3666 3667 mutex_enter(&pidlock); 3668 for (p = practive; p; p = p->p_next) { 3669 if (p->p_as == as) { 3670 mutex_enter(&p->p_lock); 3671 if (p->p_as == as) 3672 sigaddq(p, NULL, siginfo, KM_NOSLEEP); 3673 mutex_exit(&p->p_lock); 3674 } 3675 } 3676 mutex_exit(&pidlock); 3677 } 3678 3679 /* 3680 * return memory object ID 3681 */ 3682 int 3683 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp) 3684 { 3685 struct seg *seg; 3686 int sts; 3687 3688 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 3689 seg = as_segat(as, addr); 3690 if (seg == NULL) { 3691 AS_LOCK_EXIT(as, &as->a_lock); 3692 return (EFAULT); 3693 } 3694 /* 3695 * catch old drivers which may not support getmemid 3696 */ 3697 if (seg->s_ops->getmemid == NULL) { 3698 AS_LOCK_EXIT(as, &as->a_lock); 3699 return (ENODEV); 3700 } 3701 3702 sts = SEGOP_GETMEMID(seg, addr, memidp); 3703 3704 AS_LOCK_EXIT(as, &as->a_lock); 3705 return (sts); 3706 } 3707