1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - address spaces. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/t_lock.h> 47 #include <sys/param.h> 48 #include <sys/errno.h> 49 #include <sys/systm.h> 50 #include <sys/mman.h> 51 #include <sys/sysmacros.h> 52 #include <sys/cpuvar.h> 53 #include <sys/sysinfo.h> 54 #include <sys/kmem.h> 55 #include <sys/vnode.h> 56 #include <sys/vmsystm.h> 57 #include <sys/cmn_err.h> 58 #include <sys/debug.h> 59 #include <sys/tnf_probe.h> 60 #include <sys/vtrace.h> 61 62 #include <vm/hat.h> 63 #include <vm/xhat.h> 64 #include <vm/as.h> 65 #include <vm/seg.h> 66 #include <vm/seg_vn.h> 67 #include <vm/seg_dev.h> 68 #include <vm/seg_kmem.h> 69 #include <vm/seg_map.h> 70 #include <vm/seg_spt.h> 71 #include <vm/page.h> 72 73 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */ 74 75 static struct kmem_cache *as_cache; 76 77 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t); 78 static void as_clearwatchprot(struct as *, caddr_t, size_t); 79 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *); 80 81 82 /* 83 * Verifying the segment lists is very time-consuming; it may not be 84 * desirable always to define VERIFY_SEGLIST when DEBUG is set. 85 */ 86 #ifdef DEBUG 87 #define VERIFY_SEGLIST 88 int do_as_verify = 0; 89 #endif 90 91 /* 92 * Allocate a new callback data structure entry and fill in the events of 93 * interest, the address range of interest, and the callback argument. 94 * Link the entry on the as->a_callbacks list. A callback entry for the 95 * entire address space may be specified with vaddr = 0 and size = -1. 96 * 97 * CALLERS RESPONSIBILITY: If not calling from within the process context for 98 * the specified as, the caller must guarantee persistence of the specified as 99 * for the duration of this function (eg. pages being locked within the as 100 * will guarantee persistence). 101 */ 102 int 103 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events, 104 caddr_t vaddr, size_t size, int sleepflag) 105 { 106 struct as_callback *current_head, *cb; 107 caddr_t saddr; 108 size_t rsize; 109 110 /* callback function and an event are mandatory */ 111 if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0)) 112 return (EINVAL); 113 114 /* Adding a callback after as_free has been called is not allowed */ 115 if (as == &kas) 116 return (ENOMEM); 117 118 /* 119 * vaddr = 0 and size = -1 is used to indicate that the callback range 120 * is the entire address space so no rounding is done in that case. 121 */ 122 if (size != -1) { 123 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK); 124 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) - 125 (size_t)saddr; 126 /* check for wraparound */ 127 if (saddr + rsize < saddr) 128 return (ENOMEM); 129 } else { 130 if (vaddr != 0) 131 return (EINVAL); 132 saddr = vaddr; 133 rsize = size; 134 } 135 136 /* Allocate and initialize a callback entry */ 137 cb = kmem_zalloc(sizeof (struct as_callback), sleepflag); 138 if (cb == NULL) 139 return (EAGAIN); 140 141 cb->ascb_func = cb_func; 142 cb->ascb_arg = arg; 143 cb->ascb_events = events; 144 cb->ascb_saddr = saddr; 145 cb->ascb_len = rsize; 146 147 /* Add the entry to the list */ 148 mutex_enter(&as->a_contents); 149 current_head = as->a_callbacks; 150 as->a_callbacks = cb; 151 cb->ascb_next = current_head; 152 153 /* 154 * The call to this function may lose in a race with 155 * a pertinent event - eg. a thread does long term memory locking 156 * but before the callback is added another thread executes as_unmap. 157 * A broadcast here resolves that. 158 */ 159 if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) { 160 AS_CLRUNMAPWAIT(as); 161 cv_broadcast(&as->a_cv); 162 } 163 164 mutex_exit(&as->a_contents); 165 return (0); 166 } 167 168 /* 169 * Search the callback list for an entry which pertains to arg. 170 * 171 * This is called from within the client upon completion of the callback. 172 * RETURN VALUES: 173 * AS_CALLBACK_DELETED (callback entry found and deleted) 174 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok) 175 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this 176 * entry will be made in as_do_callbacks) 177 * 178 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED 179 * set, it indicates that as_do_callbacks is processing this entry. The 180 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made 181 * to unblock as_do_callbacks, in case it is blocked. 182 * 183 * CALLERS RESPONSIBILITY: If not calling from within the process context for 184 * the specified as, the caller must guarantee persistence of the specified as 185 * for the duration of this function (eg. pages being locked within the as 186 * will guarantee persistence). 187 */ 188 uint_t 189 as_delete_callback(struct as *as, void *arg) 190 { 191 struct as_callback **prevcb = &as->a_callbacks; 192 struct as_callback *cb; 193 uint_t rc = AS_CALLBACK_NOTFOUND; 194 195 mutex_enter(&as->a_contents); 196 for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) { 197 if (cb->ascb_arg != arg) 198 continue; 199 200 /* 201 * If the events indicate AS_CALLBACK_CALLED, just clear 202 * AS_ALL_EVENT in the events field and wakeup the thread 203 * that may be waiting in as_do_callbacks. as_do_callbacks 204 * will take care of removing this entry from the list. In 205 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise 206 * (AS_CALLBACK_CALLED not set), just remove it from the 207 * list, return the memory and return AS_CALLBACK_DELETED. 208 */ 209 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) { 210 /* leave AS_CALLBACK_CALLED */ 211 cb->ascb_events &= ~AS_ALL_EVENT; 212 rc = AS_CALLBACK_DELETE_DEFERRED; 213 cv_broadcast(&as->a_cv); 214 } else { 215 *prevcb = cb->ascb_next; 216 kmem_free(cb, sizeof (struct as_callback)); 217 rc = AS_CALLBACK_DELETED; 218 } 219 break; 220 } 221 mutex_exit(&as->a_contents); 222 return (rc); 223 } 224 225 /* 226 * Searches the as callback list for a matching entry. 227 * Returns a pointer to the first matching callback, or NULL if 228 * nothing is found. 229 * This function never sleeps so it is ok to call it with more 230 * locks held but the (required) a_contents mutex. 231 * 232 * See also comment on as_do_callbacks below. 233 */ 234 static struct as_callback * 235 as_find_callback(struct as *as, uint_t events, caddr_t event_addr, 236 size_t event_len) 237 { 238 struct as_callback *cb; 239 240 ASSERT(MUTEX_HELD(&as->a_contents)); 241 for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) { 242 /* 243 * If the callback has not already been called, then 244 * check if events or address range pertains. An event_len 245 * of zero means do an unconditional callback. 246 */ 247 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) || 248 ((event_len != 0) && (((cb->ascb_events & events) == 0) || 249 (event_addr + event_len < cb->ascb_saddr) || 250 (event_addr > (cb->ascb_saddr + cb->ascb_len))))) { 251 continue; 252 } 253 break; 254 } 255 return (cb); 256 } 257 258 /* 259 * Executes a given callback and removes it from the callback list for 260 * this address space. 261 * This function may sleep so the caller must drop all locks except 262 * a_contents before calling this func. 263 * 264 * See also comments on as_do_callbacks below. 265 */ 266 static void 267 as_execute_callback(struct as *as, struct as_callback *cb, 268 uint_t events) 269 { 270 struct as_callback **prevcb; 271 void *cb_arg; 272 273 ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events)); 274 cb->ascb_events |= AS_CALLBACK_CALLED; 275 mutex_exit(&as->a_contents); 276 (*cb->ascb_func)(as, cb->ascb_arg, events); 277 mutex_enter(&as->a_contents); 278 /* 279 * the callback function is required to delete the callback 280 * when the callback function determines it is OK for 281 * this thread to continue. as_delete_callback will clear 282 * the AS_ALL_EVENT in the events field when it is deleted. 283 * If the callback function called as_delete_callback, 284 * events will already be cleared and there will be no blocking. 285 */ 286 while ((cb->ascb_events & events) != 0) { 287 cv_wait(&as->a_cv, &as->a_contents); 288 } 289 /* 290 * This entry needs to be taken off the list. Normally, the 291 * callback func itself does that, but unfortunately the list 292 * may have changed while the callback was running because the 293 * a_contents mutex was dropped and someone else other than the 294 * callback func itself could have called as_delete_callback, 295 * so we have to search to find this entry again. The entry 296 * must have AS_CALLBACK_CALLED, and have the same 'arg'. 297 */ 298 cb_arg = cb->ascb_arg; 299 prevcb = &as->a_callbacks; 300 for (cb = as->a_callbacks; cb != NULL; 301 prevcb = &cb->ascb_next, cb = *prevcb) { 302 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) || 303 (cb_arg != cb->ascb_arg)) { 304 continue; 305 } 306 *prevcb = cb->ascb_next; 307 kmem_free(cb, sizeof (struct as_callback)); 308 break; 309 } 310 } 311 312 /* 313 * Check the callback list for a matching event and intersection of 314 * address range. If there is a match invoke the callback. Skip an entry if: 315 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED) 316 * - not event of interest 317 * - not address range of interest 318 * 319 * An event_len of zero indicates a request for an unconditional callback 320 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The 321 * a_contents lock must be dropped before a callback, so only one callback 322 * can be done before returning. Return -1 (true) if a callback was 323 * executed and removed from the list, else return 0 (false). 324 * 325 * The logically separate parts, i.e. finding a matching callback and 326 * executing a given callback have been separated into two functions 327 * so that they can be called with different sets of locks held beyond 328 * the always-required a_contents. as_find_callback does not sleep so 329 * it is ok to call it if more locks than a_contents (i.e. the a_lock 330 * rwlock) are held. as_execute_callback on the other hand may sleep 331 * so all locks beyond a_contents must be dropped by the caller if one 332 * does not want to end comatose. 333 */ 334 static int 335 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr, 336 size_t event_len) 337 { 338 struct as_callback *cb; 339 340 if ((cb = as_find_callback(as, events, event_addr, event_len))) { 341 as_execute_callback(as, cb, events); 342 return (-1); 343 } 344 return (0); 345 } 346 347 /* 348 * Search for the segment containing addr. If a segment containing addr 349 * exists, that segment is returned. If no such segment exists, and 350 * the list spans addresses greater than addr, then the first segment 351 * whose base is greater than addr is returned; otherwise, NULL is 352 * returned unless tail is true, in which case the last element of the 353 * list is returned. 354 * 355 * a_seglast is used to cache the last found segment for repeated 356 * searches to the same addr (which happens frequently). 357 */ 358 struct seg * 359 as_findseg(struct as *as, caddr_t addr, int tail) 360 { 361 struct seg *seg = as->a_seglast; 362 avl_index_t where; 363 364 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 365 366 if (seg != NULL && 367 seg->s_base <= addr && 368 addr < seg->s_base + seg->s_size) 369 return (seg); 370 371 seg = avl_find(&as->a_segtree, &addr, &where); 372 if (seg != NULL) 373 return (as->a_seglast = seg); 374 375 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); 376 if (seg == NULL && tail) 377 seg = avl_last(&as->a_segtree); 378 return (as->a_seglast = seg); 379 } 380 381 #ifdef VERIFY_SEGLIST 382 /* 383 * verify that the linked list is coherent 384 */ 385 static void 386 as_verify(struct as *as) 387 { 388 struct seg *seg, *seglast, *p, *n; 389 uint_t nsegs = 0; 390 391 if (do_as_verify == 0) 392 return; 393 394 seglast = as->a_seglast; 395 396 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 397 ASSERT(seg->s_as == as); 398 p = AS_SEGPREV(as, seg); 399 n = AS_SEGNEXT(as, seg); 400 ASSERT(p == NULL || p->s_as == as); 401 ASSERT(p == NULL || p->s_base < seg->s_base); 402 ASSERT(n == NULL || n->s_base > seg->s_base); 403 ASSERT(n != NULL || seg == avl_last(&as->a_segtree)); 404 if (seg == seglast) 405 seglast = NULL; 406 nsegs++; 407 } 408 ASSERT(seglast == NULL); 409 ASSERT(avl_numnodes(&as->a_segtree) == nsegs); 410 } 411 #endif /* VERIFY_SEGLIST */ 412 413 /* 414 * Add a new segment to the address space. The avl_find() 415 * may be expensive so we attempt to use last segment accessed 416 * in as_gap() as an insertion point. 417 */ 418 int 419 as_addseg(struct as *as, struct seg *newseg) 420 { 421 struct seg *seg; 422 caddr_t addr; 423 caddr_t eaddr; 424 avl_index_t where; 425 426 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 427 428 as->a_updatedir = 1; /* inform /proc */ 429 gethrestime(&as->a_updatetime); 430 431 if (as->a_lastgaphl != NULL) { 432 struct seg *hseg = NULL; 433 struct seg *lseg = NULL; 434 435 if (as->a_lastgaphl->s_base > newseg->s_base) { 436 hseg = as->a_lastgaphl; 437 lseg = AVL_PREV(&as->a_segtree, hseg); 438 } else { 439 lseg = as->a_lastgaphl; 440 hseg = AVL_NEXT(&as->a_segtree, lseg); 441 } 442 443 if (hseg && lseg && lseg->s_base < newseg->s_base && 444 hseg->s_base > newseg->s_base) { 445 avl_insert_here(&as->a_segtree, newseg, lseg, 446 AVL_AFTER); 447 as->a_lastgaphl = NULL; 448 as->a_seglast = newseg; 449 return (0); 450 } 451 as->a_lastgaphl = NULL; 452 } 453 454 addr = newseg->s_base; 455 eaddr = addr + newseg->s_size; 456 again: 457 458 seg = avl_find(&as->a_segtree, &addr, &where); 459 460 if (seg == NULL) 461 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); 462 463 if (seg == NULL) 464 seg = avl_last(&as->a_segtree); 465 466 if (seg != NULL) { 467 caddr_t base = seg->s_base; 468 469 /* 470 * If top of seg is below the requested address, then 471 * the insertion point is at the end of the linked list, 472 * and seg points to the tail of the list. Otherwise, 473 * the insertion point is immediately before seg. 474 */ 475 if (base + seg->s_size > addr) { 476 if (addr >= base || eaddr > base) { 477 #ifdef __sparc 478 extern struct seg_ops segnf_ops; 479 480 /* 481 * no-fault segs must disappear if overlaid. 482 * XXX need new segment type so 483 * we don't have to check s_ops 484 */ 485 if (seg->s_ops == &segnf_ops) { 486 seg_unmap(seg); 487 goto again; 488 } 489 #endif 490 return (-1); /* overlapping segment */ 491 } 492 } 493 } 494 as->a_seglast = newseg; 495 avl_insert(&as->a_segtree, newseg, where); 496 497 #ifdef VERIFY_SEGLIST 498 as_verify(as); 499 #endif 500 return (0); 501 } 502 503 struct seg * 504 as_removeseg(struct as *as, struct seg *seg) 505 { 506 avl_tree_t *t; 507 508 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 509 510 as->a_updatedir = 1; /* inform /proc */ 511 gethrestime(&as->a_updatetime); 512 513 if (seg == NULL) 514 return (NULL); 515 516 t = &as->a_segtree; 517 if (as->a_seglast == seg) 518 as->a_seglast = NULL; 519 as->a_lastgaphl = NULL; 520 521 /* 522 * if this segment is at an address higher than 523 * a_lastgap, set a_lastgap to the next segment (NULL if last segment) 524 */ 525 if (as->a_lastgap && 526 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base)) 527 as->a_lastgap = AVL_NEXT(t, seg); 528 529 /* 530 * remove the segment from the seg tree 531 */ 532 avl_remove(t, seg); 533 534 #ifdef VERIFY_SEGLIST 535 as_verify(as); 536 #endif 537 return (seg); 538 } 539 540 /* 541 * Find a segment containing addr. 542 */ 543 struct seg * 544 as_segat(struct as *as, caddr_t addr) 545 { 546 struct seg *seg = as->a_seglast; 547 548 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 549 550 if (seg != NULL && seg->s_base <= addr && 551 addr < seg->s_base + seg->s_size) 552 return (seg); 553 554 seg = avl_find(&as->a_segtree, &addr, NULL); 555 return (seg); 556 } 557 558 /* 559 * Serialize all searches for holes in an address space to 560 * prevent two or more threads from allocating the same virtual 561 * address range. The address space must not be "read/write" 562 * locked by the caller since we may block. 563 */ 564 void 565 as_rangelock(struct as *as) 566 { 567 mutex_enter(&as->a_contents); 568 while (AS_ISCLAIMGAP(as)) 569 cv_wait(&as->a_cv, &as->a_contents); 570 AS_SETCLAIMGAP(as); 571 mutex_exit(&as->a_contents); 572 } 573 574 /* 575 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads. 576 */ 577 void 578 as_rangeunlock(struct as *as) 579 { 580 mutex_enter(&as->a_contents); 581 AS_CLRCLAIMGAP(as); 582 cv_signal(&as->a_cv); 583 mutex_exit(&as->a_contents); 584 } 585 586 /* 587 * compar segments (or just an address) by segment address range 588 */ 589 static int 590 as_segcompar(const void *x, const void *y) 591 { 592 struct seg *a = (struct seg *)x; 593 struct seg *b = (struct seg *)y; 594 595 if (a->s_base < b->s_base) 596 return (-1); 597 if (a->s_base >= b->s_base + b->s_size) 598 return (1); 599 return (0); 600 } 601 602 603 void 604 as_avlinit(struct as *as) 605 { 606 avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg), 607 offsetof(struct seg, s_tree)); 608 avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page), 609 offsetof(struct watched_page, wp_link)); 610 } 611 612 /*ARGSUSED*/ 613 static int 614 as_constructor(void *buf, void *cdrarg, int kmflags) 615 { 616 struct as *as = buf; 617 618 mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL); 619 cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL); 620 rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL); 621 as_avlinit(as); 622 return (0); 623 } 624 625 /*ARGSUSED1*/ 626 static void 627 as_destructor(void *buf, void *cdrarg) 628 { 629 struct as *as = buf; 630 631 avl_destroy(&as->a_segtree); 632 mutex_destroy(&as->a_contents); 633 cv_destroy(&as->a_cv); 634 rw_destroy(&as->a_lock); 635 } 636 637 void 638 as_init(void) 639 { 640 as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0, 641 as_constructor, as_destructor, NULL, NULL, NULL, 0); 642 } 643 644 /* 645 * Allocate and initialize an address space data structure. 646 * We call hat_alloc to allow any machine dependent 647 * information in the hat structure to be initialized. 648 */ 649 struct as * 650 as_alloc(void) 651 { 652 struct as *as; 653 654 as = kmem_cache_alloc(as_cache, KM_SLEEP); 655 656 as->a_flags = 0; 657 as->a_vbits = 0; 658 as->a_hrm = NULL; 659 as->a_seglast = NULL; 660 as->a_size = 0; 661 as->a_updatedir = 0; 662 gethrestime(&as->a_updatetime); 663 as->a_objectdir = NULL; 664 as->a_sizedir = 0; 665 as->a_userlimit = (caddr_t)USERLIMIT; 666 as->a_lastgap = NULL; 667 as->a_lastgaphl = NULL; 668 as->a_callbacks = NULL; 669 670 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 671 as->a_hat = hat_alloc(as); /* create hat for default system mmu */ 672 AS_LOCK_EXIT(as, &as->a_lock); 673 674 as->a_xhat = NULL; 675 676 return (as); 677 } 678 679 /* 680 * Free an address space data structure. 681 * Need to free the hat first and then 682 * all the segments on this as and finally 683 * the space for the as struct itself. 684 */ 685 void 686 as_free(struct as *as) 687 { 688 struct hat *hat = as->a_hat; 689 struct seg *seg, *next; 690 int called = 0; 691 692 top: 693 /* 694 * Invoke ALL callbacks. as_do_callbacks will do one callback 695 * per call, and not return (-1) until the callback has completed. 696 * When as_do_callbacks returns zero, all callbacks have completed. 697 */ 698 mutex_enter(&as->a_contents); 699 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0)) 700 ; 701 702 /* This will prevent new XHATs from attaching to as */ 703 if (!called) 704 AS_SETBUSY(as); 705 mutex_exit(&as->a_contents); 706 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 707 708 if (!called) { 709 called = 1; 710 hat_free_start(hat); 711 if (as->a_xhat != NULL) 712 xhat_free_start_all(as); 713 } 714 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) { 715 int err; 716 717 next = AS_SEGNEXT(as, seg); 718 retry: 719 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 720 if (err == EAGAIN) { 721 mutex_enter(&as->a_contents); 722 if (as->a_callbacks) { 723 AS_LOCK_EXIT(as, &as->a_lock); 724 } else if (!AS_ISNOUNMAPWAIT(as)) { 725 /* 726 * Memory is currently locked. Wait for a 727 * cv_signal that it has been unlocked, then 728 * try the operation again. 729 */ 730 if (AS_ISUNMAPWAIT(as) == 0) 731 cv_broadcast(&as->a_cv); 732 AS_SETUNMAPWAIT(as); 733 AS_LOCK_EXIT(as, &as->a_lock); 734 while (AS_ISUNMAPWAIT(as)) 735 cv_wait(&as->a_cv, &as->a_contents); 736 } else { 737 /* 738 * We may have raced with 739 * segvn_reclaim()/segspt_reclaim(). In this 740 * case clean nounmapwait flag and retry since 741 * softlockcnt in this segment may be already 742 * 0. We don't drop as writer lock so our 743 * number of retries without sleeping should 744 * be very small. See segvn_reclaim() for 745 * more comments. 746 */ 747 AS_CLRNOUNMAPWAIT(as); 748 mutex_exit(&as->a_contents); 749 goto retry; 750 } 751 mutex_exit(&as->a_contents); 752 goto top; 753 } else { 754 /* 755 * We do not expect any other error return at this 756 * time. This is similar to an ASSERT in seg_unmap() 757 */ 758 ASSERT(err == 0); 759 } 760 } 761 hat_free_end(hat); 762 if (as->a_xhat != NULL) 763 xhat_free_end_all(as); 764 AS_LOCK_EXIT(as, &as->a_lock); 765 766 /* /proc stuff */ 767 ASSERT(avl_numnodes(&as->a_wpage) == 0); 768 if (as->a_objectdir) { 769 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *)); 770 as->a_objectdir = NULL; 771 as->a_sizedir = 0; 772 } 773 774 /* 775 * Free the struct as back to kmem. Assert it has no segments. 776 */ 777 ASSERT(avl_numnodes(&as->a_segtree) == 0); 778 kmem_cache_free(as_cache, as); 779 } 780 781 int 782 as_dup(struct as *as, struct as **outas) 783 { 784 struct as *newas; 785 struct seg *seg, *newseg; 786 int error; 787 788 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 789 as_clearwatch(as); 790 newas = as_alloc(); 791 newas->a_userlimit = as->a_userlimit; 792 newas->a_proc = as->a_proc->p_child; 793 794 AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER); 795 796 /* This will prevent new XHATs from attaching */ 797 mutex_enter(&as->a_contents); 798 AS_SETBUSY(as); 799 mutex_exit(&as->a_contents); 800 mutex_enter(&newas->a_contents); 801 AS_SETBUSY(newas); 802 mutex_exit(&newas->a_contents); 803 804 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD); 805 806 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 807 808 if (seg->s_flags & S_PURGE) 809 continue; 810 811 newseg = seg_alloc(newas, seg->s_base, seg->s_size); 812 if (newseg == NULL) { 813 AS_LOCK_EXIT(newas, &newas->a_lock); 814 as_setwatch(as); 815 mutex_enter(&as->a_contents); 816 AS_CLRBUSY(as); 817 mutex_exit(&as->a_contents); 818 AS_LOCK_EXIT(as, &as->a_lock); 819 as_free(newas); 820 return (-1); 821 } 822 if ((error = SEGOP_DUP(seg, newseg)) != 0) { 823 /* 824 * We call seg_free() on the new seg 825 * because the segment is not set up 826 * completely; i.e. it has no ops. 827 */ 828 as_setwatch(as); 829 mutex_enter(&as->a_contents); 830 AS_CLRBUSY(as); 831 mutex_exit(&as->a_contents); 832 AS_LOCK_EXIT(as, &as->a_lock); 833 seg_free(newseg); 834 AS_LOCK_EXIT(newas, &newas->a_lock); 835 as_free(newas); 836 return (error); 837 } 838 newas->a_size += seg->s_size; 839 } 840 841 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL); 842 if (as->a_xhat != NULL) 843 error |= xhat_dup_all(as, newas, NULL, 0, HAT_DUP_ALL); 844 845 mutex_enter(&newas->a_contents); 846 AS_CLRBUSY(newas); 847 mutex_exit(&newas->a_contents); 848 AS_LOCK_EXIT(newas, &newas->a_lock); 849 850 as_setwatch(as); 851 mutex_enter(&as->a_contents); 852 AS_CLRBUSY(as); 853 mutex_exit(&as->a_contents); 854 AS_LOCK_EXIT(as, &as->a_lock); 855 if (error != 0) { 856 as_free(newas); 857 return (error); 858 } 859 *outas = newas; 860 return (0); 861 } 862 863 /* 864 * Handle a ``fault'' at addr for size bytes. 865 */ 866 faultcode_t 867 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size, 868 enum fault_type type, enum seg_rw rw) 869 { 870 struct seg *seg; 871 caddr_t raddr; /* rounded down addr */ 872 size_t rsize; /* rounded up size */ 873 size_t ssize; 874 faultcode_t res = 0; 875 caddr_t addrsav; 876 struct seg *segsav; 877 int as_lock_held; 878 klwp_t *lwp = ttolwp(curthread); 879 int is_xhat = 0; 880 int holding_wpage = 0; 881 extern struct seg_ops segdev_ops; 882 883 884 885 if (as->a_hat != hat) { 886 /* This must be an XHAT then */ 887 is_xhat = 1; 888 889 if ((type != F_INVAL) || (as == &kas)) 890 return (FC_NOSUPPORT); 891 } 892 893 retry: 894 if (!is_xhat) { 895 /* 896 * Indicate that the lwp is not to be stopped while waiting 897 * for a pagefault. This is to avoid deadlock while debugging 898 * a process via /proc over NFS (in particular). 899 */ 900 if (lwp != NULL) 901 lwp->lwp_nostop++; 902 903 /* 904 * same length must be used when we softlock and softunlock. 905 * We don't support softunlocking lengths less than 906 * the original length when there is largepage support. 907 * See seg_dev.c for more comments. 908 */ 909 switch (type) { 910 911 case F_SOFTLOCK: 912 CPU_STATS_ADD_K(vm, softlock, 1); 913 break; 914 915 case F_SOFTUNLOCK: 916 break; 917 918 case F_PROT: 919 CPU_STATS_ADD_K(vm, prot_fault, 1); 920 break; 921 922 case F_INVAL: 923 CPU_STATS_ENTER_K(); 924 CPU_STATS_ADDQ(CPU, vm, as_fault, 1); 925 if (as == &kas) 926 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1); 927 CPU_STATS_EXIT_K(); 928 break; 929 } 930 } 931 932 /* Kernel probe */ 933 TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */, 934 tnf_opaque, address, addr, 935 tnf_fault_type, fault_type, type, 936 tnf_seg_access, access, rw); 937 938 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 939 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 940 (size_t)raddr; 941 942 /* 943 * XXX -- Don't grab the as lock for segkmap. We should grab it for 944 * correctness, but then we could be stuck holding this lock for 945 * a LONG time if the fault needs to be resolved on a slow 946 * filesystem, and then no-one will be able to exec new commands, 947 * as exec'ing requires the write lock on the as. 948 */ 949 if (as == &kas && segkmap && segkmap->s_base <= raddr && 950 raddr + size < segkmap->s_base + segkmap->s_size) { 951 /* 952 * if (as==&kas), this can't be XHAT: we've already returned 953 * FC_NOSUPPORT. 954 */ 955 seg = segkmap; 956 as_lock_held = 0; 957 } else { 958 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 959 if (is_xhat && avl_numnodes(&as->a_wpage) != 0) { 960 /* 961 * Grab and hold the writers' lock on the as 962 * if the fault is to a watched page. 963 * This will keep CPUs from "peeking" at the 964 * address range while we're temporarily boosting 965 * the permissions for the XHAT device to 966 * resolve the fault in the segment layer. 967 * 968 * We could check whether faulted address 969 * is within a watched page and only then grab 970 * the writer lock, but this is simpler. 971 */ 972 AS_LOCK_EXIT(as, &as->a_lock); 973 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 974 } 975 976 seg = as_segat(as, raddr); 977 if (seg == NULL) { 978 AS_LOCK_EXIT(as, &as->a_lock); 979 if ((lwp != NULL) && (!is_xhat)) 980 lwp->lwp_nostop--; 981 return (FC_NOMAP); 982 } 983 984 as_lock_held = 1; 985 } 986 987 addrsav = raddr; 988 segsav = seg; 989 990 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 991 if (raddr >= seg->s_base + seg->s_size) { 992 seg = AS_SEGNEXT(as, seg); 993 if (seg == NULL || raddr != seg->s_base) { 994 res = FC_NOMAP; 995 break; 996 } 997 } 998 if (raddr + rsize > seg->s_base + seg->s_size) 999 ssize = seg->s_base + seg->s_size - raddr; 1000 else 1001 ssize = rsize; 1002 1003 if (!is_xhat || (seg->s_ops != &segdev_ops)) { 1004 1005 if (is_xhat && avl_numnodes(&as->a_wpage) != 0 && 1006 pr_is_watchpage_as(raddr, rw, as)) { 1007 /* 1008 * Handle watch pages. If we're faulting on a 1009 * watched page from an X-hat, we have to 1010 * restore the original permissions while we 1011 * handle the fault. 1012 */ 1013 as_clearwatch(as); 1014 holding_wpage = 1; 1015 } 1016 1017 res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw); 1018 1019 /* Restore watchpoints */ 1020 if (holding_wpage) { 1021 as_setwatch(as); 1022 holding_wpage = 0; 1023 } 1024 1025 if (res != 0) 1026 break; 1027 } else { 1028 /* XHAT does not support seg_dev */ 1029 res = FC_NOSUPPORT; 1030 break; 1031 } 1032 } 1033 1034 /* 1035 * If we were SOFTLOCKing and encountered a failure, 1036 * we must SOFTUNLOCK the range we already did. (Maybe we 1037 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing 1038 * right here...) 1039 */ 1040 if (res != 0 && type == F_SOFTLOCK) { 1041 for (seg = segsav; addrsav < raddr; addrsav += ssize) { 1042 if (addrsav >= seg->s_base + seg->s_size) 1043 seg = AS_SEGNEXT(as, seg); 1044 ASSERT(seg != NULL); 1045 /* 1046 * Now call the fault routine again to perform the 1047 * unlock using S_OTHER instead of the rw variable 1048 * since we never got a chance to touch the pages. 1049 */ 1050 if (raddr > seg->s_base + seg->s_size) 1051 ssize = seg->s_base + seg->s_size - addrsav; 1052 else 1053 ssize = raddr - addrsav; 1054 (void) SEGOP_FAULT(hat, seg, addrsav, ssize, 1055 F_SOFTUNLOCK, S_OTHER); 1056 } 1057 } 1058 if (as_lock_held) 1059 AS_LOCK_EXIT(as, &as->a_lock); 1060 if ((lwp != NULL) && (!is_xhat)) 1061 lwp->lwp_nostop--; 1062 1063 /* 1064 * If the lower levels returned EDEADLK for a fault, 1065 * It means that we should retry the fault. Let's wait 1066 * a bit also to let the deadlock causing condition clear. 1067 * This is part of a gross hack to work around a design flaw 1068 * in the ufs/sds logging code and should go away when the 1069 * logging code is re-designed to fix the problem. See bug 1070 * 4125102 for details of the problem. 1071 */ 1072 if (FC_ERRNO(res) == EDEADLK) { 1073 delay(deadlk_wait); 1074 res = 0; 1075 goto retry; 1076 } 1077 return (res); 1078 } 1079 1080 1081 1082 /* 1083 * Asynchronous ``fault'' at addr for size bytes. 1084 */ 1085 faultcode_t 1086 as_faulta(struct as *as, caddr_t addr, size_t size) 1087 { 1088 struct seg *seg; 1089 caddr_t raddr; /* rounded down addr */ 1090 size_t rsize; /* rounded up size */ 1091 faultcode_t res = 0; 1092 klwp_t *lwp = ttolwp(curthread); 1093 1094 retry: 1095 /* 1096 * Indicate that the lwp is not to be stopped while waiting 1097 * for a pagefault. This is to avoid deadlock while debugging 1098 * a process via /proc over NFS (in particular). 1099 */ 1100 if (lwp != NULL) 1101 lwp->lwp_nostop++; 1102 1103 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1104 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1105 (size_t)raddr; 1106 1107 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1108 seg = as_segat(as, raddr); 1109 if (seg == NULL) { 1110 AS_LOCK_EXIT(as, &as->a_lock); 1111 if (lwp != NULL) 1112 lwp->lwp_nostop--; 1113 return (FC_NOMAP); 1114 } 1115 1116 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) { 1117 if (raddr >= seg->s_base + seg->s_size) { 1118 seg = AS_SEGNEXT(as, seg); 1119 if (seg == NULL || raddr != seg->s_base) { 1120 res = FC_NOMAP; 1121 break; 1122 } 1123 } 1124 res = SEGOP_FAULTA(seg, raddr); 1125 if (res != 0) 1126 break; 1127 } 1128 AS_LOCK_EXIT(as, &as->a_lock); 1129 if (lwp != NULL) 1130 lwp->lwp_nostop--; 1131 /* 1132 * If the lower levels returned EDEADLK for a fault, 1133 * It means that we should retry the fault. Let's wait 1134 * a bit also to let the deadlock causing condition clear. 1135 * This is part of a gross hack to work around a design flaw 1136 * in the ufs/sds logging code and should go away when the 1137 * logging code is re-designed to fix the problem. See bug 1138 * 4125102 for details of the problem. 1139 */ 1140 if (FC_ERRNO(res) == EDEADLK) { 1141 delay(deadlk_wait); 1142 res = 0; 1143 goto retry; 1144 } 1145 return (res); 1146 } 1147 1148 /* 1149 * Set the virtual mapping for the interval from [addr : addr + size) 1150 * in address space `as' to have the specified protection. 1151 * It is ok for the range to cross over several segments, 1152 * as long as they are contiguous. 1153 */ 1154 int 1155 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 1156 { 1157 struct seg *seg; 1158 struct as_callback *cb; 1159 size_t ssize; 1160 caddr_t raddr; /* rounded down addr */ 1161 size_t rsize; /* rounded up size */ 1162 int error = 0, writer = 0; 1163 caddr_t saveraddr; 1164 size_t saversize; 1165 1166 setprot_top: 1167 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1168 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1169 (size_t)raddr; 1170 1171 if (raddr + rsize < raddr) /* check for wraparound */ 1172 return (ENOMEM); 1173 1174 saveraddr = raddr; 1175 saversize = rsize; 1176 1177 /* 1178 * Normally we only lock the as as a reader. But 1179 * if due to setprot the segment driver needs to split 1180 * a segment it will return IE_RETRY. Therefore we re-acquire 1181 * the as lock as a writer so the segment driver can change 1182 * the seg list. Also the segment driver will return IE_RETRY 1183 * after it has changed the segment list so we therefore keep 1184 * locking as a writer. Since these opeartions should be rare 1185 * want to only lock as a writer when necessary. 1186 */ 1187 if (writer || avl_numnodes(&as->a_wpage) != 0) { 1188 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1189 } else { 1190 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1191 } 1192 1193 as_clearwatchprot(as, raddr, rsize); 1194 seg = as_segat(as, raddr); 1195 if (seg == NULL) { 1196 as_setwatch(as); 1197 AS_LOCK_EXIT(as, &as->a_lock); 1198 return (ENOMEM); 1199 } 1200 1201 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 1202 if (raddr >= seg->s_base + seg->s_size) { 1203 seg = AS_SEGNEXT(as, seg); 1204 if (seg == NULL || raddr != seg->s_base) { 1205 error = ENOMEM; 1206 break; 1207 } 1208 } 1209 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 1210 ssize = seg->s_base + seg->s_size - raddr; 1211 else 1212 ssize = rsize; 1213 retry: 1214 error = SEGOP_SETPROT(seg, raddr, ssize, prot); 1215 1216 if (error == IE_NOMEM) { 1217 error = EAGAIN; 1218 break; 1219 } 1220 1221 if (error == IE_RETRY) { 1222 AS_LOCK_EXIT(as, &as->a_lock); 1223 writer = 1; 1224 goto setprot_top; 1225 } 1226 1227 if (error == EAGAIN) { 1228 /* 1229 * Make sure we have a_lock as writer. 1230 */ 1231 if (writer == 0) { 1232 AS_LOCK_EXIT(as, &as->a_lock); 1233 writer = 1; 1234 goto setprot_top; 1235 } 1236 1237 /* 1238 * Memory is currently locked. It must be unlocked 1239 * before this operation can succeed through a retry. 1240 * The possible reasons for locked memory and 1241 * corresponding strategies for unlocking are: 1242 * (1) Normal I/O 1243 * wait for a signal that the I/O operation 1244 * has completed and the memory is unlocked. 1245 * (2) Asynchronous I/O 1246 * The aio subsystem does not unlock pages when 1247 * the I/O is completed. Those pages are unlocked 1248 * when the application calls aiowait/aioerror. 1249 * So, to prevent blocking forever, cv_broadcast() 1250 * is done to wake up aio_cleanup_thread. 1251 * Subsequently, segvn_reclaim will be called, and 1252 * that will do AS_CLRUNMAPWAIT() and wake us up. 1253 * (3) Long term page locking: 1254 * Drivers intending to have pages locked for a 1255 * period considerably longer than for normal I/O 1256 * (essentially forever) may have registered for a 1257 * callback so they may unlock these pages on 1258 * request. This is needed to allow this operation 1259 * to succeed. Each entry on the callback list is 1260 * examined. If the event or address range pertains 1261 * the callback is invoked (unless it already is in 1262 * progress). The a_contents lock must be dropped 1263 * before the callback, so only one callback can 1264 * be done at a time. Go to the top and do more 1265 * until zero is returned. If zero is returned, 1266 * either there were no callbacks for this event 1267 * or they were already in progress. 1268 */ 1269 mutex_enter(&as->a_contents); 1270 if (as->a_callbacks && 1271 (cb = as_find_callback(as, AS_SETPROT_EVENT, 1272 seg->s_base, seg->s_size))) { 1273 AS_LOCK_EXIT(as, &as->a_lock); 1274 as_execute_callback(as, cb, AS_SETPROT_EVENT); 1275 } else if (!AS_ISNOUNMAPWAIT(as)) { 1276 if (AS_ISUNMAPWAIT(as) == 0) 1277 cv_broadcast(&as->a_cv); 1278 AS_SETUNMAPWAIT(as); 1279 AS_LOCK_EXIT(as, &as->a_lock); 1280 while (AS_ISUNMAPWAIT(as)) 1281 cv_wait(&as->a_cv, &as->a_contents); 1282 } else { 1283 /* 1284 * We may have raced with 1285 * segvn_reclaim()/segspt_reclaim(). In this 1286 * case clean nounmapwait flag and retry since 1287 * softlockcnt in this segment may be already 1288 * 0. We don't drop as writer lock so our 1289 * number of retries without sleeping should 1290 * be very small. See segvn_reclaim() for 1291 * more comments. 1292 */ 1293 AS_CLRNOUNMAPWAIT(as); 1294 mutex_exit(&as->a_contents); 1295 goto retry; 1296 } 1297 mutex_exit(&as->a_contents); 1298 goto setprot_top; 1299 } else if (error != 0) 1300 break; 1301 } 1302 if (error != 0) { 1303 as_setwatch(as); 1304 } else { 1305 as_setwatchprot(as, saveraddr, saversize, prot); 1306 } 1307 AS_LOCK_EXIT(as, &as->a_lock); 1308 return (error); 1309 } 1310 1311 /* 1312 * Check to make sure that the interval [addr, addr + size) 1313 * in address space `as' has at least the specified protection. 1314 * It is ok for the range to cross over several segments, as long 1315 * as they are contiguous. 1316 */ 1317 int 1318 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 1319 { 1320 struct seg *seg; 1321 size_t ssize; 1322 caddr_t raddr; /* rounded down addr */ 1323 size_t rsize; /* rounded up size */ 1324 int error = 0; 1325 1326 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1327 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1328 (size_t)raddr; 1329 1330 if (raddr + rsize < raddr) /* check for wraparound */ 1331 return (ENOMEM); 1332 1333 /* 1334 * This is ugly as sin... 1335 * Normally, we only acquire the address space readers lock. 1336 * However, if the address space has watchpoints present, 1337 * we must acquire the writer lock on the address space for 1338 * the benefit of as_clearwatchprot() and as_setwatchprot(). 1339 */ 1340 if (avl_numnodes(&as->a_wpage) != 0) 1341 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1342 else 1343 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1344 as_clearwatchprot(as, raddr, rsize); 1345 seg = as_segat(as, raddr); 1346 if (seg == NULL) { 1347 as_setwatch(as); 1348 AS_LOCK_EXIT(as, &as->a_lock); 1349 return (ENOMEM); 1350 } 1351 1352 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 1353 if (raddr >= seg->s_base + seg->s_size) { 1354 seg = AS_SEGNEXT(as, seg); 1355 if (seg == NULL || raddr != seg->s_base) { 1356 error = ENOMEM; 1357 break; 1358 } 1359 } 1360 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 1361 ssize = seg->s_base + seg->s_size - raddr; 1362 else 1363 ssize = rsize; 1364 1365 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot); 1366 if (error != 0) 1367 break; 1368 } 1369 as_setwatch(as); 1370 AS_LOCK_EXIT(as, &as->a_lock); 1371 return (error); 1372 } 1373 1374 int 1375 as_unmap(struct as *as, caddr_t addr, size_t size) 1376 { 1377 struct seg *seg, *seg_next; 1378 struct as_callback *cb; 1379 caddr_t raddr, eaddr; 1380 size_t ssize; 1381 int err; 1382 1383 top: 1384 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1385 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) & 1386 (uintptr_t)PAGEMASK); 1387 1388 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1389 1390 as->a_updatedir = 1; /* inform /proc */ 1391 gethrestime(&as->a_updatetime); 1392 1393 /* 1394 * Use as_findseg to find the first segment in the range, then 1395 * step through the segments in order, following s_next. 1396 */ 1397 as_clearwatchprot(as, raddr, eaddr - raddr); 1398 1399 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) { 1400 if (eaddr <= seg->s_base) 1401 break; /* eaddr was in a gap; all done */ 1402 1403 /* this is implied by the test above */ 1404 ASSERT(raddr < eaddr); 1405 1406 if (raddr < seg->s_base) 1407 raddr = seg->s_base; /* raddr was in a gap */ 1408 1409 if (eaddr > (seg->s_base + seg->s_size)) 1410 ssize = seg->s_base + seg->s_size - raddr; 1411 else 1412 ssize = eaddr - raddr; 1413 1414 /* 1415 * Save next segment pointer since seg can be 1416 * destroyed during the segment unmap operation. 1417 */ 1418 seg_next = AS_SEGNEXT(as, seg); 1419 1420 retry: 1421 err = SEGOP_UNMAP(seg, raddr, ssize); 1422 if (err == EAGAIN) { 1423 /* 1424 * Memory is currently locked. It must be unlocked 1425 * before this operation can succeed through a retry. 1426 * The possible reasons for locked memory and 1427 * corresponding strategies for unlocking are: 1428 * (1) Normal I/O 1429 * wait for a signal that the I/O operation 1430 * has completed and the memory is unlocked. 1431 * (2) Asynchronous I/O 1432 * The aio subsystem does not unlock pages when 1433 * the I/O is completed. Those pages are unlocked 1434 * when the application calls aiowait/aioerror. 1435 * So, to prevent blocking forever, cv_broadcast() 1436 * is done to wake up aio_cleanup_thread. 1437 * Subsequently, segvn_reclaim will be called, and 1438 * that will do AS_CLRUNMAPWAIT() and wake us up. 1439 * (3) Long term page locking: 1440 * Drivers intending to have pages locked for a 1441 * period considerably longer than for normal I/O 1442 * (essentially forever) may have registered for a 1443 * callback so they may unlock these pages on 1444 * request. This is needed to allow this operation 1445 * to succeed. Each entry on the callback list is 1446 * examined. If the event or address range pertains 1447 * the callback is invoked (unless it already is in 1448 * progress). The a_contents lock must be dropped 1449 * before the callback, so only one callback can 1450 * be done at a time. Go to the top and do more 1451 * until zero is returned. If zero is returned, 1452 * either there were no callbacks for this event 1453 * or they were already in progress. 1454 */ 1455 mutex_enter(&as->a_contents); 1456 if (as->a_callbacks && 1457 (cb = as_find_callback(as, AS_UNMAP_EVENT, 1458 seg->s_base, seg->s_size))) { 1459 AS_LOCK_EXIT(as, &as->a_lock); 1460 as_execute_callback(as, cb, AS_UNMAP_EVENT); 1461 } else if (!AS_ISNOUNMAPWAIT(as)) { 1462 if (AS_ISUNMAPWAIT(as) == 0) 1463 cv_broadcast(&as->a_cv); 1464 AS_SETUNMAPWAIT(as); 1465 AS_LOCK_EXIT(as, &as->a_lock); 1466 while (AS_ISUNMAPWAIT(as)) 1467 cv_wait(&as->a_cv, &as->a_contents); 1468 } else { 1469 /* 1470 * We may have raced with 1471 * segvn_reclaim()/segspt_reclaim(). In this 1472 * case clean nounmapwait flag and retry since 1473 * softlockcnt in this segment may be already 1474 * 0. We don't drop as writer lock so our 1475 * number of retries without sleeping should 1476 * be very small. See segvn_reclaim() for 1477 * more comments. 1478 */ 1479 AS_CLRNOUNMAPWAIT(as); 1480 mutex_exit(&as->a_contents); 1481 goto retry; 1482 } 1483 mutex_exit(&as->a_contents); 1484 goto top; 1485 } else if (err == IE_RETRY) { 1486 AS_LOCK_EXIT(as, &as->a_lock); 1487 goto top; 1488 } else if (err) { 1489 as_setwatch(as); 1490 AS_LOCK_EXIT(as, &as->a_lock); 1491 return (-1); 1492 } 1493 1494 as->a_size -= ssize; 1495 raddr += ssize; 1496 } 1497 AS_LOCK_EXIT(as, &as->a_lock); 1498 return (0); 1499 } 1500 1501 static int 1502 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec, 1503 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated) 1504 { 1505 uint_t szc; 1506 uint_t nszc; 1507 int error; 1508 caddr_t a; 1509 caddr_t eaddr; 1510 size_t segsize; 1511 struct seg *seg; 1512 size_t pgsz; 1513 int do_off = (vn_a->vp != NULL || vn_a->amp != NULL); 1514 uint_t save_szcvec; 1515 1516 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1517 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1518 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1519 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL); 1520 if (!do_off) { 1521 vn_a->offset = 0; 1522 } 1523 1524 if (szcvec <= 1) { 1525 seg = seg_alloc(as, addr, size); 1526 if (seg == NULL) { 1527 return (ENOMEM); 1528 } 1529 vn_a->szc = 0; 1530 error = (*crfp)(seg, vn_a); 1531 if (error != 0) { 1532 seg_free(seg); 1533 } else { 1534 as->a_size += size; 1535 } 1536 return (error); 1537 } 1538 1539 eaddr = addr + size; 1540 save_szcvec = szcvec; 1541 szcvec >>= 1; 1542 szc = 0; 1543 nszc = 0; 1544 while (szcvec) { 1545 if ((szcvec & 0x1) == 0) { 1546 nszc++; 1547 szcvec >>= 1; 1548 continue; 1549 } 1550 nszc++; 1551 pgsz = page_get_pagesize(nszc); 1552 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 1553 if (a != addr) { 1554 ASSERT(a < eaddr); 1555 segsize = a - addr; 1556 seg = seg_alloc(as, addr, segsize); 1557 if (seg == NULL) { 1558 return (ENOMEM); 1559 } 1560 vn_a->szc = szc; 1561 error = (*crfp)(seg, vn_a); 1562 if (error != 0) { 1563 seg_free(seg); 1564 return (error); 1565 } 1566 as->a_size += segsize; 1567 *segcreated = 1; 1568 if (do_off) { 1569 vn_a->offset += segsize; 1570 } 1571 addr = a; 1572 } 1573 szc = nszc; 1574 szcvec >>= 1; 1575 } 1576 1577 ASSERT(addr < eaddr); 1578 szcvec = save_szcvec | 1; /* add 8K pages */ 1579 while (szcvec) { 1580 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 1581 ASSERT(a >= addr); 1582 if (a != addr) { 1583 segsize = a - addr; 1584 seg = seg_alloc(as, addr, segsize); 1585 if (seg == NULL) { 1586 return (ENOMEM); 1587 } 1588 vn_a->szc = szc; 1589 error = (*crfp)(seg, vn_a); 1590 if (error != 0) { 1591 seg_free(seg); 1592 return (error); 1593 } 1594 as->a_size += segsize; 1595 *segcreated = 1; 1596 if (do_off) { 1597 vn_a->offset += segsize; 1598 } 1599 addr = a; 1600 } 1601 szcvec &= ~(1 << szc); 1602 if (szcvec) { 1603 szc = highbit(szcvec) - 1; 1604 pgsz = page_get_pagesize(szc); 1605 } 1606 } 1607 ASSERT(addr == eaddr); 1608 1609 return (0); 1610 } 1611 1612 static int 1613 as_map_vnsegs(struct as *as, caddr_t addr, size_t size, 1614 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated) 1615 { 1616 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA); 1617 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM; 1618 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags, 1619 type, 0); 1620 int error; 1621 struct seg *seg; 1622 struct vattr va; 1623 u_offset_t eoff; 1624 size_t save_size = 0; 1625 extern size_t textrepl_size_thresh; 1626 1627 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1628 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1629 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1630 ASSERT(vn_a->vp != NULL); 1631 ASSERT(vn_a->amp == NULL); 1632 1633 again: 1634 if (szcvec <= 1) { 1635 seg = seg_alloc(as, addr, size); 1636 if (seg == NULL) { 1637 return (ENOMEM); 1638 } 1639 vn_a->szc = 0; 1640 error = (*crfp)(seg, vn_a); 1641 if (error != 0) { 1642 seg_free(seg); 1643 } else { 1644 as->a_size += size; 1645 } 1646 return (error); 1647 } 1648 1649 va.va_mask = AT_SIZE; 1650 if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) { 1651 szcvec = 0; 1652 goto again; 1653 } 1654 eoff = vn_a->offset & PAGEMASK; 1655 if (eoff >= va.va_size) { 1656 szcvec = 0; 1657 goto again; 1658 } 1659 eoff += size; 1660 if (btopr(va.va_size) < btopr(eoff)) { 1661 save_size = size; 1662 size = va.va_size - (vn_a->offset & PAGEMASK); 1663 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t); 1664 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags, 1665 type, 0); 1666 if (szcvec <= 1) { 1667 size = save_size; 1668 goto again; 1669 } 1670 } 1671 1672 if (size > textrepl_size_thresh) { 1673 vn_a->flags |= _MAP_TEXTREPL; 1674 } 1675 error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a, 1676 segcreated); 1677 if (error != 0) { 1678 return (error); 1679 } 1680 if (save_size) { 1681 addr += size; 1682 size = save_size - size; 1683 szcvec = 0; 1684 goto again; 1685 } 1686 return (0); 1687 } 1688 1689 /* 1690 * as_map_ansegs: shared or private anonymous memory. Note that the flags 1691 * passed to map_pgszvec cannot be MAP_INITDATA, for anon. 1692 */ 1693 static int 1694 as_map_ansegs(struct as *as, caddr_t addr, size_t size, 1695 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated) 1696 { 1697 uint_t szcvec; 1698 uchar_t type; 1699 1700 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE); 1701 if (vn_a->type == MAP_SHARED) { 1702 type = MAPPGSZC_SHM; 1703 } else if (vn_a->type == MAP_PRIVATE) { 1704 if (vn_a->szc == AS_MAP_HEAP) { 1705 type = MAPPGSZC_HEAP; 1706 } else if (vn_a->szc == AS_MAP_STACK) { 1707 type = MAPPGSZC_STACK; 1708 } else { 1709 type = MAPPGSZC_PRIVM; 1710 } 1711 } 1712 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ? 1713 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE), 1714 (vn_a->flags & MAP_TEXT), type, 0); 1715 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1716 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1717 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1718 ASSERT(vn_a->vp == NULL); 1719 1720 return (as_map_segvn_segs(as, addr, size, szcvec, 1721 crfp, vn_a, segcreated)); 1722 } 1723 1724 int 1725 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp) 1726 { 1727 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1728 return (as_map_locked(as, addr, size, crfp, argsp)); 1729 } 1730 1731 int 1732 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(), 1733 void *argsp) 1734 { 1735 struct seg *seg = NULL; 1736 caddr_t raddr; /* rounded down addr */ 1737 size_t rsize; /* rounded up size */ 1738 int error; 1739 int unmap = 0; 1740 struct proc *p = curproc; 1741 struct segvn_crargs crargs; 1742 1743 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1744 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1745 (size_t)raddr; 1746 1747 /* 1748 * check for wrap around 1749 */ 1750 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) { 1751 AS_LOCK_EXIT(as, &as->a_lock); 1752 return (ENOMEM); 1753 } 1754 1755 as->a_updatedir = 1; /* inform /proc */ 1756 gethrestime(&as->a_updatetime); 1757 1758 if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) { 1759 AS_LOCK_EXIT(as, &as->a_lock); 1760 1761 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p, 1762 RCA_UNSAFE_ALL); 1763 1764 return (ENOMEM); 1765 } 1766 1767 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) { 1768 crargs = *(struct segvn_crargs *)argsp; 1769 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap); 1770 if (error != 0) { 1771 AS_LOCK_EXIT(as, &as->a_lock); 1772 if (unmap) { 1773 (void) as_unmap(as, addr, size); 1774 } 1775 return (error); 1776 } 1777 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) { 1778 crargs = *(struct segvn_crargs *)argsp; 1779 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap); 1780 if (error != 0) { 1781 AS_LOCK_EXIT(as, &as->a_lock); 1782 if (unmap) { 1783 (void) as_unmap(as, addr, size); 1784 } 1785 return (error); 1786 } 1787 } else { 1788 seg = seg_alloc(as, addr, size); 1789 if (seg == NULL) { 1790 AS_LOCK_EXIT(as, &as->a_lock); 1791 return (ENOMEM); 1792 } 1793 1794 error = (*crfp)(seg, argsp); 1795 if (error != 0) { 1796 seg_free(seg); 1797 AS_LOCK_EXIT(as, &as->a_lock); 1798 return (error); 1799 } 1800 /* 1801 * Add size now so as_unmap will work if as_ctl fails. 1802 */ 1803 as->a_size += rsize; 1804 } 1805 1806 as_setwatch(as); 1807 1808 /* 1809 * If the address space is locked, 1810 * establish memory locks for the new segment. 1811 */ 1812 mutex_enter(&as->a_contents); 1813 if (AS_ISPGLCK(as)) { 1814 mutex_exit(&as->a_contents); 1815 AS_LOCK_EXIT(as, &as->a_lock); 1816 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0); 1817 if (error != 0) 1818 (void) as_unmap(as, addr, size); 1819 } else { 1820 mutex_exit(&as->a_contents); 1821 AS_LOCK_EXIT(as, &as->a_lock); 1822 } 1823 return (error); 1824 } 1825 1826 1827 /* 1828 * Delete all segments in the address space marked with S_PURGE. 1829 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c). 1830 * These segments are deleted as a first step before calls to as_gap(), so 1831 * that they don't affect mmap() or shmat(). 1832 */ 1833 void 1834 as_purge(struct as *as) 1835 { 1836 struct seg *seg; 1837 struct seg *next_seg; 1838 1839 /* 1840 * the setting of NEEDSPURGE is protect by as_rangelock(), so 1841 * no need to grab a_contents mutex for this check 1842 */ 1843 if ((as->a_flags & AS_NEEDSPURGE) == 0) 1844 return; 1845 1846 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1847 next_seg = NULL; 1848 seg = AS_SEGFIRST(as); 1849 while (seg != NULL) { 1850 next_seg = AS_SEGNEXT(as, seg); 1851 if (seg->s_flags & S_PURGE) 1852 SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 1853 seg = next_seg; 1854 } 1855 AS_LOCK_EXIT(as, &as->a_lock); 1856 1857 mutex_enter(&as->a_contents); 1858 as->a_flags &= ~AS_NEEDSPURGE; 1859 mutex_exit(&as->a_contents); 1860 } 1861 1862 /* 1863 * Find a hole within [*basep, *basep + *lenp), which contains a mappable 1864 * range of addresses at least "minlen" long, where the base of the range is 1865 * at "off" phase from an "align" boundary and there is space for a 1866 * "redzone"-sized redzone on eithe rside of the range. Thus, 1867 * if align was 4M and off was 16k, the user wants a hole which will start 1868 * 16k into a 4M page. 1869 * 1870 * If flags specifies AH_HI, the hole will have the highest possible address 1871 * in the range. We use the as->a_lastgap field to figure out where to 1872 * start looking for a gap. 1873 * 1874 * Otherwise, the gap will have the lowest possible address. 1875 * 1876 * If flags specifies AH_CONTAIN, the hole will contain the address addr. 1877 * 1878 * If an adequate hole is found, *basep and *lenp are set to reflect the part of 1879 * the hole that is within range, and 0 is returned. On failure, -1 is returned. 1880 * 1881 * NOTE: This routine is not correct when base+len overflows caddr_t. 1882 */ 1883 int 1884 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, 1885 uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off) 1886 { 1887 caddr_t lobound = *basep; 1888 caddr_t hibound = lobound + *lenp; 1889 struct seg *lseg, *hseg; 1890 caddr_t lo, hi; 1891 int forward; 1892 caddr_t save_base; 1893 size_t save_len; 1894 size_t save_minlen; 1895 size_t save_redzone; 1896 int fast_path = 1; 1897 1898 save_base = *basep; 1899 save_len = *lenp; 1900 save_minlen = minlen; 1901 save_redzone = redzone; 1902 1903 /* 1904 * For the first pass/fast_path, just add align and redzone into 1905 * minlen since if we get an allocation, we can guarantee that it 1906 * will fit the alignment and redzone requested. 1907 * This increases the chance that hibound will be adjusted to 1908 * a_lastgap->s_base which will likely allow us to find an 1909 * acceptable hole in the address space quicker. 1910 * If we can't find a hole with this fast_path, then we look for 1911 * smaller holes in which the alignment and offset may allow 1912 * the allocation to fit. 1913 */ 1914 minlen += align; 1915 minlen += 2 * redzone; 1916 redzone = 0; 1917 1918 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1919 if (AS_SEGFIRST(as) == NULL) { 1920 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR, 1921 align, redzone, off)) { 1922 AS_LOCK_EXIT(as, &as->a_lock); 1923 return (0); 1924 } else { 1925 AS_LOCK_EXIT(as, &as->a_lock); 1926 *basep = save_base; 1927 *lenp = save_len; 1928 return (-1); 1929 } 1930 } 1931 1932 retry: 1933 /* 1934 * Set up to iterate over all the inter-segment holes in the given 1935 * direction. lseg is NULL for the lowest-addressed hole and hseg is 1936 * NULL for the highest-addressed hole. If moving backwards, we reset 1937 * sseg to denote the highest-addressed segment. 1938 */ 1939 forward = (flags & AH_DIR) == AH_LO; 1940 if (forward) { 1941 hseg = as_findseg(as, lobound, 1); 1942 lseg = AS_SEGPREV(as, hseg); 1943 } else { 1944 1945 /* 1946 * If allocating at least as much as the last allocation, 1947 * use a_lastgap's base as a better estimate of hibound. 1948 */ 1949 if (as->a_lastgap && 1950 minlen >= as->a_lastgap->s_size && 1951 hibound >= as->a_lastgap->s_base) 1952 hibound = as->a_lastgap->s_base; 1953 1954 hseg = as_findseg(as, hibound, 1); 1955 if (hseg->s_base + hseg->s_size < hibound) { 1956 lseg = hseg; 1957 hseg = NULL; 1958 } else { 1959 lseg = AS_SEGPREV(as, hseg); 1960 } 1961 } 1962 1963 for (;;) { 1964 /* 1965 * Set lo and hi to the hole's boundaries. (We should really 1966 * use MAXADDR in place of hibound in the expression below, 1967 * but can't express it easily; using hibound in its place is 1968 * harmless.) 1969 */ 1970 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size; 1971 hi = (hseg == NULL) ? hibound : hseg->s_base; 1972 /* 1973 * If the iteration has moved past the interval from lobound 1974 * to hibound it's pointless to continue. 1975 */ 1976 if ((forward && lo > hibound) || (!forward && hi < lobound)) 1977 break; 1978 else if (lo > hibound || hi < lobound) 1979 goto cont; 1980 /* 1981 * Candidate hole lies at least partially within the allowable 1982 * range. Restrict it to fall completely within that range, 1983 * i.e., to [max(lo, lobound), min(hi, hibound)]. 1984 */ 1985 if (lo < lobound) 1986 lo = lobound; 1987 if (hi > hibound) 1988 hi = hibound; 1989 /* 1990 * Verify that the candidate hole is big enough and meets 1991 * hardware constraints. If the hole is too small, no need 1992 * to do the further checks since they will fail. 1993 */ 1994 *basep = lo; 1995 *lenp = hi - lo; 1996 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp, 1997 minlen, forward ? AH_LO : AH_HI, align, redzone, off) && 1998 ((flags & AH_CONTAIN) == 0 || 1999 (*basep <= addr && *basep + *lenp > addr))) { 2000 if (!forward) 2001 as->a_lastgap = hseg; 2002 if (hseg != NULL) 2003 as->a_lastgaphl = hseg; 2004 else 2005 as->a_lastgaphl = lseg; 2006 AS_LOCK_EXIT(as, &as->a_lock); 2007 return (0); 2008 } 2009 cont: 2010 /* 2011 * Move to the next hole. 2012 */ 2013 if (forward) { 2014 lseg = hseg; 2015 if (lseg == NULL) 2016 break; 2017 hseg = AS_SEGNEXT(as, hseg); 2018 } else { 2019 hseg = lseg; 2020 if (hseg == NULL) 2021 break; 2022 lseg = AS_SEGPREV(as, lseg); 2023 } 2024 } 2025 if (fast_path && (align != 0 || save_redzone != 0)) { 2026 fast_path = 0; 2027 minlen = save_minlen; 2028 redzone = save_redzone; 2029 goto retry; 2030 } 2031 *basep = save_base; 2032 *lenp = save_len; 2033 AS_LOCK_EXIT(as, &as->a_lock); 2034 return (-1); 2035 } 2036 2037 /* 2038 * Find a hole of at least size minlen within [*basep, *basep + *lenp). 2039 * 2040 * If flags specifies AH_HI, the hole will have the highest possible address 2041 * in the range. We use the as->a_lastgap field to figure out where to 2042 * start looking for a gap. 2043 * 2044 * Otherwise, the gap will have the lowest possible address. 2045 * 2046 * If flags specifies AH_CONTAIN, the hole will contain the address addr. 2047 * 2048 * If an adequate hole is found, base and len are set to reflect the part of 2049 * the hole that is within range, and 0 is returned, otherwise, 2050 * -1 is returned. 2051 * 2052 * NOTE: This routine is not correct when base+len overflows caddr_t. 2053 */ 2054 int 2055 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags, 2056 caddr_t addr) 2057 { 2058 2059 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0)); 2060 } 2061 2062 /* 2063 * Return the next range within [base, base + len) that is backed 2064 * with "real memory". Skip holes and non-seg_vn segments. 2065 * We're lazy and only return one segment at a time. 2066 */ 2067 int 2068 as_memory(struct as *as, caddr_t *basep, size_t *lenp) 2069 { 2070 extern struct seg_ops segspt_shmops; /* needs a header file */ 2071 struct seg *seg; 2072 caddr_t addr, eaddr; 2073 caddr_t segend; 2074 2075 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2076 2077 addr = *basep; 2078 eaddr = addr + *lenp; 2079 2080 seg = as_findseg(as, addr, 0); 2081 if (seg != NULL) 2082 addr = MAX(seg->s_base, addr); 2083 2084 for (;;) { 2085 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) { 2086 AS_LOCK_EXIT(as, &as->a_lock); 2087 return (EINVAL); 2088 } 2089 2090 if (seg->s_ops == &segvn_ops) { 2091 segend = seg->s_base + seg->s_size; 2092 break; 2093 } 2094 2095 /* 2096 * We do ISM by looking into the private data 2097 * to determine the real size of the segment. 2098 */ 2099 if (seg->s_ops == &segspt_shmops) { 2100 segend = seg->s_base + spt_realsize(seg); 2101 if (addr < segend) 2102 break; 2103 } 2104 2105 seg = AS_SEGNEXT(as, seg); 2106 2107 if (seg != NULL) 2108 addr = seg->s_base; 2109 } 2110 2111 *basep = addr; 2112 2113 if (segend > eaddr) 2114 *lenp = eaddr - addr; 2115 else 2116 *lenp = segend - addr; 2117 2118 AS_LOCK_EXIT(as, &as->a_lock); 2119 return (0); 2120 } 2121 2122 /* 2123 * Swap the pages associated with the address space as out to 2124 * secondary storage, returning the number of bytes actually 2125 * swapped. 2126 * 2127 * The value returned is intended to correlate well with the process's 2128 * memory requirements. Its usefulness for this purpose depends on 2129 * how well the segment-level routines do at returning accurate 2130 * information. 2131 */ 2132 size_t 2133 as_swapout(struct as *as) 2134 { 2135 struct seg *seg; 2136 size_t swpcnt = 0; 2137 2138 /* 2139 * Kernel-only processes have given up their address 2140 * spaces. Of course, we shouldn't be attempting to 2141 * swap out such processes in the first place... 2142 */ 2143 if (as == NULL) 2144 return (0); 2145 2146 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2147 2148 /* Prevent XHATs from attaching */ 2149 mutex_enter(&as->a_contents); 2150 AS_SETBUSY(as); 2151 mutex_exit(&as->a_contents); 2152 2153 2154 /* 2155 * Free all mapping resources associated with the address 2156 * space. The segment-level swapout routines capitalize 2157 * on this unmapping by scavanging pages that have become 2158 * unmapped here. 2159 */ 2160 hat_swapout(as->a_hat); 2161 if (as->a_xhat != NULL) 2162 xhat_swapout_all(as); 2163 2164 mutex_enter(&as->a_contents); 2165 AS_CLRBUSY(as); 2166 mutex_exit(&as->a_contents); 2167 2168 /* 2169 * Call the swapout routines of all segments in the address 2170 * space to do the actual work, accumulating the amount of 2171 * space reclaimed. 2172 */ 2173 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 2174 struct seg_ops *ov = seg->s_ops; 2175 2176 /* 2177 * We have to check to see if the seg has 2178 * an ops vector because the seg may have 2179 * been in the middle of being set up when 2180 * the process was picked for swapout. 2181 */ 2182 if ((ov != NULL) && (ov->swapout != NULL)) 2183 swpcnt += SEGOP_SWAPOUT(seg); 2184 } 2185 AS_LOCK_EXIT(as, &as->a_lock); 2186 return (swpcnt); 2187 } 2188 2189 /* 2190 * Determine whether data from the mappings in interval [addr, addr + size) 2191 * are in the primary memory (core) cache. 2192 */ 2193 int 2194 as_incore(struct as *as, caddr_t addr, 2195 size_t size, char *vec, size_t *sizep) 2196 { 2197 struct seg *seg; 2198 size_t ssize; 2199 caddr_t raddr; /* rounded down addr */ 2200 size_t rsize; /* rounded up size */ 2201 size_t isize; /* iteration size */ 2202 int error = 0; /* result, assume success */ 2203 2204 *sizep = 0; 2205 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2206 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) - 2207 (size_t)raddr; 2208 2209 if (raddr + rsize < raddr) /* check for wraparound */ 2210 return (ENOMEM); 2211 2212 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2213 seg = as_segat(as, raddr); 2214 if (seg == NULL) { 2215 AS_LOCK_EXIT(as, &as->a_lock); 2216 return (-1); 2217 } 2218 2219 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 2220 if (raddr >= seg->s_base + seg->s_size) { 2221 seg = AS_SEGNEXT(as, seg); 2222 if (seg == NULL || raddr != seg->s_base) { 2223 error = -1; 2224 break; 2225 } 2226 } 2227 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2228 ssize = seg->s_base + seg->s_size - raddr; 2229 else 2230 ssize = rsize; 2231 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec); 2232 if (isize != ssize) { 2233 error = -1; 2234 break; 2235 } 2236 vec += btopr(ssize); 2237 } 2238 AS_LOCK_EXIT(as, &as->a_lock); 2239 return (error); 2240 } 2241 2242 static void 2243 as_segunlock(struct seg *seg, caddr_t addr, int attr, 2244 ulong_t *bitmap, size_t position, size_t npages) 2245 { 2246 caddr_t range_start; 2247 size_t pos1 = position; 2248 size_t pos2; 2249 size_t size; 2250 size_t end_pos = npages + position; 2251 2252 while (bt_range(bitmap, &pos1, &pos2, end_pos)) { 2253 size = ptob((pos2 - pos1)); 2254 range_start = (caddr_t)((uintptr_t)addr + 2255 ptob(pos1 - position)); 2256 2257 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK, 2258 (ulong_t *)NULL, (size_t)NULL); 2259 pos1 = pos2; 2260 } 2261 } 2262 2263 static void 2264 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map, 2265 caddr_t raddr, size_t rsize) 2266 { 2267 struct seg *seg = as_segat(as, raddr); 2268 size_t ssize; 2269 2270 while (rsize != 0) { 2271 if (raddr >= seg->s_base + seg->s_size) 2272 seg = AS_SEGNEXT(as, seg); 2273 2274 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2275 ssize = seg->s_base + seg->s_size - raddr; 2276 else 2277 ssize = rsize; 2278 2279 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize)); 2280 2281 rsize -= ssize; 2282 raddr += ssize; 2283 } 2284 } 2285 2286 /* 2287 * Cache control operations over the interval [addr, addr + size) in 2288 * address space "as". 2289 */ 2290 /*ARGSUSED*/ 2291 int 2292 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr, 2293 uintptr_t arg, ulong_t *lock_map, size_t pos) 2294 { 2295 struct seg *seg; /* working segment */ 2296 caddr_t raddr; /* rounded down addr */ 2297 caddr_t initraddr; /* saved initial rounded down addr */ 2298 size_t rsize; /* rounded up size */ 2299 size_t initrsize; /* saved initial rounded up size */ 2300 size_t ssize; /* size of seg */ 2301 int error = 0; /* result */ 2302 size_t mlock_size; /* size of bitmap */ 2303 ulong_t *mlock_map; /* pointer to bitmap used */ 2304 /* to represent the locked */ 2305 /* pages. */ 2306 retry: 2307 if (error == IE_RETRY) 2308 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 2309 else 2310 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2311 2312 /* 2313 * If these are address space lock/unlock operations, loop over 2314 * all segments in the address space, as appropriate. 2315 */ 2316 if (func == MC_LOCKAS) { 2317 size_t npages, idx; 2318 size_t rlen = 0; /* rounded as length */ 2319 2320 idx = pos; 2321 2322 if (arg & MCL_FUTURE) { 2323 mutex_enter(&as->a_contents); 2324 AS_SETPGLCK(as); 2325 mutex_exit(&as->a_contents); 2326 } 2327 if ((arg & MCL_CURRENT) == 0) { 2328 AS_LOCK_EXIT(as, &as->a_lock); 2329 return (0); 2330 } 2331 2332 seg = AS_SEGFIRST(as); 2333 if (seg == NULL) { 2334 AS_LOCK_EXIT(as, &as->a_lock); 2335 return (0); 2336 } 2337 2338 do { 2339 raddr = (caddr_t)((uintptr_t)seg->s_base & 2340 (uintptr_t)PAGEMASK); 2341 rlen += (((uintptr_t)(seg->s_base + seg->s_size) + 2342 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr; 2343 } while ((seg = AS_SEGNEXT(as, seg)) != NULL); 2344 2345 mlock_size = BT_BITOUL(btopr(rlen)); 2346 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * 2347 sizeof (ulong_t), KM_NOSLEEP)) == NULL) { 2348 AS_LOCK_EXIT(as, &as->a_lock); 2349 return (EAGAIN); 2350 } 2351 2352 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 2353 error = SEGOP_LOCKOP(seg, seg->s_base, 2354 seg->s_size, attr, MC_LOCK, mlock_map, pos); 2355 if (error != 0) 2356 break; 2357 pos += seg_pages(seg); 2358 } 2359 2360 if (error) { 2361 for (seg = AS_SEGFIRST(as); seg != NULL; 2362 seg = AS_SEGNEXT(as, seg)) { 2363 2364 raddr = (caddr_t)((uintptr_t)seg->s_base & 2365 (uintptr_t)PAGEMASK); 2366 npages = seg_pages(seg); 2367 as_segunlock(seg, raddr, attr, mlock_map, 2368 idx, npages); 2369 idx += npages; 2370 } 2371 } 2372 2373 kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); 2374 AS_LOCK_EXIT(as, &as->a_lock); 2375 goto lockerr; 2376 } else if (func == MC_UNLOCKAS) { 2377 mutex_enter(&as->a_contents); 2378 AS_CLRPGLCK(as); 2379 mutex_exit(&as->a_contents); 2380 2381 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 2382 error = SEGOP_LOCKOP(seg, seg->s_base, 2383 seg->s_size, attr, MC_UNLOCK, NULL, 0); 2384 if (error != 0) 2385 break; 2386 } 2387 2388 AS_LOCK_EXIT(as, &as->a_lock); 2389 goto lockerr; 2390 } 2391 2392 /* 2393 * Normalize addresses and sizes. 2394 */ 2395 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2396 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2397 (size_t)raddr; 2398 2399 if (raddr + rsize < raddr) { /* check for wraparound */ 2400 AS_LOCK_EXIT(as, &as->a_lock); 2401 return (ENOMEM); 2402 } 2403 2404 /* 2405 * Get initial segment. 2406 */ 2407 if ((seg = as_segat(as, raddr)) == NULL) { 2408 AS_LOCK_EXIT(as, &as->a_lock); 2409 return (ENOMEM); 2410 } 2411 2412 if (func == MC_LOCK) { 2413 mlock_size = BT_BITOUL(btopr(rsize)); 2414 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * 2415 sizeof (ulong_t), KM_NOSLEEP)) == NULL) { 2416 AS_LOCK_EXIT(as, &as->a_lock); 2417 return (EAGAIN); 2418 } 2419 } 2420 2421 /* 2422 * Loop over all segments. If a hole in the address range is 2423 * discovered, then fail. For each segment, perform the appropriate 2424 * control operation. 2425 */ 2426 while (rsize != 0) { 2427 2428 /* 2429 * Make sure there's no hole, calculate the portion 2430 * of the next segment to be operated over. 2431 */ 2432 if (raddr >= seg->s_base + seg->s_size) { 2433 seg = AS_SEGNEXT(as, seg); 2434 if (seg == NULL || raddr != seg->s_base) { 2435 if (func == MC_LOCK) { 2436 as_unlockerr(as, attr, mlock_map, 2437 initraddr, initrsize - rsize); 2438 kmem_free(mlock_map, 2439 mlock_size * sizeof (ulong_t)); 2440 } 2441 AS_LOCK_EXIT(as, &as->a_lock); 2442 return (ENOMEM); 2443 } 2444 } 2445 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2446 ssize = seg->s_base + seg->s_size - raddr; 2447 else 2448 ssize = rsize; 2449 2450 /* 2451 * Dispatch on specific function. 2452 */ 2453 switch (func) { 2454 2455 /* 2456 * Synchronize cached data from mappings with backing 2457 * objects. 2458 */ 2459 case MC_SYNC: 2460 if (error = SEGOP_SYNC(seg, raddr, ssize, 2461 attr, (uint_t)arg)) { 2462 AS_LOCK_EXIT(as, &as->a_lock); 2463 return (error); 2464 } 2465 break; 2466 2467 /* 2468 * Lock pages in memory. 2469 */ 2470 case MC_LOCK: 2471 if (error = SEGOP_LOCKOP(seg, raddr, ssize, 2472 attr, func, mlock_map, pos)) { 2473 as_unlockerr(as, attr, mlock_map, initraddr, 2474 initrsize - rsize + ssize); 2475 kmem_free(mlock_map, mlock_size * 2476 sizeof (ulong_t)); 2477 AS_LOCK_EXIT(as, &as->a_lock); 2478 goto lockerr; 2479 } 2480 break; 2481 2482 /* 2483 * Unlock mapped pages. 2484 */ 2485 case MC_UNLOCK: 2486 (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func, 2487 (ulong_t *)NULL, (size_t)NULL); 2488 break; 2489 2490 /* 2491 * Store VM advise for mapped pages in segment layer. 2492 */ 2493 case MC_ADVISE: 2494 error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg); 2495 2496 /* 2497 * Check for regular errors and special retry error 2498 */ 2499 if (error) { 2500 if (error == IE_RETRY) { 2501 /* 2502 * Need to acquire writers lock, so 2503 * have to drop readers lock and start 2504 * all over again 2505 */ 2506 AS_LOCK_EXIT(as, &as->a_lock); 2507 goto retry; 2508 } else if (error == IE_REATTACH) { 2509 /* 2510 * Find segment for current address 2511 * because current segment just got 2512 * split or concatenated 2513 */ 2514 seg = as_segat(as, raddr); 2515 if (seg == NULL) { 2516 AS_LOCK_EXIT(as, &as->a_lock); 2517 return (ENOMEM); 2518 } 2519 } else { 2520 /* 2521 * Regular error 2522 */ 2523 AS_LOCK_EXIT(as, &as->a_lock); 2524 return (error); 2525 } 2526 } 2527 break; 2528 2529 /* 2530 * Can't happen. 2531 */ 2532 default: 2533 panic("as_ctl: bad operation %d", func); 2534 /*NOTREACHED*/ 2535 } 2536 2537 rsize -= ssize; 2538 raddr += ssize; 2539 } 2540 2541 if (func == MC_LOCK) 2542 kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); 2543 AS_LOCK_EXIT(as, &as->a_lock); 2544 return (0); 2545 lockerr: 2546 2547 /* 2548 * If the lower levels returned EDEADLK for a segment lockop, 2549 * it means that we should retry the operation. Let's wait 2550 * a bit also to let the deadlock causing condition clear. 2551 * This is part of a gross hack to work around a design flaw 2552 * in the ufs/sds logging code and should go away when the 2553 * logging code is re-designed to fix the problem. See bug 2554 * 4125102 for details of the problem. 2555 */ 2556 if (error == EDEADLK) { 2557 delay(deadlk_wait); 2558 error = 0; 2559 goto retry; 2560 } 2561 return (error); 2562 } 2563 2564 /* 2565 * Special code for exec to move the stack segment from its interim 2566 * place in the old address to the right place in the new address space. 2567 */ 2568 /*ARGSUSED*/ 2569 int 2570 as_exec(struct as *oas, caddr_t ostka, size_t stksz, 2571 struct as *nas, caddr_t nstka, uint_t hatflag) 2572 { 2573 struct seg *stkseg; 2574 2575 AS_LOCK_ENTER(oas, &oas->a_lock, RW_WRITER); 2576 stkseg = as_segat(oas, ostka); 2577 stkseg = as_removeseg(oas, stkseg); 2578 ASSERT(stkseg != NULL); 2579 ASSERT(stkseg->s_base == ostka && stkseg->s_size == stksz); 2580 stkseg->s_as = nas; 2581 stkseg->s_base = nstka; 2582 2583 /* 2584 * It's ok to lock the address space we are about to exec to. 2585 */ 2586 AS_LOCK_ENTER(nas, &nas->a_lock, RW_WRITER); 2587 ASSERT(avl_numnodes(&nas->a_wpage) == 0); 2588 nas->a_size += stkseg->s_size; 2589 oas->a_size -= stkseg->s_size; 2590 (void) as_addseg(nas, stkseg); 2591 AS_LOCK_EXIT(nas, &nas->a_lock); 2592 AS_LOCK_EXIT(oas, &oas->a_lock); 2593 return (0); 2594 } 2595 2596 int 2597 fc_decode(faultcode_t fault_err) 2598 { 2599 int error = 0; 2600 2601 switch (FC_CODE(fault_err)) { 2602 case FC_OBJERR: 2603 error = FC_ERRNO(fault_err); 2604 break; 2605 case FC_PROT: 2606 error = EACCES; 2607 break; 2608 default: 2609 error = EFAULT; 2610 break; 2611 } 2612 return (error); 2613 } 2614 2615 /* 2616 * Pagelock pages from a range that spans more than 1 segment. Obtain shadow 2617 * lists from each segment and copy them to one contiguous shadow list (plist) 2618 * as expected by the caller. Save pointers to per segment shadow lists at 2619 * the tail of plist so that they can be used during as_pageunlock(). 2620 */ 2621 static int 2622 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp, 2623 caddr_t addr, size_t size, enum seg_rw rw) 2624 { 2625 caddr_t sv_addr = addr; 2626 size_t sv_size = size; 2627 struct seg *sv_seg = seg; 2628 ulong_t segcnt = 1; 2629 ulong_t cnt; 2630 size_t ssize; 2631 pgcnt_t npages = btop(size); 2632 page_t **plist; 2633 page_t **pl; 2634 int error; 2635 caddr_t eaddr; 2636 faultcode_t fault_err = 0; 2637 pgcnt_t pl_off; 2638 extern struct seg_ops segspt_shmops; 2639 2640 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 2641 ASSERT(seg != NULL); 2642 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size); 2643 ASSERT(addr + size > seg->s_base + seg->s_size); 2644 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 2645 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 2646 2647 /* 2648 * Count the number of segments covered by the range we are about to 2649 * lock. The segment count is used to size the shadow list we return 2650 * back to the caller. 2651 */ 2652 for (; size != 0; size -= ssize, addr += ssize) { 2653 if (addr >= seg->s_base + seg->s_size) { 2654 2655 seg = AS_SEGNEXT(as, seg); 2656 if (seg == NULL || addr != seg->s_base) { 2657 AS_LOCK_EXIT(as, &as->a_lock); 2658 return (EFAULT); 2659 } 2660 /* 2661 * Do a quick check if subsequent segments 2662 * will most likely support pagelock. 2663 */ 2664 if (seg->s_ops == &segvn_ops) { 2665 vnode_t *vp; 2666 2667 if (SEGOP_GETVP(seg, addr, &vp) != 0 || 2668 vp != NULL) { 2669 AS_LOCK_EXIT(as, &as->a_lock); 2670 goto slow; 2671 } 2672 } else if (seg->s_ops != &segspt_shmops) { 2673 AS_LOCK_EXIT(as, &as->a_lock); 2674 goto slow; 2675 } 2676 segcnt++; 2677 } 2678 if (addr + size > seg->s_base + seg->s_size) { 2679 ssize = seg->s_base + seg->s_size - addr; 2680 } else { 2681 ssize = size; 2682 } 2683 } 2684 ASSERT(segcnt > 1); 2685 2686 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP); 2687 2688 addr = sv_addr; 2689 size = sv_size; 2690 seg = sv_seg; 2691 2692 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) { 2693 if (addr >= seg->s_base + seg->s_size) { 2694 seg = AS_SEGNEXT(as, seg); 2695 ASSERT(seg != NULL && addr == seg->s_base); 2696 cnt++; 2697 ASSERT(cnt < segcnt); 2698 } 2699 if (addr + size > seg->s_base + seg->s_size) { 2700 ssize = seg->s_base + seg->s_size - addr; 2701 } else { 2702 ssize = size; 2703 } 2704 pl = &plist[npages + cnt]; 2705 error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, 2706 L_PAGELOCK, rw); 2707 if (error) { 2708 break; 2709 } 2710 ASSERT(plist[npages + cnt] != NULL); 2711 ASSERT(pl_off + btop(ssize) <= npages); 2712 bcopy(plist[npages + cnt], &plist[pl_off], 2713 btop(ssize) * sizeof (page_t *)); 2714 pl_off += btop(ssize); 2715 } 2716 2717 if (size == 0) { 2718 AS_LOCK_EXIT(as, &as->a_lock); 2719 ASSERT(cnt == segcnt - 1); 2720 *ppp = plist; 2721 return (0); 2722 } 2723 2724 /* 2725 * one of pagelock calls failed. The error type is in error variable. 2726 * Unlock what we've locked so far and retry with F_SOFTLOCK if error 2727 * type is either EFAULT or ENOTSUP. Otherwise just return the error 2728 * back to the caller. 2729 */ 2730 2731 eaddr = addr; 2732 seg = sv_seg; 2733 2734 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) { 2735 if (addr >= seg->s_base + seg->s_size) { 2736 seg = AS_SEGNEXT(as, seg); 2737 ASSERT(seg != NULL && addr == seg->s_base); 2738 cnt++; 2739 ASSERT(cnt < segcnt); 2740 } 2741 if (eaddr > seg->s_base + seg->s_size) { 2742 ssize = seg->s_base + seg->s_size - addr; 2743 } else { 2744 ssize = eaddr - addr; 2745 } 2746 pl = &plist[npages + cnt]; 2747 ASSERT(*pl != NULL); 2748 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, 2749 L_PAGEUNLOCK, rw); 2750 } 2751 2752 AS_LOCK_EXIT(as, &as->a_lock); 2753 2754 kmem_free(plist, (npages + segcnt) * sizeof (page_t *)); 2755 2756 if (error != ENOTSUP && error != EFAULT) { 2757 return (error); 2758 } 2759 2760 slow: 2761 /* 2762 * If we are here because pagelock failed due to the need to cow fault 2763 * in the pages we want to lock F_SOFTLOCK will do this job and in 2764 * next as_pagelock() call for this address range pagelock will 2765 * hopefully succeed. 2766 */ 2767 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw); 2768 if (fault_err != 0) { 2769 return (fc_decode(fault_err)); 2770 } 2771 *ppp = NULL; 2772 2773 return (0); 2774 } 2775 2776 /* 2777 * lock pages in a given address space. Return shadow list. If 2778 * the list is NULL, the MMU mapping is also locked. 2779 */ 2780 int 2781 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr, 2782 size_t size, enum seg_rw rw) 2783 { 2784 size_t rsize; 2785 caddr_t raddr; 2786 faultcode_t fault_err; 2787 struct seg *seg; 2788 int err; 2789 2790 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START, 2791 "as_pagelock_start: addr %p size %ld", addr, size); 2792 2793 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2794 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2795 (size_t)raddr; 2796 2797 /* 2798 * if the request crosses two segments let 2799 * as_fault handle it. 2800 */ 2801 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2802 2803 seg = as_segat(as, raddr); 2804 if (seg == NULL) { 2805 AS_LOCK_EXIT(as, &as->a_lock); 2806 return (EFAULT); 2807 } 2808 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size); 2809 if (raddr + rsize > seg->s_base + seg->s_size) { 2810 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw)); 2811 } 2812 if (raddr + rsize <= raddr) { 2813 AS_LOCK_EXIT(as, &as->a_lock); 2814 return (EFAULT); 2815 } 2816 2817 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START, 2818 "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize); 2819 2820 /* 2821 * try to lock pages and pass back shadow list 2822 */ 2823 err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw); 2824 2825 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end"); 2826 2827 AS_LOCK_EXIT(as, &as->a_lock); 2828 2829 if (err == 0 || (err != ENOTSUP && err != EFAULT)) { 2830 return (err); 2831 } 2832 2833 /* 2834 * Use F_SOFTLOCK to lock the pages because pagelock failed either due 2835 * to no pagelock support for this segment or pages need to be cow 2836 * faulted in. If fault is needed F_SOFTLOCK will do this job for 2837 * this as_pagelock() call and in the next as_pagelock() call for the 2838 * same address range pagelock call will hopefull succeed. 2839 */ 2840 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw); 2841 if (fault_err != 0) { 2842 return (fc_decode(fault_err)); 2843 } 2844 *ppp = NULL; 2845 2846 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end"); 2847 return (0); 2848 } 2849 2850 /* 2851 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow 2852 * lists from the end of plist and call pageunlock interface for each segment. 2853 * Drop as lock and free plist. 2854 */ 2855 static void 2856 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size, 2857 struct page **plist, enum seg_rw rw) 2858 { 2859 ulong_t cnt; 2860 caddr_t eaddr = addr + size; 2861 pgcnt_t npages = btop(size); 2862 size_t ssize; 2863 page_t **pl; 2864 2865 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 2866 ASSERT(seg != NULL); 2867 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size); 2868 ASSERT(addr + size > seg->s_base + seg->s_size); 2869 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 2870 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 2871 ASSERT(plist != NULL); 2872 2873 for (cnt = 0; addr < eaddr; addr += ssize) { 2874 if (addr >= seg->s_base + seg->s_size) { 2875 seg = AS_SEGNEXT(as, seg); 2876 ASSERT(seg != NULL && addr == seg->s_base); 2877 cnt++; 2878 } 2879 if (eaddr > seg->s_base + seg->s_size) { 2880 ssize = seg->s_base + seg->s_size - addr; 2881 } else { 2882 ssize = eaddr - addr; 2883 } 2884 pl = &plist[npages + cnt]; 2885 ASSERT(*pl != NULL); 2886 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, 2887 L_PAGEUNLOCK, rw); 2888 } 2889 ASSERT(cnt > 0); 2890 AS_LOCK_EXIT(as, &as->a_lock); 2891 2892 cnt++; 2893 kmem_free(plist, (npages + cnt) * sizeof (page_t *)); 2894 } 2895 2896 /* 2897 * unlock pages in a given address range 2898 */ 2899 void 2900 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size, 2901 enum seg_rw rw) 2902 { 2903 struct seg *seg; 2904 size_t rsize; 2905 caddr_t raddr; 2906 2907 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START, 2908 "as_pageunlock_start: addr %p size %ld", addr, size); 2909 2910 /* 2911 * if the shadow list is NULL, as_pagelock was 2912 * falling back to as_fault 2913 */ 2914 if (pp == NULL) { 2915 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw); 2916 return; 2917 } 2918 2919 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2920 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2921 (size_t)raddr; 2922 2923 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2924 seg = as_segat(as, raddr); 2925 ASSERT(seg != NULL); 2926 2927 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START, 2928 "seg_unlock_start: raddr %p rsize %ld", raddr, rsize); 2929 2930 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size); 2931 if (raddr + rsize <= seg->s_base + seg->s_size) { 2932 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw); 2933 } else { 2934 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw); 2935 return; 2936 } 2937 AS_LOCK_EXIT(as, &as->a_lock); 2938 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end"); 2939 } 2940 2941 int 2942 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc, 2943 boolean_t wait) 2944 { 2945 struct seg *seg; 2946 size_t ssize; 2947 caddr_t raddr; /* rounded down addr */ 2948 size_t rsize; /* rounded up size */ 2949 int error = 0; 2950 size_t pgsz = page_get_pagesize(szc); 2951 2952 setpgsz_top: 2953 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) { 2954 return (EINVAL); 2955 } 2956 2957 raddr = addr; 2958 rsize = size; 2959 2960 if (raddr + rsize < raddr) /* check for wraparound */ 2961 return (ENOMEM); 2962 2963 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 2964 as_clearwatchprot(as, raddr, rsize); 2965 seg = as_segat(as, raddr); 2966 if (seg == NULL) { 2967 as_setwatch(as); 2968 AS_LOCK_EXIT(as, &as->a_lock); 2969 return (ENOMEM); 2970 } 2971 2972 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 2973 if (raddr >= seg->s_base + seg->s_size) { 2974 seg = AS_SEGNEXT(as, seg); 2975 if (seg == NULL || raddr != seg->s_base) { 2976 error = ENOMEM; 2977 break; 2978 } 2979 } 2980 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 2981 ssize = seg->s_base + seg->s_size - raddr; 2982 } else { 2983 ssize = rsize; 2984 } 2985 2986 retry: 2987 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc); 2988 2989 if (error == IE_NOMEM) { 2990 error = EAGAIN; 2991 break; 2992 } 2993 2994 if (error == IE_RETRY) { 2995 AS_LOCK_EXIT(as, &as->a_lock); 2996 goto setpgsz_top; 2997 } 2998 2999 if (error == ENOTSUP) { 3000 error = EINVAL; 3001 break; 3002 } 3003 3004 if (wait && (error == EAGAIN)) { 3005 /* 3006 * Memory is currently locked. It must be unlocked 3007 * before this operation can succeed through a retry. 3008 * The possible reasons for locked memory and 3009 * corresponding strategies for unlocking are: 3010 * (1) Normal I/O 3011 * wait for a signal that the I/O operation 3012 * has completed and the memory is unlocked. 3013 * (2) Asynchronous I/O 3014 * The aio subsystem does not unlock pages when 3015 * the I/O is completed. Those pages are unlocked 3016 * when the application calls aiowait/aioerror. 3017 * So, to prevent blocking forever, cv_broadcast() 3018 * is done to wake up aio_cleanup_thread. 3019 * Subsequently, segvn_reclaim will be called, and 3020 * that will do AS_CLRUNMAPWAIT() and wake us up. 3021 * (3) Long term page locking: 3022 * This is not relevant for as_setpagesize() 3023 * because we cannot change the page size for 3024 * driver memory. The attempt to do so will 3025 * fail with a different error than EAGAIN so 3026 * there's no need to trigger as callbacks like 3027 * as_unmap, as_setprot or as_free would do. 3028 */ 3029 mutex_enter(&as->a_contents); 3030 if (!AS_ISNOUNMAPWAIT(as)) { 3031 if (AS_ISUNMAPWAIT(as) == 0) { 3032 cv_broadcast(&as->a_cv); 3033 } 3034 AS_SETUNMAPWAIT(as); 3035 AS_LOCK_EXIT(as, &as->a_lock); 3036 while (AS_ISUNMAPWAIT(as)) { 3037 cv_wait(&as->a_cv, &as->a_contents); 3038 } 3039 } else { 3040 /* 3041 * We may have raced with 3042 * segvn_reclaim()/segspt_reclaim(). In this 3043 * case clean nounmapwait flag and retry since 3044 * softlockcnt in this segment may be already 3045 * 0. We don't drop as writer lock so our 3046 * number of retries without sleeping should 3047 * be very small. See segvn_reclaim() for 3048 * more comments. 3049 */ 3050 AS_CLRNOUNMAPWAIT(as); 3051 mutex_exit(&as->a_contents); 3052 goto retry; 3053 } 3054 mutex_exit(&as->a_contents); 3055 goto setpgsz_top; 3056 } else if (error != 0) { 3057 break; 3058 } 3059 } 3060 as_setwatch(as); 3061 AS_LOCK_EXIT(as, &as->a_lock); 3062 return (error); 3063 } 3064 3065 /* 3066 * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments 3067 * in its chunk where s_szc is less than the szc we want to set. 3068 */ 3069 static int 3070 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc, 3071 int *retry) 3072 { 3073 struct seg *seg; 3074 size_t ssize; 3075 int error; 3076 3077 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3078 3079 seg = as_segat(as, raddr); 3080 if (seg == NULL) { 3081 panic("as_iset3_default_lpsize: no seg"); 3082 } 3083 3084 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 3085 if (raddr >= seg->s_base + seg->s_size) { 3086 seg = AS_SEGNEXT(as, seg); 3087 if (seg == NULL || raddr != seg->s_base) { 3088 panic("as_iset3_default_lpsize: as changed"); 3089 } 3090 } 3091 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3092 ssize = seg->s_base + seg->s_size - raddr; 3093 } else { 3094 ssize = rsize; 3095 } 3096 3097 if (szc > seg->s_szc) { 3098 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc); 3099 /* Only retry on EINVAL segments that have no vnode. */ 3100 if (error == EINVAL) { 3101 vnode_t *vp = NULL; 3102 if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) && 3103 (SEGOP_GETVP(seg, raddr, &vp) != 0 || 3104 vp == NULL)) { 3105 *retry = 1; 3106 } else { 3107 *retry = 0; 3108 } 3109 } 3110 if (error) { 3111 return (error); 3112 } 3113 } 3114 } 3115 return (0); 3116 } 3117 3118 /* 3119 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the 3120 * pagesize on each segment in its range, but if any fails with EINVAL, 3121 * then it reduces the pagesizes to the next size in the bitmap and 3122 * retries as_iset3_default_lpsize(). The reason why the code retries 3123 * smaller allowed sizes on EINVAL is because (a) the anon offset may not 3124 * match the bigger sizes, and (b) it's hard to get this offset (to begin 3125 * with) to pass to map_pgszcvec(). 3126 */ 3127 static int 3128 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc, 3129 uint_t szcvec) 3130 { 3131 int error; 3132 int retry; 3133 3134 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3135 3136 for (;;) { 3137 error = as_iset3_default_lpsize(as, addr, size, szc, &retry); 3138 if (error == EINVAL && retry) { 3139 szcvec &= ~(1 << szc); 3140 if (szcvec <= 1) { 3141 return (EINVAL); 3142 } 3143 szc = highbit(szcvec) - 1; 3144 } else { 3145 return (error); 3146 } 3147 } 3148 } 3149 3150 /* 3151 * as_iset1_default_lpsize() breaks its chunk into areas where existing 3152 * segments have a smaller szc than we want to set. For each such area, 3153 * it calls as_iset2_default_lpsize() 3154 */ 3155 static int 3156 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc, 3157 uint_t szcvec) 3158 { 3159 struct seg *seg; 3160 size_t ssize; 3161 caddr_t setaddr = raddr; 3162 size_t setsize = 0; 3163 int set; 3164 int error; 3165 3166 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3167 3168 seg = as_segat(as, raddr); 3169 if (seg == NULL) { 3170 panic("as_iset1_default_lpsize: no seg"); 3171 } 3172 if (seg->s_szc < szc) { 3173 set = 1; 3174 } else { 3175 set = 0; 3176 } 3177 3178 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) { 3179 if (raddr >= seg->s_base + seg->s_size) { 3180 seg = AS_SEGNEXT(as, seg); 3181 if (seg == NULL || raddr != seg->s_base) { 3182 panic("as_iset1_default_lpsize: as changed"); 3183 } 3184 if (seg->s_szc >= szc && set) { 3185 ASSERT(setsize != 0); 3186 error = as_iset2_default_lpsize(as, 3187 setaddr, setsize, szc, szcvec); 3188 if (error) { 3189 return (error); 3190 } 3191 set = 0; 3192 } else if (seg->s_szc < szc && !set) { 3193 setaddr = raddr; 3194 setsize = 0; 3195 set = 1; 3196 } 3197 } 3198 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3199 ssize = seg->s_base + seg->s_size - raddr; 3200 } else { 3201 ssize = rsize; 3202 } 3203 } 3204 error = 0; 3205 if (set) { 3206 ASSERT(setsize != 0); 3207 error = as_iset2_default_lpsize(as, setaddr, setsize, 3208 szc, szcvec); 3209 } 3210 return (error); 3211 } 3212 3213 /* 3214 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap 3215 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each 3216 * chunk to as_iset1_default_lpsize(). 3217 */ 3218 static int 3219 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags, 3220 int type) 3221 { 3222 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM; 3223 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, 3224 flags, rtype, 1); 3225 uint_t szc; 3226 uint_t nszc; 3227 int error; 3228 caddr_t a; 3229 caddr_t eaddr; 3230 size_t segsize; 3231 size_t pgsz; 3232 uint_t save_szcvec; 3233 3234 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3235 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 3236 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 3237 3238 szcvec &= ~1; 3239 if (szcvec <= 1) { /* skip if base page size */ 3240 return (0); 3241 } 3242 3243 /* Get the pagesize of the first larger page size. */ 3244 szc = lowbit(szcvec) - 1; 3245 pgsz = page_get_pagesize(szc); 3246 eaddr = addr + size; 3247 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 3248 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 3249 3250 save_szcvec = szcvec; 3251 szcvec >>= (szc + 1); 3252 nszc = szc; 3253 while (szcvec) { 3254 if ((szcvec & 0x1) == 0) { 3255 nszc++; 3256 szcvec >>= 1; 3257 continue; 3258 } 3259 nszc++; 3260 pgsz = page_get_pagesize(nszc); 3261 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 3262 if (a != addr) { 3263 ASSERT(szc > 0); 3264 ASSERT(a < eaddr); 3265 segsize = a - addr; 3266 error = as_iset1_default_lpsize(as, addr, segsize, szc, 3267 save_szcvec); 3268 if (error) { 3269 return (error); 3270 } 3271 addr = a; 3272 } 3273 szc = nszc; 3274 szcvec >>= 1; 3275 } 3276 3277 ASSERT(addr < eaddr); 3278 szcvec = save_szcvec; 3279 while (szcvec) { 3280 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 3281 ASSERT(a >= addr); 3282 if (a != addr) { 3283 ASSERT(szc > 0); 3284 segsize = a - addr; 3285 error = as_iset1_default_lpsize(as, addr, segsize, szc, 3286 save_szcvec); 3287 if (error) { 3288 return (error); 3289 } 3290 addr = a; 3291 } 3292 szcvec &= ~(1 << szc); 3293 if (szcvec) { 3294 szc = highbit(szcvec) - 1; 3295 pgsz = page_get_pagesize(szc); 3296 } 3297 } 3298 ASSERT(addr == eaddr); 3299 3300 return (0); 3301 } 3302 3303 /* 3304 * Set the default large page size for the range. Called via memcntl with 3305 * page size set to 0. as_set_default_lpsize breaks the range down into 3306 * chunks with the same type/flags, ignores-non segvn segments, and passes 3307 * each chunk to as_iset_default_lpsize(). 3308 */ 3309 int 3310 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size) 3311 { 3312 struct seg *seg; 3313 caddr_t raddr; 3314 size_t rsize; 3315 size_t ssize; 3316 int rtype, rflags; 3317 int stype, sflags; 3318 int error; 3319 caddr_t setaddr; 3320 size_t setsize; 3321 int segvn; 3322 3323 if (size == 0) 3324 return (0); 3325 3326 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 3327 again: 3328 error = 0; 3329 3330 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3331 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 3332 (size_t)raddr; 3333 3334 if (raddr + rsize < raddr) { /* check for wraparound */ 3335 AS_LOCK_EXIT(as, &as->a_lock); 3336 return (ENOMEM); 3337 } 3338 as_clearwatchprot(as, raddr, rsize); 3339 seg = as_segat(as, raddr); 3340 if (seg == NULL) { 3341 as_setwatch(as); 3342 AS_LOCK_EXIT(as, &as->a_lock); 3343 return (ENOMEM); 3344 } 3345 if (seg->s_ops == &segvn_ops) { 3346 rtype = SEGOP_GETTYPE(seg, addr); 3347 rflags = rtype & (MAP_TEXT | MAP_INITDATA); 3348 rtype = rtype & (MAP_SHARED | MAP_PRIVATE); 3349 segvn = 1; 3350 } else { 3351 segvn = 0; 3352 } 3353 setaddr = raddr; 3354 setsize = 0; 3355 3356 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) { 3357 if (raddr >= (seg->s_base + seg->s_size)) { 3358 seg = AS_SEGNEXT(as, seg); 3359 if (seg == NULL || raddr != seg->s_base) { 3360 error = ENOMEM; 3361 break; 3362 } 3363 if (seg->s_ops == &segvn_ops) { 3364 stype = SEGOP_GETTYPE(seg, raddr); 3365 sflags = stype & (MAP_TEXT | MAP_INITDATA); 3366 stype &= (MAP_SHARED | MAP_PRIVATE); 3367 if (segvn && (rflags != sflags || 3368 rtype != stype)) { 3369 /* 3370 * The next segment is also segvn but 3371 * has different flags and/or type. 3372 */ 3373 ASSERT(setsize != 0); 3374 error = as_iset_default_lpsize(as, 3375 setaddr, setsize, rflags, rtype); 3376 if (error) { 3377 break; 3378 } 3379 rflags = sflags; 3380 rtype = stype; 3381 setaddr = raddr; 3382 setsize = 0; 3383 } else if (!segvn) { 3384 rflags = sflags; 3385 rtype = stype; 3386 setaddr = raddr; 3387 setsize = 0; 3388 segvn = 1; 3389 } 3390 } else if (segvn) { 3391 /* The next segment is not segvn. */ 3392 ASSERT(setsize != 0); 3393 error = as_iset_default_lpsize(as, 3394 setaddr, setsize, rflags, rtype); 3395 if (error) { 3396 break; 3397 } 3398 segvn = 0; 3399 } 3400 } 3401 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3402 ssize = seg->s_base + seg->s_size - raddr; 3403 } else { 3404 ssize = rsize; 3405 } 3406 } 3407 if (error == 0 && segvn) { 3408 /* The last chunk when rsize == 0. */ 3409 ASSERT(setsize != 0); 3410 error = as_iset_default_lpsize(as, setaddr, setsize, 3411 rflags, rtype); 3412 } 3413 3414 if (error == IE_RETRY) { 3415 goto again; 3416 } else if (error == IE_NOMEM) { 3417 error = EAGAIN; 3418 } else if (error == ENOTSUP) { 3419 error = EINVAL; 3420 } else if (error == EAGAIN) { 3421 mutex_enter(&as->a_contents); 3422 if (!AS_ISNOUNMAPWAIT(as)) { 3423 if (AS_ISUNMAPWAIT(as) == 0) { 3424 cv_broadcast(&as->a_cv); 3425 } 3426 AS_SETUNMAPWAIT(as); 3427 AS_LOCK_EXIT(as, &as->a_lock); 3428 while (AS_ISUNMAPWAIT(as)) { 3429 cv_wait(&as->a_cv, &as->a_contents); 3430 } 3431 mutex_exit(&as->a_contents); 3432 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 3433 } else { 3434 /* 3435 * We may have raced with 3436 * segvn_reclaim()/segspt_reclaim(). In this case 3437 * clean nounmapwait flag and retry since softlockcnt 3438 * in this segment may be already 0. We don't drop as 3439 * writer lock so our number of retries without 3440 * sleeping should be very small. See segvn_reclaim() 3441 * for more comments. 3442 */ 3443 AS_CLRNOUNMAPWAIT(as); 3444 mutex_exit(&as->a_contents); 3445 } 3446 goto again; 3447 } 3448 3449 as_setwatch(as); 3450 AS_LOCK_EXIT(as, &as->a_lock); 3451 return (error); 3452 } 3453 3454 /* 3455 * Setup all of the uninitialized watched pages that we can. 3456 */ 3457 void 3458 as_setwatch(struct as *as) 3459 { 3460 struct watched_page *pwp; 3461 struct seg *seg; 3462 caddr_t vaddr; 3463 uint_t prot; 3464 int err, retrycnt; 3465 3466 if (avl_numnodes(&as->a_wpage) == 0) 3467 return; 3468 3469 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3470 3471 for (pwp = avl_first(&as->a_wpage); pwp != NULL; 3472 pwp = AVL_NEXT(&as->a_wpage, pwp)) { 3473 retrycnt = 0; 3474 retry: 3475 vaddr = pwp->wp_vaddr; 3476 if (pwp->wp_oprot != 0 || /* already set up */ 3477 (seg = as_segat(as, vaddr)) == NULL || 3478 SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0) 3479 continue; 3480 3481 pwp->wp_oprot = prot; 3482 if (pwp->wp_read) 3483 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3484 if (pwp->wp_write) 3485 prot &= ~PROT_WRITE; 3486 if (pwp->wp_exec) 3487 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3488 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) { 3489 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot); 3490 if (err == IE_RETRY) { 3491 pwp->wp_oprot = 0; 3492 ASSERT(retrycnt == 0); 3493 retrycnt++; 3494 goto retry; 3495 } 3496 } 3497 pwp->wp_prot = prot; 3498 } 3499 } 3500 3501 /* 3502 * Clear all of the watched pages in the address space. 3503 */ 3504 void 3505 as_clearwatch(struct as *as) 3506 { 3507 struct watched_page *pwp; 3508 struct seg *seg; 3509 caddr_t vaddr; 3510 uint_t prot; 3511 int err, retrycnt; 3512 3513 if (avl_numnodes(&as->a_wpage) == 0) 3514 return; 3515 3516 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3517 3518 for (pwp = avl_first(&as->a_wpage); pwp != NULL; 3519 pwp = AVL_NEXT(&as->a_wpage, pwp)) { 3520 retrycnt = 0; 3521 retry: 3522 vaddr = pwp->wp_vaddr; 3523 if (pwp->wp_oprot == 0 || /* not set up */ 3524 (seg = as_segat(as, vaddr)) == NULL) 3525 continue; 3526 3527 if ((prot = pwp->wp_oprot) != pwp->wp_prot) { 3528 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot); 3529 if (err == IE_RETRY) { 3530 ASSERT(retrycnt == 0); 3531 retrycnt++; 3532 goto retry; 3533 } 3534 } 3535 pwp->wp_oprot = 0; 3536 pwp->wp_prot = 0; 3537 } 3538 } 3539 3540 /* 3541 * Force a new setup for all the watched pages in the range. 3542 */ 3543 static void 3544 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 3545 { 3546 struct watched_page *pwp; 3547 struct watched_page tpw; 3548 caddr_t eaddr = addr + size; 3549 caddr_t vaddr; 3550 struct seg *seg; 3551 int err, retrycnt; 3552 uint_t wprot; 3553 avl_index_t where; 3554 3555 if (avl_numnodes(&as->a_wpage) == 0) 3556 return; 3557 3558 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3559 3560 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3561 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) 3562 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); 3563 3564 while (pwp != NULL && pwp->wp_vaddr < eaddr) { 3565 retrycnt = 0; 3566 vaddr = pwp->wp_vaddr; 3567 3568 wprot = prot; 3569 if (pwp->wp_read) 3570 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3571 if (pwp->wp_write) 3572 wprot &= ~PROT_WRITE; 3573 if (pwp->wp_exec) 3574 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3575 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) { 3576 retry: 3577 seg = as_segat(as, vaddr); 3578 if (seg == NULL) { 3579 panic("as_setwatchprot: no seg"); 3580 /*NOTREACHED*/ 3581 } 3582 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot); 3583 if (err == IE_RETRY) { 3584 ASSERT(retrycnt == 0); 3585 retrycnt++; 3586 goto retry; 3587 } 3588 } 3589 pwp->wp_oprot = prot; 3590 pwp->wp_prot = wprot; 3591 3592 pwp = AVL_NEXT(&as->a_wpage, pwp); 3593 } 3594 } 3595 3596 /* 3597 * Clear all of the watched pages in the range. 3598 */ 3599 static void 3600 as_clearwatchprot(struct as *as, caddr_t addr, size_t size) 3601 { 3602 caddr_t eaddr = addr + size; 3603 struct watched_page *pwp; 3604 struct watched_page tpw; 3605 uint_t prot; 3606 struct seg *seg; 3607 int err, retrycnt; 3608 avl_index_t where; 3609 3610 if (avl_numnodes(&as->a_wpage) == 0) 3611 return; 3612 3613 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3614 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) 3615 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); 3616 3617 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3618 3619 while (pwp != NULL && pwp->wp_vaddr < eaddr) { 3620 3621 if ((prot = pwp->wp_oprot) != 0) { 3622 retrycnt = 0; 3623 3624 if (prot != pwp->wp_prot) { 3625 retry: 3626 seg = as_segat(as, pwp->wp_vaddr); 3627 if (seg == NULL) 3628 continue; 3629 err = SEGOP_SETPROT(seg, pwp->wp_vaddr, 3630 PAGESIZE, prot); 3631 if (err == IE_RETRY) { 3632 ASSERT(retrycnt == 0); 3633 retrycnt++; 3634 goto retry; 3635 3636 } 3637 } 3638 pwp->wp_oprot = 0; 3639 pwp->wp_prot = 0; 3640 } 3641 3642 pwp = AVL_NEXT(&as->a_wpage, pwp); 3643 } 3644 } 3645 3646 void 3647 as_signal_proc(struct as *as, k_siginfo_t *siginfo) 3648 { 3649 struct proc *p; 3650 3651 mutex_enter(&pidlock); 3652 for (p = practive; p; p = p->p_next) { 3653 if (p->p_as == as) { 3654 mutex_enter(&p->p_lock); 3655 if (p->p_as == as) 3656 sigaddq(p, NULL, siginfo, KM_NOSLEEP); 3657 mutex_exit(&p->p_lock); 3658 } 3659 } 3660 mutex_exit(&pidlock); 3661 } 3662 3663 /* 3664 * return memory object ID 3665 */ 3666 int 3667 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp) 3668 { 3669 struct seg *seg; 3670 int sts; 3671 3672 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 3673 seg = as_segat(as, addr); 3674 if (seg == NULL) { 3675 AS_LOCK_EXIT(as, &as->a_lock); 3676 return (EFAULT); 3677 } 3678 /* 3679 * catch old drivers which may not support getmemid 3680 */ 3681 if (seg->s_ops->getmemid == NULL) { 3682 AS_LOCK_EXIT(as, &as->a_lock); 3683 return (ENODEV); 3684 } 3685 3686 sts = SEGOP_GETMEMID(seg, addr, memidp); 3687 3688 AS_LOCK_EXIT(as, &as->a_lock); 3689 return (sts); 3690 } 3691