1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - address spaces. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/t_lock.h> 47 #include <sys/param.h> 48 #include <sys/errno.h> 49 #include <sys/systm.h> 50 #include <sys/mman.h> 51 #include <sys/sysmacros.h> 52 #include <sys/cpuvar.h> 53 #include <sys/sysinfo.h> 54 #include <sys/kmem.h> 55 #include <sys/vnode.h> 56 #include <sys/vmsystm.h> 57 #include <sys/cmn_err.h> 58 #include <sys/debug.h> 59 #include <sys/tnf_probe.h> 60 #include <sys/vtrace.h> 61 62 #include <vm/hat.h> 63 #include <vm/xhat.h> 64 #include <vm/as.h> 65 #include <vm/seg.h> 66 #include <vm/seg_vn.h> 67 #include <vm/seg_dev.h> 68 #include <vm/seg_kmem.h> 69 #include <vm/seg_map.h> 70 #include <vm/seg_spt.h> 71 #include <vm/page.h> 72 73 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */ 74 75 static struct kmem_cache *as_cache; 76 77 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t); 78 static void as_clearwatchprot(struct as *, caddr_t, size_t); 79 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *); 80 81 82 /* 83 * Verifying the segment lists is very time-consuming; it may not be 84 * desirable always to define VERIFY_SEGLIST when DEBUG is set. 85 */ 86 #ifdef DEBUG 87 #define VERIFY_SEGLIST 88 int do_as_verify = 0; 89 #endif 90 91 /* 92 * Allocate a new callback data structure entry and fill in the events of 93 * interest, the address range of interest, and the callback argument. 94 * Link the entry on the as->a_callbacks list. A callback entry for the 95 * entire address space may be specified with vaddr = 0 and size = -1. 96 * 97 * CALLERS RESPONSIBILITY: If not calling from within the process context for 98 * the specified as, the caller must guarantee persistence of the specified as 99 * for the duration of this function (eg. pages being locked within the as 100 * will guarantee persistence). 101 */ 102 int 103 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events, 104 caddr_t vaddr, size_t size, int sleepflag) 105 { 106 struct as_callback *current_head, *cb; 107 caddr_t saddr; 108 size_t rsize; 109 110 /* callback function and an event are mandatory */ 111 if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0)) 112 return (EINVAL); 113 114 /* Adding a callback after as_free has been called is not allowed */ 115 if (as == &kas) 116 return (ENOMEM); 117 118 /* 119 * vaddr = 0 and size = -1 is used to indicate that the callback range 120 * is the entire address space so no rounding is done in that case. 121 */ 122 if (size != -1) { 123 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK); 124 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) - 125 (size_t)saddr; 126 /* check for wraparound */ 127 if (saddr + rsize < saddr) 128 return (ENOMEM); 129 } else { 130 if (vaddr != 0) 131 return (EINVAL); 132 saddr = vaddr; 133 rsize = size; 134 } 135 136 /* Allocate and initialize a callback entry */ 137 cb = kmem_zalloc(sizeof (struct as_callback), sleepflag); 138 if (cb == NULL) 139 return (EAGAIN); 140 141 cb->ascb_func = cb_func; 142 cb->ascb_arg = arg; 143 cb->ascb_events = events; 144 cb->ascb_saddr = saddr; 145 cb->ascb_len = rsize; 146 147 /* Add the entry to the list */ 148 mutex_enter(&as->a_contents); 149 current_head = as->a_callbacks; 150 as->a_callbacks = cb; 151 cb->ascb_next = current_head; 152 153 /* 154 * The call to this function may lose in a race with 155 * a pertinent event - eg. a thread does long term memory locking 156 * but before the callback is added another thread executes as_unmap. 157 * A broadcast here resolves that. 158 */ 159 if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) { 160 AS_CLRUNMAPWAIT(as); 161 cv_broadcast(&as->a_cv); 162 } 163 164 mutex_exit(&as->a_contents); 165 return (0); 166 } 167 168 /* 169 * Search the callback list for an entry which pertains to arg. 170 * 171 * This is called from within the client upon completion of the callback. 172 * RETURN VALUES: 173 * AS_CALLBACK_DELETED (callback entry found and deleted) 174 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok) 175 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this 176 * entry will be made in as_do_callbacks) 177 * 178 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED 179 * set, it indicates that as_do_callbacks is processing this entry. The 180 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made 181 * to unblock as_do_callbacks, in case it is blocked. 182 * 183 * CALLERS RESPONSIBILITY: If not calling from within the process context for 184 * the specified as, the caller must guarantee persistence of the specified as 185 * for the duration of this function (eg. pages being locked within the as 186 * will guarantee persistence). 187 */ 188 uint_t 189 as_delete_callback(struct as *as, void *arg) 190 { 191 struct as_callback **prevcb = &as->a_callbacks; 192 struct as_callback *cb; 193 uint_t rc = AS_CALLBACK_NOTFOUND; 194 195 mutex_enter(&as->a_contents); 196 for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) { 197 if (cb->ascb_arg != arg) 198 continue; 199 200 /* 201 * If the events indicate AS_CALLBACK_CALLED, just clear 202 * AS_ALL_EVENT in the events field and wakeup the thread 203 * that may be waiting in as_do_callbacks. as_do_callbacks 204 * will take care of removing this entry from the list. In 205 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise 206 * (AS_CALLBACK_CALLED not set), just remove it from the 207 * list, return the memory and return AS_CALLBACK_DELETED. 208 */ 209 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) { 210 /* leave AS_CALLBACK_CALLED */ 211 cb->ascb_events &= ~AS_ALL_EVENT; 212 rc = AS_CALLBACK_DELETE_DEFERRED; 213 cv_broadcast(&as->a_cv); 214 } else { 215 *prevcb = cb->ascb_next; 216 kmem_free(cb, sizeof (struct as_callback)); 217 rc = AS_CALLBACK_DELETED; 218 } 219 break; 220 } 221 mutex_exit(&as->a_contents); 222 return (rc); 223 } 224 225 /* 226 * Searches the as callback list for a matching entry. 227 * Returns a pointer to the first matching callback, or NULL if 228 * nothing is found. 229 * This function never sleeps so it is ok to call it with more 230 * locks held but the (required) a_contents mutex. 231 * 232 * See also comment on as_do_callbacks below. 233 */ 234 static struct as_callback * 235 as_find_callback(struct as *as, uint_t events, caddr_t event_addr, 236 size_t event_len) 237 { 238 struct as_callback *cb; 239 240 ASSERT(MUTEX_HELD(&as->a_contents)); 241 for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) { 242 /* 243 * If the callback has not already been called, then 244 * check if events or address range pertains. An event_len 245 * of zero means do an unconditional callback. 246 */ 247 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) || 248 ((event_len != 0) && (((cb->ascb_events & events) == 0) || 249 (event_addr + event_len < cb->ascb_saddr) || 250 (event_addr > (cb->ascb_saddr + cb->ascb_len))))) { 251 continue; 252 } 253 break; 254 } 255 return (cb); 256 } 257 258 /* 259 * Executes a given callback and removes it from the callback list for 260 * this address space. 261 * This function may sleep so the caller must drop all locks except 262 * a_contents before calling this func. 263 * 264 * See also comments on as_do_callbacks below. 265 */ 266 static void 267 as_execute_callback(struct as *as, struct as_callback *cb, 268 uint_t events) 269 { 270 struct as_callback **prevcb; 271 void *cb_arg; 272 273 ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events)); 274 cb->ascb_events |= AS_CALLBACK_CALLED; 275 mutex_exit(&as->a_contents); 276 (*cb->ascb_func)(as, cb->ascb_arg, events); 277 mutex_enter(&as->a_contents); 278 /* 279 * the callback function is required to delete the callback 280 * when the callback function determines it is OK for 281 * this thread to continue. as_delete_callback will clear 282 * the AS_ALL_EVENT in the events field when it is deleted. 283 * If the callback function called as_delete_callback, 284 * events will already be cleared and there will be no blocking. 285 */ 286 while ((cb->ascb_events & events) != 0) { 287 cv_wait(&as->a_cv, &as->a_contents); 288 } 289 /* 290 * This entry needs to be taken off the list. Normally, the 291 * callback func itself does that, but unfortunately the list 292 * may have changed while the callback was running because the 293 * a_contents mutex was dropped and someone else other than the 294 * callback func itself could have called as_delete_callback, 295 * so we have to search to find this entry again. The entry 296 * must have AS_CALLBACK_CALLED, and have the same 'arg'. 297 */ 298 cb_arg = cb->ascb_arg; 299 prevcb = &as->a_callbacks; 300 for (cb = as->a_callbacks; cb != NULL; 301 prevcb = &cb->ascb_next, cb = *prevcb) { 302 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) || 303 (cb_arg != cb->ascb_arg)) { 304 continue; 305 } 306 *prevcb = cb->ascb_next; 307 kmem_free(cb, sizeof (struct as_callback)); 308 break; 309 } 310 } 311 312 /* 313 * Check the callback list for a matching event and intersection of 314 * address range. If there is a match invoke the callback. Skip an entry if: 315 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED) 316 * - not event of interest 317 * - not address range of interest 318 * 319 * An event_len of zero indicates a request for an unconditional callback 320 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The 321 * a_contents lock must be dropped before a callback, so only one callback 322 * can be done before returning. Return -1 (true) if a callback was 323 * executed and removed from the list, else return 0 (false). 324 * 325 * The logically separate parts, i.e. finding a matching callback and 326 * executing a given callback have been separated into two functions 327 * so that they can be called with different sets of locks held beyond 328 * the always-required a_contents. as_find_callback does not sleep so 329 * it is ok to call it if more locks than a_contents (i.e. the a_lock 330 * rwlock) are held. as_execute_callback on the other hand may sleep 331 * so all locks beyond a_contents must be dropped by the caller if one 332 * does not want to end comatose. 333 */ 334 static int 335 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr, 336 size_t event_len) 337 { 338 struct as_callback *cb; 339 340 if ((cb = as_find_callback(as, events, event_addr, event_len))) { 341 as_execute_callback(as, cb, events); 342 return (-1); 343 } 344 return (0); 345 } 346 347 /* 348 * Search for the segment containing addr. If a segment containing addr 349 * exists, that segment is returned. If no such segment exists, and 350 * the list spans addresses greater than addr, then the first segment 351 * whose base is greater than addr is returned; otherwise, NULL is 352 * returned unless tail is true, in which case the last element of the 353 * list is returned. 354 * 355 * a_seglast is used to cache the last found segment for repeated 356 * searches to the same addr (which happens frequently). 357 */ 358 struct seg * 359 as_findseg(struct as *as, caddr_t addr, int tail) 360 { 361 struct seg *seg = as->a_seglast; 362 avl_index_t where; 363 364 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 365 366 if (seg != NULL && 367 seg->s_base <= addr && 368 addr < seg->s_base + seg->s_size) 369 return (seg); 370 371 seg = avl_find(&as->a_segtree, &addr, &where); 372 if (seg != NULL) 373 return (as->a_seglast = seg); 374 375 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); 376 if (seg == NULL && tail) 377 seg = avl_last(&as->a_segtree); 378 return (as->a_seglast = seg); 379 } 380 381 #ifdef VERIFY_SEGLIST 382 /* 383 * verify that the linked list is coherent 384 */ 385 static void 386 as_verify(struct as *as) 387 { 388 struct seg *seg, *seglast, *p, *n; 389 uint_t nsegs = 0; 390 391 if (do_as_verify == 0) 392 return; 393 394 seglast = as->a_seglast; 395 396 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 397 ASSERT(seg->s_as == as); 398 p = AS_SEGPREV(as, seg); 399 n = AS_SEGNEXT(as, seg); 400 ASSERT(p == NULL || p->s_as == as); 401 ASSERT(p == NULL || p->s_base < seg->s_base); 402 ASSERT(n == NULL || n->s_base > seg->s_base); 403 ASSERT(n != NULL || seg == avl_last(&as->a_segtree)); 404 if (seg == seglast) 405 seglast = NULL; 406 nsegs++; 407 } 408 ASSERT(seglast == NULL); 409 ASSERT(avl_numnodes(&as->a_segtree) == nsegs); 410 } 411 #endif /* VERIFY_SEGLIST */ 412 413 /* 414 * Add a new segment to the address space. The avl_find() 415 * may be expensive so we attempt to use last segment accessed 416 * in as_gap() as an insertion point. 417 */ 418 int 419 as_addseg(struct as *as, struct seg *newseg) 420 { 421 struct seg *seg; 422 caddr_t addr; 423 caddr_t eaddr; 424 avl_index_t where; 425 426 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 427 428 as->a_updatedir = 1; /* inform /proc */ 429 gethrestime(&as->a_updatetime); 430 431 if (as->a_lastgaphl != NULL) { 432 struct seg *hseg = NULL; 433 struct seg *lseg = NULL; 434 435 if (as->a_lastgaphl->s_base > newseg->s_base) { 436 hseg = as->a_lastgaphl; 437 lseg = AVL_PREV(&as->a_segtree, hseg); 438 } else { 439 lseg = as->a_lastgaphl; 440 hseg = AVL_NEXT(&as->a_segtree, lseg); 441 } 442 443 if (hseg && lseg && lseg->s_base < newseg->s_base && 444 hseg->s_base > newseg->s_base) { 445 avl_insert_here(&as->a_segtree, newseg, lseg, 446 AVL_AFTER); 447 as->a_lastgaphl = NULL; 448 as->a_seglast = newseg; 449 return (0); 450 } 451 as->a_lastgaphl = NULL; 452 } 453 454 addr = newseg->s_base; 455 eaddr = addr + newseg->s_size; 456 again: 457 458 seg = avl_find(&as->a_segtree, &addr, &where); 459 460 if (seg == NULL) 461 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); 462 463 if (seg == NULL) 464 seg = avl_last(&as->a_segtree); 465 466 if (seg != NULL) { 467 caddr_t base = seg->s_base; 468 469 /* 470 * If top of seg is below the requested address, then 471 * the insertion point is at the end of the linked list, 472 * and seg points to the tail of the list. Otherwise, 473 * the insertion point is immediately before seg. 474 */ 475 if (base + seg->s_size > addr) { 476 if (addr >= base || eaddr > base) { 477 #ifdef __sparc 478 extern struct seg_ops segnf_ops; 479 480 /* 481 * no-fault segs must disappear if overlaid. 482 * XXX need new segment type so 483 * we don't have to check s_ops 484 */ 485 if (seg->s_ops == &segnf_ops) { 486 seg_unmap(seg); 487 goto again; 488 } 489 #endif 490 return (-1); /* overlapping segment */ 491 } 492 } 493 } 494 as->a_seglast = newseg; 495 avl_insert(&as->a_segtree, newseg, where); 496 497 #ifdef VERIFY_SEGLIST 498 as_verify(as); 499 #endif 500 return (0); 501 } 502 503 struct seg * 504 as_removeseg(struct as *as, struct seg *seg) 505 { 506 avl_tree_t *t; 507 508 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 509 510 as->a_updatedir = 1; /* inform /proc */ 511 gethrestime(&as->a_updatetime); 512 513 if (seg == NULL) 514 return (NULL); 515 516 t = &as->a_segtree; 517 if (as->a_seglast == seg) 518 as->a_seglast = NULL; 519 as->a_lastgaphl = NULL; 520 521 /* 522 * if this segment is at an address higher than 523 * a_lastgap, set a_lastgap to the next segment (NULL if last segment) 524 */ 525 if (as->a_lastgap && 526 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base)) 527 as->a_lastgap = AVL_NEXT(t, seg); 528 529 /* 530 * remove the segment from the seg tree 531 */ 532 avl_remove(t, seg); 533 534 #ifdef VERIFY_SEGLIST 535 as_verify(as); 536 #endif 537 return (seg); 538 } 539 540 /* 541 * Find a segment containing addr. 542 */ 543 struct seg * 544 as_segat(struct as *as, caddr_t addr) 545 { 546 struct seg *seg = as->a_seglast; 547 548 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 549 550 if (seg != NULL && seg->s_base <= addr && 551 addr < seg->s_base + seg->s_size) 552 return (seg); 553 554 seg = avl_find(&as->a_segtree, &addr, NULL); 555 return (seg); 556 } 557 558 /* 559 * Serialize all searches for holes in an address space to 560 * prevent two or more threads from allocating the same virtual 561 * address range. The address space must not be "read/write" 562 * locked by the caller since we may block. 563 */ 564 void 565 as_rangelock(struct as *as) 566 { 567 mutex_enter(&as->a_contents); 568 while (AS_ISCLAIMGAP(as)) 569 cv_wait(&as->a_cv, &as->a_contents); 570 AS_SETCLAIMGAP(as); 571 mutex_exit(&as->a_contents); 572 } 573 574 /* 575 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads. 576 */ 577 void 578 as_rangeunlock(struct as *as) 579 { 580 mutex_enter(&as->a_contents); 581 AS_CLRCLAIMGAP(as); 582 cv_signal(&as->a_cv); 583 mutex_exit(&as->a_contents); 584 } 585 586 /* 587 * compar segments (or just an address) by segment address range 588 */ 589 static int 590 as_segcompar(const void *x, const void *y) 591 { 592 struct seg *a = (struct seg *)x; 593 struct seg *b = (struct seg *)y; 594 595 if (a->s_base < b->s_base) 596 return (-1); 597 if (a->s_base >= b->s_base + b->s_size) 598 return (1); 599 return (0); 600 } 601 602 603 void 604 as_avlinit(struct as *as) 605 { 606 avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg), 607 offsetof(struct seg, s_tree)); 608 avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page), 609 offsetof(struct watched_page, wp_link)); 610 } 611 612 /*ARGSUSED*/ 613 static int 614 as_constructor(void *buf, void *cdrarg, int kmflags) 615 { 616 struct as *as = buf; 617 618 mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL); 619 cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL); 620 rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL); 621 as_avlinit(as); 622 return (0); 623 } 624 625 /*ARGSUSED1*/ 626 static void 627 as_destructor(void *buf, void *cdrarg) 628 { 629 struct as *as = buf; 630 631 avl_destroy(&as->a_segtree); 632 mutex_destroy(&as->a_contents); 633 cv_destroy(&as->a_cv); 634 rw_destroy(&as->a_lock); 635 } 636 637 void 638 as_init(void) 639 { 640 as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0, 641 as_constructor, as_destructor, NULL, NULL, NULL, 0); 642 } 643 644 /* 645 * Allocate and initialize an address space data structure. 646 * We call hat_alloc to allow any machine dependent 647 * information in the hat structure to be initialized. 648 */ 649 struct as * 650 as_alloc(void) 651 { 652 struct as *as; 653 654 as = kmem_cache_alloc(as_cache, KM_SLEEP); 655 656 as->a_flags = 0; 657 as->a_vbits = 0; 658 as->a_hrm = NULL; 659 as->a_seglast = NULL; 660 as->a_size = 0; 661 as->a_updatedir = 0; 662 gethrestime(&as->a_updatetime); 663 as->a_objectdir = NULL; 664 as->a_sizedir = 0; 665 as->a_userlimit = (caddr_t)USERLIMIT; 666 as->a_lastgap = NULL; 667 as->a_lastgaphl = NULL; 668 as->a_callbacks = NULL; 669 670 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 671 as->a_hat = hat_alloc(as); /* create hat for default system mmu */ 672 AS_LOCK_EXIT(as, &as->a_lock); 673 674 as->a_xhat = NULL; 675 676 return (as); 677 } 678 679 /* 680 * Free an address space data structure. 681 * Need to free the hat first and then 682 * all the segments on this as and finally 683 * the space for the as struct itself. 684 */ 685 void 686 as_free(struct as *as) 687 { 688 struct hat *hat = as->a_hat; 689 struct seg *seg, *next; 690 int called = 0; 691 692 top: 693 /* 694 * Invoke ALL callbacks. as_do_callbacks will do one callback 695 * per call, and not return (-1) until the callback has completed. 696 * When as_do_callbacks returns zero, all callbacks have completed. 697 */ 698 mutex_enter(&as->a_contents); 699 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0)) 700 ; 701 702 /* This will prevent new XHATs from attaching to as */ 703 if (!called) 704 AS_SETBUSY(as); 705 mutex_exit(&as->a_contents); 706 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 707 708 if (!called) { 709 called = 1; 710 hat_free_start(hat); 711 if (as->a_xhat != NULL) 712 xhat_free_start_all(as); 713 } 714 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) { 715 int err; 716 717 next = AS_SEGNEXT(as, seg); 718 retry: 719 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 720 if (err == EAGAIN) { 721 mutex_enter(&as->a_contents); 722 if (as->a_callbacks) { 723 AS_LOCK_EXIT(as, &as->a_lock); 724 } else if (!AS_ISNOUNMAPWAIT(as)) { 725 /* 726 * Memory is currently locked. Wait for a 727 * cv_signal that it has been unlocked, then 728 * try the operation again. 729 */ 730 if (AS_ISUNMAPWAIT(as) == 0) 731 cv_broadcast(&as->a_cv); 732 AS_SETUNMAPWAIT(as); 733 AS_LOCK_EXIT(as, &as->a_lock); 734 while (AS_ISUNMAPWAIT(as)) 735 cv_wait(&as->a_cv, &as->a_contents); 736 } else { 737 /* 738 * We may have raced with 739 * segvn_reclaim()/segspt_reclaim(). In this 740 * case clean nounmapwait flag and retry since 741 * softlockcnt in this segment may be already 742 * 0. We don't drop as writer lock so our 743 * number of retries without sleeping should 744 * be very small. See segvn_reclaim() for 745 * more comments. 746 */ 747 AS_CLRNOUNMAPWAIT(as); 748 mutex_exit(&as->a_contents); 749 goto retry; 750 } 751 mutex_exit(&as->a_contents); 752 goto top; 753 } else { 754 /* 755 * We do not expect any other error return at this 756 * time. This is similar to an ASSERT in seg_unmap() 757 */ 758 ASSERT(err == 0); 759 } 760 } 761 hat_free_end(hat); 762 if (as->a_xhat != NULL) 763 xhat_free_end_all(as); 764 AS_LOCK_EXIT(as, &as->a_lock); 765 766 /* /proc stuff */ 767 ASSERT(avl_numnodes(&as->a_wpage) == 0); 768 if (as->a_objectdir) { 769 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *)); 770 as->a_objectdir = NULL; 771 as->a_sizedir = 0; 772 } 773 774 /* 775 * Free the struct as back to kmem. Assert it has no segments. 776 */ 777 ASSERT(avl_numnodes(&as->a_segtree) == 0); 778 kmem_cache_free(as_cache, as); 779 } 780 781 int 782 as_dup(struct as *as, struct as **outas) 783 { 784 struct as *newas; 785 struct seg *seg, *newseg; 786 int error; 787 788 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 789 as_clearwatch(as); 790 newas = as_alloc(); 791 newas->a_userlimit = as->a_userlimit; 792 AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER); 793 794 /* This will prevent new XHATs from attaching */ 795 mutex_enter(&as->a_contents); 796 AS_SETBUSY(as); 797 mutex_exit(&as->a_contents); 798 mutex_enter(&newas->a_contents); 799 AS_SETBUSY(newas); 800 mutex_exit(&newas->a_contents); 801 802 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD); 803 804 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 805 806 if (seg->s_flags & S_PURGE) 807 continue; 808 809 newseg = seg_alloc(newas, seg->s_base, seg->s_size); 810 if (newseg == NULL) { 811 AS_LOCK_EXIT(newas, &newas->a_lock); 812 as_setwatch(as); 813 mutex_enter(&as->a_contents); 814 AS_CLRBUSY(as); 815 mutex_exit(&as->a_contents); 816 AS_LOCK_EXIT(as, &as->a_lock); 817 as_free(newas); 818 return (-1); 819 } 820 if ((error = SEGOP_DUP(seg, newseg)) != 0) { 821 /* 822 * We call seg_free() on the new seg 823 * because the segment is not set up 824 * completely; i.e. it has no ops. 825 */ 826 as_setwatch(as); 827 mutex_enter(&as->a_contents); 828 AS_CLRBUSY(as); 829 mutex_exit(&as->a_contents); 830 AS_LOCK_EXIT(as, &as->a_lock); 831 seg_free(newseg); 832 AS_LOCK_EXIT(newas, &newas->a_lock); 833 as_free(newas); 834 return (error); 835 } 836 newas->a_size += seg->s_size; 837 } 838 839 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL); 840 if (as->a_xhat != NULL) 841 error |= xhat_dup_all(as, newas, NULL, 0, HAT_DUP_ALL); 842 843 mutex_enter(&newas->a_contents); 844 AS_CLRBUSY(newas); 845 mutex_exit(&newas->a_contents); 846 AS_LOCK_EXIT(newas, &newas->a_lock); 847 848 as_setwatch(as); 849 mutex_enter(&as->a_contents); 850 AS_CLRBUSY(as); 851 mutex_exit(&as->a_contents); 852 AS_LOCK_EXIT(as, &as->a_lock); 853 if (error != 0) { 854 as_free(newas); 855 return (error); 856 } 857 *outas = newas; 858 return (0); 859 } 860 861 /* 862 * Handle a ``fault'' at addr for size bytes. 863 */ 864 faultcode_t 865 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size, 866 enum fault_type type, enum seg_rw rw) 867 { 868 struct seg *seg; 869 caddr_t raddr; /* rounded down addr */ 870 size_t rsize; /* rounded up size */ 871 size_t ssize; 872 faultcode_t res = 0; 873 caddr_t addrsav; 874 struct seg *segsav; 875 int as_lock_held; 876 klwp_t *lwp = ttolwp(curthread); 877 int is_xhat = 0; 878 int holding_wpage = 0; 879 extern struct seg_ops segdev_ops; 880 881 882 883 if (as->a_hat != hat) { 884 /* This must be an XHAT then */ 885 is_xhat = 1; 886 887 if ((type != F_INVAL) || (as == &kas)) 888 return (FC_NOSUPPORT); 889 } 890 891 retry: 892 if (!is_xhat) { 893 /* 894 * Indicate that the lwp is not to be stopped while waiting 895 * for a pagefault. This is to avoid deadlock while debugging 896 * a process via /proc over NFS (in particular). 897 */ 898 if (lwp != NULL) 899 lwp->lwp_nostop++; 900 901 /* 902 * same length must be used when we softlock and softunlock. 903 * We don't support softunlocking lengths less than 904 * the original length when there is largepage support. 905 * See seg_dev.c for more comments. 906 */ 907 switch (type) { 908 909 case F_SOFTLOCK: 910 CPU_STATS_ADD_K(vm, softlock, 1); 911 break; 912 913 case F_SOFTUNLOCK: 914 break; 915 916 case F_PROT: 917 CPU_STATS_ADD_K(vm, prot_fault, 1); 918 break; 919 920 case F_INVAL: 921 CPU_STATS_ENTER_K(); 922 CPU_STATS_ADDQ(CPU, vm, as_fault, 1); 923 if (as == &kas) 924 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1); 925 CPU_STATS_EXIT_K(); 926 break; 927 } 928 } 929 930 /* Kernel probe */ 931 TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */, 932 tnf_opaque, address, addr, 933 tnf_fault_type, fault_type, type, 934 tnf_seg_access, access, rw); 935 936 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 937 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 938 (size_t)raddr; 939 940 /* 941 * XXX -- Don't grab the as lock for segkmap. We should grab it for 942 * correctness, but then we could be stuck holding this lock for 943 * a LONG time if the fault needs to be resolved on a slow 944 * filesystem, and then no-one will be able to exec new commands, 945 * as exec'ing requires the write lock on the as. 946 */ 947 if (as == &kas && segkmap && segkmap->s_base <= raddr && 948 raddr + size < segkmap->s_base + segkmap->s_size) { 949 /* 950 * if (as==&kas), this can't be XHAT: we've already returned 951 * FC_NOSUPPORT. 952 */ 953 seg = segkmap; 954 as_lock_held = 0; 955 } else { 956 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 957 if (is_xhat && avl_numnodes(&as->a_wpage) != 0) { 958 /* 959 * Grab and hold the writers' lock on the as 960 * if the fault is to a watched page. 961 * This will keep CPUs from "peeking" at the 962 * address range while we're temporarily boosting 963 * the permissions for the XHAT device to 964 * resolve the fault in the segment layer. 965 * 966 * We could check whether faulted address 967 * is within a watched page and only then grab 968 * the writer lock, but this is simpler. 969 */ 970 AS_LOCK_EXIT(as, &as->a_lock); 971 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 972 } 973 974 seg = as_segat(as, raddr); 975 if (seg == NULL) { 976 AS_LOCK_EXIT(as, &as->a_lock); 977 if ((lwp != NULL) && (!is_xhat)) 978 lwp->lwp_nostop--; 979 return (FC_NOMAP); 980 } 981 982 as_lock_held = 1; 983 } 984 985 addrsav = raddr; 986 segsav = seg; 987 988 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 989 if (raddr >= seg->s_base + seg->s_size) { 990 seg = AS_SEGNEXT(as, seg); 991 if (seg == NULL || raddr != seg->s_base) { 992 res = FC_NOMAP; 993 break; 994 } 995 } 996 if (raddr + rsize > seg->s_base + seg->s_size) 997 ssize = seg->s_base + seg->s_size - raddr; 998 else 999 ssize = rsize; 1000 1001 if (!is_xhat || (seg->s_ops != &segdev_ops)) { 1002 1003 if (is_xhat && avl_numnodes(&as->a_wpage) != 0 && 1004 pr_is_watchpage_as(raddr, rw, as)) { 1005 /* 1006 * Handle watch pages. If we're faulting on a 1007 * watched page from an X-hat, we have to 1008 * restore the original permissions while we 1009 * handle the fault. 1010 */ 1011 as_clearwatch(as); 1012 holding_wpage = 1; 1013 } 1014 1015 res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw); 1016 1017 /* Restore watchpoints */ 1018 if (holding_wpage) { 1019 as_setwatch(as); 1020 holding_wpage = 0; 1021 } 1022 1023 if (res != 0) 1024 break; 1025 } else { 1026 /* XHAT does not support seg_dev */ 1027 res = FC_NOSUPPORT; 1028 break; 1029 } 1030 } 1031 1032 /* 1033 * If we were SOFTLOCKing and encountered a failure, 1034 * we must SOFTUNLOCK the range we already did. (Maybe we 1035 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing 1036 * right here...) 1037 */ 1038 if (res != 0 && type == F_SOFTLOCK) { 1039 for (seg = segsav; addrsav < raddr; addrsav += ssize) { 1040 if (addrsav >= seg->s_base + seg->s_size) 1041 seg = AS_SEGNEXT(as, seg); 1042 ASSERT(seg != NULL); 1043 /* 1044 * Now call the fault routine again to perform the 1045 * unlock using S_OTHER instead of the rw variable 1046 * since we never got a chance to touch the pages. 1047 */ 1048 if (raddr > seg->s_base + seg->s_size) 1049 ssize = seg->s_base + seg->s_size - addrsav; 1050 else 1051 ssize = raddr - addrsav; 1052 (void) SEGOP_FAULT(hat, seg, addrsav, ssize, 1053 F_SOFTUNLOCK, S_OTHER); 1054 } 1055 } 1056 if (as_lock_held) 1057 AS_LOCK_EXIT(as, &as->a_lock); 1058 if ((lwp != NULL) && (!is_xhat)) 1059 lwp->lwp_nostop--; 1060 1061 /* 1062 * If the lower levels returned EDEADLK for a fault, 1063 * It means that we should retry the fault. Let's wait 1064 * a bit also to let the deadlock causing condition clear. 1065 * This is part of a gross hack to work around a design flaw 1066 * in the ufs/sds logging code and should go away when the 1067 * logging code is re-designed to fix the problem. See bug 1068 * 4125102 for details of the problem. 1069 */ 1070 if (FC_ERRNO(res) == EDEADLK) { 1071 delay(deadlk_wait); 1072 res = 0; 1073 goto retry; 1074 } 1075 return (res); 1076 } 1077 1078 1079 1080 /* 1081 * Asynchronous ``fault'' at addr for size bytes. 1082 */ 1083 faultcode_t 1084 as_faulta(struct as *as, caddr_t addr, size_t size) 1085 { 1086 struct seg *seg; 1087 caddr_t raddr; /* rounded down addr */ 1088 size_t rsize; /* rounded up size */ 1089 faultcode_t res = 0; 1090 klwp_t *lwp = ttolwp(curthread); 1091 1092 retry: 1093 /* 1094 * Indicate that the lwp is not to be stopped while waiting 1095 * for a pagefault. This is to avoid deadlock while debugging 1096 * a process via /proc over NFS (in particular). 1097 */ 1098 if (lwp != NULL) 1099 lwp->lwp_nostop++; 1100 1101 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1102 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1103 (size_t)raddr; 1104 1105 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1106 seg = as_segat(as, raddr); 1107 if (seg == NULL) { 1108 AS_LOCK_EXIT(as, &as->a_lock); 1109 if (lwp != NULL) 1110 lwp->lwp_nostop--; 1111 return (FC_NOMAP); 1112 } 1113 1114 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) { 1115 if (raddr >= seg->s_base + seg->s_size) { 1116 seg = AS_SEGNEXT(as, seg); 1117 if (seg == NULL || raddr != seg->s_base) { 1118 res = FC_NOMAP; 1119 break; 1120 } 1121 } 1122 res = SEGOP_FAULTA(seg, raddr); 1123 if (res != 0) 1124 break; 1125 } 1126 AS_LOCK_EXIT(as, &as->a_lock); 1127 if (lwp != NULL) 1128 lwp->lwp_nostop--; 1129 /* 1130 * If the lower levels returned EDEADLK for a fault, 1131 * It means that we should retry the fault. Let's wait 1132 * a bit also to let the deadlock causing condition clear. 1133 * This is part of a gross hack to work around a design flaw 1134 * in the ufs/sds logging code and should go away when the 1135 * logging code is re-designed to fix the problem. See bug 1136 * 4125102 for details of the problem. 1137 */ 1138 if (FC_ERRNO(res) == EDEADLK) { 1139 delay(deadlk_wait); 1140 res = 0; 1141 goto retry; 1142 } 1143 return (res); 1144 } 1145 1146 /* 1147 * Set the virtual mapping for the interval from [addr : addr + size) 1148 * in address space `as' to have the specified protection. 1149 * It is ok for the range to cross over several segments, 1150 * as long as they are contiguous. 1151 */ 1152 int 1153 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 1154 { 1155 struct seg *seg; 1156 struct as_callback *cb; 1157 size_t ssize; 1158 caddr_t raddr; /* rounded down addr */ 1159 size_t rsize; /* rounded up size */ 1160 int error = 0, writer = 0; 1161 caddr_t saveraddr; 1162 size_t saversize; 1163 1164 setprot_top: 1165 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1166 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1167 (size_t)raddr; 1168 1169 if (raddr + rsize < raddr) /* check for wraparound */ 1170 return (ENOMEM); 1171 1172 saveraddr = raddr; 1173 saversize = rsize; 1174 1175 /* 1176 * Normally we only lock the as as a reader. But 1177 * if due to setprot the segment driver needs to split 1178 * a segment it will return IE_RETRY. Therefore we re-acquire 1179 * the as lock as a writer so the segment driver can change 1180 * the seg list. Also the segment driver will return IE_RETRY 1181 * after it has changed the segment list so we therefore keep 1182 * locking as a writer. Since these opeartions should be rare 1183 * want to only lock as a writer when necessary. 1184 */ 1185 if (writer || avl_numnodes(&as->a_wpage) != 0) { 1186 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1187 } else { 1188 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1189 } 1190 1191 as_clearwatchprot(as, raddr, rsize); 1192 seg = as_segat(as, raddr); 1193 if (seg == NULL) { 1194 as_setwatch(as); 1195 AS_LOCK_EXIT(as, &as->a_lock); 1196 return (ENOMEM); 1197 } 1198 1199 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 1200 if (raddr >= seg->s_base + seg->s_size) { 1201 seg = AS_SEGNEXT(as, seg); 1202 if (seg == NULL || raddr != seg->s_base) { 1203 error = ENOMEM; 1204 break; 1205 } 1206 } 1207 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 1208 ssize = seg->s_base + seg->s_size - raddr; 1209 else 1210 ssize = rsize; 1211 retry: 1212 error = SEGOP_SETPROT(seg, raddr, ssize, prot); 1213 1214 if (error == IE_NOMEM) { 1215 error = EAGAIN; 1216 break; 1217 } 1218 1219 if (error == IE_RETRY) { 1220 AS_LOCK_EXIT(as, &as->a_lock); 1221 writer = 1; 1222 goto setprot_top; 1223 } 1224 1225 if (error == EAGAIN) { 1226 /* 1227 * Make sure we have a_lock as writer. 1228 */ 1229 if (writer == 0) { 1230 AS_LOCK_EXIT(as, &as->a_lock); 1231 writer = 1; 1232 goto setprot_top; 1233 } 1234 1235 /* 1236 * Memory is currently locked. It must be unlocked 1237 * before this operation can succeed through a retry. 1238 * The possible reasons for locked memory and 1239 * corresponding strategies for unlocking are: 1240 * (1) Normal I/O 1241 * wait for a signal that the I/O operation 1242 * has completed and the memory is unlocked. 1243 * (2) Asynchronous I/O 1244 * The aio subsystem does not unlock pages when 1245 * the I/O is completed. Those pages are unlocked 1246 * when the application calls aiowait/aioerror. 1247 * So, to prevent blocking forever, cv_broadcast() 1248 * is done to wake up aio_cleanup_thread. 1249 * Subsequently, segvn_reclaim will be called, and 1250 * that will do AS_CLRUNMAPWAIT() and wake us up. 1251 * (3) Long term page locking: 1252 * Drivers intending to have pages locked for a 1253 * period considerably longer than for normal I/O 1254 * (essentially forever) may have registered for a 1255 * callback so they may unlock these pages on 1256 * request. This is needed to allow this operation 1257 * to succeed. Each entry on the callback list is 1258 * examined. If the event or address range pertains 1259 * the callback is invoked (unless it already is in 1260 * progress). The a_contents lock must be dropped 1261 * before the callback, so only one callback can 1262 * be done at a time. Go to the top and do more 1263 * until zero is returned. If zero is returned, 1264 * either there were no callbacks for this event 1265 * or they were already in progress. 1266 */ 1267 mutex_enter(&as->a_contents); 1268 if (as->a_callbacks && 1269 (cb = as_find_callback(as, AS_SETPROT_EVENT, 1270 seg->s_base, seg->s_size))) { 1271 AS_LOCK_EXIT(as, &as->a_lock); 1272 as_execute_callback(as, cb, AS_SETPROT_EVENT); 1273 } else if (!AS_ISNOUNMAPWAIT(as)) { 1274 if (AS_ISUNMAPWAIT(as) == 0) 1275 cv_broadcast(&as->a_cv); 1276 AS_SETUNMAPWAIT(as); 1277 AS_LOCK_EXIT(as, &as->a_lock); 1278 while (AS_ISUNMAPWAIT(as)) 1279 cv_wait(&as->a_cv, &as->a_contents); 1280 } else { 1281 /* 1282 * We may have raced with 1283 * segvn_reclaim()/segspt_reclaim(). In this 1284 * case clean nounmapwait flag and retry since 1285 * softlockcnt in this segment may be already 1286 * 0. We don't drop as writer lock so our 1287 * number of retries without sleeping should 1288 * be very small. See segvn_reclaim() for 1289 * more comments. 1290 */ 1291 AS_CLRNOUNMAPWAIT(as); 1292 mutex_exit(&as->a_contents); 1293 goto retry; 1294 } 1295 mutex_exit(&as->a_contents); 1296 goto setprot_top; 1297 } else if (error != 0) 1298 break; 1299 } 1300 if (error != 0) { 1301 as_setwatch(as); 1302 } else { 1303 as_setwatchprot(as, saveraddr, saversize, prot); 1304 } 1305 AS_LOCK_EXIT(as, &as->a_lock); 1306 return (error); 1307 } 1308 1309 /* 1310 * Check to make sure that the interval [addr, addr + size) 1311 * in address space `as' has at least the specified protection. 1312 * It is ok for the range to cross over several segments, as long 1313 * as they are contiguous. 1314 */ 1315 int 1316 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 1317 { 1318 struct seg *seg; 1319 size_t ssize; 1320 caddr_t raddr; /* rounded down addr */ 1321 size_t rsize; /* rounded up size */ 1322 int error = 0; 1323 1324 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1325 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1326 (size_t)raddr; 1327 1328 if (raddr + rsize < raddr) /* check for wraparound */ 1329 return (ENOMEM); 1330 1331 /* 1332 * This is ugly as sin... 1333 * Normally, we only acquire the address space readers lock. 1334 * However, if the address space has watchpoints present, 1335 * we must acquire the writer lock on the address space for 1336 * the benefit of as_clearwatchprot() and as_setwatchprot(). 1337 */ 1338 if (avl_numnodes(&as->a_wpage) != 0) 1339 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1340 else 1341 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1342 as_clearwatchprot(as, raddr, rsize); 1343 seg = as_segat(as, raddr); 1344 if (seg == NULL) { 1345 as_setwatch(as); 1346 AS_LOCK_EXIT(as, &as->a_lock); 1347 return (ENOMEM); 1348 } 1349 1350 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 1351 if (raddr >= seg->s_base + seg->s_size) { 1352 seg = AS_SEGNEXT(as, seg); 1353 if (seg == NULL || raddr != seg->s_base) { 1354 error = ENOMEM; 1355 break; 1356 } 1357 } 1358 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 1359 ssize = seg->s_base + seg->s_size - raddr; 1360 else 1361 ssize = rsize; 1362 1363 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot); 1364 if (error != 0) 1365 break; 1366 } 1367 as_setwatch(as); 1368 AS_LOCK_EXIT(as, &as->a_lock); 1369 return (error); 1370 } 1371 1372 int 1373 as_unmap(struct as *as, caddr_t addr, size_t size) 1374 { 1375 struct seg *seg, *seg_next; 1376 struct as_callback *cb; 1377 caddr_t raddr, eaddr; 1378 size_t ssize; 1379 int err; 1380 1381 top: 1382 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1383 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) & 1384 (uintptr_t)PAGEMASK); 1385 1386 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1387 1388 as->a_updatedir = 1; /* inform /proc */ 1389 gethrestime(&as->a_updatetime); 1390 1391 /* 1392 * Use as_findseg to find the first segment in the range, then 1393 * step through the segments in order, following s_next. 1394 */ 1395 as_clearwatchprot(as, raddr, eaddr - raddr); 1396 1397 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) { 1398 if (eaddr <= seg->s_base) 1399 break; /* eaddr was in a gap; all done */ 1400 1401 /* this is implied by the test above */ 1402 ASSERT(raddr < eaddr); 1403 1404 if (raddr < seg->s_base) 1405 raddr = seg->s_base; /* raddr was in a gap */ 1406 1407 if (eaddr > (seg->s_base + seg->s_size)) 1408 ssize = seg->s_base + seg->s_size - raddr; 1409 else 1410 ssize = eaddr - raddr; 1411 1412 /* 1413 * Save next segment pointer since seg can be 1414 * destroyed during the segment unmap operation. 1415 */ 1416 seg_next = AS_SEGNEXT(as, seg); 1417 1418 retry: 1419 err = SEGOP_UNMAP(seg, raddr, ssize); 1420 if (err == EAGAIN) { 1421 /* 1422 * Memory is currently locked. It must be unlocked 1423 * before this operation can succeed through a retry. 1424 * The possible reasons for locked memory and 1425 * corresponding strategies for unlocking are: 1426 * (1) Normal I/O 1427 * wait for a signal that the I/O operation 1428 * has completed and the memory is unlocked. 1429 * (2) Asynchronous I/O 1430 * The aio subsystem does not unlock pages when 1431 * the I/O is completed. Those pages are unlocked 1432 * when the application calls aiowait/aioerror. 1433 * So, to prevent blocking forever, cv_broadcast() 1434 * is done to wake up aio_cleanup_thread. 1435 * Subsequently, segvn_reclaim will be called, and 1436 * that will do AS_CLRUNMAPWAIT() and wake us up. 1437 * (3) Long term page locking: 1438 * Drivers intending to have pages locked for a 1439 * period considerably longer than for normal I/O 1440 * (essentially forever) may have registered for a 1441 * callback so they may unlock these pages on 1442 * request. This is needed to allow this operation 1443 * to succeed. Each entry on the callback list is 1444 * examined. If the event or address range pertains 1445 * the callback is invoked (unless it already is in 1446 * progress). The a_contents lock must be dropped 1447 * before the callback, so only one callback can 1448 * be done at a time. Go to the top and do more 1449 * until zero is returned. If zero is returned, 1450 * either there were no callbacks for this event 1451 * or they were already in progress. 1452 */ 1453 mutex_enter(&as->a_contents); 1454 if (as->a_callbacks && 1455 (cb = as_find_callback(as, AS_UNMAP_EVENT, 1456 seg->s_base, seg->s_size))) { 1457 AS_LOCK_EXIT(as, &as->a_lock); 1458 as_execute_callback(as, cb, AS_UNMAP_EVENT); 1459 } else if (!AS_ISNOUNMAPWAIT(as)) { 1460 if (AS_ISUNMAPWAIT(as) == 0) 1461 cv_broadcast(&as->a_cv); 1462 AS_SETUNMAPWAIT(as); 1463 AS_LOCK_EXIT(as, &as->a_lock); 1464 while (AS_ISUNMAPWAIT(as)) 1465 cv_wait(&as->a_cv, &as->a_contents); 1466 } else { 1467 /* 1468 * We may have raced with 1469 * segvn_reclaim()/segspt_reclaim(). In this 1470 * case clean nounmapwait flag and retry since 1471 * softlockcnt in this segment may be already 1472 * 0. We don't drop as writer lock so our 1473 * number of retries without sleeping should 1474 * be very small. See segvn_reclaim() for 1475 * more comments. 1476 */ 1477 AS_CLRNOUNMAPWAIT(as); 1478 mutex_exit(&as->a_contents); 1479 goto retry; 1480 } 1481 mutex_exit(&as->a_contents); 1482 goto top; 1483 } else if (err == IE_RETRY) { 1484 AS_LOCK_EXIT(as, &as->a_lock); 1485 goto top; 1486 } else if (err) { 1487 as_setwatch(as); 1488 AS_LOCK_EXIT(as, &as->a_lock); 1489 return (-1); 1490 } 1491 1492 as->a_size -= ssize; 1493 raddr += ssize; 1494 } 1495 AS_LOCK_EXIT(as, &as->a_lock); 1496 return (0); 1497 } 1498 1499 static int 1500 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec, 1501 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated) 1502 { 1503 uint_t szc; 1504 uint_t nszc; 1505 int error; 1506 caddr_t a; 1507 caddr_t eaddr; 1508 size_t segsize; 1509 struct seg *seg; 1510 size_t pgsz; 1511 int do_off = (vn_a->vp != NULL || vn_a->amp != NULL); 1512 uint_t save_szcvec; 1513 1514 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1515 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1516 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1517 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL); 1518 if (!do_off) { 1519 vn_a->offset = 0; 1520 } 1521 1522 if (szcvec <= 1) { 1523 seg = seg_alloc(as, addr, size); 1524 if (seg == NULL) { 1525 return (ENOMEM); 1526 } 1527 vn_a->szc = 0; 1528 error = (*crfp)(seg, vn_a); 1529 if (error != 0) { 1530 seg_free(seg); 1531 } else { 1532 as->a_size += size; 1533 } 1534 return (error); 1535 } 1536 1537 eaddr = addr + size; 1538 save_szcvec = szcvec; 1539 szcvec >>= 1; 1540 szc = 0; 1541 nszc = 0; 1542 while (szcvec) { 1543 if ((szcvec & 0x1) == 0) { 1544 nszc++; 1545 szcvec >>= 1; 1546 continue; 1547 } 1548 nszc++; 1549 pgsz = page_get_pagesize(nszc); 1550 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 1551 if (a != addr) { 1552 ASSERT(a < eaddr); 1553 segsize = a - addr; 1554 seg = seg_alloc(as, addr, segsize); 1555 if (seg == NULL) { 1556 return (ENOMEM); 1557 } 1558 vn_a->szc = szc; 1559 error = (*crfp)(seg, vn_a); 1560 if (error != 0) { 1561 seg_free(seg); 1562 return (error); 1563 } 1564 as->a_size += segsize; 1565 *segcreated = 1; 1566 if (do_off) { 1567 vn_a->offset += segsize; 1568 } 1569 addr = a; 1570 } 1571 szc = nszc; 1572 szcvec >>= 1; 1573 } 1574 1575 ASSERT(addr < eaddr); 1576 szcvec = save_szcvec | 1; /* add 8K pages */ 1577 while (szcvec) { 1578 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 1579 ASSERT(a >= addr); 1580 if (a != addr) { 1581 segsize = a - addr; 1582 seg = seg_alloc(as, addr, segsize); 1583 if (seg == NULL) { 1584 return (ENOMEM); 1585 } 1586 vn_a->szc = szc; 1587 error = (*crfp)(seg, vn_a); 1588 if (error != 0) { 1589 seg_free(seg); 1590 return (error); 1591 } 1592 as->a_size += segsize; 1593 *segcreated = 1; 1594 if (do_off) { 1595 vn_a->offset += segsize; 1596 } 1597 addr = a; 1598 } 1599 szcvec &= ~(1 << szc); 1600 if (szcvec) { 1601 szc = highbit(szcvec) - 1; 1602 pgsz = page_get_pagesize(szc); 1603 } 1604 } 1605 ASSERT(addr == eaddr); 1606 1607 return (0); 1608 } 1609 1610 static int 1611 as_map_vnsegs(struct as *as, caddr_t addr, size_t size, 1612 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated) 1613 { 1614 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA); 1615 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM; 1616 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags, 1617 type, 0); 1618 int error; 1619 struct seg *seg; 1620 struct vattr va; 1621 u_offset_t eoff; 1622 size_t save_size = 0; 1623 extern size_t textrepl_size_thresh; 1624 1625 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1626 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1627 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1628 ASSERT(vn_a->vp != NULL); 1629 ASSERT(vn_a->amp == NULL); 1630 1631 again: 1632 if (szcvec <= 1) { 1633 seg = seg_alloc(as, addr, size); 1634 if (seg == NULL) { 1635 return (ENOMEM); 1636 } 1637 vn_a->szc = 0; 1638 error = (*crfp)(seg, vn_a); 1639 if (error != 0) { 1640 seg_free(seg); 1641 } else { 1642 as->a_size += size; 1643 } 1644 return (error); 1645 } 1646 1647 va.va_mask = AT_SIZE; 1648 if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) { 1649 szcvec = 0; 1650 goto again; 1651 } 1652 eoff = vn_a->offset & PAGEMASK; 1653 if (eoff >= va.va_size) { 1654 szcvec = 0; 1655 goto again; 1656 } 1657 eoff += size; 1658 if (btopr(va.va_size) < btopr(eoff)) { 1659 save_size = size; 1660 size = va.va_size - (vn_a->offset & PAGEMASK); 1661 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t); 1662 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags, 1663 type, 0); 1664 if (szcvec <= 1) { 1665 size = save_size; 1666 goto again; 1667 } 1668 } 1669 1670 if (size > textrepl_size_thresh) { 1671 vn_a->flags |= _MAP_TEXTREPL; 1672 } 1673 error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a, 1674 segcreated); 1675 if (error != 0) { 1676 return (error); 1677 } 1678 if (save_size) { 1679 addr += size; 1680 size = save_size - size; 1681 szcvec = 0; 1682 goto again; 1683 } 1684 return (0); 1685 } 1686 1687 /* 1688 * as_map_ansegs: shared or private anonymous memory. Note that the flags 1689 * passed to map_pgszvec cannot be MAP_INITDATA, for anon. 1690 */ 1691 static int 1692 as_map_ansegs(struct as *as, caddr_t addr, size_t size, 1693 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated) 1694 { 1695 uint_t szcvec; 1696 uchar_t type; 1697 1698 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE); 1699 if (vn_a->type == MAP_SHARED) { 1700 type = MAPPGSZC_SHM; 1701 } else if (vn_a->type == MAP_PRIVATE) { 1702 if (vn_a->szc == AS_MAP_HEAP) { 1703 type = MAPPGSZC_HEAP; 1704 } else if (vn_a->szc == AS_MAP_STACK) { 1705 type = MAPPGSZC_STACK; 1706 } else { 1707 type = MAPPGSZC_PRIVM; 1708 } 1709 } 1710 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ? 1711 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE), 1712 (vn_a->flags & MAP_TEXT), type, 0); 1713 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1714 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1715 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1716 ASSERT(vn_a->vp == NULL); 1717 1718 return (as_map_segvn_segs(as, addr, size, szcvec, 1719 crfp, vn_a, segcreated)); 1720 } 1721 1722 int 1723 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp) 1724 { 1725 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1726 return (as_map_locked(as, addr, size, crfp, argsp)); 1727 } 1728 1729 int 1730 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(), 1731 void *argsp) 1732 { 1733 struct seg *seg = NULL; 1734 caddr_t raddr; /* rounded down addr */ 1735 size_t rsize; /* rounded up size */ 1736 int error; 1737 int unmap = 0; 1738 struct proc *p = curproc; 1739 struct segvn_crargs crargs; 1740 1741 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1742 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1743 (size_t)raddr; 1744 1745 /* 1746 * check for wrap around 1747 */ 1748 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) { 1749 AS_LOCK_EXIT(as, &as->a_lock); 1750 return (ENOMEM); 1751 } 1752 1753 as->a_updatedir = 1; /* inform /proc */ 1754 gethrestime(&as->a_updatetime); 1755 1756 if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) { 1757 AS_LOCK_EXIT(as, &as->a_lock); 1758 1759 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p, 1760 RCA_UNSAFE_ALL); 1761 1762 return (ENOMEM); 1763 } 1764 1765 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) { 1766 crargs = *(struct segvn_crargs *)argsp; 1767 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap); 1768 if (error != 0) { 1769 AS_LOCK_EXIT(as, &as->a_lock); 1770 if (unmap) { 1771 (void) as_unmap(as, addr, size); 1772 } 1773 return (error); 1774 } 1775 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) { 1776 crargs = *(struct segvn_crargs *)argsp; 1777 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap); 1778 if (error != 0) { 1779 AS_LOCK_EXIT(as, &as->a_lock); 1780 if (unmap) { 1781 (void) as_unmap(as, addr, size); 1782 } 1783 return (error); 1784 } 1785 } else { 1786 seg = seg_alloc(as, addr, size); 1787 if (seg == NULL) { 1788 AS_LOCK_EXIT(as, &as->a_lock); 1789 return (ENOMEM); 1790 } 1791 1792 error = (*crfp)(seg, argsp); 1793 if (error != 0) { 1794 seg_free(seg); 1795 AS_LOCK_EXIT(as, &as->a_lock); 1796 return (error); 1797 } 1798 /* 1799 * Add size now so as_unmap will work if as_ctl fails. 1800 */ 1801 as->a_size += rsize; 1802 } 1803 1804 as_setwatch(as); 1805 1806 /* 1807 * If the address space is locked, 1808 * establish memory locks for the new segment. 1809 */ 1810 mutex_enter(&as->a_contents); 1811 if (AS_ISPGLCK(as)) { 1812 mutex_exit(&as->a_contents); 1813 AS_LOCK_EXIT(as, &as->a_lock); 1814 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0); 1815 if (error != 0) 1816 (void) as_unmap(as, addr, size); 1817 } else { 1818 mutex_exit(&as->a_contents); 1819 AS_LOCK_EXIT(as, &as->a_lock); 1820 } 1821 return (error); 1822 } 1823 1824 1825 /* 1826 * Delete all segments in the address space marked with S_PURGE. 1827 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c). 1828 * These segments are deleted as a first step before calls to as_gap(), so 1829 * that they don't affect mmap() or shmat(). 1830 */ 1831 void 1832 as_purge(struct as *as) 1833 { 1834 struct seg *seg; 1835 struct seg *next_seg; 1836 1837 /* 1838 * the setting of NEEDSPURGE is protect by as_rangelock(), so 1839 * no need to grab a_contents mutex for this check 1840 */ 1841 if ((as->a_flags & AS_NEEDSPURGE) == 0) 1842 return; 1843 1844 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1845 next_seg = NULL; 1846 seg = AS_SEGFIRST(as); 1847 while (seg != NULL) { 1848 next_seg = AS_SEGNEXT(as, seg); 1849 if (seg->s_flags & S_PURGE) 1850 SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 1851 seg = next_seg; 1852 } 1853 AS_LOCK_EXIT(as, &as->a_lock); 1854 1855 mutex_enter(&as->a_contents); 1856 as->a_flags &= ~AS_NEEDSPURGE; 1857 mutex_exit(&as->a_contents); 1858 } 1859 1860 /* 1861 * Find a hole within [*basep, *basep + *lenp), which contains a mappable 1862 * range of addresses at least "minlen" long, where the base of the range is 1863 * at "off" phase from an "align" boundary and there is space for a 1864 * "redzone"-sized redzone on eithe rside of the range. Thus, 1865 * if align was 4M and off was 16k, the user wants a hole which will start 1866 * 16k into a 4M page. 1867 * 1868 * If flags specifies AH_HI, the hole will have the highest possible address 1869 * in the range. We use the as->a_lastgap field to figure out where to 1870 * start looking for a gap. 1871 * 1872 * Otherwise, the gap will have the lowest possible address. 1873 * 1874 * If flags specifies AH_CONTAIN, the hole will contain the address addr. 1875 * 1876 * If an adequate hole is found, *basep and *lenp are set to reflect the part of 1877 * the hole that is within range, and 0 is returned. On failure, -1 is returned. 1878 * 1879 * NOTE: This routine is not correct when base+len overflows caddr_t. 1880 */ 1881 int 1882 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, 1883 uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off) 1884 { 1885 caddr_t lobound = *basep; 1886 caddr_t hibound = lobound + *lenp; 1887 struct seg *lseg, *hseg; 1888 caddr_t lo, hi; 1889 int forward; 1890 caddr_t save_base; 1891 size_t save_len; 1892 size_t save_minlen; 1893 size_t save_redzone; 1894 int fast_path = 1; 1895 1896 save_base = *basep; 1897 save_len = *lenp; 1898 save_minlen = minlen; 1899 save_redzone = redzone; 1900 1901 /* 1902 * For the first pass/fast_path, just add align and redzone into 1903 * minlen since if we get an allocation, we can guarantee that it 1904 * will fit the alignment and redzone requested. 1905 * This increases the chance that hibound will be adjusted to 1906 * a_lastgap->s_base which will likely allow us to find an 1907 * acceptable hole in the address space quicker. 1908 * If we can't find a hole with this fast_path, then we look for 1909 * smaller holes in which the alignment and offset may allow 1910 * the allocation to fit. 1911 */ 1912 minlen += align; 1913 minlen += 2 * redzone; 1914 redzone = 0; 1915 1916 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1917 if (AS_SEGFIRST(as) == NULL) { 1918 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR, 1919 align, redzone, off)) { 1920 AS_LOCK_EXIT(as, &as->a_lock); 1921 return (0); 1922 } else { 1923 AS_LOCK_EXIT(as, &as->a_lock); 1924 *basep = save_base; 1925 *lenp = save_len; 1926 return (-1); 1927 } 1928 } 1929 1930 retry: 1931 /* 1932 * Set up to iterate over all the inter-segment holes in the given 1933 * direction. lseg is NULL for the lowest-addressed hole and hseg is 1934 * NULL for the highest-addressed hole. If moving backwards, we reset 1935 * sseg to denote the highest-addressed segment. 1936 */ 1937 forward = (flags & AH_DIR) == AH_LO; 1938 if (forward) { 1939 hseg = as_findseg(as, lobound, 1); 1940 lseg = AS_SEGPREV(as, hseg); 1941 } else { 1942 1943 /* 1944 * If allocating at least as much as the last allocation, 1945 * use a_lastgap's base as a better estimate of hibound. 1946 */ 1947 if (as->a_lastgap && 1948 minlen >= as->a_lastgap->s_size && 1949 hibound >= as->a_lastgap->s_base) 1950 hibound = as->a_lastgap->s_base; 1951 1952 hseg = as_findseg(as, hibound, 1); 1953 if (hseg->s_base + hseg->s_size < hibound) { 1954 lseg = hseg; 1955 hseg = NULL; 1956 } else { 1957 lseg = AS_SEGPREV(as, hseg); 1958 } 1959 } 1960 1961 for (;;) { 1962 /* 1963 * Set lo and hi to the hole's boundaries. (We should really 1964 * use MAXADDR in place of hibound in the expression below, 1965 * but can't express it easily; using hibound in its place is 1966 * harmless.) 1967 */ 1968 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size; 1969 hi = (hseg == NULL) ? hibound : hseg->s_base; 1970 /* 1971 * If the iteration has moved past the interval from lobound 1972 * to hibound it's pointless to continue. 1973 */ 1974 if ((forward && lo > hibound) || (!forward && hi < lobound)) 1975 break; 1976 else if (lo > hibound || hi < lobound) 1977 goto cont; 1978 /* 1979 * Candidate hole lies at least partially within the allowable 1980 * range. Restrict it to fall completely within that range, 1981 * i.e., to [max(lo, lobound), min(hi, hibound)]. 1982 */ 1983 if (lo < lobound) 1984 lo = lobound; 1985 if (hi > hibound) 1986 hi = hibound; 1987 /* 1988 * Verify that the candidate hole is big enough and meets 1989 * hardware constraints. If the hole is too small, no need 1990 * to do the further checks since they will fail. 1991 */ 1992 *basep = lo; 1993 *lenp = hi - lo; 1994 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp, 1995 minlen, forward ? AH_LO : AH_HI, align, redzone, off) && 1996 ((flags & AH_CONTAIN) == 0 || 1997 (*basep <= addr && *basep + *lenp > addr))) { 1998 if (!forward) 1999 as->a_lastgap = hseg; 2000 if (hseg != NULL) 2001 as->a_lastgaphl = hseg; 2002 else 2003 as->a_lastgaphl = lseg; 2004 AS_LOCK_EXIT(as, &as->a_lock); 2005 return (0); 2006 } 2007 cont: 2008 /* 2009 * Move to the next hole. 2010 */ 2011 if (forward) { 2012 lseg = hseg; 2013 if (lseg == NULL) 2014 break; 2015 hseg = AS_SEGNEXT(as, hseg); 2016 } else { 2017 hseg = lseg; 2018 if (hseg == NULL) 2019 break; 2020 lseg = AS_SEGPREV(as, lseg); 2021 } 2022 } 2023 if (fast_path && (align != 0 || save_redzone != 0)) { 2024 fast_path = 0; 2025 minlen = save_minlen; 2026 redzone = save_redzone; 2027 goto retry; 2028 } 2029 *basep = save_base; 2030 *lenp = save_len; 2031 AS_LOCK_EXIT(as, &as->a_lock); 2032 return (-1); 2033 } 2034 2035 /* 2036 * Find a hole of at least size minlen within [*basep, *basep + *lenp). 2037 * 2038 * If flags specifies AH_HI, the hole will have the highest possible address 2039 * in the range. We use the as->a_lastgap field to figure out where to 2040 * start looking for a gap. 2041 * 2042 * Otherwise, the gap will have the lowest possible address. 2043 * 2044 * If flags specifies AH_CONTAIN, the hole will contain the address addr. 2045 * 2046 * If an adequate hole is found, base and len are set to reflect the part of 2047 * the hole that is within range, and 0 is returned, otherwise, 2048 * -1 is returned. 2049 * 2050 * NOTE: This routine is not correct when base+len overflows caddr_t. 2051 */ 2052 int 2053 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags, 2054 caddr_t addr) 2055 { 2056 2057 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0)); 2058 } 2059 2060 /* 2061 * Return the next range within [base, base + len) that is backed 2062 * with "real memory". Skip holes and non-seg_vn segments. 2063 * We're lazy and only return one segment at a time. 2064 */ 2065 int 2066 as_memory(struct as *as, caddr_t *basep, size_t *lenp) 2067 { 2068 extern struct seg_ops segspt_shmops; /* needs a header file */ 2069 struct seg *seg; 2070 caddr_t addr, eaddr; 2071 caddr_t segend; 2072 2073 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2074 2075 addr = *basep; 2076 eaddr = addr + *lenp; 2077 2078 seg = as_findseg(as, addr, 0); 2079 if (seg != NULL) 2080 addr = MAX(seg->s_base, addr); 2081 2082 for (;;) { 2083 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) { 2084 AS_LOCK_EXIT(as, &as->a_lock); 2085 return (EINVAL); 2086 } 2087 2088 if (seg->s_ops == &segvn_ops) { 2089 segend = seg->s_base + seg->s_size; 2090 break; 2091 } 2092 2093 /* 2094 * We do ISM by looking into the private data 2095 * to determine the real size of the segment. 2096 */ 2097 if (seg->s_ops == &segspt_shmops) { 2098 segend = seg->s_base + spt_realsize(seg); 2099 if (addr < segend) 2100 break; 2101 } 2102 2103 seg = AS_SEGNEXT(as, seg); 2104 2105 if (seg != NULL) 2106 addr = seg->s_base; 2107 } 2108 2109 *basep = addr; 2110 2111 if (segend > eaddr) 2112 *lenp = eaddr - addr; 2113 else 2114 *lenp = segend - addr; 2115 2116 AS_LOCK_EXIT(as, &as->a_lock); 2117 return (0); 2118 } 2119 2120 /* 2121 * Swap the pages associated with the address space as out to 2122 * secondary storage, returning the number of bytes actually 2123 * swapped. 2124 * 2125 * The value returned is intended to correlate well with the process's 2126 * memory requirements. Its usefulness for this purpose depends on 2127 * how well the segment-level routines do at returning accurate 2128 * information. 2129 */ 2130 size_t 2131 as_swapout(struct as *as) 2132 { 2133 struct seg *seg; 2134 size_t swpcnt = 0; 2135 2136 /* 2137 * Kernel-only processes have given up their address 2138 * spaces. Of course, we shouldn't be attempting to 2139 * swap out such processes in the first place... 2140 */ 2141 if (as == NULL) 2142 return (0); 2143 2144 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2145 2146 /* Prevent XHATs from attaching */ 2147 mutex_enter(&as->a_contents); 2148 AS_SETBUSY(as); 2149 mutex_exit(&as->a_contents); 2150 2151 2152 /* 2153 * Free all mapping resources associated with the address 2154 * space. The segment-level swapout routines capitalize 2155 * on this unmapping by scavanging pages that have become 2156 * unmapped here. 2157 */ 2158 hat_swapout(as->a_hat); 2159 if (as->a_xhat != NULL) 2160 xhat_swapout_all(as); 2161 2162 mutex_enter(&as->a_contents); 2163 AS_CLRBUSY(as); 2164 mutex_exit(&as->a_contents); 2165 2166 /* 2167 * Call the swapout routines of all segments in the address 2168 * space to do the actual work, accumulating the amount of 2169 * space reclaimed. 2170 */ 2171 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 2172 struct seg_ops *ov = seg->s_ops; 2173 2174 /* 2175 * We have to check to see if the seg has 2176 * an ops vector because the seg may have 2177 * been in the middle of being set up when 2178 * the process was picked for swapout. 2179 */ 2180 if ((ov != NULL) && (ov->swapout != NULL)) 2181 swpcnt += SEGOP_SWAPOUT(seg); 2182 } 2183 AS_LOCK_EXIT(as, &as->a_lock); 2184 return (swpcnt); 2185 } 2186 2187 /* 2188 * Determine whether data from the mappings in interval [addr, addr + size) 2189 * are in the primary memory (core) cache. 2190 */ 2191 int 2192 as_incore(struct as *as, caddr_t addr, 2193 size_t size, char *vec, size_t *sizep) 2194 { 2195 struct seg *seg; 2196 size_t ssize; 2197 caddr_t raddr; /* rounded down addr */ 2198 size_t rsize; /* rounded up size */ 2199 size_t isize; /* iteration size */ 2200 int error = 0; /* result, assume success */ 2201 2202 *sizep = 0; 2203 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2204 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) - 2205 (size_t)raddr; 2206 2207 if (raddr + rsize < raddr) /* check for wraparound */ 2208 return (ENOMEM); 2209 2210 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2211 seg = as_segat(as, raddr); 2212 if (seg == NULL) { 2213 AS_LOCK_EXIT(as, &as->a_lock); 2214 return (-1); 2215 } 2216 2217 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 2218 if (raddr >= seg->s_base + seg->s_size) { 2219 seg = AS_SEGNEXT(as, seg); 2220 if (seg == NULL || raddr != seg->s_base) { 2221 error = -1; 2222 break; 2223 } 2224 } 2225 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2226 ssize = seg->s_base + seg->s_size - raddr; 2227 else 2228 ssize = rsize; 2229 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec); 2230 if (isize != ssize) { 2231 error = -1; 2232 break; 2233 } 2234 vec += btopr(ssize); 2235 } 2236 AS_LOCK_EXIT(as, &as->a_lock); 2237 return (error); 2238 } 2239 2240 static void 2241 as_segunlock(struct seg *seg, caddr_t addr, int attr, 2242 ulong_t *bitmap, size_t position, size_t npages) 2243 { 2244 caddr_t range_start; 2245 size_t pos1 = position; 2246 size_t pos2; 2247 size_t size; 2248 size_t end_pos = npages + position; 2249 2250 while (bt_range(bitmap, &pos1, &pos2, end_pos)) { 2251 size = ptob((pos2 - pos1)); 2252 range_start = (caddr_t)((uintptr_t)addr + 2253 ptob(pos1 - position)); 2254 2255 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK, 2256 (ulong_t *)NULL, (size_t)NULL); 2257 pos1 = pos2; 2258 } 2259 } 2260 2261 static void 2262 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map, 2263 caddr_t raddr, size_t rsize) 2264 { 2265 struct seg *seg = as_segat(as, raddr); 2266 size_t ssize; 2267 2268 while (rsize != 0) { 2269 if (raddr >= seg->s_base + seg->s_size) 2270 seg = AS_SEGNEXT(as, seg); 2271 2272 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2273 ssize = seg->s_base + seg->s_size - raddr; 2274 else 2275 ssize = rsize; 2276 2277 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize)); 2278 2279 rsize -= ssize; 2280 raddr += ssize; 2281 } 2282 } 2283 2284 /* 2285 * Cache control operations over the interval [addr, addr + size) in 2286 * address space "as". 2287 */ 2288 /*ARGSUSED*/ 2289 int 2290 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr, 2291 uintptr_t arg, ulong_t *lock_map, size_t pos) 2292 { 2293 struct seg *seg; /* working segment */ 2294 caddr_t raddr; /* rounded down addr */ 2295 caddr_t initraddr; /* saved initial rounded down addr */ 2296 size_t rsize; /* rounded up size */ 2297 size_t initrsize; /* saved initial rounded up size */ 2298 size_t ssize; /* size of seg */ 2299 int error = 0; /* result */ 2300 size_t mlock_size; /* size of bitmap */ 2301 ulong_t *mlock_map; /* pointer to bitmap used */ 2302 /* to represent the locked */ 2303 /* pages. */ 2304 retry: 2305 if (error == IE_RETRY) 2306 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 2307 else 2308 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2309 2310 /* 2311 * If these are address space lock/unlock operations, loop over 2312 * all segments in the address space, as appropriate. 2313 */ 2314 if (func == MC_LOCKAS) { 2315 size_t npages, idx; 2316 size_t rlen = 0; /* rounded as length */ 2317 2318 idx = pos; 2319 2320 if (arg & MCL_FUTURE) { 2321 mutex_enter(&as->a_contents); 2322 AS_SETPGLCK(as); 2323 mutex_exit(&as->a_contents); 2324 } 2325 if ((arg & MCL_CURRENT) == 0) { 2326 AS_LOCK_EXIT(as, &as->a_lock); 2327 return (0); 2328 } 2329 2330 seg = AS_SEGFIRST(as); 2331 if (seg == NULL) { 2332 AS_LOCK_EXIT(as, &as->a_lock); 2333 return (0); 2334 } 2335 2336 do { 2337 raddr = (caddr_t)((uintptr_t)seg->s_base & 2338 (uintptr_t)PAGEMASK); 2339 rlen += (((uintptr_t)(seg->s_base + seg->s_size) + 2340 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr; 2341 } while ((seg = AS_SEGNEXT(as, seg)) != NULL); 2342 2343 mlock_size = BT_BITOUL(btopr(rlen)); 2344 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * 2345 sizeof (ulong_t), KM_NOSLEEP)) == NULL) { 2346 AS_LOCK_EXIT(as, &as->a_lock); 2347 return (EAGAIN); 2348 } 2349 2350 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 2351 error = SEGOP_LOCKOP(seg, seg->s_base, 2352 seg->s_size, attr, MC_LOCK, mlock_map, pos); 2353 if (error != 0) 2354 break; 2355 pos += seg_pages(seg); 2356 } 2357 2358 if (error) { 2359 for (seg = AS_SEGFIRST(as); seg != NULL; 2360 seg = AS_SEGNEXT(as, seg)) { 2361 2362 raddr = (caddr_t)((uintptr_t)seg->s_base & 2363 (uintptr_t)PAGEMASK); 2364 npages = seg_pages(seg); 2365 as_segunlock(seg, raddr, attr, mlock_map, 2366 idx, npages); 2367 idx += npages; 2368 } 2369 } 2370 2371 kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); 2372 AS_LOCK_EXIT(as, &as->a_lock); 2373 goto lockerr; 2374 } else if (func == MC_UNLOCKAS) { 2375 mutex_enter(&as->a_contents); 2376 AS_CLRPGLCK(as); 2377 mutex_exit(&as->a_contents); 2378 2379 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 2380 error = SEGOP_LOCKOP(seg, seg->s_base, 2381 seg->s_size, attr, MC_UNLOCK, NULL, 0); 2382 if (error != 0) 2383 break; 2384 } 2385 2386 AS_LOCK_EXIT(as, &as->a_lock); 2387 goto lockerr; 2388 } 2389 2390 /* 2391 * Normalize addresses and sizes. 2392 */ 2393 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2394 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2395 (size_t)raddr; 2396 2397 if (raddr + rsize < raddr) { /* check for wraparound */ 2398 AS_LOCK_EXIT(as, &as->a_lock); 2399 return (ENOMEM); 2400 } 2401 2402 /* 2403 * Get initial segment. 2404 */ 2405 if ((seg = as_segat(as, raddr)) == NULL) { 2406 AS_LOCK_EXIT(as, &as->a_lock); 2407 return (ENOMEM); 2408 } 2409 2410 if (func == MC_LOCK) { 2411 mlock_size = BT_BITOUL(btopr(rsize)); 2412 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * 2413 sizeof (ulong_t), KM_NOSLEEP)) == NULL) { 2414 AS_LOCK_EXIT(as, &as->a_lock); 2415 return (EAGAIN); 2416 } 2417 } 2418 2419 /* 2420 * Loop over all segments. If a hole in the address range is 2421 * discovered, then fail. For each segment, perform the appropriate 2422 * control operation. 2423 */ 2424 while (rsize != 0) { 2425 2426 /* 2427 * Make sure there's no hole, calculate the portion 2428 * of the next segment to be operated over. 2429 */ 2430 if (raddr >= seg->s_base + seg->s_size) { 2431 seg = AS_SEGNEXT(as, seg); 2432 if (seg == NULL || raddr != seg->s_base) { 2433 if (func == MC_LOCK) { 2434 as_unlockerr(as, attr, mlock_map, 2435 initraddr, initrsize - rsize); 2436 kmem_free(mlock_map, 2437 mlock_size * sizeof (ulong_t)); 2438 } 2439 AS_LOCK_EXIT(as, &as->a_lock); 2440 return (ENOMEM); 2441 } 2442 } 2443 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2444 ssize = seg->s_base + seg->s_size - raddr; 2445 else 2446 ssize = rsize; 2447 2448 /* 2449 * Dispatch on specific function. 2450 */ 2451 switch (func) { 2452 2453 /* 2454 * Synchronize cached data from mappings with backing 2455 * objects. 2456 */ 2457 case MC_SYNC: 2458 if (error = SEGOP_SYNC(seg, raddr, ssize, 2459 attr, (uint_t)arg)) { 2460 AS_LOCK_EXIT(as, &as->a_lock); 2461 return (error); 2462 } 2463 break; 2464 2465 /* 2466 * Lock pages in memory. 2467 */ 2468 case MC_LOCK: 2469 if (error = SEGOP_LOCKOP(seg, raddr, ssize, 2470 attr, func, mlock_map, pos)) { 2471 as_unlockerr(as, attr, mlock_map, initraddr, 2472 initrsize - rsize + ssize); 2473 kmem_free(mlock_map, mlock_size * 2474 sizeof (ulong_t)); 2475 AS_LOCK_EXIT(as, &as->a_lock); 2476 goto lockerr; 2477 } 2478 break; 2479 2480 /* 2481 * Unlock mapped pages. 2482 */ 2483 case MC_UNLOCK: 2484 (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func, 2485 (ulong_t *)NULL, (size_t)NULL); 2486 break; 2487 2488 /* 2489 * Store VM advise for mapped pages in segment layer. 2490 */ 2491 case MC_ADVISE: 2492 error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg); 2493 2494 /* 2495 * Check for regular errors and special retry error 2496 */ 2497 if (error) { 2498 if (error == IE_RETRY) { 2499 /* 2500 * Need to acquire writers lock, so 2501 * have to drop readers lock and start 2502 * all over again 2503 */ 2504 AS_LOCK_EXIT(as, &as->a_lock); 2505 goto retry; 2506 } else if (error == IE_REATTACH) { 2507 /* 2508 * Find segment for current address 2509 * because current segment just got 2510 * split or concatenated 2511 */ 2512 seg = as_segat(as, raddr); 2513 if (seg == NULL) { 2514 AS_LOCK_EXIT(as, &as->a_lock); 2515 return (ENOMEM); 2516 } 2517 } else { 2518 /* 2519 * Regular error 2520 */ 2521 AS_LOCK_EXIT(as, &as->a_lock); 2522 return (error); 2523 } 2524 } 2525 break; 2526 2527 /* 2528 * Can't happen. 2529 */ 2530 default: 2531 panic("as_ctl: bad operation %d", func); 2532 /*NOTREACHED*/ 2533 } 2534 2535 rsize -= ssize; 2536 raddr += ssize; 2537 } 2538 2539 if (func == MC_LOCK) 2540 kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); 2541 AS_LOCK_EXIT(as, &as->a_lock); 2542 return (0); 2543 lockerr: 2544 2545 /* 2546 * If the lower levels returned EDEADLK for a segment lockop, 2547 * it means that we should retry the operation. Let's wait 2548 * a bit also to let the deadlock causing condition clear. 2549 * This is part of a gross hack to work around a design flaw 2550 * in the ufs/sds logging code and should go away when the 2551 * logging code is re-designed to fix the problem. See bug 2552 * 4125102 for details of the problem. 2553 */ 2554 if (error == EDEADLK) { 2555 delay(deadlk_wait); 2556 error = 0; 2557 goto retry; 2558 } 2559 return (error); 2560 } 2561 2562 /* 2563 * Special code for exec to move the stack segment from its interim 2564 * place in the old address to the right place in the new address space. 2565 */ 2566 /*ARGSUSED*/ 2567 int 2568 as_exec(struct as *oas, caddr_t ostka, size_t stksz, 2569 struct as *nas, caddr_t nstka, uint_t hatflag) 2570 { 2571 struct seg *stkseg; 2572 2573 AS_LOCK_ENTER(oas, &oas->a_lock, RW_WRITER); 2574 stkseg = as_segat(oas, ostka); 2575 stkseg = as_removeseg(oas, stkseg); 2576 ASSERT(stkseg != NULL); 2577 ASSERT(stkseg->s_base == ostka && stkseg->s_size == stksz); 2578 stkseg->s_as = nas; 2579 stkseg->s_base = nstka; 2580 2581 /* 2582 * It's ok to lock the address space we are about to exec to. 2583 */ 2584 AS_LOCK_ENTER(nas, &nas->a_lock, RW_WRITER); 2585 ASSERT(avl_numnodes(&nas->a_wpage) == 0); 2586 nas->a_size += stkseg->s_size; 2587 oas->a_size -= stkseg->s_size; 2588 (void) as_addseg(nas, stkseg); 2589 AS_LOCK_EXIT(nas, &nas->a_lock); 2590 AS_LOCK_EXIT(oas, &oas->a_lock); 2591 return (0); 2592 } 2593 2594 int 2595 fc_decode(faultcode_t fault_err) 2596 { 2597 int error = 0; 2598 2599 switch (FC_CODE(fault_err)) { 2600 case FC_OBJERR: 2601 error = FC_ERRNO(fault_err); 2602 break; 2603 case FC_PROT: 2604 error = EACCES; 2605 break; 2606 default: 2607 error = EFAULT; 2608 break; 2609 } 2610 return (error); 2611 } 2612 2613 /* 2614 * Pagelock pages from a range that spans more than 1 segment. Obtain shadow 2615 * lists from each segment and copy them to one contiguous shadow list (plist) 2616 * as expected by the caller. Save pointers to per segment shadow lists at 2617 * the tail of plist so that they can be used during as_pageunlock(). 2618 */ 2619 static int 2620 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp, 2621 caddr_t addr, size_t size, enum seg_rw rw) 2622 { 2623 caddr_t sv_addr = addr; 2624 size_t sv_size = size; 2625 struct seg *sv_seg = seg; 2626 ulong_t segcnt = 1; 2627 ulong_t cnt; 2628 size_t ssize; 2629 pgcnt_t npages = btop(size); 2630 page_t **plist; 2631 page_t **pl; 2632 int error; 2633 caddr_t eaddr; 2634 faultcode_t fault_err = 0; 2635 pgcnt_t pl_off; 2636 extern struct seg_ops segspt_shmops; 2637 2638 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 2639 ASSERT(seg != NULL); 2640 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size); 2641 ASSERT(addr + size > seg->s_base + seg->s_size); 2642 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 2643 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 2644 2645 /* 2646 * Count the number of segments covered by the range we are about to 2647 * lock. The segment count is used to size the shadow list we return 2648 * back to the caller. 2649 */ 2650 for (; size != 0; size -= ssize, addr += ssize) { 2651 if (addr >= seg->s_base + seg->s_size) { 2652 2653 seg = AS_SEGNEXT(as, seg); 2654 if (seg == NULL || addr != seg->s_base) { 2655 AS_LOCK_EXIT(as, &as->a_lock); 2656 return (EFAULT); 2657 } 2658 /* 2659 * Do a quick check if subsequent segments 2660 * will most likely support pagelock. 2661 */ 2662 if (seg->s_ops == &segvn_ops) { 2663 vnode_t *vp; 2664 2665 if (SEGOP_GETVP(seg, addr, &vp) != 0 || 2666 vp != NULL) { 2667 AS_LOCK_EXIT(as, &as->a_lock); 2668 goto slow; 2669 } 2670 } else if (seg->s_ops != &segspt_shmops) { 2671 AS_LOCK_EXIT(as, &as->a_lock); 2672 goto slow; 2673 } 2674 segcnt++; 2675 } 2676 if (addr + size > seg->s_base + seg->s_size) { 2677 ssize = seg->s_base + seg->s_size - addr; 2678 } else { 2679 ssize = size; 2680 } 2681 } 2682 ASSERT(segcnt > 1); 2683 2684 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP); 2685 2686 addr = sv_addr; 2687 size = sv_size; 2688 seg = sv_seg; 2689 2690 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) { 2691 if (addr >= seg->s_base + seg->s_size) { 2692 seg = AS_SEGNEXT(as, seg); 2693 ASSERT(seg != NULL && addr == seg->s_base); 2694 cnt++; 2695 ASSERT(cnt < segcnt); 2696 } 2697 if (addr + size > seg->s_base + seg->s_size) { 2698 ssize = seg->s_base + seg->s_size - addr; 2699 } else { 2700 ssize = size; 2701 } 2702 pl = &plist[npages + cnt]; 2703 error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, 2704 L_PAGELOCK, rw); 2705 if (error) { 2706 break; 2707 } 2708 ASSERT(plist[npages + cnt] != NULL); 2709 ASSERT(pl_off + btop(ssize) <= npages); 2710 bcopy(plist[npages + cnt], &plist[pl_off], 2711 btop(ssize) * sizeof (page_t *)); 2712 pl_off += btop(ssize); 2713 } 2714 2715 if (size == 0) { 2716 AS_LOCK_EXIT(as, &as->a_lock); 2717 ASSERT(cnt == segcnt - 1); 2718 *ppp = plist; 2719 return (0); 2720 } 2721 2722 /* 2723 * one of pagelock calls failed. The error type is in error variable. 2724 * Unlock what we've locked so far and retry with F_SOFTLOCK if error 2725 * type is either EFAULT or ENOTSUP. Otherwise just return the error 2726 * back to the caller. 2727 */ 2728 2729 eaddr = addr; 2730 seg = sv_seg; 2731 2732 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) { 2733 if (addr >= seg->s_base + seg->s_size) { 2734 seg = AS_SEGNEXT(as, seg); 2735 ASSERT(seg != NULL && addr == seg->s_base); 2736 cnt++; 2737 ASSERT(cnt < segcnt); 2738 } 2739 if (eaddr > seg->s_base + seg->s_size) { 2740 ssize = seg->s_base + seg->s_size - addr; 2741 } else { 2742 ssize = eaddr - addr; 2743 } 2744 pl = &plist[npages + cnt]; 2745 ASSERT(*pl != NULL); 2746 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, 2747 L_PAGEUNLOCK, rw); 2748 } 2749 2750 AS_LOCK_EXIT(as, &as->a_lock); 2751 2752 kmem_free(plist, (npages + segcnt) * sizeof (page_t *)); 2753 2754 if (error != ENOTSUP && error != EFAULT) { 2755 return (error); 2756 } 2757 2758 slow: 2759 /* 2760 * If we are here because pagelock failed due to the need to cow fault 2761 * in the pages we want to lock F_SOFTLOCK will do this job and in 2762 * next as_pagelock() call for this address range pagelock will 2763 * hopefully succeed. 2764 */ 2765 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw); 2766 if (fault_err != 0) { 2767 return (fc_decode(fault_err)); 2768 } 2769 *ppp = NULL; 2770 2771 return (0); 2772 } 2773 2774 /* 2775 * lock pages in a given address space. Return shadow list. If 2776 * the list is NULL, the MMU mapping is also locked. 2777 */ 2778 int 2779 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr, 2780 size_t size, enum seg_rw rw) 2781 { 2782 size_t rsize; 2783 caddr_t raddr; 2784 faultcode_t fault_err; 2785 struct seg *seg; 2786 int err; 2787 2788 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START, 2789 "as_pagelock_start: addr %p size %ld", addr, size); 2790 2791 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2792 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2793 (size_t)raddr; 2794 2795 /* 2796 * if the request crosses two segments let 2797 * as_fault handle it. 2798 */ 2799 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2800 2801 seg = as_segat(as, raddr); 2802 if (seg == NULL) { 2803 AS_LOCK_EXIT(as, &as->a_lock); 2804 return (EFAULT); 2805 } 2806 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size); 2807 if (raddr + rsize > seg->s_base + seg->s_size) { 2808 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw)); 2809 } 2810 if (raddr + rsize <= raddr) { 2811 AS_LOCK_EXIT(as, &as->a_lock); 2812 return (EFAULT); 2813 } 2814 2815 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START, 2816 "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize); 2817 2818 /* 2819 * try to lock pages and pass back shadow list 2820 */ 2821 err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw); 2822 2823 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end"); 2824 2825 AS_LOCK_EXIT(as, &as->a_lock); 2826 2827 if (err == 0 || (err != ENOTSUP && err != EFAULT)) { 2828 return (err); 2829 } 2830 2831 /* 2832 * Use F_SOFTLOCK to lock the pages because pagelock failed either due 2833 * to no pagelock support for this segment or pages need to be cow 2834 * faulted in. If fault is needed F_SOFTLOCK will do this job for 2835 * this as_pagelock() call and in the next as_pagelock() call for the 2836 * same address range pagelock call will hopefull succeed. 2837 */ 2838 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw); 2839 if (fault_err != 0) { 2840 return (fc_decode(fault_err)); 2841 } 2842 *ppp = NULL; 2843 2844 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end"); 2845 return (0); 2846 } 2847 2848 /* 2849 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow 2850 * lists from the end of plist and call pageunlock interface for each segment. 2851 * Drop as lock and free plist. 2852 */ 2853 static void 2854 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size, 2855 struct page **plist, enum seg_rw rw) 2856 { 2857 ulong_t cnt; 2858 caddr_t eaddr = addr + size; 2859 pgcnt_t npages = btop(size); 2860 size_t ssize; 2861 page_t **pl; 2862 2863 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 2864 ASSERT(seg != NULL); 2865 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size); 2866 ASSERT(addr + size > seg->s_base + seg->s_size); 2867 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 2868 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 2869 ASSERT(plist != NULL); 2870 2871 for (cnt = 0; addr < eaddr; addr += ssize) { 2872 if (addr >= seg->s_base + seg->s_size) { 2873 seg = AS_SEGNEXT(as, seg); 2874 ASSERT(seg != NULL && addr == seg->s_base); 2875 cnt++; 2876 } 2877 if (eaddr > seg->s_base + seg->s_size) { 2878 ssize = seg->s_base + seg->s_size - addr; 2879 } else { 2880 ssize = eaddr - addr; 2881 } 2882 pl = &plist[npages + cnt]; 2883 ASSERT(*pl != NULL); 2884 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, 2885 L_PAGEUNLOCK, rw); 2886 } 2887 ASSERT(cnt > 0); 2888 AS_LOCK_EXIT(as, &as->a_lock); 2889 2890 cnt++; 2891 kmem_free(plist, (npages + cnt) * sizeof (page_t *)); 2892 } 2893 2894 /* 2895 * unlock pages in a given address range 2896 */ 2897 void 2898 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size, 2899 enum seg_rw rw) 2900 { 2901 struct seg *seg; 2902 size_t rsize; 2903 caddr_t raddr; 2904 2905 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START, 2906 "as_pageunlock_start: addr %p size %ld", addr, size); 2907 2908 /* 2909 * if the shadow list is NULL, as_pagelock was 2910 * falling back to as_fault 2911 */ 2912 if (pp == NULL) { 2913 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw); 2914 return; 2915 } 2916 2917 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2918 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2919 (size_t)raddr; 2920 2921 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2922 seg = as_segat(as, raddr); 2923 ASSERT(seg != NULL); 2924 2925 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START, 2926 "seg_unlock_start: raddr %p rsize %ld", raddr, rsize); 2927 2928 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size); 2929 if (raddr + rsize <= seg->s_base + seg->s_size) { 2930 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw); 2931 } else { 2932 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw); 2933 return; 2934 } 2935 AS_LOCK_EXIT(as, &as->a_lock); 2936 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end"); 2937 } 2938 2939 int 2940 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc, 2941 boolean_t wait) 2942 { 2943 struct seg *seg; 2944 size_t ssize; 2945 caddr_t raddr; /* rounded down addr */ 2946 size_t rsize; /* rounded up size */ 2947 int error = 0; 2948 size_t pgsz = page_get_pagesize(szc); 2949 2950 setpgsz_top: 2951 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) { 2952 return (EINVAL); 2953 } 2954 2955 raddr = addr; 2956 rsize = size; 2957 2958 if (raddr + rsize < raddr) /* check for wraparound */ 2959 return (ENOMEM); 2960 2961 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 2962 as_clearwatchprot(as, raddr, rsize); 2963 seg = as_segat(as, raddr); 2964 if (seg == NULL) { 2965 as_setwatch(as); 2966 AS_LOCK_EXIT(as, &as->a_lock); 2967 return (ENOMEM); 2968 } 2969 2970 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 2971 if (raddr >= seg->s_base + seg->s_size) { 2972 seg = AS_SEGNEXT(as, seg); 2973 if (seg == NULL || raddr != seg->s_base) { 2974 error = ENOMEM; 2975 break; 2976 } 2977 } 2978 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 2979 ssize = seg->s_base + seg->s_size - raddr; 2980 } else { 2981 ssize = rsize; 2982 } 2983 2984 retry: 2985 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc); 2986 2987 if (error == IE_NOMEM) { 2988 error = EAGAIN; 2989 break; 2990 } 2991 2992 if (error == IE_RETRY) { 2993 AS_LOCK_EXIT(as, &as->a_lock); 2994 goto setpgsz_top; 2995 } 2996 2997 if (error == ENOTSUP) { 2998 error = EINVAL; 2999 break; 3000 } 3001 3002 if (wait && (error == EAGAIN)) { 3003 /* 3004 * Memory is currently locked. It must be unlocked 3005 * before this operation can succeed through a retry. 3006 * The possible reasons for locked memory and 3007 * corresponding strategies for unlocking are: 3008 * (1) Normal I/O 3009 * wait for a signal that the I/O operation 3010 * has completed and the memory is unlocked. 3011 * (2) Asynchronous I/O 3012 * The aio subsystem does not unlock pages when 3013 * the I/O is completed. Those pages are unlocked 3014 * when the application calls aiowait/aioerror. 3015 * So, to prevent blocking forever, cv_broadcast() 3016 * is done to wake up aio_cleanup_thread. 3017 * Subsequently, segvn_reclaim will be called, and 3018 * that will do AS_CLRUNMAPWAIT() and wake us up. 3019 * (3) Long term page locking: 3020 * This is not relevant for as_setpagesize() 3021 * because we cannot change the page size for 3022 * driver memory. The attempt to do so will 3023 * fail with a different error than EAGAIN so 3024 * there's no need to trigger as callbacks like 3025 * as_unmap, as_setprot or as_free would do. 3026 */ 3027 mutex_enter(&as->a_contents); 3028 if (!AS_ISNOUNMAPWAIT(as)) { 3029 if (AS_ISUNMAPWAIT(as) == 0) { 3030 cv_broadcast(&as->a_cv); 3031 } 3032 AS_SETUNMAPWAIT(as); 3033 AS_LOCK_EXIT(as, &as->a_lock); 3034 while (AS_ISUNMAPWAIT(as)) { 3035 cv_wait(&as->a_cv, &as->a_contents); 3036 } 3037 } else { 3038 /* 3039 * We may have raced with 3040 * segvn_reclaim()/segspt_reclaim(). In this 3041 * case clean nounmapwait flag and retry since 3042 * softlockcnt in this segment may be already 3043 * 0. We don't drop as writer lock so our 3044 * number of retries without sleeping should 3045 * be very small. See segvn_reclaim() for 3046 * more comments. 3047 */ 3048 AS_CLRNOUNMAPWAIT(as); 3049 mutex_exit(&as->a_contents); 3050 goto retry; 3051 } 3052 mutex_exit(&as->a_contents); 3053 goto setpgsz_top; 3054 } else if (error != 0) { 3055 break; 3056 } 3057 } 3058 as_setwatch(as); 3059 AS_LOCK_EXIT(as, &as->a_lock); 3060 return (error); 3061 } 3062 3063 /* 3064 * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments 3065 * in its chunk where s_szc is less than the szc we want to set. 3066 */ 3067 static int 3068 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc, 3069 int *retry) 3070 { 3071 struct seg *seg; 3072 size_t ssize; 3073 int error; 3074 3075 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3076 3077 seg = as_segat(as, raddr); 3078 if (seg == NULL) { 3079 panic("as_iset3_default_lpsize: no seg"); 3080 } 3081 3082 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 3083 if (raddr >= seg->s_base + seg->s_size) { 3084 seg = AS_SEGNEXT(as, seg); 3085 if (seg == NULL || raddr != seg->s_base) { 3086 panic("as_iset3_default_lpsize: as changed"); 3087 } 3088 } 3089 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3090 ssize = seg->s_base + seg->s_size - raddr; 3091 } else { 3092 ssize = rsize; 3093 } 3094 3095 if (szc > seg->s_szc) { 3096 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc); 3097 /* Only retry on EINVAL segments that have no vnode. */ 3098 if (error == EINVAL) { 3099 vnode_t *vp = NULL; 3100 if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) && 3101 (SEGOP_GETVP(seg, raddr, &vp) != 0 || 3102 vp == NULL)) { 3103 *retry = 1; 3104 } else { 3105 *retry = 0; 3106 } 3107 } 3108 if (error) { 3109 return (error); 3110 } 3111 } 3112 } 3113 return (0); 3114 } 3115 3116 /* 3117 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the 3118 * pagesize on each segment in its range, but if any fails with EINVAL, 3119 * then it reduces the pagesizes to the next size in the bitmap and 3120 * retries as_iset3_default_lpsize(). The reason why the code retries 3121 * smaller allowed sizes on EINVAL is because (a) the anon offset may not 3122 * match the bigger sizes, and (b) it's hard to get this offset (to begin 3123 * with) to pass to map_pgszcvec(). 3124 */ 3125 static int 3126 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc, 3127 uint_t szcvec) 3128 { 3129 int error; 3130 int retry; 3131 3132 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3133 3134 for (;;) { 3135 error = as_iset3_default_lpsize(as, addr, size, szc, &retry); 3136 if (error == EINVAL && retry) { 3137 szcvec &= ~(1 << szc); 3138 if (szcvec <= 1) { 3139 return (EINVAL); 3140 } 3141 szc = highbit(szcvec) - 1; 3142 } else { 3143 return (error); 3144 } 3145 } 3146 } 3147 3148 /* 3149 * as_iset1_default_lpsize() breaks its chunk into areas where existing 3150 * segments have a smaller szc than we want to set. For each such area, 3151 * it calls as_iset2_default_lpsize() 3152 */ 3153 static int 3154 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc, 3155 uint_t szcvec) 3156 { 3157 struct seg *seg; 3158 size_t ssize; 3159 caddr_t setaddr = raddr; 3160 size_t setsize = 0; 3161 int set; 3162 int error; 3163 3164 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3165 3166 seg = as_segat(as, raddr); 3167 if (seg == NULL) { 3168 panic("as_iset1_default_lpsize: no seg"); 3169 } 3170 if (seg->s_szc < szc) { 3171 set = 1; 3172 } else { 3173 set = 0; 3174 } 3175 3176 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) { 3177 if (raddr >= seg->s_base + seg->s_size) { 3178 seg = AS_SEGNEXT(as, seg); 3179 if (seg == NULL || raddr != seg->s_base) { 3180 panic("as_iset1_default_lpsize: as changed"); 3181 } 3182 if (seg->s_szc >= szc && set) { 3183 ASSERT(setsize != 0); 3184 error = as_iset2_default_lpsize(as, 3185 setaddr, setsize, szc, szcvec); 3186 if (error) { 3187 return (error); 3188 } 3189 set = 0; 3190 } else if (seg->s_szc < szc && !set) { 3191 setaddr = raddr; 3192 setsize = 0; 3193 set = 1; 3194 } 3195 } 3196 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3197 ssize = seg->s_base + seg->s_size - raddr; 3198 } else { 3199 ssize = rsize; 3200 } 3201 } 3202 error = 0; 3203 if (set) { 3204 ASSERT(setsize != 0); 3205 error = as_iset2_default_lpsize(as, setaddr, setsize, 3206 szc, szcvec); 3207 } 3208 return (error); 3209 } 3210 3211 /* 3212 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap 3213 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each 3214 * chunk to as_iset1_default_lpsize(). 3215 */ 3216 static int 3217 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags, 3218 int type) 3219 { 3220 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM; 3221 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, 3222 flags, rtype, 1); 3223 uint_t szc; 3224 uint_t nszc; 3225 int error; 3226 caddr_t a; 3227 caddr_t eaddr; 3228 size_t segsize; 3229 size_t pgsz; 3230 uint_t save_szcvec; 3231 3232 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3233 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 3234 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 3235 3236 szcvec &= ~1; 3237 if (szcvec <= 1) { /* skip if base page size */ 3238 return (0); 3239 } 3240 3241 /* Get the pagesize of the first larger page size. */ 3242 szc = lowbit(szcvec) - 1; 3243 pgsz = page_get_pagesize(szc); 3244 eaddr = addr + size; 3245 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 3246 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 3247 3248 save_szcvec = szcvec; 3249 szcvec >>= (szc + 1); 3250 nszc = szc; 3251 while (szcvec) { 3252 if ((szcvec & 0x1) == 0) { 3253 nszc++; 3254 szcvec >>= 1; 3255 continue; 3256 } 3257 nszc++; 3258 pgsz = page_get_pagesize(nszc); 3259 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 3260 if (a != addr) { 3261 ASSERT(szc > 0); 3262 ASSERT(a < eaddr); 3263 segsize = a - addr; 3264 error = as_iset1_default_lpsize(as, addr, segsize, szc, 3265 save_szcvec); 3266 if (error) { 3267 return (error); 3268 } 3269 addr = a; 3270 } 3271 szc = nszc; 3272 szcvec >>= 1; 3273 } 3274 3275 ASSERT(addr < eaddr); 3276 szcvec = save_szcvec; 3277 while (szcvec) { 3278 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 3279 ASSERT(a >= addr); 3280 if (a != addr) { 3281 ASSERT(szc > 0); 3282 segsize = a - addr; 3283 error = as_iset1_default_lpsize(as, addr, segsize, szc, 3284 save_szcvec); 3285 if (error) { 3286 return (error); 3287 } 3288 addr = a; 3289 } 3290 szcvec &= ~(1 << szc); 3291 if (szcvec) { 3292 szc = highbit(szcvec) - 1; 3293 pgsz = page_get_pagesize(szc); 3294 } 3295 } 3296 ASSERT(addr == eaddr); 3297 3298 return (0); 3299 } 3300 3301 /* 3302 * Set the default large page size for the range. Called via memcntl with 3303 * page size set to 0. as_set_default_lpsize breaks the range down into 3304 * chunks with the same type/flags, ignores-non segvn segments, and passes 3305 * each chunk to as_iset_default_lpsize(). 3306 */ 3307 int 3308 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size) 3309 { 3310 struct seg *seg; 3311 caddr_t raddr; 3312 size_t rsize; 3313 size_t ssize; 3314 int rtype, rflags; 3315 int stype, sflags; 3316 int error; 3317 caddr_t setaddr; 3318 size_t setsize; 3319 int segvn; 3320 3321 if (size == 0) 3322 return (0); 3323 3324 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 3325 again: 3326 error = 0; 3327 3328 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3329 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 3330 (size_t)raddr; 3331 3332 if (raddr + rsize < raddr) { /* check for wraparound */ 3333 AS_LOCK_EXIT(as, &as->a_lock); 3334 return (ENOMEM); 3335 } 3336 as_clearwatchprot(as, raddr, rsize); 3337 seg = as_segat(as, raddr); 3338 if (seg == NULL) { 3339 as_setwatch(as); 3340 AS_LOCK_EXIT(as, &as->a_lock); 3341 return (ENOMEM); 3342 } 3343 if (seg->s_ops == &segvn_ops) { 3344 rtype = SEGOP_GETTYPE(seg, addr); 3345 rflags = rtype & (MAP_TEXT | MAP_INITDATA); 3346 rtype = rtype & (MAP_SHARED | MAP_PRIVATE); 3347 segvn = 1; 3348 } else { 3349 segvn = 0; 3350 } 3351 setaddr = raddr; 3352 setsize = 0; 3353 3354 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) { 3355 if (raddr >= (seg->s_base + seg->s_size)) { 3356 seg = AS_SEGNEXT(as, seg); 3357 if (seg == NULL || raddr != seg->s_base) { 3358 error = ENOMEM; 3359 break; 3360 } 3361 if (seg->s_ops == &segvn_ops) { 3362 stype = SEGOP_GETTYPE(seg, raddr); 3363 sflags = stype & (MAP_TEXT | MAP_INITDATA); 3364 stype &= (MAP_SHARED | MAP_PRIVATE); 3365 if (segvn && (rflags != sflags || 3366 rtype != stype)) { 3367 /* 3368 * The next segment is also segvn but 3369 * has different flags and/or type. 3370 */ 3371 ASSERT(setsize != 0); 3372 error = as_iset_default_lpsize(as, 3373 setaddr, setsize, rflags, rtype); 3374 if (error) { 3375 break; 3376 } 3377 rflags = sflags; 3378 rtype = stype; 3379 setaddr = raddr; 3380 setsize = 0; 3381 } else if (!segvn) { 3382 rflags = sflags; 3383 rtype = stype; 3384 setaddr = raddr; 3385 setsize = 0; 3386 segvn = 1; 3387 } 3388 } else if (segvn) { 3389 /* The next segment is not segvn. */ 3390 ASSERT(setsize != 0); 3391 error = as_iset_default_lpsize(as, 3392 setaddr, setsize, rflags, rtype); 3393 if (error) { 3394 break; 3395 } 3396 segvn = 0; 3397 } 3398 } 3399 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3400 ssize = seg->s_base + seg->s_size - raddr; 3401 } else { 3402 ssize = rsize; 3403 } 3404 } 3405 if (error == 0 && segvn) { 3406 /* The last chunk when rsize == 0. */ 3407 ASSERT(setsize != 0); 3408 error = as_iset_default_lpsize(as, setaddr, setsize, 3409 rflags, rtype); 3410 } 3411 3412 if (error == IE_RETRY) { 3413 goto again; 3414 } else if (error == IE_NOMEM) { 3415 error = EAGAIN; 3416 } else if (error == ENOTSUP) { 3417 error = EINVAL; 3418 } else if (error == EAGAIN) { 3419 mutex_enter(&as->a_contents); 3420 if (!AS_ISNOUNMAPWAIT(as)) { 3421 if (AS_ISUNMAPWAIT(as) == 0) { 3422 cv_broadcast(&as->a_cv); 3423 } 3424 AS_SETUNMAPWAIT(as); 3425 AS_LOCK_EXIT(as, &as->a_lock); 3426 while (AS_ISUNMAPWAIT(as)) { 3427 cv_wait(&as->a_cv, &as->a_contents); 3428 } 3429 mutex_exit(&as->a_contents); 3430 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 3431 } else { 3432 /* 3433 * We may have raced with 3434 * segvn_reclaim()/segspt_reclaim(). In this case 3435 * clean nounmapwait flag and retry since softlockcnt 3436 * in this segment may be already 0. We don't drop as 3437 * writer lock so our number of retries without 3438 * sleeping should be very small. See segvn_reclaim() 3439 * for more comments. 3440 */ 3441 AS_CLRNOUNMAPWAIT(as); 3442 mutex_exit(&as->a_contents); 3443 } 3444 goto again; 3445 } 3446 3447 as_setwatch(as); 3448 AS_LOCK_EXIT(as, &as->a_lock); 3449 return (error); 3450 } 3451 3452 /* 3453 * Setup all of the uninitialized watched pages that we can. 3454 */ 3455 void 3456 as_setwatch(struct as *as) 3457 { 3458 struct watched_page *pwp; 3459 struct seg *seg; 3460 caddr_t vaddr; 3461 uint_t prot; 3462 int err, retrycnt; 3463 3464 if (avl_numnodes(&as->a_wpage) == 0) 3465 return; 3466 3467 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3468 3469 for (pwp = avl_first(&as->a_wpage); pwp != NULL; 3470 pwp = AVL_NEXT(&as->a_wpage, pwp)) { 3471 retrycnt = 0; 3472 retry: 3473 vaddr = pwp->wp_vaddr; 3474 if (pwp->wp_oprot != 0 || /* already set up */ 3475 (seg = as_segat(as, vaddr)) == NULL || 3476 SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0) 3477 continue; 3478 3479 pwp->wp_oprot = prot; 3480 if (pwp->wp_read) 3481 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3482 if (pwp->wp_write) 3483 prot &= ~PROT_WRITE; 3484 if (pwp->wp_exec) 3485 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3486 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) { 3487 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot); 3488 if (err == IE_RETRY) { 3489 pwp->wp_oprot = 0; 3490 ASSERT(retrycnt == 0); 3491 retrycnt++; 3492 goto retry; 3493 } 3494 } 3495 pwp->wp_prot = prot; 3496 } 3497 } 3498 3499 /* 3500 * Clear all of the watched pages in the address space. 3501 */ 3502 void 3503 as_clearwatch(struct as *as) 3504 { 3505 struct watched_page *pwp; 3506 struct seg *seg; 3507 caddr_t vaddr; 3508 uint_t prot; 3509 int err, retrycnt; 3510 3511 if (avl_numnodes(&as->a_wpage) == 0) 3512 return; 3513 3514 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3515 3516 for (pwp = avl_first(&as->a_wpage); pwp != NULL; 3517 pwp = AVL_NEXT(&as->a_wpage, pwp)) { 3518 retrycnt = 0; 3519 retry: 3520 vaddr = pwp->wp_vaddr; 3521 if (pwp->wp_oprot == 0 || /* not set up */ 3522 (seg = as_segat(as, vaddr)) == NULL) 3523 continue; 3524 3525 if ((prot = pwp->wp_oprot) != pwp->wp_prot) { 3526 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot); 3527 if (err == IE_RETRY) { 3528 ASSERT(retrycnt == 0); 3529 retrycnt++; 3530 goto retry; 3531 } 3532 } 3533 pwp->wp_oprot = 0; 3534 pwp->wp_prot = 0; 3535 } 3536 } 3537 3538 /* 3539 * Force a new setup for all the watched pages in the range. 3540 */ 3541 static void 3542 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 3543 { 3544 struct watched_page *pwp; 3545 struct watched_page tpw; 3546 caddr_t eaddr = addr + size; 3547 caddr_t vaddr; 3548 struct seg *seg; 3549 int err, retrycnt; 3550 uint_t wprot; 3551 avl_index_t where; 3552 3553 if (avl_numnodes(&as->a_wpage) == 0) 3554 return; 3555 3556 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3557 3558 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3559 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) 3560 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); 3561 3562 while (pwp != NULL && pwp->wp_vaddr < eaddr) { 3563 retrycnt = 0; 3564 vaddr = pwp->wp_vaddr; 3565 3566 wprot = prot; 3567 if (pwp->wp_read) 3568 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3569 if (pwp->wp_write) 3570 wprot &= ~PROT_WRITE; 3571 if (pwp->wp_exec) 3572 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3573 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) { 3574 retry: 3575 seg = as_segat(as, vaddr); 3576 if (seg == NULL) { 3577 panic("as_setwatchprot: no seg"); 3578 /*NOTREACHED*/ 3579 } 3580 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot); 3581 if (err == IE_RETRY) { 3582 ASSERT(retrycnt == 0); 3583 retrycnt++; 3584 goto retry; 3585 } 3586 } 3587 pwp->wp_oprot = prot; 3588 pwp->wp_prot = wprot; 3589 3590 pwp = AVL_NEXT(&as->a_wpage, pwp); 3591 } 3592 } 3593 3594 /* 3595 * Clear all of the watched pages in the range. 3596 */ 3597 static void 3598 as_clearwatchprot(struct as *as, caddr_t addr, size_t size) 3599 { 3600 caddr_t eaddr = addr + size; 3601 struct watched_page *pwp; 3602 struct watched_page tpw; 3603 uint_t prot; 3604 struct seg *seg; 3605 int err, retrycnt; 3606 avl_index_t where; 3607 3608 if (avl_numnodes(&as->a_wpage) == 0) 3609 return; 3610 3611 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3612 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) 3613 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); 3614 3615 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3616 3617 while (pwp != NULL && pwp->wp_vaddr < eaddr) { 3618 3619 if ((prot = pwp->wp_oprot) != 0) { 3620 retrycnt = 0; 3621 3622 if (prot != pwp->wp_prot) { 3623 retry: 3624 seg = as_segat(as, pwp->wp_vaddr); 3625 if (seg == NULL) 3626 continue; 3627 err = SEGOP_SETPROT(seg, pwp->wp_vaddr, 3628 PAGESIZE, prot); 3629 if (err == IE_RETRY) { 3630 ASSERT(retrycnt == 0); 3631 retrycnt++; 3632 goto retry; 3633 3634 } 3635 } 3636 pwp->wp_oprot = 0; 3637 pwp->wp_prot = 0; 3638 } 3639 3640 pwp = AVL_NEXT(&as->a_wpage, pwp); 3641 } 3642 } 3643 3644 void 3645 as_signal_proc(struct as *as, k_siginfo_t *siginfo) 3646 { 3647 struct proc *p; 3648 3649 mutex_enter(&pidlock); 3650 for (p = practive; p; p = p->p_next) { 3651 if (p->p_as == as) { 3652 mutex_enter(&p->p_lock); 3653 if (p->p_as == as) 3654 sigaddq(p, NULL, siginfo, KM_NOSLEEP); 3655 mutex_exit(&p->p_lock); 3656 } 3657 } 3658 mutex_exit(&pidlock); 3659 } 3660 3661 /* 3662 * return memory object ID 3663 */ 3664 int 3665 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp) 3666 { 3667 struct seg *seg; 3668 int sts; 3669 3670 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 3671 seg = as_segat(as, addr); 3672 if (seg == NULL) { 3673 AS_LOCK_EXIT(as, &as->a_lock); 3674 return (EFAULT); 3675 } 3676 /* 3677 * catch old drivers which may not support getmemid 3678 */ 3679 if (seg->s_ops->getmemid == NULL) { 3680 AS_LOCK_EXIT(as, &as->a_lock); 3681 return (ENODEV); 3682 } 3683 3684 sts = SEGOP_GETMEMID(seg, addr, memidp); 3685 3686 AS_LOCK_EXIT(as, &as->a_lock); 3687 return (sts); 3688 } 3689