1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright 2018 Joyent, Inc. 25 * Copyright (c) 2016 by Delphix. All rights reserved. 26 */ 27 28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 29 /* All Rights Reserved */ 30 31 /* 32 * University Copyright- Copyright (c) 1982, 1986, 1988 33 * The Regents of the University of California 34 * All Rights Reserved 35 * 36 * University Acknowledgment- Portions of this document are derived from 37 * software developed by the University of California, Berkeley, and its 38 * contributors. 39 */ 40 41 /* 42 * VM - address spaces. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/t_lock.h> 47 #include <sys/param.h> 48 #include <sys/errno.h> 49 #include <sys/systm.h> 50 #include <sys/mman.h> 51 #include <sys/sysmacros.h> 52 #include <sys/cpuvar.h> 53 #include <sys/sysinfo.h> 54 #include <sys/kmem.h> 55 #include <sys/vnode.h> 56 #include <sys/vmsystm.h> 57 #include <sys/cmn_err.h> 58 #include <sys/debug.h> 59 #include <sys/tnf_probe.h> 60 #include <sys/vtrace.h> 61 62 #include <vm/hat.h> 63 #include <vm/as.h> 64 #include <vm/seg.h> 65 #include <vm/seg_vn.h> 66 #include <vm/seg_dev.h> 67 #include <vm/seg_kmem.h> 68 #include <vm/seg_map.h> 69 #include <vm/seg_spt.h> 70 #include <vm/seg_hole.h> 71 #include <vm/page.h> 72 73 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */ 74 75 static struct kmem_cache *as_cache; 76 77 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t); 78 static void as_clearwatchprot(struct as *, caddr_t, size_t); 79 80 81 /* 82 * Verifying the segment lists is very time-consuming; it may not be 83 * desirable always to define VERIFY_SEGLIST when DEBUG is set. 84 */ 85 #ifdef DEBUG 86 #define VERIFY_SEGLIST 87 int do_as_verify = 0; 88 #endif 89 90 /* 91 * Allocate a new callback data structure entry and fill in the events of 92 * interest, the address range of interest, and the callback argument. 93 * Link the entry on the as->a_callbacks list. A callback entry for the 94 * entire address space may be specified with vaddr = 0 and size = -1. 95 * 96 * CALLERS RESPONSIBILITY: If not calling from within the process context for 97 * the specified as, the caller must guarantee persistence of the specified as 98 * for the duration of this function (eg. pages being locked within the as 99 * will guarantee persistence). 100 */ 101 int 102 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events, 103 caddr_t vaddr, size_t size, int sleepflag) 104 { 105 struct as_callback *current_head, *cb; 106 caddr_t saddr; 107 size_t rsize; 108 109 /* callback function and an event are mandatory */ 110 if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0)) 111 return (EINVAL); 112 113 /* Adding a callback after as_free has been called is not allowed */ 114 if (as == &kas) 115 return (ENOMEM); 116 117 /* 118 * vaddr = 0 and size = -1 is used to indicate that the callback range 119 * is the entire address space so no rounding is done in that case. 120 */ 121 if (size != -1) { 122 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK); 123 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) - 124 (size_t)saddr; 125 /* check for wraparound */ 126 if (saddr + rsize < saddr) 127 return (ENOMEM); 128 } else { 129 if (vaddr != 0) 130 return (EINVAL); 131 saddr = vaddr; 132 rsize = size; 133 } 134 135 /* Allocate and initialize a callback entry */ 136 cb = kmem_zalloc(sizeof (struct as_callback), sleepflag); 137 if (cb == NULL) 138 return (EAGAIN); 139 140 cb->ascb_func = cb_func; 141 cb->ascb_arg = arg; 142 cb->ascb_events = events; 143 cb->ascb_saddr = saddr; 144 cb->ascb_len = rsize; 145 146 /* Add the entry to the list */ 147 mutex_enter(&as->a_contents); 148 current_head = as->a_callbacks; 149 as->a_callbacks = cb; 150 cb->ascb_next = current_head; 151 152 /* 153 * The call to this function may lose in a race with 154 * a pertinent event - eg. a thread does long term memory locking 155 * but before the callback is added another thread executes as_unmap. 156 * A broadcast here resolves that. 157 */ 158 if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) { 159 AS_CLRUNMAPWAIT(as); 160 cv_broadcast(&as->a_cv); 161 } 162 163 mutex_exit(&as->a_contents); 164 return (0); 165 } 166 167 /* 168 * Search the callback list for an entry which pertains to arg. 169 * 170 * This is called from within the client upon completion of the callback. 171 * RETURN VALUES: 172 * AS_CALLBACK_DELETED (callback entry found and deleted) 173 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok) 174 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this 175 * entry will be made in as_do_callbacks) 176 * 177 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED 178 * set, it indicates that as_do_callbacks is processing this entry. The 179 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made 180 * to unblock as_do_callbacks, in case it is blocked. 181 * 182 * CALLERS RESPONSIBILITY: If not calling from within the process context for 183 * the specified as, the caller must guarantee persistence of the specified as 184 * for the duration of this function (eg. pages being locked within the as 185 * will guarantee persistence). 186 */ 187 uint_t 188 as_delete_callback(struct as *as, void *arg) 189 { 190 struct as_callback **prevcb = &as->a_callbacks; 191 struct as_callback *cb; 192 uint_t rc = AS_CALLBACK_NOTFOUND; 193 194 mutex_enter(&as->a_contents); 195 for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) { 196 if (cb->ascb_arg != arg) 197 continue; 198 199 /* 200 * If the events indicate AS_CALLBACK_CALLED, just clear 201 * AS_ALL_EVENT in the events field and wakeup the thread 202 * that may be waiting in as_do_callbacks. as_do_callbacks 203 * will take care of removing this entry from the list. In 204 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise 205 * (AS_CALLBACK_CALLED not set), just remove it from the 206 * list, return the memory and return AS_CALLBACK_DELETED. 207 */ 208 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) { 209 /* leave AS_CALLBACK_CALLED */ 210 cb->ascb_events &= ~AS_ALL_EVENT; 211 rc = AS_CALLBACK_DELETE_DEFERRED; 212 cv_broadcast(&as->a_cv); 213 } else { 214 *prevcb = cb->ascb_next; 215 kmem_free(cb, sizeof (struct as_callback)); 216 rc = AS_CALLBACK_DELETED; 217 } 218 break; 219 } 220 mutex_exit(&as->a_contents); 221 return (rc); 222 } 223 224 /* 225 * Searches the as callback list for a matching entry. 226 * Returns a pointer to the first matching callback, or NULL if 227 * nothing is found. 228 * This function never sleeps so it is ok to call it with more 229 * locks held but the (required) a_contents mutex. 230 * 231 * See also comment on as_do_callbacks below. 232 */ 233 static struct as_callback * 234 as_find_callback(struct as *as, uint_t events, caddr_t event_addr, 235 size_t event_len) 236 { 237 struct as_callback *cb; 238 239 ASSERT(MUTEX_HELD(&as->a_contents)); 240 for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) { 241 /* 242 * If the callback has not already been called, then 243 * check if events or address range pertains. An event_len 244 * of zero means do an unconditional callback. 245 */ 246 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) || 247 ((event_len != 0) && (((cb->ascb_events & events) == 0) || 248 (event_addr + event_len < cb->ascb_saddr) || 249 (event_addr > (cb->ascb_saddr + cb->ascb_len))))) { 250 continue; 251 } 252 break; 253 } 254 return (cb); 255 } 256 257 /* 258 * Executes a given callback and removes it from the callback list for 259 * this address space. 260 * This function may sleep so the caller must drop all locks except 261 * a_contents before calling this func. 262 * 263 * See also comments on as_do_callbacks below. 264 */ 265 static void 266 as_execute_callback(struct as *as, struct as_callback *cb, 267 uint_t events) 268 { 269 struct as_callback **prevcb; 270 void *cb_arg; 271 272 ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events)); 273 cb->ascb_events |= AS_CALLBACK_CALLED; 274 mutex_exit(&as->a_contents); 275 (*cb->ascb_func)(as, cb->ascb_arg, events); 276 mutex_enter(&as->a_contents); 277 /* 278 * the callback function is required to delete the callback 279 * when the callback function determines it is OK for 280 * this thread to continue. as_delete_callback will clear 281 * the AS_ALL_EVENT in the events field when it is deleted. 282 * If the callback function called as_delete_callback, 283 * events will already be cleared and there will be no blocking. 284 */ 285 while ((cb->ascb_events & events) != 0) { 286 cv_wait(&as->a_cv, &as->a_contents); 287 } 288 /* 289 * This entry needs to be taken off the list. Normally, the 290 * callback func itself does that, but unfortunately the list 291 * may have changed while the callback was running because the 292 * a_contents mutex was dropped and someone else other than the 293 * callback func itself could have called as_delete_callback, 294 * so we have to search to find this entry again. The entry 295 * must have AS_CALLBACK_CALLED, and have the same 'arg'. 296 */ 297 cb_arg = cb->ascb_arg; 298 prevcb = &as->a_callbacks; 299 for (cb = as->a_callbacks; cb != NULL; 300 prevcb = &cb->ascb_next, cb = *prevcb) { 301 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) || 302 (cb_arg != cb->ascb_arg)) { 303 continue; 304 } 305 *prevcb = cb->ascb_next; 306 kmem_free(cb, sizeof (struct as_callback)); 307 break; 308 } 309 } 310 311 /* 312 * Check the callback list for a matching event and intersection of 313 * address range. If there is a match invoke the callback. Skip an entry if: 314 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED) 315 * - not event of interest 316 * - not address range of interest 317 * 318 * An event_len of zero indicates a request for an unconditional callback 319 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The 320 * a_contents lock must be dropped before a callback, so only one callback 321 * can be done before returning. Return -1 (true) if a callback was 322 * executed and removed from the list, else return 0 (false). 323 * 324 * The logically separate parts, i.e. finding a matching callback and 325 * executing a given callback have been separated into two functions 326 * so that they can be called with different sets of locks held beyond 327 * the always-required a_contents. as_find_callback does not sleep so 328 * it is ok to call it if more locks than a_contents (i.e. the a_lock 329 * rwlock) are held. as_execute_callback on the other hand may sleep 330 * so all locks beyond a_contents must be dropped by the caller if one 331 * does not want to end comatose. 332 */ 333 static int 334 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr, 335 size_t event_len) 336 { 337 struct as_callback *cb; 338 339 if ((cb = as_find_callback(as, events, event_addr, event_len))) { 340 as_execute_callback(as, cb, events); 341 return (-1); 342 } 343 return (0); 344 } 345 346 /* 347 * Search for the segment containing addr. If a segment containing addr 348 * exists, that segment is returned. If no such segment exists, and 349 * the list spans addresses greater than addr, then the first segment 350 * whose base is greater than addr is returned; otherwise, NULL is 351 * returned unless tail is true, in which case the last element of the 352 * list is returned. 353 * 354 * a_seglast is used to cache the last found segment for repeated 355 * searches to the same addr (which happens frequently). 356 */ 357 struct seg * 358 as_findseg(struct as *as, caddr_t addr, int tail) 359 { 360 struct seg *seg = as->a_seglast; 361 avl_index_t where; 362 363 ASSERT(AS_LOCK_HELD(as)); 364 365 if (seg != NULL && 366 seg->s_base <= addr && 367 addr < seg->s_base + seg->s_size) 368 return (seg); 369 370 seg = avl_find(&as->a_segtree, &addr, &where); 371 if (seg != NULL) 372 return (as->a_seglast = seg); 373 374 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); 375 if (seg == NULL && tail) 376 seg = avl_last(&as->a_segtree); 377 return (as->a_seglast = seg); 378 } 379 380 #ifdef VERIFY_SEGLIST 381 /* 382 * verify that the linked list is coherent 383 */ 384 static void 385 as_verify(struct as *as) 386 { 387 struct seg *seg, *seglast, *p, *n; 388 uint_t nsegs = 0; 389 390 if (do_as_verify == 0) 391 return; 392 393 seglast = as->a_seglast; 394 395 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 396 ASSERT(seg->s_as == as); 397 p = AS_SEGPREV(as, seg); 398 n = AS_SEGNEXT(as, seg); 399 ASSERT(p == NULL || p->s_as == as); 400 ASSERT(p == NULL || p->s_base < seg->s_base); 401 ASSERT(n == NULL || n->s_base > seg->s_base); 402 ASSERT(n != NULL || seg == avl_last(&as->a_segtree)); 403 if (seg == seglast) 404 seglast = NULL; 405 nsegs++; 406 } 407 ASSERT(seglast == NULL); 408 ASSERT(avl_numnodes(&as->a_segtree) == nsegs); 409 } 410 #endif /* VERIFY_SEGLIST */ 411 412 /* 413 * Add a new segment to the address space. The avl_find() 414 * may be expensive so we attempt to use last segment accessed 415 * in as_gap() as an insertion point. 416 */ 417 int 418 as_addseg(struct as *as, struct seg *newseg) 419 { 420 struct seg *seg; 421 caddr_t addr; 422 caddr_t eaddr; 423 avl_index_t where; 424 425 ASSERT(AS_WRITE_HELD(as)); 426 427 as->a_updatedir = 1; /* inform /proc */ 428 gethrestime(&as->a_updatetime); 429 430 if (as->a_lastgaphl != NULL) { 431 struct seg *hseg = NULL; 432 struct seg *lseg = NULL; 433 434 if (as->a_lastgaphl->s_base > newseg->s_base) { 435 hseg = as->a_lastgaphl; 436 lseg = AVL_PREV(&as->a_segtree, hseg); 437 } else { 438 lseg = as->a_lastgaphl; 439 hseg = AVL_NEXT(&as->a_segtree, lseg); 440 } 441 442 if (hseg && lseg && lseg->s_base < newseg->s_base && 443 hseg->s_base > newseg->s_base) { 444 avl_insert_here(&as->a_segtree, newseg, lseg, 445 AVL_AFTER); 446 as->a_lastgaphl = NULL; 447 as->a_seglast = newseg; 448 return (0); 449 } 450 as->a_lastgaphl = NULL; 451 } 452 453 addr = newseg->s_base; 454 eaddr = addr + newseg->s_size; 455 456 seg = avl_find(&as->a_segtree, &addr, &where); 457 458 if (seg == NULL) 459 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); 460 461 if (seg == NULL) 462 seg = avl_last(&as->a_segtree); 463 464 if (seg != NULL) { 465 caddr_t base = seg->s_base; 466 467 /* 468 * If top of seg is below the requested address, then 469 * the insertion point is at the end of the linked list, 470 * and seg points to the tail of the list. Otherwise, 471 * the insertion point is immediately before seg. 472 */ 473 if (base + seg->s_size > addr) { 474 if (addr >= base || eaddr > base) { 475 return (-1); /* overlapping segment */ 476 } 477 } 478 } 479 as->a_seglast = newseg; 480 avl_insert(&as->a_segtree, newseg, where); 481 482 #ifdef VERIFY_SEGLIST 483 as_verify(as); 484 #endif 485 return (0); 486 } 487 488 struct seg * 489 as_removeseg(struct as *as, struct seg *seg) 490 { 491 avl_tree_t *t; 492 493 ASSERT(AS_WRITE_HELD(as)); 494 495 as->a_updatedir = 1; /* inform /proc */ 496 gethrestime(&as->a_updatetime); 497 498 if (seg == NULL) 499 return (NULL); 500 501 t = &as->a_segtree; 502 if (as->a_seglast == seg) 503 as->a_seglast = NULL; 504 as->a_lastgaphl = NULL; 505 506 /* 507 * if this segment is at an address higher than 508 * a_lastgap, set a_lastgap to the next segment (NULL if last segment) 509 */ 510 if (as->a_lastgap && 511 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base)) 512 as->a_lastgap = AVL_NEXT(t, seg); 513 514 /* 515 * remove the segment from the seg tree 516 */ 517 avl_remove(t, seg); 518 519 #ifdef VERIFY_SEGLIST 520 as_verify(as); 521 #endif 522 return (seg); 523 } 524 525 /* 526 * Find a segment containing addr. 527 */ 528 struct seg * 529 as_segat(struct as *as, caddr_t addr) 530 { 531 struct seg *seg = as->a_seglast; 532 533 ASSERT(AS_LOCK_HELD(as)); 534 535 if (seg != NULL && seg->s_base <= addr && 536 addr < seg->s_base + seg->s_size) 537 return (seg); 538 539 seg = avl_find(&as->a_segtree, &addr, NULL); 540 return (seg); 541 } 542 543 /* 544 * Serialize all searches for holes in an address space to 545 * prevent two or more threads from allocating the same virtual 546 * address range. The address space must not be "read/write" 547 * locked by the caller since we may block. 548 */ 549 void 550 as_rangelock(struct as *as) 551 { 552 mutex_enter(&as->a_contents); 553 while (AS_ISCLAIMGAP(as)) 554 cv_wait(&as->a_cv, &as->a_contents); 555 AS_SETCLAIMGAP(as); 556 mutex_exit(&as->a_contents); 557 } 558 559 /* 560 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads. 561 */ 562 void 563 as_rangeunlock(struct as *as) 564 { 565 mutex_enter(&as->a_contents); 566 AS_CLRCLAIMGAP(as); 567 cv_signal(&as->a_cv); 568 mutex_exit(&as->a_contents); 569 } 570 571 /* 572 * compar segments (or just an address) by segment address range 573 */ 574 static int 575 as_segcompar(const void *x, const void *y) 576 { 577 struct seg *a = (struct seg *)x; 578 struct seg *b = (struct seg *)y; 579 580 if (a->s_base < b->s_base) 581 return (-1); 582 if (a->s_base >= b->s_base + b->s_size) 583 return (1); 584 return (0); 585 } 586 587 588 void 589 as_avlinit(struct as *as) 590 { 591 avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg), 592 offsetof(struct seg, s_tree)); 593 avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page), 594 offsetof(struct watched_page, wp_link)); 595 } 596 597 /*ARGSUSED*/ 598 static int 599 as_constructor(void *buf, void *cdrarg, int kmflags) 600 { 601 struct as *as = buf; 602 603 mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL); 604 cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL); 605 rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL); 606 as_avlinit(as); 607 return (0); 608 } 609 610 /*ARGSUSED1*/ 611 static void 612 as_destructor(void *buf, void *cdrarg) 613 { 614 struct as *as = buf; 615 616 avl_destroy(&as->a_segtree); 617 mutex_destroy(&as->a_contents); 618 cv_destroy(&as->a_cv); 619 rw_destroy(&as->a_lock); 620 } 621 622 void 623 as_init(void) 624 { 625 as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0, 626 as_constructor, as_destructor, NULL, NULL, NULL, 0); 627 } 628 629 /* 630 * Allocate and initialize an address space data structure. 631 * We call hat_alloc to allow any machine dependent 632 * information in the hat structure to be initialized. 633 */ 634 struct as * 635 as_alloc(void) 636 { 637 struct as *as; 638 639 as = kmem_cache_alloc(as_cache, KM_SLEEP); 640 641 as->a_flags = 0; 642 as->a_vbits = 0; 643 as->a_hrm = NULL; 644 as->a_seglast = NULL; 645 as->a_size = 0; 646 as->a_resvsize = 0; 647 as->a_updatedir = 0; 648 gethrestime(&as->a_updatetime); 649 as->a_objectdir = NULL; 650 as->a_sizedir = 0; 651 as->a_userlimit = (caddr_t)USERLIMIT; 652 as->a_lastgap = NULL; 653 as->a_lastgaphl = NULL; 654 as->a_callbacks = NULL; 655 as->a_proc = NULL; 656 657 AS_LOCK_ENTER(as, RW_WRITER); 658 as->a_hat = hat_alloc(as); /* create hat for default system mmu */ 659 AS_LOCK_EXIT(as); 660 661 return (as); 662 } 663 664 /* 665 * Free an address space data structure. 666 * Need to free the hat first and then 667 * all the segments on this as and finally 668 * the space for the as struct itself. 669 */ 670 void 671 as_free(struct as *as) 672 { 673 struct hat *hat = as->a_hat; 674 struct seg *seg, *next; 675 boolean_t free_started = B_FALSE; 676 677 top: 678 /* 679 * Invoke ALL callbacks. as_do_callbacks will do one callback 680 * per call, and not return (-1) until the callback has completed. 681 * When as_do_callbacks returns zero, all callbacks have completed. 682 */ 683 mutex_enter(&as->a_contents); 684 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0)) 685 ; 686 687 mutex_exit(&as->a_contents); 688 AS_LOCK_ENTER(as, RW_WRITER); 689 690 if (!free_started) { 691 free_started = B_TRUE; 692 hat_free_start(hat); 693 } 694 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) { 695 int err; 696 697 next = AS_SEGNEXT(as, seg); 698 retry: 699 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 700 if (err == EAGAIN) { 701 mutex_enter(&as->a_contents); 702 if (as->a_callbacks) { 703 AS_LOCK_EXIT(as); 704 } else if (!AS_ISNOUNMAPWAIT(as)) { 705 /* 706 * Memory is currently locked. Wait for a 707 * cv_signal that it has been unlocked, then 708 * try the operation again. 709 */ 710 if (AS_ISUNMAPWAIT(as) == 0) 711 cv_broadcast(&as->a_cv); 712 AS_SETUNMAPWAIT(as); 713 AS_LOCK_EXIT(as); 714 while (AS_ISUNMAPWAIT(as)) 715 cv_wait(&as->a_cv, &as->a_contents); 716 } else { 717 /* 718 * We may have raced with 719 * segvn_reclaim()/segspt_reclaim(). In this 720 * case clean nounmapwait flag and retry since 721 * softlockcnt in this segment may be already 722 * 0. We don't drop as writer lock so our 723 * number of retries without sleeping should 724 * be very small. See segvn_reclaim() for 725 * more comments. 726 */ 727 AS_CLRNOUNMAPWAIT(as); 728 mutex_exit(&as->a_contents); 729 goto retry; 730 } 731 mutex_exit(&as->a_contents); 732 goto top; 733 } else { 734 /* 735 * We do not expect any other error return at this 736 * time. This is similar to an ASSERT in seg_unmap() 737 */ 738 ASSERT(err == 0); 739 } 740 } 741 hat_free_end(hat); 742 AS_LOCK_EXIT(as); 743 744 /* /proc stuff */ 745 ASSERT(avl_numnodes(&as->a_wpage) == 0); 746 if (as->a_objectdir) { 747 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *)); 748 as->a_objectdir = NULL; 749 as->a_sizedir = 0; 750 } 751 752 /* 753 * Free the struct as back to kmem. Assert it has no segments. 754 */ 755 ASSERT(avl_numnodes(&as->a_segtree) == 0); 756 kmem_cache_free(as_cache, as); 757 } 758 759 int 760 as_dup(struct as *as, struct proc *forkedproc) 761 { 762 struct as *newas; 763 struct seg *seg, *newseg; 764 size_t purgesize = 0; 765 int error; 766 767 AS_LOCK_ENTER(as, RW_WRITER); 768 as_clearwatch(as); 769 newas = as_alloc(); 770 newas->a_userlimit = as->a_userlimit; 771 newas->a_proc = forkedproc; 772 773 AS_LOCK_ENTER(newas, RW_WRITER); 774 775 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD); 776 777 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 778 779 if (seg->s_flags & S_PURGE) { 780 purgesize += seg->s_size; 781 continue; 782 } 783 784 newseg = seg_alloc(newas, seg->s_base, seg->s_size); 785 if (newseg == NULL) { 786 AS_LOCK_EXIT(newas); 787 as_setwatch(as); 788 AS_LOCK_EXIT(as); 789 as_free(newas); 790 return (-1); 791 } 792 if ((error = SEGOP_DUP(seg, newseg)) != 0) { 793 /* 794 * We call seg_free() on the new seg 795 * because the segment is not set up 796 * completely; i.e. it has no ops. 797 */ 798 as_setwatch(as); 799 AS_LOCK_EXIT(as); 800 seg_free(newseg); 801 AS_LOCK_EXIT(newas); 802 as_free(newas); 803 return (error); 804 } 805 if ((newseg->s_flags & S_HOLE) == 0) { 806 newas->a_size += seg->s_size; 807 } 808 } 809 newas->a_resvsize = as->a_resvsize - purgesize; 810 811 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL); 812 813 AS_LOCK_EXIT(newas); 814 815 as_setwatch(as); 816 AS_LOCK_EXIT(as); 817 if (error != 0) { 818 as_free(newas); 819 return (error); 820 } 821 forkedproc->p_as = newas; 822 return (0); 823 } 824 825 /* 826 * Handle a ``fault'' at addr for size bytes. 827 */ 828 faultcode_t 829 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size, 830 enum fault_type type, enum seg_rw rw) 831 { 832 struct seg *seg; 833 caddr_t raddr; /* rounded down addr */ 834 size_t rsize; /* rounded up size */ 835 size_t ssize; 836 faultcode_t res = 0; 837 caddr_t addrsav; 838 struct seg *segsav; 839 int as_lock_held; 840 klwp_t *lwp = ttolwp(curthread); 841 842 843 844 retry: 845 /* 846 * Indicate that the lwp is not to be stopped while waiting for a 847 * pagefault. This is to avoid deadlock while debugging a process 848 * via /proc over NFS (in particular). 849 */ 850 if (lwp != NULL) 851 lwp->lwp_nostop++; 852 853 /* 854 * same length must be used when we softlock and softunlock. We 855 * don't support softunlocking lengths less than the original length 856 * when there is largepage support. See seg_dev.c for more 857 * comments. 858 */ 859 switch (type) { 860 861 case F_SOFTLOCK: 862 CPU_STATS_ADD_K(vm, softlock, 1); 863 break; 864 865 case F_SOFTUNLOCK: 866 break; 867 868 case F_PROT: 869 CPU_STATS_ADD_K(vm, prot_fault, 1); 870 break; 871 872 case F_INVAL: 873 CPU_STATS_ENTER_K(); 874 CPU_STATS_ADDQ(CPU, vm, as_fault, 1); 875 if (as == &kas) 876 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1); 877 CPU_STATS_EXIT_K(); 878 break; 879 } 880 881 /* Kernel probe */ 882 TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */, 883 tnf_opaque, address, addr, 884 tnf_fault_type, fault_type, type, 885 tnf_seg_access, access, rw); 886 887 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 888 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 889 (size_t)raddr; 890 891 /* 892 * XXX -- Don't grab the as lock for segkmap. We should grab it for 893 * correctness, but then we could be stuck holding this lock for 894 * a LONG time if the fault needs to be resolved on a slow 895 * filesystem, and then no-one will be able to exec new commands, 896 * as exec'ing requires the write lock on the as. 897 */ 898 if (as == &kas && segkmap && segkmap->s_base <= raddr && 899 raddr + size < segkmap->s_base + segkmap->s_size) { 900 seg = segkmap; 901 as_lock_held = 0; 902 } else { 903 AS_LOCK_ENTER(as, RW_READER); 904 905 seg = as_segat(as, raddr); 906 if (seg == NULL) { 907 AS_LOCK_EXIT(as); 908 if (lwp != NULL) 909 lwp->lwp_nostop--; 910 return (FC_NOMAP); 911 } 912 913 as_lock_held = 1; 914 } 915 916 addrsav = raddr; 917 segsav = seg; 918 919 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 920 if (raddr >= seg->s_base + seg->s_size) { 921 seg = AS_SEGNEXT(as, seg); 922 if (seg == NULL || raddr != seg->s_base) { 923 res = FC_NOMAP; 924 break; 925 } 926 } 927 if (raddr + rsize > seg->s_base + seg->s_size) 928 ssize = seg->s_base + seg->s_size - raddr; 929 else 930 ssize = rsize; 931 932 res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw); 933 if (res != 0) 934 break; 935 } 936 937 /* 938 * If we were SOFTLOCKing and encountered a failure, 939 * we must SOFTUNLOCK the range we already did. (Maybe we 940 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing 941 * right here...) 942 */ 943 if (res != 0 && type == F_SOFTLOCK) { 944 for (seg = segsav; addrsav < raddr; addrsav += ssize) { 945 if (addrsav >= seg->s_base + seg->s_size) 946 seg = AS_SEGNEXT(as, seg); 947 ASSERT(seg != NULL); 948 /* 949 * Now call the fault routine again to perform the 950 * unlock using S_OTHER instead of the rw variable 951 * since we never got a chance to touch the pages. 952 */ 953 if (raddr > seg->s_base + seg->s_size) 954 ssize = seg->s_base + seg->s_size - addrsav; 955 else 956 ssize = raddr - addrsav; 957 (void) SEGOP_FAULT(hat, seg, addrsav, ssize, 958 F_SOFTUNLOCK, S_OTHER); 959 } 960 } 961 if (as_lock_held) 962 AS_LOCK_EXIT(as); 963 if (lwp != NULL) 964 lwp->lwp_nostop--; 965 966 /* 967 * If the lower levels returned EDEADLK for a fault, 968 * It means that we should retry the fault. Let's wait 969 * a bit also to let the deadlock causing condition clear. 970 * This is part of a gross hack to work around a design flaw 971 * in the ufs/sds logging code and should go away when the 972 * logging code is re-designed to fix the problem. See bug 973 * 4125102 for details of the problem. 974 */ 975 if (FC_ERRNO(res) == EDEADLK) { 976 delay(deadlk_wait); 977 res = 0; 978 goto retry; 979 } 980 return (res); 981 } 982 983 984 985 /* 986 * Asynchronous ``fault'' at addr for size bytes. 987 */ 988 faultcode_t 989 as_faulta(struct as *as, caddr_t addr, size_t size) 990 { 991 struct seg *seg; 992 caddr_t raddr; /* rounded down addr */ 993 size_t rsize; /* rounded up size */ 994 faultcode_t res = 0; 995 klwp_t *lwp = ttolwp(curthread); 996 997 retry: 998 /* 999 * Indicate that the lwp is not to be stopped while waiting 1000 * for a pagefault. This is to avoid deadlock while debugging 1001 * a process via /proc over NFS (in particular). 1002 */ 1003 if (lwp != NULL) 1004 lwp->lwp_nostop++; 1005 1006 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1007 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1008 (size_t)raddr; 1009 1010 AS_LOCK_ENTER(as, RW_READER); 1011 seg = as_segat(as, raddr); 1012 if (seg == NULL) { 1013 AS_LOCK_EXIT(as); 1014 if (lwp != NULL) 1015 lwp->lwp_nostop--; 1016 return (FC_NOMAP); 1017 } 1018 1019 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) { 1020 if (raddr >= seg->s_base + seg->s_size) { 1021 seg = AS_SEGNEXT(as, seg); 1022 if (seg == NULL || raddr != seg->s_base) { 1023 res = FC_NOMAP; 1024 break; 1025 } 1026 } 1027 res = SEGOP_FAULTA(seg, raddr); 1028 if (res != 0) 1029 break; 1030 } 1031 AS_LOCK_EXIT(as); 1032 if (lwp != NULL) 1033 lwp->lwp_nostop--; 1034 /* 1035 * If the lower levels returned EDEADLK for a fault, 1036 * It means that we should retry the fault. Let's wait 1037 * a bit also to let the deadlock causing condition clear. 1038 * This is part of a gross hack to work around a design flaw 1039 * in the ufs/sds logging code and should go away when the 1040 * logging code is re-designed to fix the problem. See bug 1041 * 4125102 for details of the problem. 1042 */ 1043 if (FC_ERRNO(res) == EDEADLK) { 1044 delay(deadlk_wait); 1045 res = 0; 1046 goto retry; 1047 } 1048 return (res); 1049 } 1050 1051 /* 1052 * Set the virtual mapping for the interval from [addr : addr + size) 1053 * in address space `as' to have the specified protection. 1054 * It is ok for the range to cross over several segments, 1055 * as long as they are contiguous. 1056 */ 1057 int 1058 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 1059 { 1060 struct seg *seg; 1061 struct as_callback *cb; 1062 size_t ssize; 1063 caddr_t raddr; /* rounded down addr */ 1064 size_t rsize; /* rounded up size */ 1065 int error = 0, writer = 0; 1066 caddr_t saveraddr; 1067 size_t saversize; 1068 1069 setprot_top: 1070 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1071 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1072 (size_t)raddr; 1073 1074 if (raddr + rsize < raddr) /* check for wraparound */ 1075 return (ENOMEM); 1076 1077 saveraddr = raddr; 1078 saversize = rsize; 1079 1080 /* 1081 * Normally we only lock the as as a reader. But 1082 * if due to setprot the segment driver needs to split 1083 * a segment it will return IE_RETRY. Therefore we re-acquire 1084 * the as lock as a writer so the segment driver can change 1085 * the seg list. Also the segment driver will return IE_RETRY 1086 * after it has changed the segment list so we therefore keep 1087 * locking as a writer. Since these opeartions should be rare 1088 * want to only lock as a writer when necessary. 1089 */ 1090 if (writer || avl_numnodes(&as->a_wpage) != 0) { 1091 AS_LOCK_ENTER(as, RW_WRITER); 1092 } else { 1093 AS_LOCK_ENTER(as, RW_READER); 1094 } 1095 1096 as_clearwatchprot(as, raddr, rsize); 1097 seg = as_segat(as, raddr); 1098 if (seg == NULL) { 1099 as_setwatch(as); 1100 AS_LOCK_EXIT(as); 1101 return (ENOMEM); 1102 } 1103 1104 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 1105 if (raddr >= seg->s_base + seg->s_size) { 1106 seg = AS_SEGNEXT(as, seg); 1107 if (seg == NULL || raddr != seg->s_base) { 1108 error = ENOMEM; 1109 break; 1110 } 1111 } 1112 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 1113 ssize = seg->s_base + seg->s_size - raddr; 1114 else 1115 ssize = rsize; 1116 retry: 1117 error = SEGOP_SETPROT(seg, raddr, ssize, prot); 1118 1119 if (error == IE_NOMEM) { 1120 error = EAGAIN; 1121 break; 1122 } 1123 1124 if (error == IE_RETRY) { 1125 AS_LOCK_EXIT(as); 1126 writer = 1; 1127 goto setprot_top; 1128 } 1129 1130 if (error == EAGAIN) { 1131 /* 1132 * Make sure we have a_lock as writer. 1133 */ 1134 if (writer == 0) { 1135 AS_LOCK_EXIT(as); 1136 writer = 1; 1137 goto setprot_top; 1138 } 1139 1140 /* 1141 * Memory is currently locked. It must be unlocked 1142 * before this operation can succeed through a retry. 1143 * The possible reasons for locked memory and 1144 * corresponding strategies for unlocking are: 1145 * (1) Normal I/O 1146 * wait for a signal that the I/O operation 1147 * has completed and the memory is unlocked. 1148 * (2) Asynchronous I/O 1149 * The aio subsystem does not unlock pages when 1150 * the I/O is completed. Those pages are unlocked 1151 * when the application calls aiowait/aioerror. 1152 * So, to prevent blocking forever, cv_broadcast() 1153 * is done to wake up aio_cleanup_thread. 1154 * Subsequently, segvn_reclaim will be called, and 1155 * that will do AS_CLRUNMAPWAIT() and wake us up. 1156 * (3) Long term page locking: 1157 * Drivers intending to have pages locked for a 1158 * period considerably longer than for normal I/O 1159 * (essentially forever) may have registered for a 1160 * callback so they may unlock these pages on 1161 * request. This is needed to allow this operation 1162 * to succeed. Each entry on the callback list is 1163 * examined. If the event or address range pertains 1164 * the callback is invoked (unless it already is in 1165 * progress). The a_contents lock must be dropped 1166 * before the callback, so only one callback can 1167 * be done at a time. Go to the top and do more 1168 * until zero is returned. If zero is returned, 1169 * either there were no callbacks for this event 1170 * or they were already in progress. 1171 */ 1172 mutex_enter(&as->a_contents); 1173 if (as->a_callbacks && 1174 (cb = as_find_callback(as, AS_SETPROT_EVENT, 1175 seg->s_base, seg->s_size))) { 1176 AS_LOCK_EXIT(as); 1177 as_execute_callback(as, cb, AS_SETPROT_EVENT); 1178 } else if (!AS_ISNOUNMAPWAIT(as)) { 1179 if (AS_ISUNMAPWAIT(as) == 0) 1180 cv_broadcast(&as->a_cv); 1181 AS_SETUNMAPWAIT(as); 1182 AS_LOCK_EXIT(as); 1183 while (AS_ISUNMAPWAIT(as)) 1184 cv_wait(&as->a_cv, &as->a_contents); 1185 } else { 1186 /* 1187 * We may have raced with 1188 * segvn_reclaim()/segspt_reclaim(). In this 1189 * case clean nounmapwait flag and retry since 1190 * softlockcnt in this segment may be already 1191 * 0. We don't drop as writer lock so our 1192 * number of retries without sleeping should 1193 * be very small. See segvn_reclaim() for 1194 * more comments. 1195 */ 1196 AS_CLRNOUNMAPWAIT(as); 1197 mutex_exit(&as->a_contents); 1198 goto retry; 1199 } 1200 mutex_exit(&as->a_contents); 1201 goto setprot_top; 1202 } else if (error != 0) 1203 break; 1204 } 1205 if (error != 0) { 1206 as_setwatch(as); 1207 } else { 1208 as_setwatchprot(as, saveraddr, saversize, prot); 1209 } 1210 AS_LOCK_EXIT(as); 1211 return (error); 1212 } 1213 1214 /* 1215 * Check to make sure that the interval [addr, addr + size) 1216 * in address space `as' has at least the specified protection. 1217 * It is ok for the range to cross over several segments, as long 1218 * as they are contiguous. 1219 */ 1220 int 1221 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 1222 { 1223 struct seg *seg; 1224 size_t ssize; 1225 caddr_t raddr; /* rounded down addr */ 1226 size_t rsize; /* rounded up size */ 1227 int error = 0; 1228 1229 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1230 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1231 (size_t)raddr; 1232 1233 if (raddr + rsize < raddr) /* check for wraparound */ 1234 return (ENOMEM); 1235 1236 /* 1237 * This is ugly as sin... 1238 * Normally, we only acquire the address space readers lock. 1239 * However, if the address space has watchpoints present, 1240 * we must acquire the writer lock on the address space for 1241 * the benefit of as_clearwatchprot() and as_setwatchprot(). 1242 */ 1243 if (avl_numnodes(&as->a_wpage) != 0) 1244 AS_LOCK_ENTER(as, RW_WRITER); 1245 else 1246 AS_LOCK_ENTER(as, RW_READER); 1247 as_clearwatchprot(as, raddr, rsize); 1248 seg = as_segat(as, raddr); 1249 if (seg == NULL) { 1250 as_setwatch(as); 1251 AS_LOCK_EXIT(as); 1252 return (ENOMEM); 1253 } 1254 1255 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 1256 if (raddr >= seg->s_base + seg->s_size) { 1257 seg = AS_SEGNEXT(as, seg); 1258 if (seg == NULL || raddr != seg->s_base) { 1259 error = ENOMEM; 1260 break; 1261 } 1262 } 1263 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 1264 ssize = seg->s_base + seg->s_size - raddr; 1265 else 1266 ssize = rsize; 1267 1268 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot); 1269 if (error != 0) 1270 break; 1271 } 1272 as_setwatch(as); 1273 AS_LOCK_EXIT(as); 1274 return (error); 1275 } 1276 1277 int 1278 as_unmap(struct as *as, caddr_t addr, size_t size) 1279 { 1280 struct seg *seg, *seg_next; 1281 struct as_callback *cb; 1282 caddr_t raddr, eaddr; 1283 size_t ssize, rsize = 0; 1284 int err; 1285 1286 top: 1287 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1288 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) & 1289 (uintptr_t)PAGEMASK); 1290 1291 AS_LOCK_ENTER(as, RW_WRITER); 1292 1293 as->a_updatedir = 1; /* inform /proc */ 1294 gethrestime(&as->a_updatetime); 1295 1296 /* 1297 * Use as_findseg to find the first segment in the range, then 1298 * step through the segments in order, following s_next. 1299 */ 1300 as_clearwatchprot(as, raddr, eaddr - raddr); 1301 1302 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) { 1303 const boolean_t is_hole = ((seg->s_flags & S_HOLE) != 0); 1304 1305 if (eaddr <= seg->s_base) 1306 break; /* eaddr was in a gap; all done */ 1307 1308 /* this is implied by the test above */ 1309 ASSERT(raddr < eaddr); 1310 1311 if (raddr < seg->s_base) 1312 raddr = seg->s_base; /* raddr was in a gap */ 1313 1314 if (eaddr > (seg->s_base + seg->s_size)) 1315 ssize = seg->s_base + seg->s_size - raddr; 1316 else 1317 ssize = eaddr - raddr; 1318 1319 /* 1320 * Save next segment pointer since seg can be 1321 * destroyed during the segment unmap operation. 1322 */ 1323 seg_next = AS_SEGNEXT(as, seg); 1324 1325 /* 1326 * We didn't count /dev/null mappings, so ignore them here. 1327 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again, 1328 * we have to do this check here while we have seg.) 1329 */ 1330 rsize = 0; 1331 if (!SEG_IS_DEVNULL_MAPPING(seg) && 1332 !SEG_IS_PARTIAL_RESV(seg)) 1333 rsize = ssize; 1334 1335 retry: 1336 err = SEGOP_UNMAP(seg, raddr, ssize); 1337 if (err == EAGAIN) { 1338 /* 1339 * Memory is currently locked. It must be unlocked 1340 * before this operation can succeed through a retry. 1341 * The possible reasons for locked memory and 1342 * corresponding strategies for unlocking are: 1343 * (1) Normal I/O 1344 * wait for a signal that the I/O operation 1345 * has completed and the memory is unlocked. 1346 * (2) Asynchronous I/O 1347 * The aio subsystem does not unlock pages when 1348 * the I/O is completed. Those pages are unlocked 1349 * when the application calls aiowait/aioerror. 1350 * So, to prevent blocking forever, cv_broadcast() 1351 * is done to wake up aio_cleanup_thread. 1352 * Subsequently, segvn_reclaim will be called, and 1353 * that will do AS_CLRUNMAPWAIT() and wake us up. 1354 * (3) Long term page locking: 1355 * Drivers intending to have pages locked for a 1356 * period considerably longer than for normal I/O 1357 * (essentially forever) may have registered for a 1358 * callback so they may unlock these pages on 1359 * request. This is needed to allow this operation 1360 * to succeed. Each entry on the callback list is 1361 * examined. If the event or address range pertains 1362 * the callback is invoked (unless it already is in 1363 * progress). The a_contents lock must be dropped 1364 * before the callback, so only one callback can 1365 * be done at a time. Go to the top and do more 1366 * until zero is returned. If zero is returned, 1367 * either there were no callbacks for this event 1368 * or they were already in progress. 1369 */ 1370 mutex_enter(&as->a_contents); 1371 if (as->a_callbacks && 1372 (cb = as_find_callback(as, AS_UNMAP_EVENT, 1373 seg->s_base, seg->s_size))) { 1374 AS_LOCK_EXIT(as); 1375 as_execute_callback(as, cb, AS_UNMAP_EVENT); 1376 } else if (!AS_ISNOUNMAPWAIT(as)) { 1377 if (AS_ISUNMAPWAIT(as) == 0) 1378 cv_broadcast(&as->a_cv); 1379 AS_SETUNMAPWAIT(as); 1380 AS_LOCK_EXIT(as); 1381 while (AS_ISUNMAPWAIT(as)) 1382 cv_wait(&as->a_cv, &as->a_contents); 1383 } else { 1384 /* 1385 * We may have raced with 1386 * segvn_reclaim()/segspt_reclaim(). In this 1387 * case clean nounmapwait flag and retry since 1388 * softlockcnt in this segment may be already 1389 * 0. We don't drop as writer lock so our 1390 * number of retries without sleeping should 1391 * be very small. See segvn_reclaim() for 1392 * more comments. 1393 */ 1394 AS_CLRNOUNMAPWAIT(as); 1395 mutex_exit(&as->a_contents); 1396 goto retry; 1397 } 1398 mutex_exit(&as->a_contents); 1399 goto top; 1400 } else if (err == IE_RETRY) { 1401 AS_LOCK_EXIT(as); 1402 goto top; 1403 } else if (err) { 1404 as_setwatch(as); 1405 AS_LOCK_EXIT(as); 1406 return (-1); 1407 } 1408 1409 if (!is_hole) { 1410 as->a_size -= ssize; 1411 if (rsize) 1412 as->a_resvsize -= rsize; 1413 } 1414 raddr += ssize; 1415 } 1416 AS_LOCK_EXIT(as); 1417 return (0); 1418 } 1419 1420 static int 1421 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec, 1422 segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated) 1423 { 1424 uint_t szc, nszc, save_szcvec; 1425 int error; 1426 caddr_t a, eaddr; 1427 size_t pgsz = 0; 1428 const boolean_t do_off = (vn_a->vp != NULL || vn_a->amp != NULL); 1429 1430 ASSERT(AS_WRITE_HELD(as)); 1431 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1432 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1433 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL); 1434 1435 if (!do_off) { 1436 vn_a->offset = 0; 1437 } 1438 1439 if (szcvec <= 1) { 1440 struct seg *seg, *segref; 1441 1442 seg = segref = seg_alloc(as, addr, size); 1443 if (seg == NULL) { 1444 return (ENOMEM); 1445 } 1446 vn_a->szc = 0; 1447 error = (*crfp)(&seg, vn_a); 1448 if (error != 0) { 1449 VERIFY3P(seg, ==, segref); 1450 seg_free(seg); 1451 } else { 1452 as->a_size += size; 1453 as->a_resvsize += size; 1454 } 1455 return (error); 1456 } 1457 1458 eaddr = addr + size; 1459 save_szcvec = szcvec; 1460 szcvec >>= 1; 1461 szc = 0; 1462 nszc = 0; 1463 while (szcvec) { 1464 if ((szcvec & 0x1) == 0) { 1465 nszc++; 1466 szcvec >>= 1; 1467 continue; 1468 } 1469 nszc++; 1470 pgsz = page_get_pagesize(nszc); 1471 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 1472 if (a != addr) { 1473 struct seg *seg, *segref; 1474 size_t segsize; 1475 1476 ASSERT(a < eaddr); 1477 1478 segsize = a - addr; 1479 seg = segref = seg_alloc(as, addr, segsize); 1480 if (seg == NULL) { 1481 return (ENOMEM); 1482 } 1483 vn_a->szc = szc; 1484 error = (*crfp)(&seg, vn_a); 1485 if (error != 0) { 1486 VERIFY3P(seg, ==, segref); 1487 seg_free(seg); 1488 return (error); 1489 } 1490 as->a_size += segsize; 1491 as->a_resvsize += segsize; 1492 *segcreated = B_TRUE; 1493 if (do_off) { 1494 vn_a->offset += segsize; 1495 } 1496 addr = a; 1497 } 1498 szc = nszc; 1499 szcvec >>= 1; 1500 } 1501 1502 ASSERT(addr < eaddr); 1503 szcvec = save_szcvec | 1; /* add 8K pages */ 1504 while (szcvec) { 1505 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 1506 ASSERT(a >= addr); 1507 if (a != addr) { 1508 struct seg *seg, *segref; 1509 size_t segsize; 1510 1511 segsize = a - addr; 1512 seg = segref = seg_alloc(as, addr, segsize); 1513 if (seg == NULL) { 1514 return (ENOMEM); 1515 } 1516 vn_a->szc = szc; 1517 error = (*crfp)(&seg, vn_a); 1518 if (error != 0) { 1519 VERIFY3P(seg, ==, segref); 1520 seg_free(seg); 1521 return (error); 1522 } 1523 as->a_size += segsize; 1524 as->a_resvsize += segsize; 1525 *segcreated = B_TRUE; 1526 if (do_off) { 1527 vn_a->offset += segsize; 1528 } 1529 addr = a; 1530 } 1531 szcvec &= ~(1 << szc); 1532 if (szcvec) { 1533 szc = highbit(szcvec) - 1; 1534 pgsz = page_get_pagesize(szc); 1535 } 1536 } 1537 ASSERT(addr == eaddr); 1538 1539 return (0); 1540 } 1541 1542 static int 1543 as_map_vnsegs(struct as *as, caddr_t addr, size_t size, 1544 segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated) 1545 { 1546 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA); 1547 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM; 1548 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags, 1549 type, 0); 1550 int error; 1551 struct vattr va; 1552 u_offset_t eoff; 1553 size_t save_size = 0; 1554 extern size_t textrepl_size_thresh; 1555 1556 ASSERT(AS_WRITE_HELD(as)); 1557 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1558 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1559 ASSERT(vn_a->vp != NULL); 1560 ASSERT(vn_a->amp == NULL); 1561 1562 again: 1563 if (szcvec <= 1) { 1564 struct seg *seg, *segref; 1565 1566 seg = segref = seg_alloc(as, addr, size); 1567 if (seg == NULL) { 1568 return (ENOMEM); 1569 } 1570 vn_a->szc = 0; 1571 error = (*crfp)(&seg, vn_a); 1572 if (error != 0) { 1573 VERIFY3P(seg, ==, segref); 1574 seg_free(seg); 1575 } else { 1576 as->a_size += size; 1577 as->a_resvsize += size; 1578 } 1579 return (error); 1580 } 1581 1582 va.va_mask = AT_SIZE; 1583 if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) { 1584 szcvec = 0; 1585 goto again; 1586 } 1587 eoff = vn_a->offset & PAGEMASK; 1588 if (eoff >= va.va_size) { 1589 szcvec = 0; 1590 goto again; 1591 } 1592 eoff += size; 1593 if (btopr(va.va_size) < btopr(eoff)) { 1594 save_size = size; 1595 size = va.va_size - (vn_a->offset & PAGEMASK); 1596 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t); 1597 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags, 1598 type, 0); 1599 if (szcvec <= 1) { 1600 size = save_size; 1601 goto again; 1602 } 1603 } 1604 1605 if (size > textrepl_size_thresh) { 1606 vn_a->flags |= _MAP_TEXTREPL; 1607 } 1608 error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a, 1609 segcreated); 1610 if (error != 0) { 1611 return (error); 1612 } 1613 if (save_size) { 1614 addr += size; 1615 size = save_size - size; 1616 szcvec = 0; 1617 goto again; 1618 } 1619 return (0); 1620 } 1621 1622 /* 1623 * as_map_ansegs: shared or private anonymous memory. Note that the flags 1624 * passed to map_pgszvec cannot be MAP_INITDATA, for anon. 1625 */ 1626 static int 1627 as_map_ansegs(struct as *as, caddr_t addr, size_t size, 1628 segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated) 1629 { 1630 uint_t szcvec; 1631 uchar_t type = 0; 1632 1633 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE); 1634 if (vn_a->type == MAP_SHARED) { 1635 type = MAPPGSZC_SHM; 1636 } else if (vn_a->type == MAP_PRIVATE) { 1637 if (vn_a->szc == AS_MAP_HEAP) { 1638 type = MAPPGSZC_HEAP; 1639 } else if (vn_a->szc == AS_MAP_STACK) { 1640 type = MAPPGSZC_STACK; 1641 } else { 1642 type = MAPPGSZC_PRIVM; 1643 } 1644 } 1645 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ? 1646 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE), 1647 (vn_a->flags & MAP_TEXT), type, 0); 1648 ASSERT(AS_WRITE_HELD(as)); 1649 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1650 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1651 ASSERT(vn_a->vp == NULL); 1652 1653 return (as_map_segvn_segs(as, addr, size, szcvec, 1654 crfp, vn_a, segcreated)); 1655 } 1656 1657 int 1658 as_map(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp, 1659 void *argsp) 1660 { 1661 AS_LOCK_ENTER(as, RW_WRITER); 1662 return (as_map_locked(as, addr, size, crfp, argsp)); 1663 } 1664 1665 int 1666 as_map_locked(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp, 1667 void *argsp) 1668 { 1669 caddr_t raddr; /* rounded down addr */ 1670 size_t rsize; /* rounded up size */ 1671 int error; 1672 boolean_t is_hole = B_FALSE; 1673 /* 1674 * The use of a_proc is preferred to handle the case where curproc is 1675 * a door_call server and is allocating memory in the client's (a_proc) 1676 * address space. 1677 * When creating a shared memory segment a_proc will be NULL so we 1678 * fallback to curproc in that case. 1679 */ 1680 struct proc *p = (as->a_proc == NULL) ? curproc : as->a_proc; 1681 struct segvn_crargs crargs; 1682 1683 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1684 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1685 (size_t)raddr; 1686 1687 /* 1688 * check for wrap around 1689 */ 1690 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) { 1691 AS_LOCK_EXIT(as); 1692 return (ENOMEM); 1693 } 1694 1695 as->a_updatedir = 1; /* inform /proc */ 1696 gethrestime(&as->a_updatetime); 1697 1698 if (as != &kas) { 1699 /* 1700 * Ensure that the virtual size of the process will not exceed 1701 * the configured limit. Since seg_hole segments will later 1702 * set the S_HOLE flag indicating their status as a hole in the 1703 * AS, they are excluded from this check. 1704 */ 1705 if (as->a_size + rsize > (size_t)p->p_vmem_ctl && 1706 !AS_MAP_CHECK_SEGHOLE(crfp)) { 1707 AS_LOCK_EXIT(as); 1708 1709 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], 1710 p->p_rctls, p, RCA_UNSAFE_ALL); 1711 return (ENOMEM); 1712 } 1713 } 1714 1715 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) { 1716 boolean_t do_unmap = B_FALSE; 1717 1718 crargs = *(struct segvn_crargs *)argsp; 1719 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, 1720 &do_unmap); 1721 if (error != 0) { 1722 AS_LOCK_EXIT(as); 1723 if (do_unmap) { 1724 (void) as_unmap(as, addr, size); 1725 } 1726 return (error); 1727 } 1728 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) { 1729 boolean_t do_unmap = B_FALSE; 1730 1731 crargs = *(struct segvn_crargs *)argsp; 1732 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, 1733 &do_unmap); 1734 if (error != 0) { 1735 AS_LOCK_EXIT(as); 1736 if (do_unmap) { 1737 (void) as_unmap(as, addr, size); 1738 } 1739 return (error); 1740 } 1741 } else { 1742 struct seg *seg, *segref; 1743 1744 seg = segref = seg_alloc(as, addr, size); 1745 if (seg == NULL) { 1746 AS_LOCK_EXIT(as); 1747 return (ENOMEM); 1748 } 1749 1750 /* 1751 * It is possible that the segment creation routine will free 1752 * 'seg' as part of a more advanced operation, such as when 1753 * segvn concatenates adjacent segments together. When this 1754 * occurs, the seg*_create routine must communicate the 1755 * resulting segment out via the 'struct seg **' parameter. 1756 * 1757 * If segment creation fails, it must not free the passed-in 1758 * segment, nor alter the argument pointer. 1759 */ 1760 error = (*crfp)(&seg, argsp); 1761 if (error != 0) { 1762 VERIFY3P(seg, ==, segref); 1763 seg_free(seg); 1764 AS_LOCK_EXIT(as); 1765 return (error); 1766 } 1767 1768 /* 1769 * Check if the resulting segment represents a hole in the 1770 * address space, rather than contributing to the AS size. 1771 */ 1772 is_hole = ((seg->s_flags & S_HOLE) != 0); 1773 1774 /* Add size now so as_unmap will work if as_ctl fails. */ 1775 if (!is_hole) { 1776 as->a_size += rsize; 1777 as->a_resvsize += rsize; 1778 } 1779 } 1780 1781 as_setwatch(as); 1782 1783 /* 1784 * Establish memory locks for the segment if the address space is 1785 * locked, provided it's not an explicit hole in the AS. 1786 */ 1787 mutex_enter(&as->a_contents); 1788 if (AS_ISPGLCK(as) && !is_hole) { 1789 mutex_exit(&as->a_contents); 1790 AS_LOCK_EXIT(as); 1791 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0); 1792 if (error != 0) 1793 (void) as_unmap(as, addr, size); 1794 } else { 1795 mutex_exit(&as->a_contents); 1796 AS_LOCK_EXIT(as); 1797 } 1798 return (error); 1799 } 1800 1801 1802 /* 1803 * Delete all segments in the address space marked with S_PURGE. 1804 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c). 1805 * These segments are deleted as a first step before calls to as_gap(), so 1806 * that they don't affect mmap() or shmat(). 1807 */ 1808 void 1809 as_purge(struct as *as) 1810 { 1811 struct seg *seg; 1812 struct seg *next_seg; 1813 1814 /* 1815 * the setting of NEEDSPURGE is protect by as_rangelock(), so 1816 * no need to grab a_contents mutex for this check 1817 */ 1818 if ((as->a_flags & AS_NEEDSPURGE) == 0) 1819 return; 1820 1821 AS_LOCK_ENTER(as, RW_WRITER); 1822 next_seg = NULL; 1823 seg = AS_SEGFIRST(as); 1824 while (seg != NULL) { 1825 next_seg = AS_SEGNEXT(as, seg); 1826 if (seg->s_flags & S_PURGE) 1827 SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 1828 seg = next_seg; 1829 } 1830 AS_LOCK_EXIT(as); 1831 1832 mutex_enter(&as->a_contents); 1833 as->a_flags &= ~AS_NEEDSPURGE; 1834 mutex_exit(&as->a_contents); 1835 } 1836 1837 /* 1838 * Find a hole within [*basep, *basep + *lenp), which contains a mappable 1839 * range of addresses at least "minlen" long, where the base of the range is 1840 * at "off" phase from an "align" boundary and there is space for a 1841 * "redzone"-sized redzone on eithe rside of the range. Thus, 1842 * if align was 4M and off was 16k, the user wants a hole which will start 1843 * 16k into a 4M page. 1844 * 1845 * If flags specifies AH_HI, the hole will have the highest possible address 1846 * in the range. We use the as->a_lastgap field to figure out where to 1847 * start looking for a gap. 1848 * 1849 * Otherwise, the gap will have the lowest possible address. 1850 * 1851 * If flags specifies AH_CONTAIN, the hole will contain the address addr. 1852 * 1853 * If an adequate hole is found, *basep and *lenp are set to reflect the part of 1854 * the hole that is within range, and 0 is returned. On failure, -1 is returned. 1855 * 1856 * NOTE: This routine is not correct when base+len overflows caddr_t. 1857 */ 1858 int 1859 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, 1860 uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off) 1861 { 1862 caddr_t lobound = *basep; 1863 caddr_t hibound = lobound + *lenp; 1864 struct seg *lseg, *hseg; 1865 caddr_t lo, hi; 1866 int forward; 1867 caddr_t save_base; 1868 size_t save_len; 1869 size_t save_minlen; 1870 size_t save_redzone; 1871 int fast_path = 1; 1872 1873 save_base = *basep; 1874 save_len = *lenp; 1875 save_minlen = minlen; 1876 save_redzone = redzone; 1877 1878 /* 1879 * For the first pass/fast_path, just add align and redzone into 1880 * minlen since if we get an allocation, we can guarantee that it 1881 * will fit the alignment and redzone requested. 1882 * This increases the chance that hibound will be adjusted to 1883 * a_lastgap->s_base which will likely allow us to find an 1884 * acceptable hole in the address space quicker. 1885 * If we can't find a hole with this fast_path, then we look for 1886 * smaller holes in which the alignment and offset may allow 1887 * the allocation to fit. 1888 */ 1889 minlen += align; 1890 minlen += 2 * redzone; 1891 redzone = 0; 1892 1893 AS_LOCK_ENTER(as, RW_READER); 1894 if (AS_SEGFIRST(as) == NULL) { 1895 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR, 1896 align, redzone, off)) { 1897 AS_LOCK_EXIT(as); 1898 return (0); 1899 } else { 1900 AS_LOCK_EXIT(as); 1901 *basep = save_base; 1902 *lenp = save_len; 1903 return (-1); 1904 } 1905 } 1906 1907 retry: 1908 /* 1909 * Set up to iterate over all the inter-segment holes in the given 1910 * direction. lseg is NULL for the lowest-addressed hole and hseg is 1911 * NULL for the highest-addressed hole. If moving backwards, we reset 1912 * sseg to denote the highest-addressed segment. 1913 */ 1914 forward = (flags & AH_DIR) == AH_LO; 1915 if (forward) { 1916 hseg = as_findseg(as, lobound, 1); 1917 lseg = AS_SEGPREV(as, hseg); 1918 } else { 1919 1920 /* 1921 * If allocating at least as much as the last allocation, 1922 * use a_lastgap's base as a better estimate of hibound. 1923 */ 1924 if (as->a_lastgap && 1925 minlen >= as->a_lastgap->s_size && 1926 hibound >= as->a_lastgap->s_base) 1927 hibound = as->a_lastgap->s_base; 1928 1929 hseg = as_findseg(as, hibound, 1); 1930 if (hseg->s_base + hseg->s_size < hibound) { 1931 lseg = hseg; 1932 hseg = NULL; 1933 } else { 1934 lseg = AS_SEGPREV(as, hseg); 1935 } 1936 } 1937 1938 for (;;) { 1939 /* 1940 * Set lo and hi to the hole's boundaries. (We should really 1941 * use MAXADDR in place of hibound in the expression below, 1942 * but can't express it easily; using hibound in its place is 1943 * harmless.) 1944 */ 1945 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size; 1946 hi = (hseg == NULL) ? hibound : hseg->s_base; 1947 /* 1948 * If the iteration has moved past the interval from lobound 1949 * to hibound it's pointless to continue. 1950 */ 1951 if ((forward && lo > hibound) || (!forward && hi < lobound)) 1952 break; 1953 else if (lo > hibound || hi < lobound) 1954 goto cont; 1955 /* 1956 * Candidate hole lies at least partially within the allowable 1957 * range. Restrict it to fall completely within that range, 1958 * i.e., to [max(lo, lobound), min(hi, hibound)]. 1959 */ 1960 if (lo < lobound) 1961 lo = lobound; 1962 if (hi > hibound) 1963 hi = hibound; 1964 /* 1965 * Verify that the candidate hole is big enough and meets 1966 * hardware constraints. If the hole is too small, no need 1967 * to do the further checks since they will fail. 1968 */ 1969 *basep = lo; 1970 *lenp = hi - lo; 1971 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp, 1972 minlen, forward ? AH_LO : AH_HI, align, redzone, off) && 1973 ((flags & AH_CONTAIN) == 0 || 1974 (*basep <= addr && *basep + *lenp > addr))) { 1975 if (!forward) 1976 as->a_lastgap = hseg; 1977 if (hseg != NULL) 1978 as->a_lastgaphl = hseg; 1979 else 1980 as->a_lastgaphl = lseg; 1981 AS_LOCK_EXIT(as); 1982 return (0); 1983 } 1984 cont: 1985 /* 1986 * Move to the next hole. 1987 */ 1988 if (forward) { 1989 lseg = hseg; 1990 if (lseg == NULL) 1991 break; 1992 hseg = AS_SEGNEXT(as, hseg); 1993 } else { 1994 hseg = lseg; 1995 if (hseg == NULL) 1996 break; 1997 lseg = AS_SEGPREV(as, lseg); 1998 } 1999 } 2000 if (fast_path && (align != 0 || save_redzone != 0)) { 2001 fast_path = 0; 2002 minlen = save_minlen; 2003 redzone = save_redzone; 2004 goto retry; 2005 } 2006 *basep = save_base; 2007 *lenp = save_len; 2008 AS_LOCK_EXIT(as); 2009 return (-1); 2010 } 2011 2012 /* 2013 * Find a hole of at least size minlen within [*basep, *basep + *lenp). 2014 * 2015 * If flags specifies AH_HI, the hole will have the highest possible address 2016 * in the range. We use the as->a_lastgap field to figure out where to 2017 * start looking for a gap. 2018 * 2019 * Otherwise, the gap will have the lowest possible address. 2020 * 2021 * If flags specifies AH_CONTAIN, the hole will contain the address addr. 2022 * 2023 * If an adequate hole is found, base and len are set to reflect the part of 2024 * the hole that is within range, and 0 is returned, otherwise, 2025 * -1 is returned. 2026 * 2027 * NOTE: This routine is not correct when base+len overflows caddr_t. 2028 */ 2029 int 2030 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags, 2031 caddr_t addr) 2032 { 2033 2034 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0)); 2035 } 2036 2037 /* 2038 * Return the next range within [base, base + len) that is backed 2039 * with "real memory". Skip holes and non-seg_vn segments. 2040 * We're lazy and only return one segment at a time. 2041 */ 2042 int 2043 as_memory(struct as *as, caddr_t *basep, size_t *lenp) 2044 { 2045 extern struct seg_ops segspt_shmops; /* needs a header file */ 2046 struct seg *seg; 2047 caddr_t addr, eaddr; 2048 caddr_t segend; 2049 2050 AS_LOCK_ENTER(as, RW_READER); 2051 2052 addr = *basep; 2053 eaddr = addr + *lenp; 2054 2055 seg = as_findseg(as, addr, 0); 2056 if (seg != NULL) 2057 addr = MAX(seg->s_base, addr); 2058 2059 for (;;) { 2060 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) { 2061 AS_LOCK_EXIT(as); 2062 return (EINVAL); 2063 } 2064 2065 if (seg->s_ops == &segvn_ops) { 2066 segend = seg->s_base + seg->s_size; 2067 break; 2068 } 2069 2070 /* 2071 * We do ISM by looking into the private data 2072 * to determine the real size of the segment. 2073 */ 2074 if (seg->s_ops == &segspt_shmops) { 2075 segend = seg->s_base + spt_realsize(seg); 2076 if (addr < segend) 2077 break; 2078 } 2079 2080 seg = AS_SEGNEXT(as, seg); 2081 2082 if (seg != NULL) 2083 addr = seg->s_base; 2084 } 2085 2086 *basep = addr; 2087 2088 if (segend > eaddr) 2089 *lenp = eaddr - addr; 2090 else 2091 *lenp = segend - addr; 2092 2093 AS_LOCK_EXIT(as); 2094 return (0); 2095 } 2096 2097 /* 2098 * Swap the pages associated with the address space as out to 2099 * secondary storage, returning the number of bytes actually 2100 * swapped. 2101 * 2102 * The value returned is intended to correlate well with the process's 2103 * memory requirements. Its usefulness for this purpose depends on 2104 * how well the segment-level routines do at returning accurate 2105 * information. 2106 */ 2107 size_t 2108 as_swapout(struct as *as) 2109 { 2110 struct seg *seg; 2111 size_t swpcnt = 0; 2112 2113 /* 2114 * Kernel-only processes have given up their address 2115 * spaces. Of course, we shouldn't be attempting to 2116 * swap out such processes in the first place... 2117 */ 2118 if (as == NULL) 2119 return (0); 2120 2121 AS_LOCK_ENTER(as, RW_READER); 2122 2123 /* 2124 * Free all mapping resources associated with the address 2125 * space. The segment-level swapout routines capitalize 2126 * on this unmapping by scavanging pages that have become 2127 * unmapped here. 2128 */ 2129 hat_swapout(as->a_hat); 2130 2131 /* 2132 * Call the swapout routines of all segments in the address 2133 * space to do the actual work, accumulating the amount of 2134 * space reclaimed. 2135 */ 2136 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 2137 struct seg_ops *ov = seg->s_ops; 2138 2139 /* 2140 * We have to check to see if the seg has 2141 * an ops vector because the seg may have 2142 * been in the middle of being set up when 2143 * the process was picked for swapout. 2144 */ 2145 if ((ov != NULL) && (ov->swapout != NULL)) 2146 swpcnt += SEGOP_SWAPOUT(seg); 2147 } 2148 AS_LOCK_EXIT(as); 2149 return (swpcnt); 2150 } 2151 2152 /* 2153 * Determine whether data from the mappings in interval [addr, addr + size) 2154 * are in the primary memory (core) cache. 2155 */ 2156 int 2157 as_incore(struct as *as, caddr_t addr, 2158 size_t size, char *vec, size_t *sizep) 2159 { 2160 struct seg *seg; 2161 size_t ssize; 2162 caddr_t raddr; /* rounded down addr */ 2163 size_t rsize; /* rounded up size */ 2164 size_t isize; /* iteration size */ 2165 int error = 0; /* result, assume success */ 2166 2167 *sizep = 0; 2168 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2169 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) - 2170 (size_t)raddr; 2171 2172 if (raddr + rsize < raddr) /* check for wraparound */ 2173 return (ENOMEM); 2174 2175 AS_LOCK_ENTER(as, RW_READER); 2176 seg = as_segat(as, raddr); 2177 if (seg == NULL) { 2178 AS_LOCK_EXIT(as); 2179 return (-1); 2180 } 2181 2182 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 2183 if (raddr >= seg->s_base + seg->s_size) { 2184 seg = AS_SEGNEXT(as, seg); 2185 if (seg == NULL || raddr != seg->s_base) { 2186 error = -1; 2187 break; 2188 } 2189 } 2190 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2191 ssize = seg->s_base + seg->s_size - raddr; 2192 else 2193 ssize = rsize; 2194 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec); 2195 if (isize != ssize) { 2196 error = -1; 2197 break; 2198 } 2199 vec += btopr(ssize); 2200 } 2201 AS_LOCK_EXIT(as); 2202 return (error); 2203 } 2204 2205 static void 2206 as_segunlock(struct seg *seg, caddr_t addr, int attr, 2207 ulong_t *bitmap, size_t position, size_t npages) 2208 { 2209 caddr_t range_start; 2210 size_t pos1 = position; 2211 size_t pos2; 2212 size_t size; 2213 size_t end_pos = npages + position; 2214 2215 while (bt_range(bitmap, &pos1, &pos2, end_pos)) { 2216 size = ptob((pos2 - pos1)); 2217 range_start = (caddr_t)((uintptr_t)addr + 2218 ptob(pos1 - position)); 2219 2220 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK, 2221 (ulong_t *)NULL, (size_t)NULL); 2222 pos1 = pos2; 2223 } 2224 } 2225 2226 static void 2227 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map, 2228 caddr_t raddr, size_t rsize) 2229 { 2230 struct seg *seg = as_segat(as, raddr); 2231 size_t ssize; 2232 2233 while (rsize != 0) { 2234 if (raddr >= seg->s_base + seg->s_size) 2235 seg = AS_SEGNEXT(as, seg); 2236 2237 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2238 ssize = seg->s_base + seg->s_size - raddr; 2239 else 2240 ssize = rsize; 2241 2242 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize)); 2243 2244 rsize -= ssize; 2245 raddr += ssize; 2246 } 2247 } 2248 2249 /* 2250 * Cache control operations over the interval [addr, addr + size) in 2251 * address space "as". 2252 */ 2253 /*ARGSUSED*/ 2254 int 2255 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr, 2256 uintptr_t arg, ulong_t *lock_map, size_t pos) 2257 { 2258 struct seg *seg; /* working segment */ 2259 caddr_t raddr; /* rounded down addr */ 2260 caddr_t initraddr; /* saved initial rounded down addr */ 2261 size_t rsize; /* rounded up size */ 2262 size_t initrsize; /* saved initial rounded up size */ 2263 size_t ssize; /* size of seg */ 2264 int error = 0; /* result */ 2265 size_t mlock_size; /* size of bitmap */ 2266 ulong_t *mlock_map; /* pointer to bitmap used */ 2267 /* to represent the locked */ 2268 /* pages. */ 2269 2270 mlock_size = 0; 2271 mlock_map = NULL; 2272 retry: 2273 if (error == IE_RETRY) 2274 AS_LOCK_ENTER(as, RW_WRITER); 2275 else 2276 AS_LOCK_ENTER(as, RW_READER); 2277 2278 /* 2279 * If these are address space lock/unlock operations, loop over 2280 * all segments in the address space, as appropriate. 2281 */ 2282 if (func == MC_LOCKAS) { 2283 size_t npages, idx; 2284 size_t rlen = 0; /* rounded as length */ 2285 2286 idx = pos; 2287 2288 if (arg & MCL_FUTURE) { 2289 mutex_enter(&as->a_contents); 2290 AS_SETPGLCK(as); 2291 mutex_exit(&as->a_contents); 2292 } 2293 if ((arg & MCL_CURRENT) == 0) { 2294 AS_LOCK_EXIT(as); 2295 return (0); 2296 } 2297 2298 seg = AS_SEGFIRST(as); 2299 if (seg == NULL) { 2300 AS_LOCK_EXIT(as); 2301 return (0); 2302 } 2303 2304 do { 2305 raddr = (caddr_t)((uintptr_t)seg->s_base & 2306 (uintptr_t)PAGEMASK); 2307 rlen += (((uintptr_t)(seg->s_base + seg->s_size) + 2308 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr; 2309 } while ((seg = AS_SEGNEXT(as, seg)) != NULL); 2310 2311 mlock_size = BT_BITOUL(btopr(rlen)); 2312 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * 2313 sizeof (ulong_t), KM_NOSLEEP)) == NULL) { 2314 AS_LOCK_EXIT(as); 2315 return (EAGAIN); 2316 } 2317 2318 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 2319 if ((seg->s_flags & S_HOLE) != 0) { 2320 continue; 2321 } 2322 error = SEGOP_LOCKOP(seg, seg->s_base, 2323 seg->s_size, attr, MC_LOCK, mlock_map, pos); 2324 if (error != 0) 2325 break; 2326 pos += seg_pages(seg); 2327 } 2328 2329 if (error) { 2330 for (seg = AS_SEGFIRST(as); seg != NULL; 2331 seg = AS_SEGNEXT(as, seg)) { 2332 2333 raddr = (caddr_t)((uintptr_t)seg->s_base & 2334 (uintptr_t)PAGEMASK); 2335 npages = seg_pages(seg); 2336 as_segunlock(seg, raddr, attr, mlock_map, 2337 idx, npages); 2338 idx += npages; 2339 } 2340 } 2341 2342 kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); 2343 AS_LOCK_EXIT(as); 2344 goto lockerr; 2345 } else if (func == MC_UNLOCKAS) { 2346 mutex_enter(&as->a_contents); 2347 AS_CLRPGLCK(as); 2348 mutex_exit(&as->a_contents); 2349 2350 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 2351 if ((seg->s_flags & S_HOLE) != 0) { 2352 continue; 2353 } 2354 error = SEGOP_LOCKOP(seg, seg->s_base, 2355 seg->s_size, attr, MC_UNLOCK, NULL, 0); 2356 if (error != 0) 2357 break; 2358 } 2359 2360 AS_LOCK_EXIT(as); 2361 goto lockerr; 2362 } 2363 2364 /* 2365 * Normalize addresses and sizes. 2366 */ 2367 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2368 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2369 (size_t)raddr; 2370 2371 if (raddr + rsize < raddr) { /* check for wraparound */ 2372 AS_LOCK_EXIT(as); 2373 return (ENOMEM); 2374 } 2375 2376 /* 2377 * Get initial segment. 2378 */ 2379 if ((seg = as_segat(as, raddr)) == NULL) { 2380 AS_LOCK_EXIT(as); 2381 return (ENOMEM); 2382 } 2383 2384 if (func == MC_LOCK) { 2385 mlock_size = BT_BITOUL(btopr(rsize)); 2386 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * 2387 sizeof (ulong_t), KM_NOSLEEP)) == NULL) { 2388 AS_LOCK_EXIT(as); 2389 return (EAGAIN); 2390 } 2391 } 2392 2393 /* 2394 * Loop over all segments. If a hole in the address range is 2395 * discovered, then fail. For each segment, perform the appropriate 2396 * control operation. 2397 */ 2398 while (rsize != 0) { 2399 2400 /* 2401 * Make sure there's no hole, calculate the portion 2402 * of the next segment to be operated over. 2403 */ 2404 if (raddr >= seg->s_base + seg->s_size) { 2405 seg = AS_SEGNEXT(as, seg); 2406 if (seg == NULL || raddr != seg->s_base) { 2407 if (func == MC_LOCK) { 2408 as_unlockerr(as, attr, mlock_map, 2409 initraddr, initrsize - rsize); 2410 kmem_free(mlock_map, 2411 mlock_size * sizeof (ulong_t)); 2412 } 2413 AS_LOCK_EXIT(as); 2414 return (ENOMEM); 2415 } 2416 } 2417 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2418 ssize = seg->s_base + seg->s_size - raddr; 2419 else 2420 ssize = rsize; 2421 2422 /* 2423 * Dispatch on specific function. 2424 */ 2425 switch (func) { 2426 2427 /* 2428 * Synchronize cached data from mappings with backing 2429 * objects. 2430 */ 2431 case MC_SYNC: 2432 if (error = SEGOP_SYNC(seg, raddr, ssize, 2433 attr, (uint_t)arg)) { 2434 AS_LOCK_EXIT(as); 2435 return (error); 2436 } 2437 break; 2438 2439 /* 2440 * Lock pages in memory. 2441 */ 2442 case MC_LOCK: 2443 if (error = SEGOP_LOCKOP(seg, raddr, ssize, 2444 attr, func, mlock_map, pos)) { 2445 as_unlockerr(as, attr, mlock_map, initraddr, 2446 initrsize - rsize + ssize); 2447 kmem_free(mlock_map, mlock_size * 2448 sizeof (ulong_t)); 2449 AS_LOCK_EXIT(as); 2450 goto lockerr; 2451 } 2452 break; 2453 2454 /* 2455 * Unlock mapped pages. 2456 */ 2457 case MC_UNLOCK: 2458 (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func, 2459 (ulong_t *)NULL, (size_t)NULL); 2460 break; 2461 2462 /* 2463 * Store VM advise for mapped pages in segment layer. 2464 */ 2465 case MC_ADVISE: 2466 error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg); 2467 2468 /* 2469 * Check for regular errors and special retry error 2470 */ 2471 if (error) { 2472 if (error == IE_RETRY) { 2473 /* 2474 * Need to acquire writers lock, so 2475 * have to drop readers lock and start 2476 * all over again 2477 */ 2478 AS_LOCK_EXIT(as); 2479 goto retry; 2480 } else if (error == IE_REATTACH) { 2481 /* 2482 * Find segment for current address 2483 * because current segment just got 2484 * split or concatenated 2485 */ 2486 seg = as_segat(as, raddr); 2487 if (seg == NULL) { 2488 AS_LOCK_EXIT(as); 2489 return (ENOMEM); 2490 } 2491 } else { 2492 /* 2493 * Regular error 2494 */ 2495 AS_LOCK_EXIT(as); 2496 return (error); 2497 } 2498 } 2499 break; 2500 2501 case MC_INHERIT_ZERO: 2502 if (seg->s_ops->inherit == NULL) { 2503 error = ENOTSUP; 2504 } else { 2505 error = SEGOP_INHERIT(seg, raddr, ssize, 2506 SEGP_INH_ZERO); 2507 } 2508 if (error != 0) { 2509 AS_LOCK_EXIT(as); 2510 return (error); 2511 } 2512 break; 2513 2514 /* 2515 * Can't happen. 2516 */ 2517 default: 2518 panic("as_ctl: bad operation %d", func); 2519 /*NOTREACHED*/ 2520 } 2521 2522 rsize -= ssize; 2523 raddr += ssize; 2524 } 2525 2526 if (func == MC_LOCK) 2527 kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); 2528 AS_LOCK_EXIT(as); 2529 return (0); 2530 lockerr: 2531 2532 /* 2533 * If the lower levels returned EDEADLK for a segment lockop, 2534 * it means that we should retry the operation. Let's wait 2535 * a bit also to let the deadlock causing condition clear. 2536 * This is part of a gross hack to work around a design flaw 2537 * in the ufs/sds logging code and should go away when the 2538 * logging code is re-designed to fix the problem. See bug 2539 * 4125102 for details of the problem. 2540 */ 2541 if (error == EDEADLK) { 2542 delay(deadlk_wait); 2543 error = 0; 2544 goto retry; 2545 } 2546 return (error); 2547 } 2548 2549 int 2550 fc_decode(faultcode_t fault_err) 2551 { 2552 int error = 0; 2553 2554 switch (FC_CODE(fault_err)) { 2555 case FC_OBJERR: 2556 error = FC_ERRNO(fault_err); 2557 break; 2558 case FC_PROT: 2559 error = EACCES; 2560 break; 2561 default: 2562 error = EFAULT; 2563 break; 2564 } 2565 return (error); 2566 } 2567 2568 /* 2569 * Pagelock pages from a range that spans more than 1 segment. Obtain shadow 2570 * lists from each segment and copy them to one contiguous shadow list (plist) 2571 * as expected by the caller. Save pointers to per segment shadow lists at 2572 * the tail of plist so that they can be used during as_pageunlock(). 2573 */ 2574 static int 2575 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp, 2576 caddr_t addr, size_t size, enum seg_rw rw) 2577 { 2578 caddr_t sv_addr = addr; 2579 size_t sv_size = size; 2580 struct seg *sv_seg = seg; 2581 ulong_t segcnt = 1; 2582 ulong_t cnt; 2583 size_t ssize; 2584 pgcnt_t npages = btop(size); 2585 page_t **plist; 2586 page_t **pl; 2587 int error; 2588 caddr_t eaddr; 2589 faultcode_t fault_err = 0; 2590 pgcnt_t pl_off; 2591 extern struct seg_ops segspt_shmops; 2592 2593 ASSERT(AS_LOCK_HELD(as)); 2594 ASSERT(seg != NULL); 2595 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size); 2596 ASSERT(addr + size > seg->s_base + seg->s_size); 2597 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 2598 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 2599 2600 /* 2601 * Count the number of segments covered by the range we are about to 2602 * lock. The segment count is used to size the shadow list we return 2603 * back to the caller. 2604 */ 2605 for (; size != 0; size -= ssize, addr += ssize) { 2606 if (addr >= seg->s_base + seg->s_size) { 2607 2608 seg = AS_SEGNEXT(as, seg); 2609 if (seg == NULL || addr != seg->s_base) { 2610 AS_LOCK_EXIT(as); 2611 return (EFAULT); 2612 } 2613 /* 2614 * Do a quick check if subsequent segments 2615 * will most likely support pagelock. 2616 */ 2617 if (seg->s_ops == &segvn_ops) { 2618 vnode_t *vp; 2619 2620 if (SEGOP_GETVP(seg, addr, &vp) != 0 || 2621 vp != NULL) { 2622 AS_LOCK_EXIT(as); 2623 goto slow; 2624 } 2625 } else if (seg->s_ops != &segspt_shmops) { 2626 AS_LOCK_EXIT(as); 2627 goto slow; 2628 } 2629 segcnt++; 2630 } 2631 if (addr + size > seg->s_base + seg->s_size) { 2632 ssize = seg->s_base + seg->s_size - addr; 2633 } else { 2634 ssize = size; 2635 } 2636 } 2637 ASSERT(segcnt > 1); 2638 2639 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP); 2640 2641 addr = sv_addr; 2642 size = sv_size; 2643 seg = sv_seg; 2644 2645 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) { 2646 if (addr >= seg->s_base + seg->s_size) { 2647 seg = AS_SEGNEXT(as, seg); 2648 ASSERT(seg != NULL && addr == seg->s_base); 2649 cnt++; 2650 ASSERT(cnt < segcnt); 2651 } 2652 if (addr + size > seg->s_base + seg->s_size) { 2653 ssize = seg->s_base + seg->s_size - addr; 2654 } else { 2655 ssize = size; 2656 } 2657 pl = &plist[npages + cnt]; 2658 error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, 2659 L_PAGELOCK, rw); 2660 if (error) { 2661 break; 2662 } 2663 ASSERT(plist[npages + cnt] != NULL); 2664 ASSERT(pl_off + btop(ssize) <= npages); 2665 bcopy(plist[npages + cnt], &plist[pl_off], 2666 btop(ssize) * sizeof (page_t *)); 2667 pl_off += btop(ssize); 2668 } 2669 2670 if (size == 0) { 2671 AS_LOCK_EXIT(as); 2672 ASSERT(cnt == segcnt - 1); 2673 *ppp = plist; 2674 return (0); 2675 } 2676 2677 /* 2678 * one of pagelock calls failed. The error type is in error variable. 2679 * Unlock what we've locked so far and retry with F_SOFTLOCK if error 2680 * type is either EFAULT or ENOTSUP. Otherwise just return the error 2681 * back to the caller. 2682 */ 2683 2684 eaddr = addr; 2685 seg = sv_seg; 2686 2687 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) { 2688 if (addr >= seg->s_base + seg->s_size) { 2689 seg = AS_SEGNEXT(as, seg); 2690 ASSERT(seg != NULL && addr == seg->s_base); 2691 cnt++; 2692 ASSERT(cnt < segcnt); 2693 } 2694 if (eaddr > seg->s_base + seg->s_size) { 2695 ssize = seg->s_base + seg->s_size - addr; 2696 } else { 2697 ssize = eaddr - addr; 2698 } 2699 pl = &plist[npages + cnt]; 2700 ASSERT(*pl != NULL); 2701 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, 2702 L_PAGEUNLOCK, rw); 2703 } 2704 2705 AS_LOCK_EXIT(as); 2706 2707 kmem_free(plist, (npages + segcnt) * sizeof (page_t *)); 2708 2709 if (error != ENOTSUP && error != EFAULT) { 2710 return (error); 2711 } 2712 2713 slow: 2714 /* 2715 * If we are here because pagelock failed due to the need to cow fault 2716 * in the pages we want to lock F_SOFTLOCK will do this job and in 2717 * next as_pagelock() call for this address range pagelock will 2718 * hopefully succeed. 2719 */ 2720 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw); 2721 if (fault_err != 0) { 2722 return (fc_decode(fault_err)); 2723 } 2724 *ppp = NULL; 2725 2726 return (0); 2727 } 2728 2729 /* 2730 * lock pages in a given address space. Return shadow list. If 2731 * the list is NULL, the MMU mapping is also locked. 2732 */ 2733 int 2734 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr, 2735 size_t size, enum seg_rw rw) 2736 { 2737 size_t rsize; 2738 caddr_t raddr; 2739 faultcode_t fault_err; 2740 struct seg *seg; 2741 int err; 2742 2743 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START, 2744 "as_pagelock_start: addr %p size %ld", addr, size); 2745 2746 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2747 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2748 (size_t)raddr; 2749 2750 /* 2751 * if the request crosses two segments let 2752 * as_fault handle it. 2753 */ 2754 AS_LOCK_ENTER(as, RW_READER); 2755 2756 seg = as_segat(as, raddr); 2757 if (seg == NULL) { 2758 AS_LOCK_EXIT(as); 2759 return (EFAULT); 2760 } 2761 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size); 2762 if (raddr + rsize > seg->s_base + seg->s_size) { 2763 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw)); 2764 } 2765 if (raddr + rsize <= raddr) { 2766 AS_LOCK_EXIT(as); 2767 return (EFAULT); 2768 } 2769 2770 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START, 2771 "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize); 2772 2773 /* 2774 * try to lock pages and pass back shadow list 2775 */ 2776 err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw); 2777 2778 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end"); 2779 2780 AS_LOCK_EXIT(as); 2781 2782 if (err == 0 || (err != ENOTSUP && err != EFAULT)) { 2783 return (err); 2784 } 2785 2786 /* 2787 * Use F_SOFTLOCK to lock the pages because pagelock failed either due 2788 * to no pagelock support for this segment or pages need to be cow 2789 * faulted in. If fault is needed F_SOFTLOCK will do this job for 2790 * this as_pagelock() call and in the next as_pagelock() call for the 2791 * same address range pagelock call will hopefull succeed. 2792 */ 2793 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw); 2794 if (fault_err != 0) { 2795 return (fc_decode(fault_err)); 2796 } 2797 *ppp = NULL; 2798 2799 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end"); 2800 return (0); 2801 } 2802 2803 /* 2804 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow 2805 * lists from the end of plist and call pageunlock interface for each segment. 2806 * Drop as lock and free plist. 2807 */ 2808 static void 2809 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size, 2810 struct page **plist, enum seg_rw rw) 2811 { 2812 ulong_t cnt; 2813 caddr_t eaddr = addr + size; 2814 pgcnt_t npages = btop(size); 2815 size_t ssize; 2816 page_t **pl; 2817 2818 ASSERT(AS_LOCK_HELD(as)); 2819 ASSERT(seg != NULL); 2820 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size); 2821 ASSERT(addr + size > seg->s_base + seg->s_size); 2822 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 2823 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 2824 ASSERT(plist != NULL); 2825 2826 for (cnt = 0; addr < eaddr; addr += ssize) { 2827 if (addr >= seg->s_base + seg->s_size) { 2828 seg = AS_SEGNEXT(as, seg); 2829 ASSERT(seg != NULL && addr == seg->s_base); 2830 cnt++; 2831 } 2832 if (eaddr > seg->s_base + seg->s_size) { 2833 ssize = seg->s_base + seg->s_size - addr; 2834 } else { 2835 ssize = eaddr - addr; 2836 } 2837 pl = &plist[npages + cnt]; 2838 ASSERT(*pl != NULL); 2839 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, 2840 L_PAGEUNLOCK, rw); 2841 } 2842 ASSERT(cnt > 0); 2843 AS_LOCK_EXIT(as); 2844 2845 cnt++; 2846 kmem_free(plist, (npages + cnt) * sizeof (page_t *)); 2847 } 2848 2849 /* 2850 * unlock pages in a given address range 2851 */ 2852 void 2853 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size, 2854 enum seg_rw rw) 2855 { 2856 struct seg *seg; 2857 size_t rsize; 2858 caddr_t raddr; 2859 2860 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START, 2861 "as_pageunlock_start: addr %p size %ld", addr, size); 2862 2863 /* 2864 * if the shadow list is NULL, as_pagelock was 2865 * falling back to as_fault 2866 */ 2867 if (pp == NULL) { 2868 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw); 2869 return; 2870 } 2871 2872 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2873 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2874 (size_t)raddr; 2875 2876 AS_LOCK_ENTER(as, RW_READER); 2877 seg = as_segat(as, raddr); 2878 ASSERT(seg != NULL); 2879 2880 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START, 2881 "seg_unlock_start: raddr %p rsize %ld", raddr, rsize); 2882 2883 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size); 2884 if (raddr + rsize <= seg->s_base + seg->s_size) { 2885 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw); 2886 } else { 2887 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw); 2888 return; 2889 } 2890 AS_LOCK_EXIT(as); 2891 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end"); 2892 } 2893 2894 int 2895 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc, 2896 boolean_t wait) 2897 { 2898 struct seg *seg; 2899 size_t ssize; 2900 caddr_t raddr; /* rounded down addr */ 2901 size_t rsize; /* rounded up size */ 2902 int error = 0; 2903 size_t pgsz = page_get_pagesize(szc); 2904 2905 setpgsz_top: 2906 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) { 2907 return (EINVAL); 2908 } 2909 2910 raddr = addr; 2911 rsize = size; 2912 2913 if (raddr + rsize < raddr) /* check for wraparound */ 2914 return (ENOMEM); 2915 2916 AS_LOCK_ENTER(as, RW_WRITER); 2917 as_clearwatchprot(as, raddr, rsize); 2918 seg = as_segat(as, raddr); 2919 if (seg == NULL) { 2920 as_setwatch(as); 2921 AS_LOCK_EXIT(as); 2922 return (ENOMEM); 2923 } 2924 2925 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 2926 if (raddr >= seg->s_base + seg->s_size) { 2927 seg = AS_SEGNEXT(as, seg); 2928 if (seg == NULL || raddr != seg->s_base) { 2929 error = ENOMEM; 2930 break; 2931 } 2932 } 2933 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 2934 ssize = seg->s_base + seg->s_size - raddr; 2935 } else { 2936 ssize = rsize; 2937 } 2938 2939 retry: 2940 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc); 2941 2942 if (error == IE_NOMEM) { 2943 error = EAGAIN; 2944 break; 2945 } 2946 2947 if (error == IE_RETRY) { 2948 AS_LOCK_EXIT(as); 2949 goto setpgsz_top; 2950 } 2951 2952 if (error == ENOTSUP) { 2953 error = EINVAL; 2954 break; 2955 } 2956 2957 if (wait && (error == EAGAIN)) { 2958 /* 2959 * Memory is currently locked. It must be unlocked 2960 * before this operation can succeed through a retry. 2961 * The possible reasons for locked memory and 2962 * corresponding strategies for unlocking are: 2963 * (1) Normal I/O 2964 * wait for a signal that the I/O operation 2965 * has completed and the memory is unlocked. 2966 * (2) Asynchronous I/O 2967 * The aio subsystem does not unlock pages when 2968 * the I/O is completed. Those pages are unlocked 2969 * when the application calls aiowait/aioerror. 2970 * So, to prevent blocking forever, cv_broadcast() 2971 * is done to wake up aio_cleanup_thread. 2972 * Subsequently, segvn_reclaim will be called, and 2973 * that will do AS_CLRUNMAPWAIT() and wake us up. 2974 * (3) Long term page locking: 2975 * This is not relevant for as_setpagesize() 2976 * because we cannot change the page size for 2977 * driver memory. The attempt to do so will 2978 * fail with a different error than EAGAIN so 2979 * there's no need to trigger as callbacks like 2980 * as_unmap, as_setprot or as_free would do. 2981 */ 2982 mutex_enter(&as->a_contents); 2983 if (!AS_ISNOUNMAPWAIT(as)) { 2984 if (AS_ISUNMAPWAIT(as) == 0) { 2985 cv_broadcast(&as->a_cv); 2986 } 2987 AS_SETUNMAPWAIT(as); 2988 AS_LOCK_EXIT(as); 2989 while (AS_ISUNMAPWAIT(as)) { 2990 cv_wait(&as->a_cv, &as->a_contents); 2991 } 2992 } else { 2993 /* 2994 * We may have raced with 2995 * segvn_reclaim()/segspt_reclaim(). In this 2996 * case clean nounmapwait flag and retry since 2997 * softlockcnt in this segment may be already 2998 * 0. We don't drop as writer lock so our 2999 * number of retries without sleeping should 3000 * be very small. See segvn_reclaim() for 3001 * more comments. 3002 */ 3003 AS_CLRNOUNMAPWAIT(as); 3004 mutex_exit(&as->a_contents); 3005 goto retry; 3006 } 3007 mutex_exit(&as->a_contents); 3008 goto setpgsz_top; 3009 } else if (error != 0) { 3010 break; 3011 } 3012 } 3013 as_setwatch(as); 3014 AS_LOCK_EXIT(as); 3015 return (error); 3016 } 3017 3018 /* 3019 * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments 3020 * in its chunk where s_szc is less than the szc we want to set. 3021 */ 3022 static int 3023 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc, 3024 int *retry) 3025 { 3026 struct seg *seg; 3027 size_t ssize; 3028 int error; 3029 3030 ASSERT(AS_WRITE_HELD(as)); 3031 3032 seg = as_segat(as, raddr); 3033 if (seg == NULL) { 3034 panic("as_iset3_default_lpsize: no seg"); 3035 } 3036 3037 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 3038 if (raddr >= seg->s_base + seg->s_size) { 3039 seg = AS_SEGNEXT(as, seg); 3040 if (seg == NULL || raddr != seg->s_base) { 3041 panic("as_iset3_default_lpsize: as changed"); 3042 } 3043 } 3044 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3045 ssize = seg->s_base + seg->s_size - raddr; 3046 } else { 3047 ssize = rsize; 3048 } 3049 3050 if (szc > seg->s_szc) { 3051 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc); 3052 /* Only retry on EINVAL segments that have no vnode. */ 3053 if (error == EINVAL) { 3054 vnode_t *vp = NULL; 3055 if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) && 3056 (SEGOP_GETVP(seg, raddr, &vp) != 0 || 3057 vp == NULL)) { 3058 *retry = 1; 3059 } else { 3060 *retry = 0; 3061 } 3062 } 3063 if (error) { 3064 return (error); 3065 } 3066 } 3067 } 3068 return (0); 3069 } 3070 3071 /* 3072 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the 3073 * pagesize on each segment in its range, but if any fails with EINVAL, 3074 * then it reduces the pagesizes to the next size in the bitmap and 3075 * retries as_iset3_default_lpsize(). The reason why the code retries 3076 * smaller allowed sizes on EINVAL is because (a) the anon offset may not 3077 * match the bigger sizes, and (b) it's hard to get this offset (to begin 3078 * with) to pass to map_pgszcvec(). 3079 */ 3080 static int 3081 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc, 3082 uint_t szcvec) 3083 { 3084 int error; 3085 int retry; 3086 3087 ASSERT(AS_WRITE_HELD(as)); 3088 3089 for (;;) { 3090 error = as_iset3_default_lpsize(as, addr, size, szc, &retry); 3091 if (error == EINVAL && retry) { 3092 szcvec &= ~(1 << szc); 3093 if (szcvec <= 1) { 3094 return (EINVAL); 3095 } 3096 szc = highbit(szcvec) - 1; 3097 } else { 3098 return (error); 3099 } 3100 } 3101 } 3102 3103 /* 3104 * as_iset1_default_lpsize() breaks its chunk into areas where existing 3105 * segments have a smaller szc than we want to set. For each such area, 3106 * it calls as_iset2_default_lpsize() 3107 */ 3108 static int 3109 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc, 3110 uint_t szcvec) 3111 { 3112 struct seg *seg; 3113 size_t ssize; 3114 caddr_t setaddr = raddr; 3115 size_t setsize = 0; 3116 int set; 3117 int error; 3118 3119 ASSERT(AS_WRITE_HELD(as)); 3120 3121 seg = as_segat(as, raddr); 3122 if (seg == NULL) { 3123 panic("as_iset1_default_lpsize: no seg"); 3124 } 3125 if (seg->s_szc < szc) { 3126 set = 1; 3127 } else { 3128 set = 0; 3129 } 3130 3131 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) { 3132 if (raddr >= seg->s_base + seg->s_size) { 3133 seg = AS_SEGNEXT(as, seg); 3134 if (seg == NULL || raddr != seg->s_base) { 3135 panic("as_iset1_default_lpsize: as changed"); 3136 } 3137 if (seg->s_szc >= szc && set) { 3138 ASSERT(setsize != 0); 3139 error = as_iset2_default_lpsize(as, 3140 setaddr, setsize, szc, szcvec); 3141 if (error) { 3142 return (error); 3143 } 3144 set = 0; 3145 } else if (seg->s_szc < szc && !set) { 3146 setaddr = raddr; 3147 setsize = 0; 3148 set = 1; 3149 } 3150 } 3151 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3152 ssize = seg->s_base + seg->s_size - raddr; 3153 } else { 3154 ssize = rsize; 3155 } 3156 } 3157 error = 0; 3158 if (set) { 3159 ASSERT(setsize != 0); 3160 error = as_iset2_default_lpsize(as, setaddr, setsize, 3161 szc, szcvec); 3162 } 3163 return (error); 3164 } 3165 3166 /* 3167 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap 3168 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each 3169 * chunk to as_iset1_default_lpsize(). 3170 */ 3171 static int 3172 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags, 3173 int type) 3174 { 3175 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM; 3176 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, 3177 flags, rtype, 1); 3178 uint_t szc; 3179 uint_t nszc; 3180 int error; 3181 caddr_t a; 3182 caddr_t eaddr; 3183 size_t segsize; 3184 size_t pgsz; 3185 uint_t save_szcvec; 3186 3187 ASSERT(AS_WRITE_HELD(as)); 3188 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 3189 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 3190 3191 szcvec &= ~1; 3192 if (szcvec <= 1) { /* skip if base page size */ 3193 return (0); 3194 } 3195 3196 /* Get the pagesize of the first larger page size. */ 3197 szc = lowbit(szcvec) - 1; 3198 pgsz = page_get_pagesize(szc); 3199 eaddr = addr + size; 3200 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 3201 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 3202 3203 save_szcvec = szcvec; 3204 szcvec >>= (szc + 1); 3205 nszc = szc; 3206 while (szcvec) { 3207 if ((szcvec & 0x1) == 0) { 3208 nszc++; 3209 szcvec >>= 1; 3210 continue; 3211 } 3212 nszc++; 3213 pgsz = page_get_pagesize(nszc); 3214 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 3215 if (a != addr) { 3216 ASSERT(szc > 0); 3217 ASSERT(a < eaddr); 3218 segsize = a - addr; 3219 error = as_iset1_default_lpsize(as, addr, segsize, szc, 3220 save_szcvec); 3221 if (error) { 3222 return (error); 3223 } 3224 addr = a; 3225 } 3226 szc = nszc; 3227 szcvec >>= 1; 3228 } 3229 3230 ASSERT(addr < eaddr); 3231 szcvec = save_szcvec; 3232 while (szcvec) { 3233 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 3234 ASSERT(a >= addr); 3235 if (a != addr) { 3236 ASSERT(szc > 0); 3237 segsize = a - addr; 3238 error = as_iset1_default_lpsize(as, addr, segsize, szc, 3239 save_szcvec); 3240 if (error) { 3241 return (error); 3242 } 3243 addr = a; 3244 } 3245 szcvec &= ~(1 << szc); 3246 if (szcvec) { 3247 szc = highbit(szcvec) - 1; 3248 pgsz = page_get_pagesize(szc); 3249 } 3250 } 3251 ASSERT(addr == eaddr); 3252 3253 return (0); 3254 } 3255 3256 /* 3257 * Set the default large page size for the range. Called via memcntl with 3258 * page size set to 0. as_set_default_lpsize breaks the range down into 3259 * chunks with the same type/flags, ignores-non segvn segments, and passes 3260 * each chunk to as_iset_default_lpsize(). 3261 */ 3262 int 3263 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size) 3264 { 3265 struct seg *seg; 3266 caddr_t raddr; 3267 size_t rsize; 3268 size_t ssize; 3269 int rtype, rflags; 3270 int stype, sflags; 3271 int error; 3272 caddr_t setaddr; 3273 size_t setsize; 3274 int segvn; 3275 3276 if (size == 0) 3277 return (0); 3278 3279 AS_LOCK_ENTER(as, RW_WRITER); 3280 again: 3281 error = 0; 3282 3283 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3284 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 3285 (size_t)raddr; 3286 3287 if (raddr + rsize < raddr) { /* check for wraparound */ 3288 AS_LOCK_EXIT(as); 3289 return (ENOMEM); 3290 } 3291 as_clearwatchprot(as, raddr, rsize); 3292 seg = as_segat(as, raddr); 3293 if (seg == NULL) { 3294 as_setwatch(as); 3295 AS_LOCK_EXIT(as); 3296 return (ENOMEM); 3297 } 3298 if (seg->s_ops == &segvn_ops) { 3299 rtype = SEGOP_GETTYPE(seg, addr); 3300 rflags = rtype & (MAP_TEXT | MAP_INITDATA); 3301 rtype = rtype & (MAP_SHARED | MAP_PRIVATE); 3302 segvn = 1; 3303 } else { 3304 segvn = 0; 3305 } 3306 setaddr = raddr; 3307 setsize = 0; 3308 3309 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) { 3310 if (raddr >= (seg->s_base + seg->s_size)) { 3311 seg = AS_SEGNEXT(as, seg); 3312 if (seg == NULL || raddr != seg->s_base) { 3313 error = ENOMEM; 3314 break; 3315 } 3316 if (seg->s_ops == &segvn_ops) { 3317 stype = SEGOP_GETTYPE(seg, raddr); 3318 sflags = stype & (MAP_TEXT | MAP_INITDATA); 3319 stype &= (MAP_SHARED | MAP_PRIVATE); 3320 if (segvn && (rflags != sflags || 3321 rtype != stype)) { 3322 /* 3323 * The next segment is also segvn but 3324 * has different flags and/or type. 3325 */ 3326 ASSERT(setsize != 0); 3327 error = as_iset_default_lpsize(as, 3328 setaddr, setsize, rflags, rtype); 3329 if (error) { 3330 break; 3331 } 3332 rflags = sflags; 3333 rtype = stype; 3334 setaddr = raddr; 3335 setsize = 0; 3336 } else if (!segvn) { 3337 rflags = sflags; 3338 rtype = stype; 3339 setaddr = raddr; 3340 setsize = 0; 3341 segvn = 1; 3342 } 3343 } else if (segvn) { 3344 /* The next segment is not segvn. */ 3345 ASSERT(setsize != 0); 3346 error = as_iset_default_lpsize(as, 3347 setaddr, setsize, rflags, rtype); 3348 if (error) { 3349 break; 3350 } 3351 segvn = 0; 3352 } 3353 } 3354 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3355 ssize = seg->s_base + seg->s_size - raddr; 3356 } else { 3357 ssize = rsize; 3358 } 3359 } 3360 if (error == 0 && segvn) { 3361 /* The last chunk when rsize == 0. */ 3362 ASSERT(setsize != 0); 3363 error = as_iset_default_lpsize(as, setaddr, setsize, 3364 rflags, rtype); 3365 } 3366 3367 if (error == IE_RETRY) { 3368 goto again; 3369 } else if (error == IE_NOMEM) { 3370 error = EAGAIN; 3371 } else if (error == ENOTSUP) { 3372 error = EINVAL; 3373 } else if (error == EAGAIN) { 3374 mutex_enter(&as->a_contents); 3375 if (!AS_ISNOUNMAPWAIT(as)) { 3376 if (AS_ISUNMAPWAIT(as) == 0) { 3377 cv_broadcast(&as->a_cv); 3378 } 3379 AS_SETUNMAPWAIT(as); 3380 AS_LOCK_EXIT(as); 3381 while (AS_ISUNMAPWAIT(as)) { 3382 cv_wait(&as->a_cv, &as->a_contents); 3383 } 3384 mutex_exit(&as->a_contents); 3385 AS_LOCK_ENTER(as, RW_WRITER); 3386 } else { 3387 /* 3388 * We may have raced with 3389 * segvn_reclaim()/segspt_reclaim(). In this case 3390 * clean nounmapwait flag and retry since softlockcnt 3391 * in this segment may be already 0. We don't drop as 3392 * writer lock so our number of retries without 3393 * sleeping should be very small. See segvn_reclaim() 3394 * for more comments. 3395 */ 3396 AS_CLRNOUNMAPWAIT(as); 3397 mutex_exit(&as->a_contents); 3398 } 3399 goto again; 3400 } 3401 3402 as_setwatch(as); 3403 AS_LOCK_EXIT(as); 3404 return (error); 3405 } 3406 3407 /* 3408 * Setup all of the uninitialized watched pages that we can. 3409 */ 3410 void 3411 as_setwatch(struct as *as) 3412 { 3413 struct watched_page *pwp; 3414 struct seg *seg; 3415 caddr_t vaddr; 3416 uint_t prot; 3417 int err, retrycnt; 3418 3419 if (avl_numnodes(&as->a_wpage) == 0) 3420 return; 3421 3422 ASSERT(AS_WRITE_HELD(as)); 3423 3424 for (pwp = avl_first(&as->a_wpage); pwp != NULL; 3425 pwp = AVL_NEXT(&as->a_wpage, pwp)) { 3426 retrycnt = 0; 3427 retry: 3428 vaddr = pwp->wp_vaddr; 3429 if (pwp->wp_oprot != 0 || /* already set up */ 3430 (seg = as_segat(as, vaddr)) == NULL || 3431 SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0) 3432 continue; 3433 3434 pwp->wp_oprot = prot; 3435 if (pwp->wp_read) 3436 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3437 if (pwp->wp_write) 3438 prot &= ~PROT_WRITE; 3439 if (pwp->wp_exec) 3440 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3441 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) { 3442 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot); 3443 if (err == IE_RETRY) { 3444 pwp->wp_oprot = 0; 3445 ASSERT(retrycnt == 0); 3446 retrycnt++; 3447 goto retry; 3448 } 3449 } 3450 pwp->wp_prot = prot; 3451 } 3452 } 3453 3454 /* 3455 * Clear all of the watched pages in the address space. 3456 */ 3457 void 3458 as_clearwatch(struct as *as) 3459 { 3460 struct watched_page *pwp; 3461 struct seg *seg; 3462 caddr_t vaddr; 3463 uint_t prot; 3464 int err, retrycnt; 3465 3466 if (avl_numnodes(&as->a_wpage) == 0) 3467 return; 3468 3469 ASSERT(AS_WRITE_HELD(as)); 3470 3471 for (pwp = avl_first(&as->a_wpage); pwp != NULL; 3472 pwp = AVL_NEXT(&as->a_wpage, pwp)) { 3473 retrycnt = 0; 3474 retry: 3475 vaddr = pwp->wp_vaddr; 3476 if (pwp->wp_oprot == 0 || /* not set up */ 3477 (seg = as_segat(as, vaddr)) == NULL) 3478 continue; 3479 3480 if ((prot = pwp->wp_oprot) != pwp->wp_prot) { 3481 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot); 3482 if (err == IE_RETRY) { 3483 ASSERT(retrycnt == 0); 3484 retrycnt++; 3485 goto retry; 3486 } 3487 } 3488 pwp->wp_oprot = 0; 3489 pwp->wp_prot = 0; 3490 } 3491 } 3492 3493 /* 3494 * Force a new setup for all the watched pages in the range. 3495 */ 3496 static void 3497 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 3498 { 3499 struct watched_page *pwp; 3500 struct watched_page tpw; 3501 caddr_t eaddr = addr + size; 3502 caddr_t vaddr; 3503 struct seg *seg; 3504 int err, retrycnt; 3505 uint_t wprot; 3506 avl_index_t where; 3507 3508 if (avl_numnodes(&as->a_wpage) == 0) 3509 return; 3510 3511 ASSERT(AS_WRITE_HELD(as)); 3512 3513 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3514 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) 3515 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); 3516 3517 while (pwp != NULL && pwp->wp_vaddr < eaddr) { 3518 retrycnt = 0; 3519 vaddr = pwp->wp_vaddr; 3520 3521 wprot = prot; 3522 if (pwp->wp_read) 3523 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3524 if (pwp->wp_write) 3525 wprot &= ~PROT_WRITE; 3526 if (pwp->wp_exec) 3527 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3528 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) { 3529 retry: 3530 seg = as_segat(as, vaddr); 3531 if (seg == NULL) { 3532 panic("as_setwatchprot: no seg"); 3533 /*NOTREACHED*/ 3534 } 3535 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot); 3536 if (err == IE_RETRY) { 3537 ASSERT(retrycnt == 0); 3538 retrycnt++; 3539 goto retry; 3540 } 3541 } 3542 pwp->wp_oprot = prot; 3543 pwp->wp_prot = wprot; 3544 3545 pwp = AVL_NEXT(&as->a_wpage, pwp); 3546 } 3547 } 3548 3549 /* 3550 * Clear all of the watched pages in the range. 3551 */ 3552 static void 3553 as_clearwatchprot(struct as *as, caddr_t addr, size_t size) 3554 { 3555 caddr_t eaddr = addr + size; 3556 struct watched_page *pwp; 3557 struct watched_page tpw; 3558 uint_t prot; 3559 struct seg *seg; 3560 int err, retrycnt; 3561 avl_index_t where; 3562 3563 if (avl_numnodes(&as->a_wpage) == 0) 3564 return; 3565 3566 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3567 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) 3568 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); 3569 3570 ASSERT(AS_WRITE_HELD(as)); 3571 3572 while (pwp != NULL && pwp->wp_vaddr < eaddr) { 3573 3574 if ((prot = pwp->wp_oprot) != 0) { 3575 retrycnt = 0; 3576 3577 if (prot != pwp->wp_prot) { 3578 retry: 3579 seg = as_segat(as, pwp->wp_vaddr); 3580 if (seg == NULL) 3581 continue; 3582 err = SEGOP_SETPROT(seg, pwp->wp_vaddr, 3583 PAGESIZE, prot); 3584 if (err == IE_RETRY) { 3585 ASSERT(retrycnt == 0); 3586 retrycnt++; 3587 goto retry; 3588 3589 } 3590 } 3591 pwp->wp_oprot = 0; 3592 pwp->wp_prot = 0; 3593 } 3594 3595 pwp = AVL_NEXT(&as->a_wpage, pwp); 3596 } 3597 } 3598 3599 void 3600 as_signal_proc(struct as *as, k_siginfo_t *siginfo) 3601 { 3602 struct proc *p; 3603 3604 mutex_enter(&pidlock); 3605 for (p = practive; p; p = p->p_next) { 3606 if (p->p_as == as) { 3607 mutex_enter(&p->p_lock); 3608 if (p->p_as == as) 3609 sigaddq(p, NULL, siginfo, KM_NOSLEEP); 3610 mutex_exit(&p->p_lock); 3611 } 3612 } 3613 mutex_exit(&pidlock); 3614 } 3615 3616 /* 3617 * return memory object ID 3618 */ 3619 int 3620 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp) 3621 { 3622 struct seg *seg; 3623 int sts; 3624 3625 AS_LOCK_ENTER(as, RW_READER); 3626 seg = as_segat(as, addr); 3627 if (seg == NULL) { 3628 AS_LOCK_EXIT(as); 3629 return (EFAULT); 3630 } 3631 /* 3632 * catch old drivers which may not support getmemid 3633 */ 3634 if (seg->s_ops->getmemid == NULL) { 3635 AS_LOCK_EXIT(as); 3636 return (ENODEV); 3637 } 3638 3639 sts = SEGOP_GETMEMID(seg, addr, memidp); 3640 3641 AS_LOCK_EXIT(as); 3642 return (sts); 3643 } 3644