1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright 2018 Joyent, Inc. 25 * Copyright (c) 2016 by Delphix. All rights reserved. 26 */ 27 28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 29 /* All Rights Reserved */ 30 31 /* 32 * University Copyright- Copyright (c) 1982, 1986, 1988 33 * The Regents of the University of California 34 * All Rights Reserved 35 * 36 * University Acknowledgment- Portions of this document are derived from 37 * software developed by the University of California, Berkeley, and its 38 * contributors. 39 */ 40 41 /* 42 * VM - address spaces. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/t_lock.h> 47 #include <sys/param.h> 48 #include <sys/errno.h> 49 #include <sys/systm.h> 50 #include <sys/mman.h> 51 #include <sys/sysmacros.h> 52 #include <sys/cpuvar.h> 53 #include <sys/sysinfo.h> 54 #include <sys/kmem.h> 55 #include <sys/vnode.h> 56 #include <sys/vmsystm.h> 57 #include <sys/cmn_err.h> 58 #include <sys/debug.h> 59 #include <sys/tnf_probe.h> 60 #include <sys/vtrace.h> 61 62 #include <vm/hat.h> 63 #include <vm/as.h> 64 #include <vm/seg.h> 65 #include <vm/seg_vn.h> 66 #include <vm/seg_dev.h> 67 #include <vm/seg_kmem.h> 68 #include <vm/seg_map.h> 69 #include <vm/seg_spt.h> 70 #include <vm/seg_hole.h> 71 #include <vm/page.h> 72 73 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */ 74 75 static struct kmem_cache *as_cache; 76 77 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t); 78 static void as_clearwatchprot(struct as *, caddr_t, size_t); 79 80 81 /* 82 * Verifying the segment lists is very time-consuming; it may not be 83 * desirable always to define VERIFY_SEGLIST when DEBUG is set. 84 */ 85 #ifdef DEBUG 86 #define VERIFY_SEGLIST 87 int do_as_verify = 0; 88 #endif 89 90 /* 91 * Allocate a new callback data structure entry and fill in the events of 92 * interest, the address range of interest, and the callback argument. 93 * Link the entry on the as->a_callbacks list. A callback entry for the 94 * entire address space may be specified with vaddr = 0 and size = -1. 95 * 96 * CALLERS RESPONSIBILITY: If not calling from within the process context for 97 * the specified as, the caller must guarantee persistence of the specified as 98 * for the duration of this function (eg. pages being locked within the as 99 * will guarantee persistence). 100 */ 101 int 102 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events, 103 caddr_t vaddr, size_t size, int sleepflag) 104 { 105 struct as_callback *current_head, *cb; 106 caddr_t saddr; 107 size_t rsize; 108 109 /* callback function and an event are mandatory */ 110 if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0)) 111 return (EINVAL); 112 113 /* Adding a callback after as_free has been called is not allowed */ 114 if (as == &kas) 115 return (ENOMEM); 116 117 /* 118 * vaddr = 0 and size = -1 is used to indicate that the callback range 119 * is the entire address space so no rounding is done in that case. 120 */ 121 if (size != -1) { 122 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK); 123 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) - 124 (size_t)saddr; 125 /* check for wraparound */ 126 if (saddr + rsize < saddr) 127 return (ENOMEM); 128 } else { 129 if (vaddr != 0) 130 return (EINVAL); 131 saddr = vaddr; 132 rsize = size; 133 } 134 135 /* Allocate and initialize a callback entry */ 136 cb = kmem_zalloc(sizeof (struct as_callback), sleepflag); 137 if (cb == NULL) 138 return (EAGAIN); 139 140 cb->ascb_func = cb_func; 141 cb->ascb_arg = arg; 142 cb->ascb_events = events; 143 cb->ascb_saddr = saddr; 144 cb->ascb_len = rsize; 145 146 /* Add the entry to the list */ 147 mutex_enter(&as->a_contents); 148 current_head = as->a_callbacks; 149 as->a_callbacks = cb; 150 cb->ascb_next = current_head; 151 152 /* 153 * The call to this function may lose in a race with 154 * a pertinent event - eg. a thread does long term memory locking 155 * but before the callback is added another thread executes as_unmap. 156 * A broadcast here resolves that. 157 */ 158 if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) { 159 AS_CLRUNMAPWAIT(as); 160 cv_broadcast(&as->a_cv); 161 } 162 163 mutex_exit(&as->a_contents); 164 return (0); 165 } 166 167 /* 168 * Search the callback list for an entry which pertains to arg. 169 * 170 * This is called from within the client upon completion of the callback. 171 * RETURN VALUES: 172 * AS_CALLBACK_DELETED (callback entry found and deleted) 173 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok) 174 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this 175 * entry will be made in as_do_callbacks) 176 * 177 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED 178 * set, it indicates that as_do_callbacks is processing this entry. The 179 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made 180 * to unblock as_do_callbacks, in case it is blocked. 181 * 182 * CALLERS RESPONSIBILITY: If not calling from within the process context for 183 * the specified as, the caller must guarantee persistence of the specified as 184 * for the duration of this function (eg. pages being locked within the as 185 * will guarantee persistence). 186 */ 187 uint_t 188 as_delete_callback(struct as *as, void *arg) 189 { 190 struct as_callback **prevcb = &as->a_callbacks; 191 struct as_callback *cb; 192 uint_t rc = AS_CALLBACK_NOTFOUND; 193 194 mutex_enter(&as->a_contents); 195 for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) { 196 if (cb->ascb_arg != arg) 197 continue; 198 199 /* 200 * If the events indicate AS_CALLBACK_CALLED, just clear 201 * AS_ALL_EVENT in the events field and wakeup the thread 202 * that may be waiting in as_do_callbacks. as_do_callbacks 203 * will take care of removing this entry from the list. In 204 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise 205 * (AS_CALLBACK_CALLED not set), just remove it from the 206 * list, return the memory and return AS_CALLBACK_DELETED. 207 */ 208 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) { 209 /* leave AS_CALLBACK_CALLED */ 210 cb->ascb_events &= ~AS_ALL_EVENT; 211 rc = AS_CALLBACK_DELETE_DEFERRED; 212 cv_broadcast(&as->a_cv); 213 } else { 214 *prevcb = cb->ascb_next; 215 kmem_free(cb, sizeof (struct as_callback)); 216 rc = AS_CALLBACK_DELETED; 217 } 218 break; 219 } 220 mutex_exit(&as->a_contents); 221 return (rc); 222 } 223 224 /* 225 * Searches the as callback list for a matching entry. 226 * Returns a pointer to the first matching callback, or NULL if 227 * nothing is found. 228 * This function never sleeps so it is ok to call it with more 229 * locks held but the (required) a_contents mutex. 230 * 231 * See also comment on as_do_callbacks below. 232 */ 233 static struct as_callback * 234 as_find_callback(struct as *as, uint_t events, caddr_t event_addr, 235 size_t event_len) 236 { 237 struct as_callback *cb; 238 239 ASSERT(MUTEX_HELD(&as->a_contents)); 240 for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) { 241 /* 242 * If the callback has not already been called, then 243 * check if events or address range pertains. An event_len 244 * of zero means do an unconditional callback. 245 */ 246 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) || 247 ((event_len != 0) && (((cb->ascb_events & events) == 0) || 248 (event_addr + event_len < cb->ascb_saddr) || 249 (event_addr > (cb->ascb_saddr + cb->ascb_len))))) { 250 continue; 251 } 252 break; 253 } 254 return (cb); 255 } 256 257 /* 258 * Executes a given callback and removes it from the callback list for 259 * this address space. 260 * This function may sleep so the caller must drop all locks except 261 * a_contents before calling this func. 262 * 263 * See also comments on as_do_callbacks below. 264 */ 265 static void 266 as_execute_callback(struct as *as, struct as_callback *cb, 267 uint_t events) 268 { 269 struct as_callback **prevcb; 270 void *cb_arg; 271 272 ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events)); 273 cb->ascb_events |= AS_CALLBACK_CALLED; 274 mutex_exit(&as->a_contents); 275 (*cb->ascb_func)(as, cb->ascb_arg, events); 276 mutex_enter(&as->a_contents); 277 /* 278 * the callback function is required to delete the callback 279 * when the callback function determines it is OK for 280 * this thread to continue. as_delete_callback will clear 281 * the AS_ALL_EVENT in the events field when it is deleted. 282 * If the callback function called as_delete_callback, 283 * events will already be cleared and there will be no blocking. 284 */ 285 while ((cb->ascb_events & events) != 0) { 286 cv_wait(&as->a_cv, &as->a_contents); 287 } 288 /* 289 * This entry needs to be taken off the list. Normally, the 290 * callback func itself does that, but unfortunately the list 291 * may have changed while the callback was running because the 292 * a_contents mutex was dropped and someone else other than the 293 * callback func itself could have called as_delete_callback, 294 * so we have to search to find this entry again. The entry 295 * must have AS_CALLBACK_CALLED, and have the same 'arg'. 296 */ 297 cb_arg = cb->ascb_arg; 298 prevcb = &as->a_callbacks; 299 for (cb = as->a_callbacks; cb != NULL; 300 prevcb = &cb->ascb_next, cb = *prevcb) { 301 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) || 302 (cb_arg != cb->ascb_arg)) { 303 continue; 304 } 305 *prevcb = cb->ascb_next; 306 kmem_free(cb, sizeof (struct as_callback)); 307 break; 308 } 309 } 310 311 /* 312 * Check the callback list for a matching event and intersection of 313 * address range. If there is a match invoke the callback. Skip an entry if: 314 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED) 315 * - not event of interest 316 * - not address range of interest 317 * 318 * An event_len of zero indicates a request for an unconditional callback 319 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The 320 * a_contents lock must be dropped before a callback, so only one callback 321 * can be done before returning. Return -1 (true) if a callback was 322 * executed and removed from the list, else return 0 (false). 323 * 324 * The logically separate parts, i.e. finding a matching callback and 325 * executing a given callback have been separated into two functions 326 * so that they can be called with different sets of locks held beyond 327 * the always-required a_contents. as_find_callback does not sleep so 328 * it is ok to call it if more locks than a_contents (i.e. the a_lock 329 * rwlock) are held. as_execute_callback on the other hand may sleep 330 * so all locks beyond a_contents must be dropped by the caller if one 331 * does not want to end comatose. 332 */ 333 static int 334 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr, 335 size_t event_len) 336 { 337 struct as_callback *cb; 338 339 if ((cb = as_find_callback(as, events, event_addr, event_len))) { 340 as_execute_callback(as, cb, events); 341 return (-1); 342 } 343 return (0); 344 } 345 346 /* 347 * Search for the segment containing addr. If a segment containing addr 348 * exists, that segment is returned. If no such segment exists, and 349 * the list spans addresses greater than addr, then the first segment 350 * whose base is greater than addr is returned; otherwise, NULL is 351 * returned unless tail is true, in which case the last element of the 352 * list is returned. 353 * 354 * a_seglast is used to cache the last found segment for repeated 355 * searches to the same addr (which happens frequently). 356 */ 357 struct seg * 358 as_findseg(struct as *as, caddr_t addr, int tail) 359 { 360 struct seg *seg = as->a_seglast; 361 avl_index_t where; 362 363 ASSERT(AS_LOCK_HELD(as)); 364 365 if (seg != NULL && 366 seg->s_base <= addr && 367 addr < seg->s_base + seg->s_size) 368 return (seg); 369 370 seg = avl_find(&as->a_segtree, &addr, &where); 371 if (seg != NULL) 372 return (as->a_seglast = seg); 373 374 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); 375 if (seg == NULL && tail) 376 seg = avl_last(&as->a_segtree); 377 return (as->a_seglast = seg); 378 } 379 380 #ifdef VERIFY_SEGLIST 381 /* 382 * verify that the linked list is coherent 383 */ 384 static void 385 as_verify(struct as *as) 386 { 387 struct seg *seg, *seglast, *p, *n; 388 uint_t nsegs = 0; 389 390 if (do_as_verify == 0) 391 return; 392 393 seglast = as->a_seglast; 394 395 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 396 ASSERT(seg->s_as == as); 397 p = AS_SEGPREV(as, seg); 398 n = AS_SEGNEXT(as, seg); 399 ASSERT(p == NULL || p->s_as == as); 400 ASSERT(p == NULL || p->s_base < seg->s_base); 401 ASSERT(n == NULL || n->s_base > seg->s_base); 402 ASSERT(n != NULL || seg == avl_last(&as->a_segtree)); 403 if (seg == seglast) 404 seglast = NULL; 405 nsegs++; 406 } 407 ASSERT(seglast == NULL); 408 ASSERT(avl_numnodes(&as->a_segtree) == nsegs); 409 } 410 #endif /* VERIFY_SEGLIST */ 411 412 /* 413 * Add a new segment to the address space. The avl_find() 414 * may be expensive so we attempt to use last segment accessed 415 * in as_gap() as an insertion point. 416 */ 417 int 418 as_addseg(struct as *as, struct seg *newseg) 419 { 420 struct seg *seg; 421 caddr_t addr; 422 caddr_t eaddr; 423 avl_index_t where; 424 425 ASSERT(AS_WRITE_HELD(as)); 426 427 as->a_updatedir = 1; /* inform /proc */ 428 gethrestime(&as->a_updatetime); 429 430 if (as->a_lastgaphl != NULL) { 431 struct seg *hseg = NULL; 432 struct seg *lseg = NULL; 433 434 if (as->a_lastgaphl->s_base > newseg->s_base) { 435 hseg = as->a_lastgaphl; 436 lseg = AVL_PREV(&as->a_segtree, hseg); 437 } else { 438 lseg = as->a_lastgaphl; 439 hseg = AVL_NEXT(&as->a_segtree, lseg); 440 } 441 442 if (hseg && lseg && lseg->s_base < newseg->s_base && 443 hseg->s_base > newseg->s_base) { 444 avl_insert_here(&as->a_segtree, newseg, lseg, 445 AVL_AFTER); 446 as->a_lastgaphl = NULL; 447 as->a_seglast = newseg; 448 return (0); 449 } 450 as->a_lastgaphl = NULL; 451 } 452 453 addr = newseg->s_base; 454 eaddr = addr + newseg->s_size; 455 again: 456 457 seg = avl_find(&as->a_segtree, &addr, &where); 458 459 if (seg == NULL) 460 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); 461 462 if (seg == NULL) 463 seg = avl_last(&as->a_segtree); 464 465 if (seg != NULL) { 466 caddr_t base = seg->s_base; 467 468 /* 469 * If top of seg is below the requested address, then 470 * the insertion point is at the end of the linked list, 471 * and seg points to the tail of the list. Otherwise, 472 * the insertion point is immediately before seg. 473 */ 474 if (base + seg->s_size > addr) { 475 if (addr >= base || eaddr > base) { 476 #ifdef __sparc 477 extern struct seg_ops segnf_ops; 478 479 /* 480 * no-fault segs must disappear if overlaid. 481 * XXX need new segment type so 482 * we don't have to check s_ops 483 */ 484 if (seg->s_ops == &segnf_ops) { 485 seg_unmap(seg); 486 goto again; 487 } 488 #endif 489 return (-1); /* overlapping segment */ 490 } 491 } 492 } 493 as->a_seglast = newseg; 494 avl_insert(&as->a_segtree, newseg, where); 495 496 #ifdef VERIFY_SEGLIST 497 as_verify(as); 498 #endif 499 return (0); 500 } 501 502 struct seg * 503 as_removeseg(struct as *as, struct seg *seg) 504 { 505 avl_tree_t *t; 506 507 ASSERT(AS_WRITE_HELD(as)); 508 509 as->a_updatedir = 1; /* inform /proc */ 510 gethrestime(&as->a_updatetime); 511 512 if (seg == NULL) 513 return (NULL); 514 515 t = &as->a_segtree; 516 if (as->a_seglast == seg) 517 as->a_seglast = NULL; 518 as->a_lastgaphl = NULL; 519 520 /* 521 * if this segment is at an address higher than 522 * a_lastgap, set a_lastgap to the next segment (NULL if last segment) 523 */ 524 if (as->a_lastgap && 525 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base)) 526 as->a_lastgap = AVL_NEXT(t, seg); 527 528 /* 529 * remove the segment from the seg tree 530 */ 531 avl_remove(t, seg); 532 533 #ifdef VERIFY_SEGLIST 534 as_verify(as); 535 #endif 536 return (seg); 537 } 538 539 /* 540 * Find a segment containing addr. 541 */ 542 struct seg * 543 as_segat(struct as *as, caddr_t addr) 544 { 545 struct seg *seg = as->a_seglast; 546 547 ASSERT(AS_LOCK_HELD(as)); 548 549 if (seg != NULL && seg->s_base <= addr && 550 addr < seg->s_base + seg->s_size) 551 return (seg); 552 553 seg = avl_find(&as->a_segtree, &addr, NULL); 554 return (seg); 555 } 556 557 /* 558 * Serialize all searches for holes in an address space to 559 * prevent two or more threads from allocating the same virtual 560 * address range. The address space must not be "read/write" 561 * locked by the caller since we may block. 562 */ 563 void 564 as_rangelock(struct as *as) 565 { 566 mutex_enter(&as->a_contents); 567 while (AS_ISCLAIMGAP(as)) 568 cv_wait(&as->a_cv, &as->a_contents); 569 AS_SETCLAIMGAP(as); 570 mutex_exit(&as->a_contents); 571 } 572 573 /* 574 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads. 575 */ 576 void 577 as_rangeunlock(struct as *as) 578 { 579 mutex_enter(&as->a_contents); 580 AS_CLRCLAIMGAP(as); 581 cv_signal(&as->a_cv); 582 mutex_exit(&as->a_contents); 583 } 584 585 /* 586 * compar segments (or just an address) by segment address range 587 */ 588 static int 589 as_segcompar(const void *x, const void *y) 590 { 591 struct seg *a = (struct seg *)x; 592 struct seg *b = (struct seg *)y; 593 594 if (a->s_base < b->s_base) 595 return (-1); 596 if (a->s_base >= b->s_base + b->s_size) 597 return (1); 598 return (0); 599 } 600 601 602 void 603 as_avlinit(struct as *as) 604 { 605 avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg), 606 offsetof(struct seg, s_tree)); 607 avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page), 608 offsetof(struct watched_page, wp_link)); 609 } 610 611 /*ARGSUSED*/ 612 static int 613 as_constructor(void *buf, void *cdrarg, int kmflags) 614 { 615 struct as *as = buf; 616 617 mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL); 618 cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL); 619 rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL); 620 as_avlinit(as); 621 return (0); 622 } 623 624 /*ARGSUSED1*/ 625 static void 626 as_destructor(void *buf, void *cdrarg) 627 { 628 struct as *as = buf; 629 630 avl_destroy(&as->a_segtree); 631 mutex_destroy(&as->a_contents); 632 cv_destroy(&as->a_cv); 633 rw_destroy(&as->a_lock); 634 } 635 636 void 637 as_init(void) 638 { 639 as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0, 640 as_constructor, as_destructor, NULL, NULL, NULL, 0); 641 } 642 643 /* 644 * Allocate and initialize an address space data structure. 645 * We call hat_alloc to allow any machine dependent 646 * information in the hat structure to be initialized. 647 */ 648 struct as * 649 as_alloc(void) 650 { 651 struct as *as; 652 653 as = kmem_cache_alloc(as_cache, KM_SLEEP); 654 655 as->a_flags = 0; 656 as->a_vbits = 0; 657 as->a_hrm = NULL; 658 as->a_seglast = NULL; 659 as->a_size = 0; 660 as->a_resvsize = 0; 661 as->a_updatedir = 0; 662 gethrestime(&as->a_updatetime); 663 as->a_objectdir = NULL; 664 as->a_sizedir = 0; 665 as->a_userlimit = (caddr_t)USERLIMIT; 666 as->a_lastgap = NULL; 667 as->a_lastgaphl = NULL; 668 as->a_callbacks = NULL; 669 as->a_proc = NULL; 670 671 AS_LOCK_ENTER(as, RW_WRITER); 672 as->a_hat = hat_alloc(as); /* create hat for default system mmu */ 673 AS_LOCK_EXIT(as); 674 675 return (as); 676 } 677 678 /* 679 * Free an address space data structure. 680 * Need to free the hat first and then 681 * all the segments on this as and finally 682 * the space for the as struct itself. 683 */ 684 void 685 as_free(struct as *as) 686 { 687 struct hat *hat = as->a_hat; 688 struct seg *seg, *next; 689 boolean_t free_started = B_FALSE; 690 691 top: 692 /* 693 * Invoke ALL callbacks. as_do_callbacks will do one callback 694 * per call, and not return (-1) until the callback has completed. 695 * When as_do_callbacks returns zero, all callbacks have completed. 696 */ 697 mutex_enter(&as->a_contents); 698 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0)) 699 ; 700 701 mutex_exit(&as->a_contents); 702 AS_LOCK_ENTER(as, RW_WRITER); 703 704 if (!free_started) { 705 free_started = B_TRUE; 706 hat_free_start(hat); 707 } 708 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) { 709 int err; 710 711 next = AS_SEGNEXT(as, seg); 712 retry: 713 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 714 if (err == EAGAIN) { 715 mutex_enter(&as->a_contents); 716 if (as->a_callbacks) { 717 AS_LOCK_EXIT(as); 718 } else if (!AS_ISNOUNMAPWAIT(as)) { 719 /* 720 * Memory is currently locked. Wait for a 721 * cv_signal that it has been unlocked, then 722 * try the operation again. 723 */ 724 if (AS_ISUNMAPWAIT(as) == 0) 725 cv_broadcast(&as->a_cv); 726 AS_SETUNMAPWAIT(as); 727 AS_LOCK_EXIT(as); 728 while (AS_ISUNMAPWAIT(as)) 729 cv_wait(&as->a_cv, &as->a_contents); 730 } else { 731 /* 732 * We may have raced with 733 * segvn_reclaim()/segspt_reclaim(). In this 734 * case clean nounmapwait flag and retry since 735 * softlockcnt in this segment may be already 736 * 0. We don't drop as writer lock so our 737 * number of retries without sleeping should 738 * be very small. See segvn_reclaim() for 739 * more comments. 740 */ 741 AS_CLRNOUNMAPWAIT(as); 742 mutex_exit(&as->a_contents); 743 goto retry; 744 } 745 mutex_exit(&as->a_contents); 746 goto top; 747 } else { 748 /* 749 * We do not expect any other error return at this 750 * time. This is similar to an ASSERT in seg_unmap() 751 */ 752 ASSERT(err == 0); 753 } 754 } 755 hat_free_end(hat); 756 AS_LOCK_EXIT(as); 757 758 /* /proc stuff */ 759 ASSERT(avl_numnodes(&as->a_wpage) == 0); 760 if (as->a_objectdir) { 761 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *)); 762 as->a_objectdir = NULL; 763 as->a_sizedir = 0; 764 } 765 766 /* 767 * Free the struct as back to kmem. Assert it has no segments. 768 */ 769 ASSERT(avl_numnodes(&as->a_segtree) == 0); 770 kmem_cache_free(as_cache, as); 771 } 772 773 int 774 as_dup(struct as *as, struct proc *forkedproc) 775 { 776 struct as *newas; 777 struct seg *seg, *newseg; 778 size_t purgesize = 0; 779 int error; 780 781 AS_LOCK_ENTER(as, RW_WRITER); 782 as_clearwatch(as); 783 newas = as_alloc(); 784 newas->a_userlimit = as->a_userlimit; 785 newas->a_proc = forkedproc; 786 787 AS_LOCK_ENTER(newas, RW_WRITER); 788 789 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD); 790 791 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 792 793 if (seg->s_flags & S_PURGE) { 794 purgesize += seg->s_size; 795 continue; 796 } 797 798 newseg = seg_alloc(newas, seg->s_base, seg->s_size); 799 if (newseg == NULL) { 800 AS_LOCK_EXIT(newas); 801 as_setwatch(as); 802 AS_LOCK_EXIT(as); 803 as_free(newas); 804 return (-1); 805 } 806 if ((error = SEGOP_DUP(seg, newseg)) != 0) { 807 /* 808 * We call seg_free() on the new seg 809 * because the segment is not set up 810 * completely; i.e. it has no ops. 811 */ 812 as_setwatch(as); 813 AS_LOCK_EXIT(as); 814 seg_free(newseg); 815 AS_LOCK_EXIT(newas); 816 as_free(newas); 817 return (error); 818 } 819 if ((newseg->s_flags & S_HOLE) == 0) { 820 newas->a_size += seg->s_size; 821 } 822 } 823 newas->a_resvsize = as->a_resvsize - purgesize; 824 825 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL); 826 827 AS_LOCK_EXIT(newas); 828 829 as_setwatch(as); 830 AS_LOCK_EXIT(as); 831 if (error != 0) { 832 as_free(newas); 833 return (error); 834 } 835 forkedproc->p_as = newas; 836 return (0); 837 } 838 839 /* 840 * Handle a ``fault'' at addr for size bytes. 841 */ 842 faultcode_t 843 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size, 844 enum fault_type type, enum seg_rw rw) 845 { 846 struct seg *seg; 847 caddr_t raddr; /* rounded down addr */ 848 size_t rsize; /* rounded up size */ 849 size_t ssize; 850 faultcode_t res = 0; 851 caddr_t addrsav; 852 struct seg *segsav; 853 int as_lock_held; 854 klwp_t *lwp = ttolwp(curthread); 855 856 857 858 retry: 859 /* 860 * Indicate that the lwp is not to be stopped while waiting for a 861 * pagefault. This is to avoid deadlock while debugging a process 862 * via /proc over NFS (in particular). 863 */ 864 if (lwp != NULL) 865 lwp->lwp_nostop++; 866 867 /* 868 * same length must be used when we softlock and softunlock. We 869 * don't support softunlocking lengths less than the original length 870 * when there is largepage support. See seg_dev.c for more 871 * comments. 872 */ 873 switch (type) { 874 875 case F_SOFTLOCK: 876 CPU_STATS_ADD_K(vm, softlock, 1); 877 break; 878 879 case F_SOFTUNLOCK: 880 break; 881 882 case F_PROT: 883 CPU_STATS_ADD_K(vm, prot_fault, 1); 884 break; 885 886 case F_INVAL: 887 CPU_STATS_ENTER_K(); 888 CPU_STATS_ADDQ(CPU, vm, as_fault, 1); 889 if (as == &kas) 890 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1); 891 CPU_STATS_EXIT_K(); 892 break; 893 } 894 895 /* Kernel probe */ 896 TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */, 897 tnf_opaque, address, addr, 898 tnf_fault_type, fault_type, type, 899 tnf_seg_access, access, rw); 900 901 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 902 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 903 (size_t)raddr; 904 905 /* 906 * XXX -- Don't grab the as lock for segkmap. We should grab it for 907 * correctness, but then we could be stuck holding this lock for 908 * a LONG time if the fault needs to be resolved on a slow 909 * filesystem, and then no-one will be able to exec new commands, 910 * as exec'ing requires the write lock on the as. 911 */ 912 if (as == &kas && segkmap && segkmap->s_base <= raddr && 913 raddr + size < segkmap->s_base + segkmap->s_size) { 914 seg = segkmap; 915 as_lock_held = 0; 916 } else { 917 AS_LOCK_ENTER(as, RW_READER); 918 919 seg = as_segat(as, raddr); 920 if (seg == NULL) { 921 AS_LOCK_EXIT(as); 922 if (lwp != NULL) 923 lwp->lwp_nostop--; 924 return (FC_NOMAP); 925 } 926 927 as_lock_held = 1; 928 } 929 930 addrsav = raddr; 931 segsav = seg; 932 933 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 934 if (raddr >= seg->s_base + seg->s_size) { 935 seg = AS_SEGNEXT(as, seg); 936 if (seg == NULL || raddr != seg->s_base) { 937 res = FC_NOMAP; 938 break; 939 } 940 } 941 if (raddr + rsize > seg->s_base + seg->s_size) 942 ssize = seg->s_base + seg->s_size - raddr; 943 else 944 ssize = rsize; 945 946 res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw); 947 if (res != 0) 948 break; 949 } 950 951 /* 952 * If we were SOFTLOCKing and encountered a failure, 953 * we must SOFTUNLOCK the range we already did. (Maybe we 954 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing 955 * right here...) 956 */ 957 if (res != 0 && type == F_SOFTLOCK) { 958 for (seg = segsav; addrsav < raddr; addrsav += ssize) { 959 if (addrsav >= seg->s_base + seg->s_size) 960 seg = AS_SEGNEXT(as, seg); 961 ASSERT(seg != NULL); 962 /* 963 * Now call the fault routine again to perform the 964 * unlock using S_OTHER instead of the rw variable 965 * since we never got a chance to touch the pages. 966 */ 967 if (raddr > seg->s_base + seg->s_size) 968 ssize = seg->s_base + seg->s_size - addrsav; 969 else 970 ssize = raddr - addrsav; 971 (void) SEGOP_FAULT(hat, seg, addrsav, ssize, 972 F_SOFTUNLOCK, S_OTHER); 973 } 974 } 975 if (as_lock_held) 976 AS_LOCK_EXIT(as); 977 if (lwp != NULL) 978 lwp->lwp_nostop--; 979 980 /* 981 * If the lower levels returned EDEADLK for a fault, 982 * It means that we should retry the fault. Let's wait 983 * a bit also to let the deadlock causing condition clear. 984 * This is part of a gross hack to work around a design flaw 985 * in the ufs/sds logging code and should go away when the 986 * logging code is re-designed to fix the problem. See bug 987 * 4125102 for details of the problem. 988 */ 989 if (FC_ERRNO(res) == EDEADLK) { 990 delay(deadlk_wait); 991 res = 0; 992 goto retry; 993 } 994 return (res); 995 } 996 997 998 999 /* 1000 * Asynchronous ``fault'' at addr for size bytes. 1001 */ 1002 faultcode_t 1003 as_faulta(struct as *as, caddr_t addr, size_t size) 1004 { 1005 struct seg *seg; 1006 caddr_t raddr; /* rounded down addr */ 1007 size_t rsize; /* rounded up size */ 1008 faultcode_t res = 0; 1009 klwp_t *lwp = ttolwp(curthread); 1010 1011 retry: 1012 /* 1013 * Indicate that the lwp is not to be stopped while waiting 1014 * for a pagefault. This is to avoid deadlock while debugging 1015 * a process via /proc over NFS (in particular). 1016 */ 1017 if (lwp != NULL) 1018 lwp->lwp_nostop++; 1019 1020 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1021 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1022 (size_t)raddr; 1023 1024 AS_LOCK_ENTER(as, RW_READER); 1025 seg = as_segat(as, raddr); 1026 if (seg == NULL) { 1027 AS_LOCK_EXIT(as); 1028 if (lwp != NULL) 1029 lwp->lwp_nostop--; 1030 return (FC_NOMAP); 1031 } 1032 1033 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) { 1034 if (raddr >= seg->s_base + seg->s_size) { 1035 seg = AS_SEGNEXT(as, seg); 1036 if (seg == NULL || raddr != seg->s_base) { 1037 res = FC_NOMAP; 1038 break; 1039 } 1040 } 1041 res = SEGOP_FAULTA(seg, raddr); 1042 if (res != 0) 1043 break; 1044 } 1045 AS_LOCK_EXIT(as); 1046 if (lwp != NULL) 1047 lwp->lwp_nostop--; 1048 /* 1049 * If the lower levels returned EDEADLK for a fault, 1050 * It means that we should retry the fault. Let's wait 1051 * a bit also to let the deadlock causing condition clear. 1052 * This is part of a gross hack to work around a design flaw 1053 * in the ufs/sds logging code and should go away when the 1054 * logging code is re-designed to fix the problem. See bug 1055 * 4125102 for details of the problem. 1056 */ 1057 if (FC_ERRNO(res) == EDEADLK) { 1058 delay(deadlk_wait); 1059 res = 0; 1060 goto retry; 1061 } 1062 return (res); 1063 } 1064 1065 /* 1066 * Set the virtual mapping for the interval from [addr : addr + size) 1067 * in address space `as' to have the specified protection. 1068 * It is ok for the range to cross over several segments, 1069 * as long as they are contiguous. 1070 */ 1071 int 1072 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 1073 { 1074 struct seg *seg; 1075 struct as_callback *cb; 1076 size_t ssize; 1077 caddr_t raddr; /* rounded down addr */ 1078 size_t rsize; /* rounded up size */ 1079 int error = 0, writer = 0; 1080 caddr_t saveraddr; 1081 size_t saversize; 1082 1083 setprot_top: 1084 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1085 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1086 (size_t)raddr; 1087 1088 if (raddr + rsize < raddr) /* check for wraparound */ 1089 return (ENOMEM); 1090 1091 saveraddr = raddr; 1092 saversize = rsize; 1093 1094 /* 1095 * Normally we only lock the as as a reader. But 1096 * if due to setprot the segment driver needs to split 1097 * a segment it will return IE_RETRY. Therefore we re-acquire 1098 * the as lock as a writer so the segment driver can change 1099 * the seg list. Also the segment driver will return IE_RETRY 1100 * after it has changed the segment list so we therefore keep 1101 * locking as a writer. Since these opeartions should be rare 1102 * want to only lock as a writer when necessary. 1103 */ 1104 if (writer || avl_numnodes(&as->a_wpage) != 0) { 1105 AS_LOCK_ENTER(as, RW_WRITER); 1106 } else { 1107 AS_LOCK_ENTER(as, RW_READER); 1108 } 1109 1110 as_clearwatchprot(as, raddr, rsize); 1111 seg = as_segat(as, raddr); 1112 if (seg == NULL) { 1113 as_setwatch(as); 1114 AS_LOCK_EXIT(as); 1115 return (ENOMEM); 1116 } 1117 1118 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 1119 if (raddr >= seg->s_base + seg->s_size) { 1120 seg = AS_SEGNEXT(as, seg); 1121 if (seg == NULL || raddr != seg->s_base) { 1122 error = ENOMEM; 1123 break; 1124 } 1125 } 1126 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 1127 ssize = seg->s_base + seg->s_size - raddr; 1128 else 1129 ssize = rsize; 1130 retry: 1131 error = SEGOP_SETPROT(seg, raddr, ssize, prot); 1132 1133 if (error == IE_NOMEM) { 1134 error = EAGAIN; 1135 break; 1136 } 1137 1138 if (error == IE_RETRY) { 1139 AS_LOCK_EXIT(as); 1140 writer = 1; 1141 goto setprot_top; 1142 } 1143 1144 if (error == EAGAIN) { 1145 /* 1146 * Make sure we have a_lock as writer. 1147 */ 1148 if (writer == 0) { 1149 AS_LOCK_EXIT(as); 1150 writer = 1; 1151 goto setprot_top; 1152 } 1153 1154 /* 1155 * Memory is currently locked. It must be unlocked 1156 * before this operation can succeed through a retry. 1157 * The possible reasons for locked memory and 1158 * corresponding strategies for unlocking are: 1159 * (1) Normal I/O 1160 * wait for a signal that the I/O operation 1161 * has completed and the memory is unlocked. 1162 * (2) Asynchronous I/O 1163 * The aio subsystem does not unlock pages when 1164 * the I/O is completed. Those pages are unlocked 1165 * when the application calls aiowait/aioerror. 1166 * So, to prevent blocking forever, cv_broadcast() 1167 * is done to wake up aio_cleanup_thread. 1168 * Subsequently, segvn_reclaim will be called, and 1169 * that will do AS_CLRUNMAPWAIT() and wake us up. 1170 * (3) Long term page locking: 1171 * Drivers intending to have pages locked for a 1172 * period considerably longer than for normal I/O 1173 * (essentially forever) may have registered for a 1174 * callback so they may unlock these pages on 1175 * request. This is needed to allow this operation 1176 * to succeed. Each entry on the callback list is 1177 * examined. If the event or address range pertains 1178 * the callback is invoked (unless it already is in 1179 * progress). The a_contents lock must be dropped 1180 * before the callback, so only one callback can 1181 * be done at a time. Go to the top and do more 1182 * until zero is returned. If zero is returned, 1183 * either there were no callbacks for this event 1184 * or they were already in progress. 1185 */ 1186 mutex_enter(&as->a_contents); 1187 if (as->a_callbacks && 1188 (cb = as_find_callback(as, AS_SETPROT_EVENT, 1189 seg->s_base, seg->s_size))) { 1190 AS_LOCK_EXIT(as); 1191 as_execute_callback(as, cb, AS_SETPROT_EVENT); 1192 } else if (!AS_ISNOUNMAPWAIT(as)) { 1193 if (AS_ISUNMAPWAIT(as) == 0) 1194 cv_broadcast(&as->a_cv); 1195 AS_SETUNMAPWAIT(as); 1196 AS_LOCK_EXIT(as); 1197 while (AS_ISUNMAPWAIT(as)) 1198 cv_wait(&as->a_cv, &as->a_contents); 1199 } else { 1200 /* 1201 * We may have raced with 1202 * segvn_reclaim()/segspt_reclaim(). In this 1203 * case clean nounmapwait flag and retry since 1204 * softlockcnt in this segment may be already 1205 * 0. We don't drop as writer lock so our 1206 * number of retries without sleeping should 1207 * be very small. See segvn_reclaim() for 1208 * more comments. 1209 */ 1210 AS_CLRNOUNMAPWAIT(as); 1211 mutex_exit(&as->a_contents); 1212 goto retry; 1213 } 1214 mutex_exit(&as->a_contents); 1215 goto setprot_top; 1216 } else if (error != 0) 1217 break; 1218 } 1219 if (error != 0) { 1220 as_setwatch(as); 1221 } else { 1222 as_setwatchprot(as, saveraddr, saversize, prot); 1223 } 1224 AS_LOCK_EXIT(as); 1225 return (error); 1226 } 1227 1228 /* 1229 * Check to make sure that the interval [addr, addr + size) 1230 * in address space `as' has at least the specified protection. 1231 * It is ok for the range to cross over several segments, as long 1232 * as they are contiguous. 1233 */ 1234 int 1235 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 1236 { 1237 struct seg *seg; 1238 size_t ssize; 1239 caddr_t raddr; /* rounded down addr */ 1240 size_t rsize; /* rounded up size */ 1241 int error = 0; 1242 1243 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1244 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1245 (size_t)raddr; 1246 1247 if (raddr + rsize < raddr) /* check for wraparound */ 1248 return (ENOMEM); 1249 1250 /* 1251 * This is ugly as sin... 1252 * Normally, we only acquire the address space readers lock. 1253 * However, if the address space has watchpoints present, 1254 * we must acquire the writer lock on the address space for 1255 * the benefit of as_clearwatchprot() and as_setwatchprot(). 1256 */ 1257 if (avl_numnodes(&as->a_wpage) != 0) 1258 AS_LOCK_ENTER(as, RW_WRITER); 1259 else 1260 AS_LOCK_ENTER(as, RW_READER); 1261 as_clearwatchprot(as, raddr, rsize); 1262 seg = as_segat(as, raddr); 1263 if (seg == NULL) { 1264 as_setwatch(as); 1265 AS_LOCK_EXIT(as); 1266 return (ENOMEM); 1267 } 1268 1269 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 1270 if (raddr >= seg->s_base + seg->s_size) { 1271 seg = AS_SEGNEXT(as, seg); 1272 if (seg == NULL || raddr != seg->s_base) { 1273 error = ENOMEM; 1274 break; 1275 } 1276 } 1277 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 1278 ssize = seg->s_base + seg->s_size - raddr; 1279 else 1280 ssize = rsize; 1281 1282 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot); 1283 if (error != 0) 1284 break; 1285 } 1286 as_setwatch(as); 1287 AS_LOCK_EXIT(as); 1288 return (error); 1289 } 1290 1291 int 1292 as_unmap(struct as *as, caddr_t addr, size_t size) 1293 { 1294 struct seg *seg, *seg_next; 1295 struct as_callback *cb; 1296 caddr_t raddr, eaddr; 1297 size_t ssize, rsize = 0; 1298 int err; 1299 1300 top: 1301 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1302 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) & 1303 (uintptr_t)PAGEMASK); 1304 1305 AS_LOCK_ENTER(as, RW_WRITER); 1306 1307 as->a_updatedir = 1; /* inform /proc */ 1308 gethrestime(&as->a_updatetime); 1309 1310 /* 1311 * Use as_findseg to find the first segment in the range, then 1312 * step through the segments in order, following s_next. 1313 */ 1314 as_clearwatchprot(as, raddr, eaddr - raddr); 1315 1316 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) { 1317 const boolean_t is_hole = ((seg->s_flags & S_HOLE) != 0); 1318 1319 if (eaddr <= seg->s_base) 1320 break; /* eaddr was in a gap; all done */ 1321 1322 /* this is implied by the test above */ 1323 ASSERT(raddr < eaddr); 1324 1325 if (raddr < seg->s_base) 1326 raddr = seg->s_base; /* raddr was in a gap */ 1327 1328 if (eaddr > (seg->s_base + seg->s_size)) 1329 ssize = seg->s_base + seg->s_size - raddr; 1330 else 1331 ssize = eaddr - raddr; 1332 1333 /* 1334 * Save next segment pointer since seg can be 1335 * destroyed during the segment unmap operation. 1336 */ 1337 seg_next = AS_SEGNEXT(as, seg); 1338 1339 /* 1340 * We didn't count /dev/null mappings, so ignore them here. 1341 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again, 1342 * we have to do this check here while we have seg.) 1343 */ 1344 rsize = 0; 1345 if (!SEG_IS_DEVNULL_MAPPING(seg) && 1346 !SEG_IS_PARTIAL_RESV(seg)) 1347 rsize = ssize; 1348 1349 retry: 1350 err = SEGOP_UNMAP(seg, raddr, ssize); 1351 if (err == EAGAIN) { 1352 /* 1353 * Memory is currently locked. It must be unlocked 1354 * before this operation can succeed through a retry. 1355 * The possible reasons for locked memory and 1356 * corresponding strategies for unlocking are: 1357 * (1) Normal I/O 1358 * wait for a signal that the I/O operation 1359 * has completed and the memory is unlocked. 1360 * (2) Asynchronous I/O 1361 * The aio subsystem does not unlock pages when 1362 * the I/O is completed. Those pages are unlocked 1363 * when the application calls aiowait/aioerror. 1364 * So, to prevent blocking forever, cv_broadcast() 1365 * is done to wake up aio_cleanup_thread. 1366 * Subsequently, segvn_reclaim will be called, and 1367 * that will do AS_CLRUNMAPWAIT() and wake us up. 1368 * (3) Long term page locking: 1369 * Drivers intending to have pages locked for a 1370 * period considerably longer than for normal I/O 1371 * (essentially forever) may have registered for a 1372 * callback so they may unlock these pages on 1373 * request. This is needed to allow this operation 1374 * to succeed. Each entry on the callback list is 1375 * examined. If the event or address range pertains 1376 * the callback is invoked (unless it already is in 1377 * progress). The a_contents lock must be dropped 1378 * before the callback, so only one callback can 1379 * be done at a time. Go to the top and do more 1380 * until zero is returned. If zero is returned, 1381 * either there were no callbacks for this event 1382 * or they were already in progress. 1383 */ 1384 mutex_enter(&as->a_contents); 1385 if (as->a_callbacks && 1386 (cb = as_find_callback(as, AS_UNMAP_EVENT, 1387 seg->s_base, seg->s_size))) { 1388 AS_LOCK_EXIT(as); 1389 as_execute_callback(as, cb, AS_UNMAP_EVENT); 1390 } else if (!AS_ISNOUNMAPWAIT(as)) { 1391 if (AS_ISUNMAPWAIT(as) == 0) 1392 cv_broadcast(&as->a_cv); 1393 AS_SETUNMAPWAIT(as); 1394 AS_LOCK_EXIT(as); 1395 while (AS_ISUNMAPWAIT(as)) 1396 cv_wait(&as->a_cv, &as->a_contents); 1397 } else { 1398 /* 1399 * We may have raced with 1400 * segvn_reclaim()/segspt_reclaim(). In this 1401 * case clean nounmapwait flag and retry since 1402 * softlockcnt in this segment may be already 1403 * 0. We don't drop as writer lock so our 1404 * number of retries without sleeping should 1405 * be very small. See segvn_reclaim() for 1406 * more comments. 1407 */ 1408 AS_CLRNOUNMAPWAIT(as); 1409 mutex_exit(&as->a_contents); 1410 goto retry; 1411 } 1412 mutex_exit(&as->a_contents); 1413 goto top; 1414 } else if (err == IE_RETRY) { 1415 AS_LOCK_EXIT(as); 1416 goto top; 1417 } else if (err) { 1418 as_setwatch(as); 1419 AS_LOCK_EXIT(as); 1420 return (-1); 1421 } 1422 1423 if (!is_hole) { 1424 as->a_size -= ssize; 1425 if (rsize) 1426 as->a_resvsize -= rsize; 1427 } 1428 raddr += ssize; 1429 } 1430 AS_LOCK_EXIT(as); 1431 return (0); 1432 } 1433 1434 static int 1435 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec, 1436 segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated) 1437 { 1438 uint_t szc, nszc, save_szcvec; 1439 int error; 1440 caddr_t a, eaddr; 1441 size_t pgsz = 0; 1442 const boolean_t do_off = (vn_a->vp != NULL || vn_a->amp != NULL); 1443 1444 ASSERT(AS_WRITE_HELD(as)); 1445 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1446 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1447 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL); 1448 1449 if (!do_off) { 1450 vn_a->offset = 0; 1451 } 1452 1453 if (szcvec <= 1) { 1454 struct seg *seg, *segref; 1455 1456 seg = segref = seg_alloc(as, addr, size); 1457 if (seg == NULL) { 1458 return (ENOMEM); 1459 } 1460 vn_a->szc = 0; 1461 error = (*crfp)(&seg, vn_a); 1462 if (error != 0) { 1463 VERIFY3P(seg, ==, segref); 1464 seg_free(seg); 1465 } else { 1466 as->a_size += size; 1467 as->a_resvsize += size; 1468 } 1469 return (error); 1470 } 1471 1472 eaddr = addr + size; 1473 save_szcvec = szcvec; 1474 szcvec >>= 1; 1475 szc = 0; 1476 nszc = 0; 1477 while (szcvec) { 1478 if ((szcvec & 0x1) == 0) { 1479 nszc++; 1480 szcvec >>= 1; 1481 continue; 1482 } 1483 nszc++; 1484 pgsz = page_get_pagesize(nszc); 1485 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 1486 if (a != addr) { 1487 struct seg *seg, *segref; 1488 size_t segsize; 1489 1490 ASSERT(a < eaddr); 1491 1492 segsize = a - addr; 1493 seg = segref = seg_alloc(as, addr, segsize); 1494 if (seg == NULL) { 1495 return (ENOMEM); 1496 } 1497 vn_a->szc = szc; 1498 error = (*crfp)(&seg, vn_a); 1499 if (error != 0) { 1500 VERIFY3P(seg, ==, segref); 1501 seg_free(seg); 1502 return (error); 1503 } 1504 as->a_size += segsize; 1505 as->a_resvsize += segsize; 1506 *segcreated = B_TRUE; 1507 if (do_off) { 1508 vn_a->offset += segsize; 1509 } 1510 addr = a; 1511 } 1512 szc = nszc; 1513 szcvec >>= 1; 1514 } 1515 1516 ASSERT(addr < eaddr); 1517 szcvec = save_szcvec | 1; /* add 8K pages */ 1518 while (szcvec) { 1519 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 1520 ASSERT(a >= addr); 1521 if (a != addr) { 1522 struct seg *seg, *segref; 1523 size_t segsize; 1524 1525 segsize = a - addr; 1526 seg = segref = seg_alloc(as, addr, segsize); 1527 if (seg == NULL) { 1528 return (ENOMEM); 1529 } 1530 vn_a->szc = szc; 1531 error = (*crfp)(&seg, vn_a); 1532 if (error != 0) { 1533 VERIFY3P(seg, ==, segref); 1534 seg_free(seg); 1535 return (error); 1536 } 1537 as->a_size += segsize; 1538 as->a_resvsize += segsize; 1539 *segcreated = B_TRUE; 1540 if (do_off) { 1541 vn_a->offset += segsize; 1542 } 1543 addr = a; 1544 } 1545 szcvec &= ~(1 << szc); 1546 if (szcvec) { 1547 szc = highbit(szcvec) - 1; 1548 pgsz = page_get_pagesize(szc); 1549 } 1550 } 1551 ASSERT(addr == eaddr); 1552 1553 return (0); 1554 } 1555 1556 static int 1557 as_map_vnsegs(struct as *as, caddr_t addr, size_t size, 1558 segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated) 1559 { 1560 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA); 1561 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM; 1562 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags, 1563 type, 0); 1564 int error; 1565 struct vattr va; 1566 u_offset_t eoff; 1567 size_t save_size = 0; 1568 extern size_t textrepl_size_thresh; 1569 1570 ASSERT(AS_WRITE_HELD(as)); 1571 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1572 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1573 ASSERT(vn_a->vp != NULL); 1574 ASSERT(vn_a->amp == NULL); 1575 1576 again: 1577 if (szcvec <= 1) { 1578 struct seg *seg, *segref; 1579 1580 seg = segref = seg_alloc(as, addr, size); 1581 if (seg == NULL) { 1582 return (ENOMEM); 1583 } 1584 vn_a->szc = 0; 1585 error = (*crfp)(&seg, vn_a); 1586 if (error != 0) { 1587 VERIFY3P(seg, ==, segref); 1588 seg_free(seg); 1589 } else { 1590 as->a_size += size; 1591 as->a_resvsize += size; 1592 } 1593 return (error); 1594 } 1595 1596 va.va_mask = AT_SIZE; 1597 if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) { 1598 szcvec = 0; 1599 goto again; 1600 } 1601 eoff = vn_a->offset & PAGEMASK; 1602 if (eoff >= va.va_size) { 1603 szcvec = 0; 1604 goto again; 1605 } 1606 eoff += size; 1607 if (btopr(va.va_size) < btopr(eoff)) { 1608 save_size = size; 1609 size = va.va_size - (vn_a->offset & PAGEMASK); 1610 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t); 1611 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags, 1612 type, 0); 1613 if (szcvec <= 1) { 1614 size = save_size; 1615 goto again; 1616 } 1617 } 1618 1619 if (size > textrepl_size_thresh) { 1620 vn_a->flags |= _MAP_TEXTREPL; 1621 } 1622 error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a, 1623 segcreated); 1624 if (error != 0) { 1625 return (error); 1626 } 1627 if (save_size) { 1628 addr += size; 1629 size = save_size - size; 1630 szcvec = 0; 1631 goto again; 1632 } 1633 return (0); 1634 } 1635 1636 /* 1637 * as_map_ansegs: shared or private anonymous memory. Note that the flags 1638 * passed to map_pgszvec cannot be MAP_INITDATA, for anon. 1639 */ 1640 static int 1641 as_map_ansegs(struct as *as, caddr_t addr, size_t size, 1642 segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated) 1643 { 1644 uint_t szcvec; 1645 uchar_t type = 0; 1646 1647 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE); 1648 if (vn_a->type == MAP_SHARED) { 1649 type = MAPPGSZC_SHM; 1650 } else if (vn_a->type == MAP_PRIVATE) { 1651 if (vn_a->szc == AS_MAP_HEAP) { 1652 type = MAPPGSZC_HEAP; 1653 } else if (vn_a->szc == AS_MAP_STACK) { 1654 type = MAPPGSZC_STACK; 1655 } else { 1656 type = MAPPGSZC_PRIVM; 1657 } 1658 } 1659 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ? 1660 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE), 1661 (vn_a->flags & MAP_TEXT), type, 0); 1662 ASSERT(AS_WRITE_HELD(as)); 1663 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1664 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1665 ASSERT(vn_a->vp == NULL); 1666 1667 return (as_map_segvn_segs(as, addr, size, szcvec, 1668 crfp, vn_a, segcreated)); 1669 } 1670 1671 int 1672 as_map(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp, 1673 void *argsp) 1674 { 1675 AS_LOCK_ENTER(as, RW_WRITER); 1676 return (as_map_locked(as, addr, size, crfp, argsp)); 1677 } 1678 1679 int 1680 as_map_locked(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp, 1681 void *argsp) 1682 { 1683 caddr_t raddr; /* rounded down addr */ 1684 size_t rsize; /* rounded up size */ 1685 int error; 1686 boolean_t is_hole = B_FALSE; 1687 /* 1688 * The use of a_proc is preferred to handle the case where curproc is 1689 * a door_call server and is allocating memory in the client's (a_proc) 1690 * address space. 1691 * When creating a shared memory segment a_proc will be NULL so we 1692 * fallback to curproc in that case. 1693 */ 1694 struct proc *p = (as->a_proc == NULL) ? curproc : as->a_proc; 1695 struct segvn_crargs crargs; 1696 1697 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1698 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1699 (size_t)raddr; 1700 1701 /* 1702 * check for wrap around 1703 */ 1704 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) { 1705 AS_LOCK_EXIT(as); 1706 return (ENOMEM); 1707 } 1708 1709 as->a_updatedir = 1; /* inform /proc */ 1710 gethrestime(&as->a_updatetime); 1711 1712 if (as != &kas) { 1713 /* 1714 * Ensure that the virtual size of the process will not exceed 1715 * the configured limit. Since seg_hole segments will later 1716 * set the S_HOLE flag indicating their status as a hole in the 1717 * AS, they are excluded from this check. 1718 */ 1719 if (as->a_size + rsize > (size_t)p->p_vmem_ctl && 1720 !AS_MAP_CHECK_SEGHOLE(crfp)) { 1721 AS_LOCK_EXIT(as); 1722 1723 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], 1724 p->p_rctls, p, RCA_UNSAFE_ALL); 1725 return (ENOMEM); 1726 } 1727 } 1728 1729 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) { 1730 boolean_t do_unmap = B_FALSE; 1731 1732 crargs = *(struct segvn_crargs *)argsp; 1733 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, 1734 &do_unmap); 1735 if (error != 0) { 1736 AS_LOCK_EXIT(as); 1737 if (do_unmap) { 1738 (void) as_unmap(as, addr, size); 1739 } 1740 return (error); 1741 } 1742 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) { 1743 boolean_t do_unmap = B_FALSE; 1744 1745 crargs = *(struct segvn_crargs *)argsp; 1746 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, 1747 &do_unmap); 1748 if (error != 0) { 1749 AS_LOCK_EXIT(as); 1750 if (do_unmap) { 1751 (void) as_unmap(as, addr, size); 1752 } 1753 return (error); 1754 } 1755 } else { 1756 struct seg *seg, *segref; 1757 1758 seg = segref = seg_alloc(as, addr, size); 1759 if (seg == NULL) { 1760 AS_LOCK_EXIT(as); 1761 return (ENOMEM); 1762 } 1763 1764 /* 1765 * It is possible that the segment creation routine will free 1766 * 'seg' as part of a more advanced operation, such as when 1767 * segvn concatenates adjacent segments together. When this 1768 * occurs, the seg*_create routine must communicate the 1769 * resulting segment out via the 'struct seg **' parameter. 1770 * 1771 * If segment creation fails, it must not free the passed-in 1772 * segment, nor alter the argument pointer. 1773 */ 1774 error = (*crfp)(&seg, argsp); 1775 if (error != 0) { 1776 VERIFY3P(seg, ==, segref); 1777 seg_free(seg); 1778 AS_LOCK_EXIT(as); 1779 return (error); 1780 } 1781 1782 /* 1783 * Check if the resulting segment represents a hole in the 1784 * address space, rather than contributing to the AS size. 1785 */ 1786 is_hole = ((seg->s_flags & S_HOLE) != 0); 1787 1788 /* Add size now so as_unmap will work if as_ctl fails. */ 1789 if (!is_hole) { 1790 as->a_size += rsize; 1791 as->a_resvsize += rsize; 1792 } 1793 } 1794 1795 as_setwatch(as); 1796 1797 /* 1798 * Establish memory locks for the segment if the address space is 1799 * locked, provided it's not an explicit hole in the AS. 1800 */ 1801 mutex_enter(&as->a_contents); 1802 if (AS_ISPGLCK(as) && !is_hole) { 1803 mutex_exit(&as->a_contents); 1804 AS_LOCK_EXIT(as); 1805 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0); 1806 if (error != 0) 1807 (void) as_unmap(as, addr, size); 1808 } else { 1809 mutex_exit(&as->a_contents); 1810 AS_LOCK_EXIT(as); 1811 } 1812 return (error); 1813 } 1814 1815 1816 /* 1817 * Delete all segments in the address space marked with S_PURGE. 1818 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c). 1819 * These segments are deleted as a first step before calls to as_gap(), so 1820 * that they don't affect mmap() or shmat(). 1821 */ 1822 void 1823 as_purge(struct as *as) 1824 { 1825 struct seg *seg; 1826 struct seg *next_seg; 1827 1828 /* 1829 * the setting of NEEDSPURGE is protect by as_rangelock(), so 1830 * no need to grab a_contents mutex for this check 1831 */ 1832 if ((as->a_flags & AS_NEEDSPURGE) == 0) 1833 return; 1834 1835 AS_LOCK_ENTER(as, RW_WRITER); 1836 next_seg = NULL; 1837 seg = AS_SEGFIRST(as); 1838 while (seg != NULL) { 1839 next_seg = AS_SEGNEXT(as, seg); 1840 if (seg->s_flags & S_PURGE) 1841 SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 1842 seg = next_seg; 1843 } 1844 AS_LOCK_EXIT(as); 1845 1846 mutex_enter(&as->a_contents); 1847 as->a_flags &= ~AS_NEEDSPURGE; 1848 mutex_exit(&as->a_contents); 1849 } 1850 1851 /* 1852 * Find a hole within [*basep, *basep + *lenp), which contains a mappable 1853 * range of addresses at least "minlen" long, where the base of the range is 1854 * at "off" phase from an "align" boundary and there is space for a 1855 * "redzone"-sized redzone on eithe rside of the range. Thus, 1856 * if align was 4M and off was 16k, the user wants a hole which will start 1857 * 16k into a 4M page. 1858 * 1859 * If flags specifies AH_HI, the hole will have the highest possible address 1860 * in the range. We use the as->a_lastgap field to figure out where to 1861 * start looking for a gap. 1862 * 1863 * Otherwise, the gap will have the lowest possible address. 1864 * 1865 * If flags specifies AH_CONTAIN, the hole will contain the address addr. 1866 * 1867 * If an adequate hole is found, *basep and *lenp are set to reflect the part of 1868 * the hole that is within range, and 0 is returned. On failure, -1 is returned. 1869 * 1870 * NOTE: This routine is not correct when base+len overflows caddr_t. 1871 */ 1872 int 1873 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, 1874 uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off) 1875 { 1876 caddr_t lobound = *basep; 1877 caddr_t hibound = lobound + *lenp; 1878 struct seg *lseg, *hseg; 1879 caddr_t lo, hi; 1880 int forward; 1881 caddr_t save_base; 1882 size_t save_len; 1883 size_t save_minlen; 1884 size_t save_redzone; 1885 int fast_path = 1; 1886 1887 save_base = *basep; 1888 save_len = *lenp; 1889 save_minlen = minlen; 1890 save_redzone = redzone; 1891 1892 /* 1893 * For the first pass/fast_path, just add align and redzone into 1894 * minlen since if we get an allocation, we can guarantee that it 1895 * will fit the alignment and redzone requested. 1896 * This increases the chance that hibound will be adjusted to 1897 * a_lastgap->s_base which will likely allow us to find an 1898 * acceptable hole in the address space quicker. 1899 * If we can't find a hole with this fast_path, then we look for 1900 * smaller holes in which the alignment and offset may allow 1901 * the allocation to fit. 1902 */ 1903 minlen += align; 1904 minlen += 2 * redzone; 1905 redzone = 0; 1906 1907 AS_LOCK_ENTER(as, RW_READER); 1908 if (AS_SEGFIRST(as) == NULL) { 1909 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR, 1910 align, redzone, off)) { 1911 AS_LOCK_EXIT(as); 1912 return (0); 1913 } else { 1914 AS_LOCK_EXIT(as); 1915 *basep = save_base; 1916 *lenp = save_len; 1917 return (-1); 1918 } 1919 } 1920 1921 retry: 1922 /* 1923 * Set up to iterate over all the inter-segment holes in the given 1924 * direction. lseg is NULL for the lowest-addressed hole and hseg is 1925 * NULL for the highest-addressed hole. If moving backwards, we reset 1926 * sseg to denote the highest-addressed segment. 1927 */ 1928 forward = (flags & AH_DIR) == AH_LO; 1929 if (forward) { 1930 hseg = as_findseg(as, lobound, 1); 1931 lseg = AS_SEGPREV(as, hseg); 1932 } else { 1933 1934 /* 1935 * If allocating at least as much as the last allocation, 1936 * use a_lastgap's base as a better estimate of hibound. 1937 */ 1938 if (as->a_lastgap && 1939 minlen >= as->a_lastgap->s_size && 1940 hibound >= as->a_lastgap->s_base) 1941 hibound = as->a_lastgap->s_base; 1942 1943 hseg = as_findseg(as, hibound, 1); 1944 if (hseg->s_base + hseg->s_size < hibound) { 1945 lseg = hseg; 1946 hseg = NULL; 1947 } else { 1948 lseg = AS_SEGPREV(as, hseg); 1949 } 1950 } 1951 1952 for (;;) { 1953 /* 1954 * Set lo and hi to the hole's boundaries. (We should really 1955 * use MAXADDR in place of hibound in the expression below, 1956 * but can't express it easily; using hibound in its place is 1957 * harmless.) 1958 */ 1959 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size; 1960 hi = (hseg == NULL) ? hibound : hseg->s_base; 1961 /* 1962 * If the iteration has moved past the interval from lobound 1963 * to hibound it's pointless to continue. 1964 */ 1965 if ((forward && lo > hibound) || (!forward && hi < lobound)) 1966 break; 1967 else if (lo > hibound || hi < lobound) 1968 goto cont; 1969 /* 1970 * Candidate hole lies at least partially within the allowable 1971 * range. Restrict it to fall completely within that range, 1972 * i.e., to [max(lo, lobound), min(hi, hibound)]. 1973 */ 1974 if (lo < lobound) 1975 lo = lobound; 1976 if (hi > hibound) 1977 hi = hibound; 1978 /* 1979 * Verify that the candidate hole is big enough and meets 1980 * hardware constraints. If the hole is too small, no need 1981 * to do the further checks since they will fail. 1982 */ 1983 *basep = lo; 1984 *lenp = hi - lo; 1985 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp, 1986 minlen, forward ? AH_LO : AH_HI, align, redzone, off) && 1987 ((flags & AH_CONTAIN) == 0 || 1988 (*basep <= addr && *basep + *lenp > addr))) { 1989 if (!forward) 1990 as->a_lastgap = hseg; 1991 if (hseg != NULL) 1992 as->a_lastgaphl = hseg; 1993 else 1994 as->a_lastgaphl = lseg; 1995 AS_LOCK_EXIT(as); 1996 return (0); 1997 } 1998 cont: 1999 /* 2000 * Move to the next hole. 2001 */ 2002 if (forward) { 2003 lseg = hseg; 2004 if (lseg == NULL) 2005 break; 2006 hseg = AS_SEGNEXT(as, hseg); 2007 } else { 2008 hseg = lseg; 2009 if (hseg == NULL) 2010 break; 2011 lseg = AS_SEGPREV(as, lseg); 2012 } 2013 } 2014 if (fast_path && (align != 0 || save_redzone != 0)) { 2015 fast_path = 0; 2016 minlen = save_minlen; 2017 redzone = save_redzone; 2018 goto retry; 2019 } 2020 *basep = save_base; 2021 *lenp = save_len; 2022 AS_LOCK_EXIT(as); 2023 return (-1); 2024 } 2025 2026 /* 2027 * Find a hole of at least size minlen within [*basep, *basep + *lenp). 2028 * 2029 * If flags specifies AH_HI, the hole will have the highest possible address 2030 * in the range. We use the as->a_lastgap field to figure out where to 2031 * start looking for a gap. 2032 * 2033 * Otherwise, the gap will have the lowest possible address. 2034 * 2035 * If flags specifies AH_CONTAIN, the hole will contain the address addr. 2036 * 2037 * If an adequate hole is found, base and len are set to reflect the part of 2038 * the hole that is within range, and 0 is returned, otherwise, 2039 * -1 is returned. 2040 * 2041 * NOTE: This routine is not correct when base+len overflows caddr_t. 2042 */ 2043 int 2044 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags, 2045 caddr_t addr) 2046 { 2047 2048 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0)); 2049 } 2050 2051 /* 2052 * Return the next range within [base, base + len) that is backed 2053 * with "real memory". Skip holes and non-seg_vn segments. 2054 * We're lazy and only return one segment at a time. 2055 */ 2056 int 2057 as_memory(struct as *as, caddr_t *basep, size_t *lenp) 2058 { 2059 extern struct seg_ops segspt_shmops; /* needs a header file */ 2060 struct seg *seg; 2061 caddr_t addr, eaddr; 2062 caddr_t segend; 2063 2064 AS_LOCK_ENTER(as, RW_READER); 2065 2066 addr = *basep; 2067 eaddr = addr + *lenp; 2068 2069 seg = as_findseg(as, addr, 0); 2070 if (seg != NULL) 2071 addr = MAX(seg->s_base, addr); 2072 2073 for (;;) { 2074 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) { 2075 AS_LOCK_EXIT(as); 2076 return (EINVAL); 2077 } 2078 2079 if (seg->s_ops == &segvn_ops) { 2080 segend = seg->s_base + seg->s_size; 2081 break; 2082 } 2083 2084 /* 2085 * We do ISM by looking into the private data 2086 * to determine the real size of the segment. 2087 */ 2088 if (seg->s_ops == &segspt_shmops) { 2089 segend = seg->s_base + spt_realsize(seg); 2090 if (addr < segend) 2091 break; 2092 } 2093 2094 seg = AS_SEGNEXT(as, seg); 2095 2096 if (seg != NULL) 2097 addr = seg->s_base; 2098 } 2099 2100 *basep = addr; 2101 2102 if (segend > eaddr) 2103 *lenp = eaddr - addr; 2104 else 2105 *lenp = segend - addr; 2106 2107 AS_LOCK_EXIT(as); 2108 return (0); 2109 } 2110 2111 /* 2112 * Swap the pages associated with the address space as out to 2113 * secondary storage, returning the number of bytes actually 2114 * swapped. 2115 * 2116 * The value returned is intended to correlate well with the process's 2117 * memory requirements. Its usefulness for this purpose depends on 2118 * how well the segment-level routines do at returning accurate 2119 * information. 2120 */ 2121 size_t 2122 as_swapout(struct as *as) 2123 { 2124 struct seg *seg; 2125 size_t swpcnt = 0; 2126 2127 /* 2128 * Kernel-only processes have given up their address 2129 * spaces. Of course, we shouldn't be attempting to 2130 * swap out such processes in the first place... 2131 */ 2132 if (as == NULL) 2133 return (0); 2134 2135 AS_LOCK_ENTER(as, RW_READER); 2136 2137 /* 2138 * Free all mapping resources associated with the address 2139 * space. The segment-level swapout routines capitalize 2140 * on this unmapping by scavanging pages that have become 2141 * unmapped here. 2142 */ 2143 hat_swapout(as->a_hat); 2144 2145 /* 2146 * Call the swapout routines of all segments in the address 2147 * space to do the actual work, accumulating the amount of 2148 * space reclaimed. 2149 */ 2150 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 2151 struct seg_ops *ov = seg->s_ops; 2152 2153 /* 2154 * We have to check to see if the seg has 2155 * an ops vector because the seg may have 2156 * been in the middle of being set up when 2157 * the process was picked for swapout. 2158 */ 2159 if ((ov != NULL) && (ov->swapout != NULL)) 2160 swpcnt += SEGOP_SWAPOUT(seg); 2161 } 2162 AS_LOCK_EXIT(as); 2163 return (swpcnt); 2164 } 2165 2166 /* 2167 * Determine whether data from the mappings in interval [addr, addr + size) 2168 * are in the primary memory (core) cache. 2169 */ 2170 int 2171 as_incore(struct as *as, caddr_t addr, 2172 size_t size, char *vec, size_t *sizep) 2173 { 2174 struct seg *seg; 2175 size_t ssize; 2176 caddr_t raddr; /* rounded down addr */ 2177 size_t rsize; /* rounded up size */ 2178 size_t isize; /* iteration size */ 2179 int error = 0; /* result, assume success */ 2180 2181 *sizep = 0; 2182 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2183 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) - 2184 (size_t)raddr; 2185 2186 if (raddr + rsize < raddr) /* check for wraparound */ 2187 return (ENOMEM); 2188 2189 AS_LOCK_ENTER(as, RW_READER); 2190 seg = as_segat(as, raddr); 2191 if (seg == NULL) { 2192 AS_LOCK_EXIT(as); 2193 return (-1); 2194 } 2195 2196 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 2197 if (raddr >= seg->s_base + seg->s_size) { 2198 seg = AS_SEGNEXT(as, seg); 2199 if (seg == NULL || raddr != seg->s_base) { 2200 error = -1; 2201 break; 2202 } 2203 } 2204 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2205 ssize = seg->s_base + seg->s_size - raddr; 2206 else 2207 ssize = rsize; 2208 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec); 2209 if (isize != ssize) { 2210 error = -1; 2211 break; 2212 } 2213 vec += btopr(ssize); 2214 } 2215 AS_LOCK_EXIT(as); 2216 return (error); 2217 } 2218 2219 static void 2220 as_segunlock(struct seg *seg, caddr_t addr, int attr, 2221 ulong_t *bitmap, size_t position, size_t npages) 2222 { 2223 caddr_t range_start; 2224 size_t pos1 = position; 2225 size_t pos2; 2226 size_t size; 2227 size_t end_pos = npages + position; 2228 2229 while (bt_range(bitmap, &pos1, &pos2, end_pos)) { 2230 size = ptob((pos2 - pos1)); 2231 range_start = (caddr_t)((uintptr_t)addr + 2232 ptob(pos1 - position)); 2233 2234 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK, 2235 (ulong_t *)NULL, (size_t)NULL); 2236 pos1 = pos2; 2237 } 2238 } 2239 2240 static void 2241 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map, 2242 caddr_t raddr, size_t rsize) 2243 { 2244 struct seg *seg = as_segat(as, raddr); 2245 size_t ssize; 2246 2247 while (rsize != 0) { 2248 if (raddr >= seg->s_base + seg->s_size) 2249 seg = AS_SEGNEXT(as, seg); 2250 2251 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2252 ssize = seg->s_base + seg->s_size - raddr; 2253 else 2254 ssize = rsize; 2255 2256 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize)); 2257 2258 rsize -= ssize; 2259 raddr += ssize; 2260 } 2261 } 2262 2263 /* 2264 * Cache control operations over the interval [addr, addr + size) in 2265 * address space "as". 2266 */ 2267 /*ARGSUSED*/ 2268 int 2269 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr, 2270 uintptr_t arg, ulong_t *lock_map, size_t pos) 2271 { 2272 struct seg *seg; /* working segment */ 2273 caddr_t raddr; /* rounded down addr */ 2274 caddr_t initraddr; /* saved initial rounded down addr */ 2275 size_t rsize; /* rounded up size */ 2276 size_t initrsize; /* saved initial rounded up size */ 2277 size_t ssize; /* size of seg */ 2278 int error = 0; /* result */ 2279 size_t mlock_size; /* size of bitmap */ 2280 ulong_t *mlock_map; /* pointer to bitmap used */ 2281 /* to represent the locked */ 2282 /* pages. */ 2283 2284 mlock_size = 0; 2285 mlock_map = NULL; 2286 retry: 2287 if (error == IE_RETRY) 2288 AS_LOCK_ENTER(as, RW_WRITER); 2289 else 2290 AS_LOCK_ENTER(as, RW_READER); 2291 2292 /* 2293 * If these are address space lock/unlock operations, loop over 2294 * all segments in the address space, as appropriate. 2295 */ 2296 if (func == MC_LOCKAS) { 2297 size_t npages, idx; 2298 size_t rlen = 0; /* rounded as length */ 2299 2300 idx = pos; 2301 2302 if (arg & MCL_FUTURE) { 2303 mutex_enter(&as->a_contents); 2304 AS_SETPGLCK(as); 2305 mutex_exit(&as->a_contents); 2306 } 2307 if ((arg & MCL_CURRENT) == 0) { 2308 AS_LOCK_EXIT(as); 2309 return (0); 2310 } 2311 2312 seg = AS_SEGFIRST(as); 2313 if (seg == NULL) { 2314 AS_LOCK_EXIT(as); 2315 return (0); 2316 } 2317 2318 do { 2319 raddr = (caddr_t)((uintptr_t)seg->s_base & 2320 (uintptr_t)PAGEMASK); 2321 rlen += (((uintptr_t)(seg->s_base + seg->s_size) + 2322 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr; 2323 } while ((seg = AS_SEGNEXT(as, seg)) != NULL); 2324 2325 mlock_size = BT_BITOUL(btopr(rlen)); 2326 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * 2327 sizeof (ulong_t), KM_NOSLEEP)) == NULL) { 2328 AS_LOCK_EXIT(as); 2329 return (EAGAIN); 2330 } 2331 2332 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 2333 if ((seg->s_flags & S_HOLE) != 0) { 2334 continue; 2335 } 2336 error = SEGOP_LOCKOP(seg, seg->s_base, 2337 seg->s_size, attr, MC_LOCK, mlock_map, pos); 2338 if (error != 0) 2339 break; 2340 pos += seg_pages(seg); 2341 } 2342 2343 if (error) { 2344 for (seg = AS_SEGFIRST(as); seg != NULL; 2345 seg = AS_SEGNEXT(as, seg)) { 2346 2347 raddr = (caddr_t)((uintptr_t)seg->s_base & 2348 (uintptr_t)PAGEMASK); 2349 npages = seg_pages(seg); 2350 as_segunlock(seg, raddr, attr, mlock_map, 2351 idx, npages); 2352 idx += npages; 2353 } 2354 } 2355 2356 kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); 2357 AS_LOCK_EXIT(as); 2358 goto lockerr; 2359 } else if (func == MC_UNLOCKAS) { 2360 mutex_enter(&as->a_contents); 2361 AS_CLRPGLCK(as); 2362 mutex_exit(&as->a_contents); 2363 2364 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 2365 if ((seg->s_flags & S_HOLE) != 0) { 2366 continue; 2367 } 2368 error = SEGOP_LOCKOP(seg, seg->s_base, 2369 seg->s_size, attr, MC_UNLOCK, NULL, 0); 2370 if (error != 0) 2371 break; 2372 } 2373 2374 AS_LOCK_EXIT(as); 2375 goto lockerr; 2376 } 2377 2378 /* 2379 * Normalize addresses and sizes. 2380 */ 2381 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2382 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2383 (size_t)raddr; 2384 2385 if (raddr + rsize < raddr) { /* check for wraparound */ 2386 AS_LOCK_EXIT(as); 2387 return (ENOMEM); 2388 } 2389 2390 /* 2391 * Get initial segment. 2392 */ 2393 if ((seg = as_segat(as, raddr)) == NULL) { 2394 AS_LOCK_EXIT(as); 2395 return (ENOMEM); 2396 } 2397 2398 if (func == MC_LOCK) { 2399 mlock_size = BT_BITOUL(btopr(rsize)); 2400 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * 2401 sizeof (ulong_t), KM_NOSLEEP)) == NULL) { 2402 AS_LOCK_EXIT(as); 2403 return (EAGAIN); 2404 } 2405 } 2406 2407 /* 2408 * Loop over all segments. If a hole in the address range is 2409 * discovered, then fail. For each segment, perform the appropriate 2410 * control operation. 2411 */ 2412 while (rsize != 0) { 2413 2414 /* 2415 * Make sure there's no hole, calculate the portion 2416 * of the next segment to be operated over. 2417 */ 2418 if (raddr >= seg->s_base + seg->s_size) { 2419 seg = AS_SEGNEXT(as, seg); 2420 if (seg == NULL || raddr != seg->s_base) { 2421 if (func == MC_LOCK) { 2422 as_unlockerr(as, attr, mlock_map, 2423 initraddr, initrsize - rsize); 2424 kmem_free(mlock_map, 2425 mlock_size * sizeof (ulong_t)); 2426 } 2427 AS_LOCK_EXIT(as); 2428 return (ENOMEM); 2429 } 2430 } 2431 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2432 ssize = seg->s_base + seg->s_size - raddr; 2433 else 2434 ssize = rsize; 2435 2436 /* 2437 * Dispatch on specific function. 2438 */ 2439 switch (func) { 2440 2441 /* 2442 * Synchronize cached data from mappings with backing 2443 * objects. 2444 */ 2445 case MC_SYNC: 2446 if (error = SEGOP_SYNC(seg, raddr, ssize, 2447 attr, (uint_t)arg)) { 2448 AS_LOCK_EXIT(as); 2449 return (error); 2450 } 2451 break; 2452 2453 /* 2454 * Lock pages in memory. 2455 */ 2456 case MC_LOCK: 2457 if (error = SEGOP_LOCKOP(seg, raddr, ssize, 2458 attr, func, mlock_map, pos)) { 2459 as_unlockerr(as, attr, mlock_map, initraddr, 2460 initrsize - rsize + ssize); 2461 kmem_free(mlock_map, mlock_size * 2462 sizeof (ulong_t)); 2463 AS_LOCK_EXIT(as); 2464 goto lockerr; 2465 } 2466 break; 2467 2468 /* 2469 * Unlock mapped pages. 2470 */ 2471 case MC_UNLOCK: 2472 (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func, 2473 (ulong_t *)NULL, (size_t)NULL); 2474 break; 2475 2476 /* 2477 * Store VM advise for mapped pages in segment layer. 2478 */ 2479 case MC_ADVISE: 2480 error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg); 2481 2482 /* 2483 * Check for regular errors and special retry error 2484 */ 2485 if (error) { 2486 if (error == IE_RETRY) { 2487 /* 2488 * Need to acquire writers lock, so 2489 * have to drop readers lock and start 2490 * all over again 2491 */ 2492 AS_LOCK_EXIT(as); 2493 goto retry; 2494 } else if (error == IE_REATTACH) { 2495 /* 2496 * Find segment for current address 2497 * because current segment just got 2498 * split or concatenated 2499 */ 2500 seg = as_segat(as, raddr); 2501 if (seg == NULL) { 2502 AS_LOCK_EXIT(as); 2503 return (ENOMEM); 2504 } 2505 } else { 2506 /* 2507 * Regular error 2508 */ 2509 AS_LOCK_EXIT(as); 2510 return (error); 2511 } 2512 } 2513 break; 2514 2515 case MC_INHERIT_ZERO: 2516 if (seg->s_ops->inherit == NULL) { 2517 error = ENOTSUP; 2518 } else { 2519 error = SEGOP_INHERIT(seg, raddr, ssize, 2520 SEGP_INH_ZERO); 2521 } 2522 if (error != 0) { 2523 AS_LOCK_EXIT(as); 2524 return (error); 2525 } 2526 break; 2527 2528 /* 2529 * Can't happen. 2530 */ 2531 default: 2532 panic("as_ctl: bad operation %d", func); 2533 /*NOTREACHED*/ 2534 } 2535 2536 rsize -= ssize; 2537 raddr += ssize; 2538 } 2539 2540 if (func == MC_LOCK) 2541 kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); 2542 AS_LOCK_EXIT(as); 2543 return (0); 2544 lockerr: 2545 2546 /* 2547 * If the lower levels returned EDEADLK for a segment lockop, 2548 * it means that we should retry the operation. Let's wait 2549 * a bit also to let the deadlock causing condition clear. 2550 * This is part of a gross hack to work around a design flaw 2551 * in the ufs/sds logging code and should go away when the 2552 * logging code is re-designed to fix the problem. See bug 2553 * 4125102 for details of the problem. 2554 */ 2555 if (error == EDEADLK) { 2556 delay(deadlk_wait); 2557 error = 0; 2558 goto retry; 2559 } 2560 return (error); 2561 } 2562 2563 int 2564 fc_decode(faultcode_t fault_err) 2565 { 2566 int error = 0; 2567 2568 switch (FC_CODE(fault_err)) { 2569 case FC_OBJERR: 2570 error = FC_ERRNO(fault_err); 2571 break; 2572 case FC_PROT: 2573 error = EACCES; 2574 break; 2575 default: 2576 error = EFAULT; 2577 break; 2578 } 2579 return (error); 2580 } 2581 2582 /* 2583 * Pagelock pages from a range that spans more than 1 segment. Obtain shadow 2584 * lists from each segment and copy them to one contiguous shadow list (plist) 2585 * as expected by the caller. Save pointers to per segment shadow lists at 2586 * the tail of plist so that they can be used during as_pageunlock(). 2587 */ 2588 static int 2589 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp, 2590 caddr_t addr, size_t size, enum seg_rw rw) 2591 { 2592 caddr_t sv_addr = addr; 2593 size_t sv_size = size; 2594 struct seg *sv_seg = seg; 2595 ulong_t segcnt = 1; 2596 ulong_t cnt; 2597 size_t ssize; 2598 pgcnt_t npages = btop(size); 2599 page_t **plist; 2600 page_t **pl; 2601 int error; 2602 caddr_t eaddr; 2603 faultcode_t fault_err = 0; 2604 pgcnt_t pl_off; 2605 extern struct seg_ops segspt_shmops; 2606 2607 ASSERT(AS_LOCK_HELD(as)); 2608 ASSERT(seg != NULL); 2609 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size); 2610 ASSERT(addr + size > seg->s_base + seg->s_size); 2611 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 2612 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 2613 2614 /* 2615 * Count the number of segments covered by the range we are about to 2616 * lock. The segment count is used to size the shadow list we return 2617 * back to the caller. 2618 */ 2619 for (; size != 0; size -= ssize, addr += ssize) { 2620 if (addr >= seg->s_base + seg->s_size) { 2621 2622 seg = AS_SEGNEXT(as, seg); 2623 if (seg == NULL || addr != seg->s_base) { 2624 AS_LOCK_EXIT(as); 2625 return (EFAULT); 2626 } 2627 /* 2628 * Do a quick check if subsequent segments 2629 * will most likely support pagelock. 2630 */ 2631 if (seg->s_ops == &segvn_ops) { 2632 vnode_t *vp; 2633 2634 if (SEGOP_GETVP(seg, addr, &vp) != 0 || 2635 vp != NULL) { 2636 AS_LOCK_EXIT(as); 2637 goto slow; 2638 } 2639 } else if (seg->s_ops != &segspt_shmops) { 2640 AS_LOCK_EXIT(as); 2641 goto slow; 2642 } 2643 segcnt++; 2644 } 2645 if (addr + size > seg->s_base + seg->s_size) { 2646 ssize = seg->s_base + seg->s_size - addr; 2647 } else { 2648 ssize = size; 2649 } 2650 } 2651 ASSERT(segcnt > 1); 2652 2653 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP); 2654 2655 addr = sv_addr; 2656 size = sv_size; 2657 seg = sv_seg; 2658 2659 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) { 2660 if (addr >= seg->s_base + seg->s_size) { 2661 seg = AS_SEGNEXT(as, seg); 2662 ASSERT(seg != NULL && addr == seg->s_base); 2663 cnt++; 2664 ASSERT(cnt < segcnt); 2665 } 2666 if (addr + size > seg->s_base + seg->s_size) { 2667 ssize = seg->s_base + seg->s_size - addr; 2668 } else { 2669 ssize = size; 2670 } 2671 pl = &plist[npages + cnt]; 2672 error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, 2673 L_PAGELOCK, rw); 2674 if (error) { 2675 break; 2676 } 2677 ASSERT(plist[npages + cnt] != NULL); 2678 ASSERT(pl_off + btop(ssize) <= npages); 2679 bcopy(plist[npages + cnt], &plist[pl_off], 2680 btop(ssize) * sizeof (page_t *)); 2681 pl_off += btop(ssize); 2682 } 2683 2684 if (size == 0) { 2685 AS_LOCK_EXIT(as); 2686 ASSERT(cnt == segcnt - 1); 2687 *ppp = plist; 2688 return (0); 2689 } 2690 2691 /* 2692 * one of pagelock calls failed. The error type is in error variable. 2693 * Unlock what we've locked so far and retry with F_SOFTLOCK if error 2694 * type is either EFAULT or ENOTSUP. Otherwise just return the error 2695 * back to the caller. 2696 */ 2697 2698 eaddr = addr; 2699 seg = sv_seg; 2700 2701 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) { 2702 if (addr >= seg->s_base + seg->s_size) { 2703 seg = AS_SEGNEXT(as, seg); 2704 ASSERT(seg != NULL && addr == seg->s_base); 2705 cnt++; 2706 ASSERT(cnt < segcnt); 2707 } 2708 if (eaddr > seg->s_base + seg->s_size) { 2709 ssize = seg->s_base + seg->s_size - addr; 2710 } else { 2711 ssize = eaddr - addr; 2712 } 2713 pl = &plist[npages + cnt]; 2714 ASSERT(*pl != NULL); 2715 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, 2716 L_PAGEUNLOCK, rw); 2717 } 2718 2719 AS_LOCK_EXIT(as); 2720 2721 kmem_free(plist, (npages + segcnt) * sizeof (page_t *)); 2722 2723 if (error != ENOTSUP && error != EFAULT) { 2724 return (error); 2725 } 2726 2727 slow: 2728 /* 2729 * If we are here because pagelock failed due to the need to cow fault 2730 * in the pages we want to lock F_SOFTLOCK will do this job and in 2731 * next as_pagelock() call for this address range pagelock will 2732 * hopefully succeed. 2733 */ 2734 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw); 2735 if (fault_err != 0) { 2736 return (fc_decode(fault_err)); 2737 } 2738 *ppp = NULL; 2739 2740 return (0); 2741 } 2742 2743 /* 2744 * lock pages in a given address space. Return shadow list. If 2745 * the list is NULL, the MMU mapping is also locked. 2746 */ 2747 int 2748 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr, 2749 size_t size, enum seg_rw rw) 2750 { 2751 size_t rsize; 2752 caddr_t raddr; 2753 faultcode_t fault_err; 2754 struct seg *seg; 2755 int err; 2756 2757 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START, 2758 "as_pagelock_start: addr %p size %ld", addr, size); 2759 2760 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2761 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2762 (size_t)raddr; 2763 2764 /* 2765 * if the request crosses two segments let 2766 * as_fault handle it. 2767 */ 2768 AS_LOCK_ENTER(as, RW_READER); 2769 2770 seg = as_segat(as, raddr); 2771 if (seg == NULL) { 2772 AS_LOCK_EXIT(as); 2773 return (EFAULT); 2774 } 2775 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size); 2776 if (raddr + rsize > seg->s_base + seg->s_size) { 2777 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw)); 2778 } 2779 if (raddr + rsize <= raddr) { 2780 AS_LOCK_EXIT(as); 2781 return (EFAULT); 2782 } 2783 2784 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START, 2785 "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize); 2786 2787 /* 2788 * try to lock pages and pass back shadow list 2789 */ 2790 err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw); 2791 2792 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end"); 2793 2794 AS_LOCK_EXIT(as); 2795 2796 if (err == 0 || (err != ENOTSUP && err != EFAULT)) { 2797 return (err); 2798 } 2799 2800 /* 2801 * Use F_SOFTLOCK to lock the pages because pagelock failed either due 2802 * to no pagelock support for this segment or pages need to be cow 2803 * faulted in. If fault is needed F_SOFTLOCK will do this job for 2804 * this as_pagelock() call and in the next as_pagelock() call for the 2805 * same address range pagelock call will hopefull succeed. 2806 */ 2807 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw); 2808 if (fault_err != 0) { 2809 return (fc_decode(fault_err)); 2810 } 2811 *ppp = NULL; 2812 2813 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end"); 2814 return (0); 2815 } 2816 2817 /* 2818 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow 2819 * lists from the end of plist and call pageunlock interface for each segment. 2820 * Drop as lock and free plist. 2821 */ 2822 static void 2823 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size, 2824 struct page **plist, enum seg_rw rw) 2825 { 2826 ulong_t cnt; 2827 caddr_t eaddr = addr + size; 2828 pgcnt_t npages = btop(size); 2829 size_t ssize; 2830 page_t **pl; 2831 2832 ASSERT(AS_LOCK_HELD(as)); 2833 ASSERT(seg != NULL); 2834 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size); 2835 ASSERT(addr + size > seg->s_base + seg->s_size); 2836 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 2837 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 2838 ASSERT(plist != NULL); 2839 2840 for (cnt = 0; addr < eaddr; addr += ssize) { 2841 if (addr >= seg->s_base + seg->s_size) { 2842 seg = AS_SEGNEXT(as, seg); 2843 ASSERT(seg != NULL && addr == seg->s_base); 2844 cnt++; 2845 } 2846 if (eaddr > seg->s_base + seg->s_size) { 2847 ssize = seg->s_base + seg->s_size - addr; 2848 } else { 2849 ssize = eaddr - addr; 2850 } 2851 pl = &plist[npages + cnt]; 2852 ASSERT(*pl != NULL); 2853 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, 2854 L_PAGEUNLOCK, rw); 2855 } 2856 ASSERT(cnt > 0); 2857 AS_LOCK_EXIT(as); 2858 2859 cnt++; 2860 kmem_free(plist, (npages + cnt) * sizeof (page_t *)); 2861 } 2862 2863 /* 2864 * unlock pages in a given address range 2865 */ 2866 void 2867 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size, 2868 enum seg_rw rw) 2869 { 2870 struct seg *seg; 2871 size_t rsize; 2872 caddr_t raddr; 2873 2874 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START, 2875 "as_pageunlock_start: addr %p size %ld", addr, size); 2876 2877 /* 2878 * if the shadow list is NULL, as_pagelock was 2879 * falling back to as_fault 2880 */ 2881 if (pp == NULL) { 2882 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw); 2883 return; 2884 } 2885 2886 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2887 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2888 (size_t)raddr; 2889 2890 AS_LOCK_ENTER(as, RW_READER); 2891 seg = as_segat(as, raddr); 2892 ASSERT(seg != NULL); 2893 2894 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START, 2895 "seg_unlock_start: raddr %p rsize %ld", raddr, rsize); 2896 2897 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size); 2898 if (raddr + rsize <= seg->s_base + seg->s_size) { 2899 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw); 2900 } else { 2901 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw); 2902 return; 2903 } 2904 AS_LOCK_EXIT(as); 2905 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end"); 2906 } 2907 2908 int 2909 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc, 2910 boolean_t wait) 2911 { 2912 struct seg *seg; 2913 size_t ssize; 2914 caddr_t raddr; /* rounded down addr */ 2915 size_t rsize; /* rounded up size */ 2916 int error = 0; 2917 size_t pgsz = page_get_pagesize(szc); 2918 2919 setpgsz_top: 2920 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) { 2921 return (EINVAL); 2922 } 2923 2924 raddr = addr; 2925 rsize = size; 2926 2927 if (raddr + rsize < raddr) /* check for wraparound */ 2928 return (ENOMEM); 2929 2930 AS_LOCK_ENTER(as, RW_WRITER); 2931 as_clearwatchprot(as, raddr, rsize); 2932 seg = as_segat(as, raddr); 2933 if (seg == NULL) { 2934 as_setwatch(as); 2935 AS_LOCK_EXIT(as); 2936 return (ENOMEM); 2937 } 2938 2939 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 2940 if (raddr >= seg->s_base + seg->s_size) { 2941 seg = AS_SEGNEXT(as, seg); 2942 if (seg == NULL || raddr != seg->s_base) { 2943 error = ENOMEM; 2944 break; 2945 } 2946 } 2947 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 2948 ssize = seg->s_base + seg->s_size - raddr; 2949 } else { 2950 ssize = rsize; 2951 } 2952 2953 retry: 2954 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc); 2955 2956 if (error == IE_NOMEM) { 2957 error = EAGAIN; 2958 break; 2959 } 2960 2961 if (error == IE_RETRY) { 2962 AS_LOCK_EXIT(as); 2963 goto setpgsz_top; 2964 } 2965 2966 if (error == ENOTSUP) { 2967 error = EINVAL; 2968 break; 2969 } 2970 2971 if (wait && (error == EAGAIN)) { 2972 /* 2973 * Memory is currently locked. It must be unlocked 2974 * before this operation can succeed through a retry. 2975 * The possible reasons for locked memory and 2976 * corresponding strategies for unlocking are: 2977 * (1) Normal I/O 2978 * wait for a signal that the I/O operation 2979 * has completed and the memory is unlocked. 2980 * (2) Asynchronous I/O 2981 * The aio subsystem does not unlock pages when 2982 * the I/O is completed. Those pages are unlocked 2983 * when the application calls aiowait/aioerror. 2984 * So, to prevent blocking forever, cv_broadcast() 2985 * is done to wake up aio_cleanup_thread. 2986 * Subsequently, segvn_reclaim will be called, and 2987 * that will do AS_CLRUNMAPWAIT() and wake us up. 2988 * (3) Long term page locking: 2989 * This is not relevant for as_setpagesize() 2990 * because we cannot change the page size for 2991 * driver memory. The attempt to do so will 2992 * fail with a different error than EAGAIN so 2993 * there's no need to trigger as callbacks like 2994 * as_unmap, as_setprot or as_free would do. 2995 */ 2996 mutex_enter(&as->a_contents); 2997 if (!AS_ISNOUNMAPWAIT(as)) { 2998 if (AS_ISUNMAPWAIT(as) == 0) { 2999 cv_broadcast(&as->a_cv); 3000 } 3001 AS_SETUNMAPWAIT(as); 3002 AS_LOCK_EXIT(as); 3003 while (AS_ISUNMAPWAIT(as)) { 3004 cv_wait(&as->a_cv, &as->a_contents); 3005 } 3006 } else { 3007 /* 3008 * We may have raced with 3009 * segvn_reclaim()/segspt_reclaim(). In this 3010 * case clean nounmapwait flag and retry since 3011 * softlockcnt in this segment may be already 3012 * 0. We don't drop as writer lock so our 3013 * number of retries without sleeping should 3014 * be very small. See segvn_reclaim() for 3015 * more comments. 3016 */ 3017 AS_CLRNOUNMAPWAIT(as); 3018 mutex_exit(&as->a_contents); 3019 goto retry; 3020 } 3021 mutex_exit(&as->a_contents); 3022 goto setpgsz_top; 3023 } else if (error != 0) { 3024 break; 3025 } 3026 } 3027 as_setwatch(as); 3028 AS_LOCK_EXIT(as); 3029 return (error); 3030 } 3031 3032 /* 3033 * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments 3034 * in its chunk where s_szc is less than the szc we want to set. 3035 */ 3036 static int 3037 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc, 3038 int *retry) 3039 { 3040 struct seg *seg; 3041 size_t ssize; 3042 int error; 3043 3044 ASSERT(AS_WRITE_HELD(as)); 3045 3046 seg = as_segat(as, raddr); 3047 if (seg == NULL) { 3048 panic("as_iset3_default_lpsize: no seg"); 3049 } 3050 3051 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 3052 if (raddr >= seg->s_base + seg->s_size) { 3053 seg = AS_SEGNEXT(as, seg); 3054 if (seg == NULL || raddr != seg->s_base) { 3055 panic("as_iset3_default_lpsize: as changed"); 3056 } 3057 } 3058 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3059 ssize = seg->s_base + seg->s_size - raddr; 3060 } else { 3061 ssize = rsize; 3062 } 3063 3064 if (szc > seg->s_szc) { 3065 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc); 3066 /* Only retry on EINVAL segments that have no vnode. */ 3067 if (error == EINVAL) { 3068 vnode_t *vp = NULL; 3069 if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) && 3070 (SEGOP_GETVP(seg, raddr, &vp) != 0 || 3071 vp == NULL)) { 3072 *retry = 1; 3073 } else { 3074 *retry = 0; 3075 } 3076 } 3077 if (error) { 3078 return (error); 3079 } 3080 } 3081 } 3082 return (0); 3083 } 3084 3085 /* 3086 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the 3087 * pagesize on each segment in its range, but if any fails with EINVAL, 3088 * then it reduces the pagesizes to the next size in the bitmap and 3089 * retries as_iset3_default_lpsize(). The reason why the code retries 3090 * smaller allowed sizes on EINVAL is because (a) the anon offset may not 3091 * match the bigger sizes, and (b) it's hard to get this offset (to begin 3092 * with) to pass to map_pgszcvec(). 3093 */ 3094 static int 3095 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc, 3096 uint_t szcvec) 3097 { 3098 int error; 3099 int retry; 3100 3101 ASSERT(AS_WRITE_HELD(as)); 3102 3103 for (;;) { 3104 error = as_iset3_default_lpsize(as, addr, size, szc, &retry); 3105 if (error == EINVAL && retry) { 3106 szcvec &= ~(1 << szc); 3107 if (szcvec <= 1) { 3108 return (EINVAL); 3109 } 3110 szc = highbit(szcvec) - 1; 3111 } else { 3112 return (error); 3113 } 3114 } 3115 } 3116 3117 /* 3118 * as_iset1_default_lpsize() breaks its chunk into areas where existing 3119 * segments have a smaller szc than we want to set. For each such area, 3120 * it calls as_iset2_default_lpsize() 3121 */ 3122 static int 3123 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc, 3124 uint_t szcvec) 3125 { 3126 struct seg *seg; 3127 size_t ssize; 3128 caddr_t setaddr = raddr; 3129 size_t setsize = 0; 3130 int set; 3131 int error; 3132 3133 ASSERT(AS_WRITE_HELD(as)); 3134 3135 seg = as_segat(as, raddr); 3136 if (seg == NULL) { 3137 panic("as_iset1_default_lpsize: no seg"); 3138 } 3139 if (seg->s_szc < szc) { 3140 set = 1; 3141 } else { 3142 set = 0; 3143 } 3144 3145 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) { 3146 if (raddr >= seg->s_base + seg->s_size) { 3147 seg = AS_SEGNEXT(as, seg); 3148 if (seg == NULL || raddr != seg->s_base) { 3149 panic("as_iset1_default_lpsize: as changed"); 3150 } 3151 if (seg->s_szc >= szc && set) { 3152 ASSERT(setsize != 0); 3153 error = as_iset2_default_lpsize(as, 3154 setaddr, setsize, szc, szcvec); 3155 if (error) { 3156 return (error); 3157 } 3158 set = 0; 3159 } else if (seg->s_szc < szc && !set) { 3160 setaddr = raddr; 3161 setsize = 0; 3162 set = 1; 3163 } 3164 } 3165 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3166 ssize = seg->s_base + seg->s_size - raddr; 3167 } else { 3168 ssize = rsize; 3169 } 3170 } 3171 error = 0; 3172 if (set) { 3173 ASSERT(setsize != 0); 3174 error = as_iset2_default_lpsize(as, setaddr, setsize, 3175 szc, szcvec); 3176 } 3177 return (error); 3178 } 3179 3180 /* 3181 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap 3182 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each 3183 * chunk to as_iset1_default_lpsize(). 3184 */ 3185 static int 3186 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags, 3187 int type) 3188 { 3189 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM; 3190 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, 3191 flags, rtype, 1); 3192 uint_t szc; 3193 uint_t nszc; 3194 int error; 3195 caddr_t a; 3196 caddr_t eaddr; 3197 size_t segsize; 3198 size_t pgsz; 3199 uint_t save_szcvec; 3200 3201 ASSERT(AS_WRITE_HELD(as)); 3202 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 3203 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 3204 3205 szcvec &= ~1; 3206 if (szcvec <= 1) { /* skip if base page size */ 3207 return (0); 3208 } 3209 3210 /* Get the pagesize of the first larger page size. */ 3211 szc = lowbit(szcvec) - 1; 3212 pgsz = page_get_pagesize(szc); 3213 eaddr = addr + size; 3214 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 3215 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 3216 3217 save_szcvec = szcvec; 3218 szcvec >>= (szc + 1); 3219 nszc = szc; 3220 while (szcvec) { 3221 if ((szcvec & 0x1) == 0) { 3222 nszc++; 3223 szcvec >>= 1; 3224 continue; 3225 } 3226 nszc++; 3227 pgsz = page_get_pagesize(nszc); 3228 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 3229 if (a != addr) { 3230 ASSERT(szc > 0); 3231 ASSERT(a < eaddr); 3232 segsize = a - addr; 3233 error = as_iset1_default_lpsize(as, addr, segsize, szc, 3234 save_szcvec); 3235 if (error) { 3236 return (error); 3237 } 3238 addr = a; 3239 } 3240 szc = nszc; 3241 szcvec >>= 1; 3242 } 3243 3244 ASSERT(addr < eaddr); 3245 szcvec = save_szcvec; 3246 while (szcvec) { 3247 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 3248 ASSERT(a >= addr); 3249 if (a != addr) { 3250 ASSERT(szc > 0); 3251 segsize = a - addr; 3252 error = as_iset1_default_lpsize(as, addr, segsize, szc, 3253 save_szcvec); 3254 if (error) { 3255 return (error); 3256 } 3257 addr = a; 3258 } 3259 szcvec &= ~(1 << szc); 3260 if (szcvec) { 3261 szc = highbit(szcvec) - 1; 3262 pgsz = page_get_pagesize(szc); 3263 } 3264 } 3265 ASSERT(addr == eaddr); 3266 3267 return (0); 3268 } 3269 3270 /* 3271 * Set the default large page size for the range. Called via memcntl with 3272 * page size set to 0. as_set_default_lpsize breaks the range down into 3273 * chunks with the same type/flags, ignores-non segvn segments, and passes 3274 * each chunk to as_iset_default_lpsize(). 3275 */ 3276 int 3277 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size) 3278 { 3279 struct seg *seg; 3280 caddr_t raddr; 3281 size_t rsize; 3282 size_t ssize; 3283 int rtype, rflags; 3284 int stype, sflags; 3285 int error; 3286 caddr_t setaddr; 3287 size_t setsize; 3288 int segvn; 3289 3290 if (size == 0) 3291 return (0); 3292 3293 AS_LOCK_ENTER(as, RW_WRITER); 3294 again: 3295 error = 0; 3296 3297 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3298 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 3299 (size_t)raddr; 3300 3301 if (raddr + rsize < raddr) { /* check for wraparound */ 3302 AS_LOCK_EXIT(as); 3303 return (ENOMEM); 3304 } 3305 as_clearwatchprot(as, raddr, rsize); 3306 seg = as_segat(as, raddr); 3307 if (seg == NULL) { 3308 as_setwatch(as); 3309 AS_LOCK_EXIT(as); 3310 return (ENOMEM); 3311 } 3312 if (seg->s_ops == &segvn_ops) { 3313 rtype = SEGOP_GETTYPE(seg, addr); 3314 rflags = rtype & (MAP_TEXT | MAP_INITDATA); 3315 rtype = rtype & (MAP_SHARED | MAP_PRIVATE); 3316 segvn = 1; 3317 } else { 3318 segvn = 0; 3319 } 3320 setaddr = raddr; 3321 setsize = 0; 3322 3323 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) { 3324 if (raddr >= (seg->s_base + seg->s_size)) { 3325 seg = AS_SEGNEXT(as, seg); 3326 if (seg == NULL || raddr != seg->s_base) { 3327 error = ENOMEM; 3328 break; 3329 } 3330 if (seg->s_ops == &segvn_ops) { 3331 stype = SEGOP_GETTYPE(seg, raddr); 3332 sflags = stype & (MAP_TEXT | MAP_INITDATA); 3333 stype &= (MAP_SHARED | MAP_PRIVATE); 3334 if (segvn && (rflags != sflags || 3335 rtype != stype)) { 3336 /* 3337 * The next segment is also segvn but 3338 * has different flags and/or type. 3339 */ 3340 ASSERT(setsize != 0); 3341 error = as_iset_default_lpsize(as, 3342 setaddr, setsize, rflags, rtype); 3343 if (error) { 3344 break; 3345 } 3346 rflags = sflags; 3347 rtype = stype; 3348 setaddr = raddr; 3349 setsize = 0; 3350 } else if (!segvn) { 3351 rflags = sflags; 3352 rtype = stype; 3353 setaddr = raddr; 3354 setsize = 0; 3355 segvn = 1; 3356 } 3357 } else if (segvn) { 3358 /* The next segment is not segvn. */ 3359 ASSERT(setsize != 0); 3360 error = as_iset_default_lpsize(as, 3361 setaddr, setsize, rflags, rtype); 3362 if (error) { 3363 break; 3364 } 3365 segvn = 0; 3366 } 3367 } 3368 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3369 ssize = seg->s_base + seg->s_size - raddr; 3370 } else { 3371 ssize = rsize; 3372 } 3373 } 3374 if (error == 0 && segvn) { 3375 /* The last chunk when rsize == 0. */ 3376 ASSERT(setsize != 0); 3377 error = as_iset_default_lpsize(as, setaddr, setsize, 3378 rflags, rtype); 3379 } 3380 3381 if (error == IE_RETRY) { 3382 goto again; 3383 } else if (error == IE_NOMEM) { 3384 error = EAGAIN; 3385 } else if (error == ENOTSUP) { 3386 error = EINVAL; 3387 } else if (error == EAGAIN) { 3388 mutex_enter(&as->a_contents); 3389 if (!AS_ISNOUNMAPWAIT(as)) { 3390 if (AS_ISUNMAPWAIT(as) == 0) { 3391 cv_broadcast(&as->a_cv); 3392 } 3393 AS_SETUNMAPWAIT(as); 3394 AS_LOCK_EXIT(as); 3395 while (AS_ISUNMAPWAIT(as)) { 3396 cv_wait(&as->a_cv, &as->a_contents); 3397 } 3398 mutex_exit(&as->a_contents); 3399 AS_LOCK_ENTER(as, RW_WRITER); 3400 } else { 3401 /* 3402 * We may have raced with 3403 * segvn_reclaim()/segspt_reclaim(). In this case 3404 * clean nounmapwait flag and retry since softlockcnt 3405 * in this segment may be already 0. We don't drop as 3406 * writer lock so our number of retries without 3407 * sleeping should be very small. See segvn_reclaim() 3408 * for more comments. 3409 */ 3410 AS_CLRNOUNMAPWAIT(as); 3411 mutex_exit(&as->a_contents); 3412 } 3413 goto again; 3414 } 3415 3416 as_setwatch(as); 3417 AS_LOCK_EXIT(as); 3418 return (error); 3419 } 3420 3421 /* 3422 * Setup all of the uninitialized watched pages that we can. 3423 */ 3424 void 3425 as_setwatch(struct as *as) 3426 { 3427 struct watched_page *pwp; 3428 struct seg *seg; 3429 caddr_t vaddr; 3430 uint_t prot; 3431 int err, retrycnt; 3432 3433 if (avl_numnodes(&as->a_wpage) == 0) 3434 return; 3435 3436 ASSERT(AS_WRITE_HELD(as)); 3437 3438 for (pwp = avl_first(&as->a_wpage); pwp != NULL; 3439 pwp = AVL_NEXT(&as->a_wpage, pwp)) { 3440 retrycnt = 0; 3441 retry: 3442 vaddr = pwp->wp_vaddr; 3443 if (pwp->wp_oprot != 0 || /* already set up */ 3444 (seg = as_segat(as, vaddr)) == NULL || 3445 SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0) 3446 continue; 3447 3448 pwp->wp_oprot = prot; 3449 if (pwp->wp_read) 3450 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3451 if (pwp->wp_write) 3452 prot &= ~PROT_WRITE; 3453 if (pwp->wp_exec) 3454 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3455 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) { 3456 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot); 3457 if (err == IE_RETRY) { 3458 pwp->wp_oprot = 0; 3459 ASSERT(retrycnt == 0); 3460 retrycnt++; 3461 goto retry; 3462 } 3463 } 3464 pwp->wp_prot = prot; 3465 } 3466 } 3467 3468 /* 3469 * Clear all of the watched pages in the address space. 3470 */ 3471 void 3472 as_clearwatch(struct as *as) 3473 { 3474 struct watched_page *pwp; 3475 struct seg *seg; 3476 caddr_t vaddr; 3477 uint_t prot; 3478 int err, retrycnt; 3479 3480 if (avl_numnodes(&as->a_wpage) == 0) 3481 return; 3482 3483 ASSERT(AS_WRITE_HELD(as)); 3484 3485 for (pwp = avl_first(&as->a_wpage); pwp != NULL; 3486 pwp = AVL_NEXT(&as->a_wpage, pwp)) { 3487 retrycnt = 0; 3488 retry: 3489 vaddr = pwp->wp_vaddr; 3490 if (pwp->wp_oprot == 0 || /* not set up */ 3491 (seg = as_segat(as, vaddr)) == NULL) 3492 continue; 3493 3494 if ((prot = pwp->wp_oprot) != pwp->wp_prot) { 3495 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot); 3496 if (err == IE_RETRY) { 3497 ASSERT(retrycnt == 0); 3498 retrycnt++; 3499 goto retry; 3500 } 3501 } 3502 pwp->wp_oprot = 0; 3503 pwp->wp_prot = 0; 3504 } 3505 } 3506 3507 /* 3508 * Force a new setup for all the watched pages in the range. 3509 */ 3510 static void 3511 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 3512 { 3513 struct watched_page *pwp; 3514 struct watched_page tpw; 3515 caddr_t eaddr = addr + size; 3516 caddr_t vaddr; 3517 struct seg *seg; 3518 int err, retrycnt; 3519 uint_t wprot; 3520 avl_index_t where; 3521 3522 if (avl_numnodes(&as->a_wpage) == 0) 3523 return; 3524 3525 ASSERT(AS_WRITE_HELD(as)); 3526 3527 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3528 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) 3529 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); 3530 3531 while (pwp != NULL && pwp->wp_vaddr < eaddr) { 3532 retrycnt = 0; 3533 vaddr = pwp->wp_vaddr; 3534 3535 wprot = prot; 3536 if (pwp->wp_read) 3537 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3538 if (pwp->wp_write) 3539 wprot &= ~PROT_WRITE; 3540 if (pwp->wp_exec) 3541 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3542 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) { 3543 retry: 3544 seg = as_segat(as, vaddr); 3545 if (seg == NULL) { 3546 panic("as_setwatchprot: no seg"); 3547 /*NOTREACHED*/ 3548 } 3549 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot); 3550 if (err == IE_RETRY) { 3551 ASSERT(retrycnt == 0); 3552 retrycnt++; 3553 goto retry; 3554 } 3555 } 3556 pwp->wp_oprot = prot; 3557 pwp->wp_prot = wprot; 3558 3559 pwp = AVL_NEXT(&as->a_wpage, pwp); 3560 } 3561 } 3562 3563 /* 3564 * Clear all of the watched pages in the range. 3565 */ 3566 static void 3567 as_clearwatchprot(struct as *as, caddr_t addr, size_t size) 3568 { 3569 caddr_t eaddr = addr + size; 3570 struct watched_page *pwp; 3571 struct watched_page tpw; 3572 uint_t prot; 3573 struct seg *seg; 3574 int err, retrycnt; 3575 avl_index_t where; 3576 3577 if (avl_numnodes(&as->a_wpage) == 0) 3578 return; 3579 3580 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3581 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) 3582 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); 3583 3584 ASSERT(AS_WRITE_HELD(as)); 3585 3586 while (pwp != NULL && pwp->wp_vaddr < eaddr) { 3587 3588 if ((prot = pwp->wp_oprot) != 0) { 3589 retrycnt = 0; 3590 3591 if (prot != pwp->wp_prot) { 3592 retry: 3593 seg = as_segat(as, pwp->wp_vaddr); 3594 if (seg == NULL) 3595 continue; 3596 err = SEGOP_SETPROT(seg, pwp->wp_vaddr, 3597 PAGESIZE, prot); 3598 if (err == IE_RETRY) { 3599 ASSERT(retrycnt == 0); 3600 retrycnt++; 3601 goto retry; 3602 3603 } 3604 } 3605 pwp->wp_oprot = 0; 3606 pwp->wp_prot = 0; 3607 } 3608 3609 pwp = AVL_NEXT(&as->a_wpage, pwp); 3610 } 3611 } 3612 3613 void 3614 as_signal_proc(struct as *as, k_siginfo_t *siginfo) 3615 { 3616 struct proc *p; 3617 3618 mutex_enter(&pidlock); 3619 for (p = practive; p; p = p->p_next) { 3620 if (p->p_as == as) { 3621 mutex_enter(&p->p_lock); 3622 if (p->p_as == as) 3623 sigaddq(p, NULL, siginfo, KM_NOSLEEP); 3624 mutex_exit(&p->p_lock); 3625 } 3626 } 3627 mutex_exit(&pidlock); 3628 } 3629 3630 /* 3631 * return memory object ID 3632 */ 3633 int 3634 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp) 3635 { 3636 struct seg *seg; 3637 int sts; 3638 3639 AS_LOCK_ENTER(as, RW_READER); 3640 seg = as_segat(as, addr); 3641 if (seg == NULL) { 3642 AS_LOCK_EXIT(as); 3643 return (EFAULT); 3644 } 3645 /* 3646 * catch old drivers which may not support getmemid 3647 */ 3648 if (seg->s_ops->getmemid == NULL) { 3649 AS_LOCK_EXIT(as); 3650 return (ENODEV); 3651 } 3652 3653 sts = SEGOP_GETMEMID(seg, addr, memidp); 3654 3655 AS_LOCK_EXIT(as); 3656 return (sts); 3657 } 3658