1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright 2018 Joyent, Inc. 25 * Copyright (c) 2016 by Delphix. All rights reserved. 26 */ 27 28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 29 /* All Rights Reserved */ 30 31 /* 32 * University Copyright- Copyright (c) 1982, 1986, 1988 33 * The Regents of the University of California 34 * All Rights Reserved 35 * 36 * University Acknowledgment- Portions of this document are derived from 37 * software developed by the University of California, Berkeley, and its 38 * contributors. 39 */ 40 41 /* 42 * VM - address spaces. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/t_lock.h> 47 #include <sys/param.h> 48 #include <sys/errno.h> 49 #include <sys/systm.h> 50 #include <sys/mman.h> 51 #include <sys/sysmacros.h> 52 #include <sys/cpuvar.h> 53 #include <sys/sysinfo.h> 54 #include <sys/kmem.h> 55 #include <sys/vnode.h> 56 #include <sys/vmsystm.h> 57 #include <sys/cmn_err.h> 58 #include <sys/debug.h> 59 #include <sys/vtrace.h> 60 61 #include <vm/hat.h> 62 #include <vm/as.h> 63 #include <vm/seg.h> 64 #include <vm/seg_vn.h> 65 #include <vm/seg_dev.h> 66 #include <vm/seg_kmem.h> 67 #include <vm/seg_map.h> 68 #include <vm/seg_spt.h> 69 #include <vm/seg_hole.h> 70 #include <vm/page.h> 71 72 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */ 73 74 static struct kmem_cache *as_cache; 75 76 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t); 77 static void as_clearwatchprot(struct as *, caddr_t, size_t); 78 79 80 /* 81 * Verifying the segment lists is very time-consuming; it may not be 82 * desirable always to define VERIFY_SEGLIST when DEBUG is set. 83 */ 84 #ifdef DEBUG 85 #define VERIFY_SEGLIST 86 int do_as_verify = 0; 87 #endif 88 89 /* 90 * Allocate a new callback data structure entry and fill in the events of 91 * interest, the address range of interest, and the callback argument. 92 * Link the entry on the as->a_callbacks list. A callback entry for the 93 * entire address space may be specified with vaddr = 0 and size = -1. 94 * 95 * CALLERS RESPONSIBILITY: If not calling from within the process context for 96 * the specified as, the caller must guarantee persistence of the specified as 97 * for the duration of this function (eg. pages being locked within the as 98 * will guarantee persistence). 99 */ 100 int 101 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events, 102 caddr_t vaddr, size_t size, int sleepflag) 103 { 104 struct as_callback *current_head, *cb; 105 caddr_t saddr; 106 size_t rsize; 107 108 /* callback function and an event are mandatory */ 109 if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0)) 110 return (EINVAL); 111 112 /* Adding a callback after as_free has been called is not allowed */ 113 if (as == &kas) 114 return (ENOMEM); 115 116 /* 117 * vaddr = 0 and size = -1 is used to indicate that the callback range 118 * is the entire address space so no rounding is done in that case. 119 */ 120 if (size != -1) { 121 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK); 122 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) - 123 (size_t)saddr; 124 /* check for wraparound */ 125 if (saddr + rsize < saddr) 126 return (ENOMEM); 127 } else { 128 if (vaddr != 0) 129 return (EINVAL); 130 saddr = vaddr; 131 rsize = size; 132 } 133 134 /* Allocate and initialize a callback entry */ 135 cb = kmem_zalloc(sizeof (struct as_callback), sleepflag); 136 if (cb == NULL) 137 return (EAGAIN); 138 139 cb->ascb_func = cb_func; 140 cb->ascb_arg = arg; 141 cb->ascb_events = events; 142 cb->ascb_saddr = saddr; 143 cb->ascb_len = rsize; 144 145 /* Add the entry to the list */ 146 mutex_enter(&as->a_contents); 147 current_head = as->a_callbacks; 148 as->a_callbacks = cb; 149 cb->ascb_next = current_head; 150 151 /* 152 * The call to this function may lose in a race with 153 * a pertinent event - eg. a thread does long term memory locking 154 * but before the callback is added another thread executes as_unmap. 155 * A broadcast here resolves that. 156 */ 157 if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) { 158 AS_CLRUNMAPWAIT(as); 159 cv_broadcast(&as->a_cv); 160 } 161 162 mutex_exit(&as->a_contents); 163 return (0); 164 } 165 166 /* 167 * Search the callback list for an entry which pertains to arg. 168 * 169 * This is called from within the client upon completion of the callback. 170 * RETURN VALUES: 171 * AS_CALLBACK_DELETED (callback entry found and deleted) 172 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok) 173 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this 174 * entry will be made in as_do_callbacks) 175 * 176 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED 177 * set, it indicates that as_do_callbacks is processing this entry. The 178 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made 179 * to unblock as_do_callbacks, in case it is blocked. 180 * 181 * CALLERS RESPONSIBILITY: If not calling from within the process context for 182 * the specified as, the caller must guarantee persistence of the specified as 183 * for the duration of this function (eg. pages being locked within the as 184 * will guarantee persistence). 185 */ 186 uint_t 187 as_delete_callback(struct as *as, void *arg) 188 { 189 struct as_callback **prevcb = &as->a_callbacks; 190 struct as_callback *cb; 191 uint_t rc = AS_CALLBACK_NOTFOUND; 192 193 mutex_enter(&as->a_contents); 194 for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) { 195 if (cb->ascb_arg != arg) 196 continue; 197 198 /* 199 * If the events indicate AS_CALLBACK_CALLED, just clear 200 * AS_ALL_EVENT in the events field and wakeup the thread 201 * that may be waiting in as_do_callbacks. as_do_callbacks 202 * will take care of removing this entry from the list. In 203 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise 204 * (AS_CALLBACK_CALLED not set), just remove it from the 205 * list, return the memory and return AS_CALLBACK_DELETED. 206 */ 207 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) { 208 /* leave AS_CALLBACK_CALLED */ 209 cb->ascb_events &= ~AS_ALL_EVENT; 210 rc = AS_CALLBACK_DELETE_DEFERRED; 211 cv_broadcast(&as->a_cv); 212 } else { 213 *prevcb = cb->ascb_next; 214 kmem_free(cb, sizeof (struct as_callback)); 215 rc = AS_CALLBACK_DELETED; 216 } 217 break; 218 } 219 mutex_exit(&as->a_contents); 220 return (rc); 221 } 222 223 /* 224 * Searches the as callback list for a matching entry. 225 * Returns a pointer to the first matching callback, or NULL if 226 * nothing is found. 227 * This function never sleeps so it is ok to call it with more 228 * locks held but the (required) a_contents mutex. 229 * 230 * See also comment on as_do_callbacks below. 231 */ 232 static struct as_callback * 233 as_find_callback(struct as *as, uint_t events, caddr_t event_addr, 234 size_t event_len) 235 { 236 struct as_callback *cb; 237 238 ASSERT(MUTEX_HELD(&as->a_contents)); 239 for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) { 240 /* 241 * If the callback has not already been called, then 242 * check if events or address range pertains. An event_len 243 * of zero means do an unconditional callback. 244 */ 245 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) || 246 ((event_len != 0) && (((cb->ascb_events & events) == 0) || 247 (event_addr + event_len < cb->ascb_saddr) || 248 (event_addr > (cb->ascb_saddr + cb->ascb_len))))) { 249 continue; 250 } 251 break; 252 } 253 return (cb); 254 } 255 256 /* 257 * Executes a given callback and removes it from the callback list for 258 * this address space. 259 * This function may sleep so the caller must drop all locks except 260 * a_contents before calling this func. 261 * 262 * See also comments on as_do_callbacks below. 263 */ 264 static void 265 as_execute_callback(struct as *as, struct as_callback *cb, 266 uint_t events) 267 { 268 struct as_callback **prevcb; 269 void *cb_arg; 270 271 ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events)); 272 cb->ascb_events |= AS_CALLBACK_CALLED; 273 mutex_exit(&as->a_contents); 274 (*cb->ascb_func)(as, cb->ascb_arg, events); 275 mutex_enter(&as->a_contents); 276 /* 277 * the callback function is required to delete the callback 278 * when the callback function determines it is OK for 279 * this thread to continue. as_delete_callback will clear 280 * the AS_ALL_EVENT in the events field when it is deleted. 281 * If the callback function called as_delete_callback, 282 * events will already be cleared and there will be no blocking. 283 */ 284 while ((cb->ascb_events & events) != 0) { 285 cv_wait(&as->a_cv, &as->a_contents); 286 } 287 /* 288 * This entry needs to be taken off the list. Normally, the 289 * callback func itself does that, but unfortunately the list 290 * may have changed while the callback was running because the 291 * a_contents mutex was dropped and someone else other than the 292 * callback func itself could have called as_delete_callback, 293 * so we have to search to find this entry again. The entry 294 * must have AS_CALLBACK_CALLED, and have the same 'arg'. 295 */ 296 cb_arg = cb->ascb_arg; 297 prevcb = &as->a_callbacks; 298 for (cb = as->a_callbacks; cb != NULL; 299 prevcb = &cb->ascb_next, cb = *prevcb) { 300 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) || 301 (cb_arg != cb->ascb_arg)) { 302 continue; 303 } 304 *prevcb = cb->ascb_next; 305 kmem_free(cb, sizeof (struct as_callback)); 306 break; 307 } 308 } 309 310 /* 311 * Check the callback list for a matching event and intersection of 312 * address range. If there is a match invoke the callback. Skip an entry if: 313 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED) 314 * - not event of interest 315 * - not address range of interest 316 * 317 * An event_len of zero indicates a request for an unconditional callback 318 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The 319 * a_contents lock must be dropped before a callback, so only one callback 320 * can be done before returning. Return -1 (true) if a callback was 321 * executed and removed from the list, else return 0 (false). 322 * 323 * The logically separate parts, i.e. finding a matching callback and 324 * executing a given callback have been separated into two functions 325 * so that they can be called with different sets of locks held beyond 326 * the always-required a_contents. as_find_callback does not sleep so 327 * it is ok to call it if more locks than a_contents (i.e. the a_lock 328 * rwlock) are held. as_execute_callback on the other hand may sleep 329 * so all locks beyond a_contents must be dropped by the caller if one 330 * does not want to end comatose. 331 */ 332 static int 333 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr, 334 size_t event_len) 335 { 336 struct as_callback *cb; 337 338 if ((cb = as_find_callback(as, events, event_addr, event_len))) { 339 as_execute_callback(as, cb, events); 340 return (-1); 341 } 342 return (0); 343 } 344 345 /* 346 * Search for the segment containing addr. If a segment containing addr 347 * exists, that segment is returned. If no such segment exists, and 348 * the list spans addresses greater than addr, then the first segment 349 * whose base is greater than addr is returned; otherwise, NULL is 350 * returned unless tail is true, in which case the last element of the 351 * list is returned. 352 * 353 * a_seglast is used to cache the last found segment for repeated 354 * searches to the same addr (which happens frequently). 355 */ 356 struct seg * 357 as_findseg(struct as *as, caddr_t addr, int tail) 358 { 359 struct seg *seg = as->a_seglast; 360 avl_index_t where; 361 362 ASSERT(AS_LOCK_HELD(as)); 363 364 if (seg != NULL && 365 seg->s_base <= addr && 366 addr < seg->s_base + seg->s_size) 367 return (seg); 368 369 seg = avl_find(&as->a_segtree, &addr, &where); 370 if (seg != NULL) 371 return (as->a_seglast = seg); 372 373 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); 374 if (seg == NULL && tail) 375 seg = avl_last(&as->a_segtree); 376 return (as->a_seglast = seg); 377 } 378 379 #ifdef VERIFY_SEGLIST 380 /* 381 * verify that the linked list is coherent 382 */ 383 static void 384 as_verify(struct as *as) 385 { 386 struct seg *seg, *seglast, *p, *n; 387 uint_t nsegs = 0; 388 389 if (do_as_verify == 0) 390 return; 391 392 seglast = as->a_seglast; 393 394 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 395 ASSERT(seg->s_as == as); 396 p = AS_SEGPREV(as, seg); 397 n = AS_SEGNEXT(as, seg); 398 ASSERT(p == NULL || p->s_as == as); 399 ASSERT(p == NULL || p->s_base < seg->s_base); 400 ASSERT(n == NULL || n->s_base > seg->s_base); 401 ASSERT(n != NULL || seg == avl_last(&as->a_segtree)); 402 if (seg == seglast) 403 seglast = NULL; 404 nsegs++; 405 } 406 ASSERT(seglast == NULL); 407 ASSERT(avl_numnodes(&as->a_segtree) == nsegs); 408 } 409 #endif /* VERIFY_SEGLIST */ 410 411 /* 412 * Add a new segment to the address space. The avl_find() 413 * may be expensive so we attempt to use last segment accessed 414 * in as_gap() as an insertion point. 415 */ 416 int 417 as_addseg(struct as *as, struct seg *newseg) 418 { 419 struct seg *seg; 420 caddr_t addr; 421 caddr_t eaddr; 422 avl_index_t where; 423 424 ASSERT(AS_WRITE_HELD(as)); 425 426 as->a_updatedir = 1; /* inform /proc */ 427 gethrestime(&as->a_updatetime); 428 429 if (as->a_lastgaphl != NULL) { 430 struct seg *hseg = NULL; 431 struct seg *lseg = NULL; 432 433 if (as->a_lastgaphl->s_base > newseg->s_base) { 434 hseg = as->a_lastgaphl; 435 lseg = AVL_PREV(&as->a_segtree, hseg); 436 } else { 437 lseg = as->a_lastgaphl; 438 hseg = AVL_NEXT(&as->a_segtree, lseg); 439 } 440 441 if (hseg && lseg && lseg->s_base < newseg->s_base && 442 hseg->s_base > newseg->s_base) { 443 avl_insert_here(&as->a_segtree, newseg, lseg, 444 AVL_AFTER); 445 as->a_lastgaphl = NULL; 446 as->a_seglast = newseg; 447 return (0); 448 } 449 as->a_lastgaphl = NULL; 450 } 451 452 addr = newseg->s_base; 453 eaddr = addr + newseg->s_size; 454 455 seg = avl_find(&as->a_segtree, &addr, &where); 456 457 if (seg == NULL) 458 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); 459 460 if (seg == NULL) 461 seg = avl_last(&as->a_segtree); 462 463 if (seg != NULL) { 464 caddr_t base = seg->s_base; 465 466 /* 467 * If top of seg is below the requested address, then 468 * the insertion point is at the end of the linked list, 469 * and seg points to the tail of the list. Otherwise, 470 * the insertion point is immediately before seg. 471 */ 472 if (base + seg->s_size > addr) { 473 if (addr >= base || eaddr > base) { 474 return (-1); /* overlapping segment */ 475 } 476 } 477 } 478 as->a_seglast = newseg; 479 avl_insert(&as->a_segtree, newseg, where); 480 481 #ifdef VERIFY_SEGLIST 482 as_verify(as); 483 #endif 484 return (0); 485 } 486 487 struct seg * 488 as_removeseg(struct as *as, struct seg *seg) 489 { 490 avl_tree_t *t; 491 492 ASSERT(AS_WRITE_HELD(as)); 493 494 as->a_updatedir = 1; /* inform /proc */ 495 gethrestime(&as->a_updatetime); 496 497 if (seg == NULL) 498 return (NULL); 499 500 t = &as->a_segtree; 501 if (as->a_seglast == seg) 502 as->a_seglast = NULL; 503 as->a_lastgaphl = NULL; 504 505 /* 506 * if this segment is at an address higher than 507 * a_lastgap, set a_lastgap to the next segment (NULL if last segment) 508 */ 509 if (as->a_lastgap && 510 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base)) 511 as->a_lastgap = AVL_NEXT(t, seg); 512 513 /* 514 * remove the segment from the seg tree 515 */ 516 avl_remove(t, seg); 517 518 #ifdef VERIFY_SEGLIST 519 as_verify(as); 520 #endif 521 return (seg); 522 } 523 524 /* 525 * Find a segment containing addr. 526 */ 527 struct seg * 528 as_segat(struct as *as, caddr_t addr) 529 { 530 struct seg *seg = as->a_seglast; 531 532 ASSERT(AS_LOCK_HELD(as)); 533 534 if (seg != NULL && seg->s_base <= addr && 535 addr < seg->s_base + seg->s_size) 536 return (seg); 537 538 seg = avl_find(&as->a_segtree, &addr, NULL); 539 return (seg); 540 } 541 542 /* 543 * Serialize all searches for holes in an address space to 544 * prevent two or more threads from allocating the same virtual 545 * address range. The address space must not be "read/write" 546 * locked by the caller since we may block. 547 */ 548 void 549 as_rangelock(struct as *as) 550 { 551 mutex_enter(&as->a_contents); 552 while (AS_ISCLAIMGAP(as)) 553 cv_wait(&as->a_cv, &as->a_contents); 554 AS_SETCLAIMGAP(as); 555 mutex_exit(&as->a_contents); 556 } 557 558 /* 559 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads. 560 */ 561 void 562 as_rangeunlock(struct as *as) 563 { 564 mutex_enter(&as->a_contents); 565 AS_CLRCLAIMGAP(as); 566 cv_signal(&as->a_cv); 567 mutex_exit(&as->a_contents); 568 } 569 570 /* 571 * compar segments (or just an address) by segment address range 572 */ 573 static int 574 as_segcompar(const void *x, const void *y) 575 { 576 struct seg *a = (struct seg *)x; 577 struct seg *b = (struct seg *)y; 578 579 if (a->s_base < b->s_base) 580 return (-1); 581 if (a->s_base >= b->s_base + b->s_size) 582 return (1); 583 return (0); 584 } 585 586 587 void 588 as_avlinit(struct as *as) 589 { 590 avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg), 591 offsetof(struct seg, s_tree)); 592 avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page), 593 offsetof(struct watched_page, wp_link)); 594 } 595 596 /*ARGSUSED*/ 597 static int 598 as_constructor(void *buf, void *cdrarg, int kmflags) 599 { 600 struct as *as = buf; 601 602 mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL); 603 cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL); 604 rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL); 605 as_avlinit(as); 606 return (0); 607 } 608 609 /*ARGSUSED1*/ 610 static void 611 as_destructor(void *buf, void *cdrarg) 612 { 613 struct as *as = buf; 614 615 avl_destroy(&as->a_segtree); 616 mutex_destroy(&as->a_contents); 617 cv_destroy(&as->a_cv); 618 rw_destroy(&as->a_lock); 619 } 620 621 void 622 as_init(void) 623 { 624 as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0, 625 as_constructor, as_destructor, NULL, NULL, NULL, 0); 626 } 627 628 /* 629 * Allocate and initialize an address space data structure. 630 * We call hat_alloc to allow any machine dependent 631 * information in the hat structure to be initialized. 632 */ 633 struct as * 634 as_alloc(void) 635 { 636 struct as *as; 637 638 as = kmem_cache_alloc(as_cache, KM_SLEEP); 639 640 as->a_flags = 0; 641 as->a_vbits = 0; 642 as->a_hrm = NULL; 643 as->a_seglast = NULL; 644 as->a_size = 0; 645 as->a_resvsize = 0; 646 as->a_updatedir = 0; 647 gethrestime(&as->a_updatetime); 648 as->a_objectdir = NULL; 649 as->a_sizedir = 0; 650 as->a_userlimit = (caddr_t)USERLIMIT; 651 as->a_lastgap = NULL; 652 as->a_lastgaphl = NULL; 653 as->a_callbacks = NULL; 654 as->a_proc = NULL; 655 656 AS_LOCK_ENTER(as, RW_WRITER); 657 as->a_hat = hat_alloc(as); /* create hat for default system mmu */ 658 AS_LOCK_EXIT(as); 659 660 return (as); 661 } 662 663 /* 664 * Free an address space data structure. 665 * Need to free the hat first and then 666 * all the segments on this as and finally 667 * the space for the as struct itself. 668 */ 669 void 670 as_free(struct as *as) 671 { 672 struct hat *hat = as->a_hat; 673 struct seg *seg, *next; 674 boolean_t free_started = B_FALSE; 675 676 top: 677 /* 678 * Invoke ALL callbacks. as_do_callbacks will do one callback 679 * per call, and not return (-1) until the callback has completed. 680 * When as_do_callbacks returns zero, all callbacks have completed. 681 */ 682 mutex_enter(&as->a_contents); 683 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0)) 684 ; 685 686 mutex_exit(&as->a_contents); 687 AS_LOCK_ENTER(as, RW_WRITER); 688 689 if (!free_started) { 690 free_started = B_TRUE; 691 hat_free_start(hat); 692 } 693 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) { 694 int err; 695 696 next = AS_SEGNEXT(as, seg); 697 retry: 698 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 699 if (err == EAGAIN) { 700 mutex_enter(&as->a_contents); 701 if (as->a_callbacks) { 702 AS_LOCK_EXIT(as); 703 } else if (!AS_ISNOUNMAPWAIT(as)) { 704 /* 705 * Memory is currently locked. Wait for a 706 * cv_signal that it has been unlocked, then 707 * try the operation again. 708 */ 709 if (AS_ISUNMAPWAIT(as) == 0) 710 cv_broadcast(&as->a_cv); 711 AS_SETUNMAPWAIT(as); 712 AS_LOCK_EXIT(as); 713 while (AS_ISUNMAPWAIT(as)) 714 cv_wait(&as->a_cv, &as->a_contents); 715 } else { 716 /* 717 * We may have raced with 718 * segvn_reclaim()/segspt_reclaim(). In this 719 * case clean nounmapwait flag and retry since 720 * softlockcnt in this segment may be already 721 * 0. We don't drop as writer lock so our 722 * number of retries without sleeping should 723 * be very small. See segvn_reclaim() for 724 * more comments. 725 */ 726 AS_CLRNOUNMAPWAIT(as); 727 mutex_exit(&as->a_contents); 728 goto retry; 729 } 730 mutex_exit(&as->a_contents); 731 goto top; 732 } else { 733 /* 734 * We do not expect any other error return at this 735 * time. This is similar to an ASSERT in seg_unmap() 736 */ 737 ASSERT(err == 0); 738 } 739 } 740 hat_free_end(hat); 741 AS_LOCK_EXIT(as); 742 743 /* /proc stuff */ 744 ASSERT(avl_numnodes(&as->a_wpage) == 0); 745 if (as->a_objectdir) { 746 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *)); 747 as->a_objectdir = NULL; 748 as->a_sizedir = 0; 749 } 750 751 /* 752 * Free the struct as back to kmem. Assert it has no segments. 753 */ 754 ASSERT(avl_numnodes(&as->a_segtree) == 0); 755 kmem_cache_free(as_cache, as); 756 } 757 758 int 759 as_dup(struct as *as, struct proc *forkedproc) 760 { 761 struct as *newas; 762 struct seg *seg, *newseg; 763 size_t purgesize = 0; 764 int error; 765 766 AS_LOCK_ENTER(as, RW_WRITER); 767 as_clearwatch(as); 768 newas = as_alloc(); 769 newas->a_userlimit = as->a_userlimit; 770 newas->a_proc = forkedproc; 771 772 AS_LOCK_ENTER(newas, RW_WRITER); 773 774 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD); 775 776 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 777 778 if (seg->s_flags & S_PURGE) { 779 purgesize += seg->s_size; 780 continue; 781 } 782 783 newseg = seg_alloc(newas, seg->s_base, seg->s_size); 784 if (newseg == NULL) { 785 AS_LOCK_EXIT(newas); 786 as_setwatch(as); 787 AS_LOCK_EXIT(as); 788 as_free(newas); 789 return (-1); 790 } 791 if ((error = SEGOP_DUP(seg, newseg)) != 0) { 792 /* 793 * We call seg_free() on the new seg 794 * because the segment is not set up 795 * completely; i.e. it has no ops. 796 */ 797 as_setwatch(as); 798 AS_LOCK_EXIT(as); 799 seg_free(newseg); 800 AS_LOCK_EXIT(newas); 801 as_free(newas); 802 return (error); 803 } 804 if ((newseg->s_flags & S_HOLE) == 0) { 805 newas->a_size += seg->s_size; 806 } 807 } 808 newas->a_resvsize = as->a_resvsize - purgesize; 809 810 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL); 811 812 AS_LOCK_EXIT(newas); 813 814 as_setwatch(as); 815 AS_LOCK_EXIT(as); 816 if (error != 0) { 817 as_free(newas); 818 return (error); 819 } 820 forkedproc->p_as = newas; 821 return (0); 822 } 823 824 /* 825 * Handle a ``fault'' at addr for size bytes. 826 */ 827 faultcode_t 828 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size, 829 enum fault_type type, enum seg_rw rw) 830 { 831 struct seg *seg; 832 caddr_t raddr; /* rounded down addr */ 833 size_t rsize; /* rounded up size */ 834 size_t ssize; 835 faultcode_t res = 0; 836 caddr_t addrsav; 837 struct seg *segsav; 838 int as_lock_held; 839 klwp_t *lwp = ttolwp(curthread); 840 841 842 843 retry: 844 /* 845 * Indicate that the lwp is not to be stopped while waiting for a 846 * pagefault. This is to avoid deadlock while debugging a process 847 * via /proc over NFS (in particular). 848 */ 849 if (lwp != NULL) 850 lwp->lwp_nostop++; 851 852 /* 853 * same length must be used when we softlock and softunlock. We 854 * don't support softunlocking lengths less than the original length 855 * when there is largepage support. See seg_dev.c for more 856 * comments. 857 */ 858 switch (type) { 859 860 case F_SOFTLOCK: 861 CPU_STATS_ADD_K(vm, softlock, 1); 862 break; 863 864 case F_SOFTUNLOCK: 865 break; 866 867 case F_PROT: 868 CPU_STATS_ADD_K(vm, prot_fault, 1); 869 break; 870 871 case F_INVAL: 872 CPU_STATS_ENTER_K(); 873 CPU_STATS_ADDQ(CPU, vm, as_fault, 1); 874 if (as == &kas) 875 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1); 876 CPU_STATS_EXIT_K(); 877 break; 878 } 879 880 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 881 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 882 (size_t)raddr; 883 884 /* 885 * XXX -- Don't grab the as lock for segkmap. We should grab it for 886 * correctness, but then we could be stuck holding this lock for 887 * a LONG time if the fault needs to be resolved on a slow 888 * filesystem, and then no-one will be able to exec new commands, 889 * as exec'ing requires the write lock on the as. 890 */ 891 if (as == &kas && segkmap && segkmap->s_base <= raddr && 892 raddr + size < segkmap->s_base + segkmap->s_size) { 893 seg = segkmap; 894 as_lock_held = 0; 895 } else { 896 AS_LOCK_ENTER(as, RW_READER); 897 898 seg = as_segat(as, raddr); 899 if (seg == NULL) { 900 AS_LOCK_EXIT(as); 901 if (lwp != NULL) 902 lwp->lwp_nostop--; 903 return (FC_NOMAP); 904 } 905 906 as_lock_held = 1; 907 } 908 909 addrsav = raddr; 910 segsav = seg; 911 912 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 913 if (raddr >= seg->s_base + seg->s_size) { 914 seg = AS_SEGNEXT(as, seg); 915 if (seg == NULL || raddr != seg->s_base) { 916 res = FC_NOMAP; 917 break; 918 } 919 } 920 if (raddr + rsize > seg->s_base + seg->s_size) 921 ssize = seg->s_base + seg->s_size - raddr; 922 else 923 ssize = rsize; 924 925 res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw); 926 if (res != 0) 927 break; 928 } 929 930 /* 931 * If we were SOFTLOCKing and encountered a failure, 932 * we must SOFTUNLOCK the range we already did. (Maybe we 933 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing 934 * right here...) 935 */ 936 if (res != 0 && type == F_SOFTLOCK) { 937 for (seg = segsav; addrsav < raddr; addrsav += ssize) { 938 if (addrsav >= seg->s_base + seg->s_size) 939 seg = AS_SEGNEXT(as, seg); 940 ASSERT(seg != NULL); 941 /* 942 * Now call the fault routine again to perform the 943 * unlock using S_OTHER instead of the rw variable 944 * since we never got a chance to touch the pages. 945 */ 946 if (raddr > seg->s_base + seg->s_size) 947 ssize = seg->s_base + seg->s_size - addrsav; 948 else 949 ssize = raddr - addrsav; 950 (void) SEGOP_FAULT(hat, seg, addrsav, ssize, 951 F_SOFTUNLOCK, S_OTHER); 952 } 953 } 954 if (as_lock_held) 955 AS_LOCK_EXIT(as); 956 if (lwp != NULL) 957 lwp->lwp_nostop--; 958 959 /* 960 * If the lower levels returned EDEADLK for a fault, 961 * It means that we should retry the fault. Let's wait 962 * a bit also to let the deadlock causing condition clear. 963 * This is part of a gross hack to work around a design flaw 964 * in the ufs/sds logging code and should go away when the 965 * logging code is re-designed to fix the problem. See bug 966 * 4125102 for details of the problem. 967 */ 968 if (FC_ERRNO(res) == EDEADLK) { 969 delay(deadlk_wait); 970 res = 0; 971 goto retry; 972 } 973 return (res); 974 } 975 976 977 978 /* 979 * Asynchronous ``fault'' at addr for size bytes. 980 */ 981 faultcode_t 982 as_faulta(struct as *as, caddr_t addr, size_t size) 983 { 984 struct seg *seg; 985 caddr_t raddr; /* rounded down addr */ 986 size_t rsize; /* rounded up size */ 987 faultcode_t res = 0; 988 klwp_t *lwp = ttolwp(curthread); 989 990 retry: 991 /* 992 * Indicate that the lwp is not to be stopped while waiting 993 * for a pagefault. This is to avoid deadlock while debugging 994 * a process via /proc over NFS (in particular). 995 */ 996 if (lwp != NULL) 997 lwp->lwp_nostop++; 998 999 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1000 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1001 (size_t)raddr; 1002 1003 AS_LOCK_ENTER(as, RW_READER); 1004 seg = as_segat(as, raddr); 1005 if (seg == NULL) { 1006 AS_LOCK_EXIT(as); 1007 if (lwp != NULL) 1008 lwp->lwp_nostop--; 1009 return (FC_NOMAP); 1010 } 1011 1012 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) { 1013 if (raddr >= seg->s_base + seg->s_size) { 1014 seg = AS_SEGNEXT(as, seg); 1015 if (seg == NULL || raddr != seg->s_base) { 1016 res = FC_NOMAP; 1017 break; 1018 } 1019 } 1020 res = SEGOP_FAULTA(seg, raddr); 1021 if (res != 0) 1022 break; 1023 } 1024 AS_LOCK_EXIT(as); 1025 if (lwp != NULL) 1026 lwp->lwp_nostop--; 1027 /* 1028 * If the lower levels returned EDEADLK for a fault, 1029 * It means that we should retry the fault. Let's wait 1030 * a bit also to let the deadlock causing condition clear. 1031 * This is part of a gross hack to work around a design flaw 1032 * in the ufs/sds logging code and should go away when the 1033 * logging code is re-designed to fix the problem. See bug 1034 * 4125102 for details of the problem. 1035 */ 1036 if (FC_ERRNO(res) == EDEADLK) { 1037 delay(deadlk_wait); 1038 res = 0; 1039 goto retry; 1040 } 1041 return (res); 1042 } 1043 1044 /* 1045 * Set the virtual mapping for the interval from [addr : addr + size) 1046 * in address space `as' to have the specified protection. 1047 * It is ok for the range to cross over several segments, 1048 * as long as they are contiguous. 1049 */ 1050 int 1051 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 1052 { 1053 struct seg *seg; 1054 struct as_callback *cb; 1055 size_t ssize; 1056 caddr_t raddr; /* rounded down addr */ 1057 size_t rsize; /* rounded up size */ 1058 int error = 0, writer = 0; 1059 caddr_t saveraddr; 1060 size_t saversize; 1061 1062 setprot_top: 1063 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1064 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1065 (size_t)raddr; 1066 1067 if (raddr + rsize < raddr) /* check for wraparound */ 1068 return (ENOMEM); 1069 1070 saveraddr = raddr; 1071 saversize = rsize; 1072 1073 /* 1074 * Normally we only lock the as as a reader. But 1075 * if due to setprot the segment driver needs to split 1076 * a segment it will return IE_RETRY. Therefore we re-acquire 1077 * the as lock as a writer so the segment driver can change 1078 * the seg list. Also the segment driver will return IE_RETRY 1079 * after it has changed the segment list so we therefore keep 1080 * locking as a writer. Since these opeartions should be rare 1081 * want to only lock as a writer when necessary. 1082 */ 1083 if (writer || avl_numnodes(&as->a_wpage) != 0) { 1084 AS_LOCK_ENTER(as, RW_WRITER); 1085 } else { 1086 AS_LOCK_ENTER(as, RW_READER); 1087 } 1088 1089 as_clearwatchprot(as, raddr, rsize); 1090 seg = as_segat(as, raddr); 1091 if (seg == NULL) { 1092 as_setwatch(as); 1093 AS_LOCK_EXIT(as); 1094 return (ENOMEM); 1095 } 1096 1097 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 1098 if (raddr >= seg->s_base + seg->s_size) { 1099 seg = AS_SEGNEXT(as, seg); 1100 if (seg == NULL || raddr != seg->s_base) { 1101 error = ENOMEM; 1102 break; 1103 } 1104 } 1105 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 1106 ssize = seg->s_base + seg->s_size - raddr; 1107 else 1108 ssize = rsize; 1109 retry: 1110 error = SEGOP_SETPROT(seg, raddr, ssize, prot); 1111 1112 if (error == IE_NOMEM) { 1113 error = EAGAIN; 1114 break; 1115 } 1116 1117 if (error == IE_RETRY) { 1118 AS_LOCK_EXIT(as); 1119 writer = 1; 1120 goto setprot_top; 1121 } 1122 1123 if (error == EAGAIN) { 1124 /* 1125 * Make sure we have a_lock as writer. 1126 */ 1127 if (writer == 0) { 1128 AS_LOCK_EXIT(as); 1129 writer = 1; 1130 goto setprot_top; 1131 } 1132 1133 /* 1134 * Memory is currently locked. It must be unlocked 1135 * before this operation can succeed through a retry. 1136 * The possible reasons for locked memory and 1137 * corresponding strategies for unlocking are: 1138 * (1) Normal I/O 1139 * wait for a signal that the I/O operation 1140 * has completed and the memory is unlocked. 1141 * (2) Asynchronous I/O 1142 * The aio subsystem does not unlock pages when 1143 * the I/O is completed. Those pages are unlocked 1144 * when the application calls aiowait/aioerror. 1145 * So, to prevent blocking forever, cv_broadcast() 1146 * is done to wake up aio_cleanup_thread. 1147 * Subsequently, segvn_reclaim will be called, and 1148 * that will do AS_CLRUNMAPWAIT() and wake us up. 1149 * (3) Long term page locking: 1150 * Drivers intending to have pages locked for a 1151 * period considerably longer than for normal I/O 1152 * (essentially forever) may have registered for a 1153 * callback so they may unlock these pages on 1154 * request. This is needed to allow this operation 1155 * to succeed. Each entry on the callback list is 1156 * examined. If the event or address range pertains 1157 * the callback is invoked (unless it already is in 1158 * progress). The a_contents lock must be dropped 1159 * before the callback, so only one callback can 1160 * be done at a time. Go to the top and do more 1161 * until zero is returned. If zero is returned, 1162 * either there were no callbacks for this event 1163 * or they were already in progress. 1164 */ 1165 mutex_enter(&as->a_contents); 1166 if (as->a_callbacks && 1167 (cb = as_find_callback(as, AS_SETPROT_EVENT, 1168 seg->s_base, seg->s_size))) { 1169 AS_LOCK_EXIT(as); 1170 as_execute_callback(as, cb, AS_SETPROT_EVENT); 1171 } else if (!AS_ISNOUNMAPWAIT(as)) { 1172 if (AS_ISUNMAPWAIT(as) == 0) 1173 cv_broadcast(&as->a_cv); 1174 AS_SETUNMAPWAIT(as); 1175 AS_LOCK_EXIT(as); 1176 while (AS_ISUNMAPWAIT(as)) 1177 cv_wait(&as->a_cv, &as->a_contents); 1178 } else { 1179 /* 1180 * We may have raced with 1181 * segvn_reclaim()/segspt_reclaim(). In this 1182 * case clean nounmapwait flag and retry since 1183 * softlockcnt in this segment may be already 1184 * 0. We don't drop as writer lock so our 1185 * number of retries without sleeping should 1186 * be very small. See segvn_reclaim() for 1187 * more comments. 1188 */ 1189 AS_CLRNOUNMAPWAIT(as); 1190 mutex_exit(&as->a_contents); 1191 goto retry; 1192 } 1193 mutex_exit(&as->a_contents); 1194 goto setprot_top; 1195 } else if (error != 0) 1196 break; 1197 } 1198 if (error != 0) { 1199 as_setwatch(as); 1200 } else { 1201 as_setwatchprot(as, saveraddr, saversize, prot); 1202 } 1203 AS_LOCK_EXIT(as); 1204 return (error); 1205 } 1206 1207 /* 1208 * Check to make sure that the interval [addr, addr + size) 1209 * in address space `as' has at least the specified protection. 1210 * It is ok for the range to cross over several segments, as long 1211 * as they are contiguous. 1212 */ 1213 int 1214 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 1215 { 1216 struct seg *seg; 1217 size_t ssize; 1218 caddr_t raddr; /* rounded down addr */ 1219 size_t rsize; /* rounded up size */ 1220 int error = 0; 1221 1222 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1223 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1224 (size_t)raddr; 1225 1226 if (raddr + rsize < raddr) /* check for wraparound */ 1227 return (ENOMEM); 1228 1229 /* 1230 * This is ugly as sin... 1231 * Normally, we only acquire the address space readers lock. 1232 * However, if the address space has watchpoints present, 1233 * we must acquire the writer lock on the address space for 1234 * the benefit of as_clearwatchprot() and as_setwatchprot(). 1235 */ 1236 if (avl_numnodes(&as->a_wpage) != 0) 1237 AS_LOCK_ENTER(as, RW_WRITER); 1238 else 1239 AS_LOCK_ENTER(as, RW_READER); 1240 as_clearwatchprot(as, raddr, rsize); 1241 seg = as_segat(as, raddr); 1242 if (seg == NULL) { 1243 as_setwatch(as); 1244 AS_LOCK_EXIT(as); 1245 return (ENOMEM); 1246 } 1247 1248 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 1249 if (raddr >= seg->s_base + seg->s_size) { 1250 seg = AS_SEGNEXT(as, seg); 1251 if (seg == NULL || raddr != seg->s_base) { 1252 error = ENOMEM; 1253 break; 1254 } 1255 } 1256 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 1257 ssize = seg->s_base + seg->s_size - raddr; 1258 else 1259 ssize = rsize; 1260 1261 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot); 1262 if (error != 0) 1263 break; 1264 } 1265 as_setwatch(as); 1266 AS_LOCK_EXIT(as); 1267 return (error); 1268 } 1269 1270 int 1271 as_unmap(struct as *as, caddr_t addr, size_t size) 1272 { 1273 struct seg *seg, *seg_next; 1274 struct as_callback *cb; 1275 caddr_t raddr, eaddr; 1276 size_t ssize, rsize = 0; 1277 int err; 1278 1279 top: 1280 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1281 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) & 1282 (uintptr_t)PAGEMASK); 1283 1284 AS_LOCK_ENTER(as, RW_WRITER); 1285 1286 as->a_updatedir = 1; /* inform /proc */ 1287 gethrestime(&as->a_updatetime); 1288 1289 /* 1290 * Use as_findseg to find the first segment in the range, then 1291 * step through the segments in order, following s_next. 1292 */ 1293 as_clearwatchprot(as, raddr, eaddr - raddr); 1294 1295 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) { 1296 const boolean_t is_hole = ((seg->s_flags & S_HOLE) != 0); 1297 1298 if (eaddr <= seg->s_base) 1299 break; /* eaddr was in a gap; all done */ 1300 1301 /* this is implied by the test above */ 1302 ASSERT(raddr < eaddr); 1303 1304 if (raddr < seg->s_base) 1305 raddr = seg->s_base; /* raddr was in a gap */ 1306 1307 if (eaddr > (seg->s_base + seg->s_size)) 1308 ssize = seg->s_base + seg->s_size - raddr; 1309 else 1310 ssize = eaddr - raddr; 1311 1312 /* 1313 * Save next segment pointer since seg can be 1314 * destroyed during the segment unmap operation. 1315 */ 1316 seg_next = AS_SEGNEXT(as, seg); 1317 1318 /* 1319 * We didn't count /dev/null mappings, so ignore them here. 1320 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again, 1321 * we have to do this check here while we have seg.) 1322 */ 1323 rsize = 0; 1324 if (!SEG_IS_DEVNULL_MAPPING(seg) && 1325 !SEG_IS_PARTIAL_RESV(seg)) 1326 rsize = ssize; 1327 1328 retry: 1329 err = SEGOP_UNMAP(seg, raddr, ssize); 1330 if (err == EAGAIN) { 1331 /* 1332 * Memory is currently locked. It must be unlocked 1333 * before this operation can succeed through a retry. 1334 * The possible reasons for locked memory and 1335 * corresponding strategies for unlocking are: 1336 * (1) Normal I/O 1337 * wait for a signal that the I/O operation 1338 * has completed and the memory is unlocked. 1339 * (2) Asynchronous I/O 1340 * The aio subsystem does not unlock pages when 1341 * the I/O is completed. Those pages are unlocked 1342 * when the application calls aiowait/aioerror. 1343 * So, to prevent blocking forever, cv_broadcast() 1344 * is done to wake up aio_cleanup_thread. 1345 * Subsequently, segvn_reclaim will be called, and 1346 * that will do AS_CLRUNMAPWAIT() and wake us up. 1347 * (3) Long term page locking: 1348 * Drivers intending to have pages locked for a 1349 * period considerably longer than for normal I/O 1350 * (essentially forever) may have registered for a 1351 * callback so they may unlock these pages on 1352 * request. This is needed to allow this operation 1353 * to succeed. Each entry on the callback list is 1354 * examined. If the event or address range pertains 1355 * the callback is invoked (unless it already is in 1356 * progress). The a_contents lock must be dropped 1357 * before the callback, so only one callback can 1358 * be done at a time. Go to the top and do more 1359 * until zero is returned. If zero is returned, 1360 * either there were no callbacks for this event 1361 * or they were already in progress. 1362 */ 1363 mutex_enter(&as->a_contents); 1364 if (as->a_callbacks && 1365 (cb = as_find_callback(as, AS_UNMAP_EVENT, 1366 seg->s_base, seg->s_size))) { 1367 AS_LOCK_EXIT(as); 1368 as_execute_callback(as, cb, AS_UNMAP_EVENT); 1369 } else if (!AS_ISNOUNMAPWAIT(as)) { 1370 if (AS_ISUNMAPWAIT(as) == 0) 1371 cv_broadcast(&as->a_cv); 1372 AS_SETUNMAPWAIT(as); 1373 AS_LOCK_EXIT(as); 1374 while (AS_ISUNMAPWAIT(as)) 1375 cv_wait(&as->a_cv, &as->a_contents); 1376 } else { 1377 /* 1378 * We may have raced with 1379 * segvn_reclaim()/segspt_reclaim(). In this 1380 * case clean nounmapwait flag and retry since 1381 * softlockcnt in this segment may be already 1382 * 0. We don't drop as writer lock so our 1383 * number of retries without sleeping should 1384 * be very small. See segvn_reclaim() for 1385 * more comments. 1386 */ 1387 AS_CLRNOUNMAPWAIT(as); 1388 mutex_exit(&as->a_contents); 1389 goto retry; 1390 } 1391 mutex_exit(&as->a_contents); 1392 goto top; 1393 } else if (err == IE_RETRY) { 1394 AS_LOCK_EXIT(as); 1395 goto top; 1396 } else if (err) { 1397 as_setwatch(as); 1398 AS_LOCK_EXIT(as); 1399 return (-1); 1400 } 1401 1402 if (!is_hole) { 1403 as->a_size -= ssize; 1404 if (rsize) 1405 as->a_resvsize -= rsize; 1406 } 1407 raddr += ssize; 1408 } 1409 AS_LOCK_EXIT(as); 1410 return (0); 1411 } 1412 1413 static int 1414 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec, 1415 segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated) 1416 { 1417 uint_t szc, nszc, save_szcvec; 1418 int error; 1419 caddr_t a, eaddr; 1420 size_t pgsz = 0; 1421 const boolean_t do_off = (vn_a->vp != NULL || vn_a->amp != NULL); 1422 1423 ASSERT(AS_WRITE_HELD(as)); 1424 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1425 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1426 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL); 1427 1428 if (!do_off) { 1429 vn_a->offset = 0; 1430 } 1431 1432 if (szcvec <= 1) { 1433 struct seg *seg, *segref; 1434 1435 seg = segref = seg_alloc(as, addr, size); 1436 if (seg == NULL) { 1437 return (ENOMEM); 1438 } 1439 vn_a->szc = 0; 1440 error = (*crfp)(&seg, vn_a); 1441 if (error != 0) { 1442 VERIFY3P(seg, ==, segref); 1443 seg_free(seg); 1444 } else { 1445 as->a_size += size; 1446 as->a_resvsize += size; 1447 } 1448 return (error); 1449 } 1450 1451 eaddr = addr + size; 1452 save_szcvec = szcvec; 1453 szcvec >>= 1; 1454 szc = 0; 1455 nszc = 0; 1456 while (szcvec) { 1457 if ((szcvec & 0x1) == 0) { 1458 nszc++; 1459 szcvec >>= 1; 1460 continue; 1461 } 1462 nszc++; 1463 pgsz = page_get_pagesize(nszc); 1464 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 1465 if (a != addr) { 1466 struct seg *seg, *segref; 1467 size_t segsize; 1468 1469 ASSERT(a < eaddr); 1470 1471 segsize = a - addr; 1472 seg = segref = seg_alloc(as, addr, segsize); 1473 if (seg == NULL) { 1474 return (ENOMEM); 1475 } 1476 vn_a->szc = szc; 1477 error = (*crfp)(&seg, vn_a); 1478 if (error != 0) { 1479 VERIFY3P(seg, ==, segref); 1480 seg_free(seg); 1481 return (error); 1482 } 1483 as->a_size += segsize; 1484 as->a_resvsize += segsize; 1485 *segcreated = B_TRUE; 1486 if (do_off) { 1487 vn_a->offset += segsize; 1488 } 1489 addr = a; 1490 } 1491 szc = nszc; 1492 szcvec >>= 1; 1493 } 1494 1495 ASSERT(addr < eaddr); 1496 szcvec = save_szcvec | 1; /* add 8K pages */ 1497 while (szcvec) { 1498 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 1499 ASSERT(a >= addr); 1500 if (a != addr) { 1501 struct seg *seg, *segref; 1502 size_t segsize; 1503 1504 segsize = a - addr; 1505 seg = segref = seg_alloc(as, addr, segsize); 1506 if (seg == NULL) { 1507 return (ENOMEM); 1508 } 1509 vn_a->szc = szc; 1510 error = (*crfp)(&seg, vn_a); 1511 if (error != 0) { 1512 VERIFY3P(seg, ==, segref); 1513 seg_free(seg); 1514 return (error); 1515 } 1516 as->a_size += segsize; 1517 as->a_resvsize += segsize; 1518 *segcreated = B_TRUE; 1519 if (do_off) { 1520 vn_a->offset += segsize; 1521 } 1522 addr = a; 1523 } 1524 szcvec &= ~(1 << szc); 1525 if (szcvec) { 1526 szc = highbit(szcvec) - 1; 1527 pgsz = page_get_pagesize(szc); 1528 } 1529 } 1530 ASSERT(addr == eaddr); 1531 1532 return (0); 1533 } 1534 1535 static int 1536 as_map_vnsegs(struct as *as, caddr_t addr, size_t size, 1537 segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated) 1538 { 1539 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA); 1540 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM; 1541 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags, 1542 type, 0); 1543 int error; 1544 struct vattr va; 1545 u_offset_t eoff; 1546 size_t save_size = 0; 1547 extern size_t textrepl_size_thresh; 1548 1549 ASSERT(AS_WRITE_HELD(as)); 1550 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1551 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1552 ASSERT(vn_a->vp != NULL); 1553 ASSERT(vn_a->amp == NULL); 1554 1555 again: 1556 if (szcvec <= 1) { 1557 struct seg *seg, *segref; 1558 1559 seg = segref = seg_alloc(as, addr, size); 1560 if (seg == NULL) { 1561 return (ENOMEM); 1562 } 1563 vn_a->szc = 0; 1564 error = (*crfp)(&seg, vn_a); 1565 if (error != 0) { 1566 VERIFY3P(seg, ==, segref); 1567 seg_free(seg); 1568 } else { 1569 as->a_size += size; 1570 as->a_resvsize += size; 1571 } 1572 return (error); 1573 } 1574 1575 va.va_mask = AT_SIZE; 1576 if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) { 1577 szcvec = 0; 1578 goto again; 1579 } 1580 eoff = vn_a->offset & PAGEMASK; 1581 if (eoff >= va.va_size) { 1582 szcvec = 0; 1583 goto again; 1584 } 1585 eoff += size; 1586 if (btopr(va.va_size) < btopr(eoff)) { 1587 save_size = size; 1588 size = va.va_size - (vn_a->offset & PAGEMASK); 1589 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t); 1590 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags, 1591 type, 0); 1592 if (szcvec <= 1) { 1593 size = save_size; 1594 goto again; 1595 } 1596 } 1597 1598 if (size > textrepl_size_thresh) { 1599 vn_a->flags |= _MAP_TEXTREPL; 1600 } 1601 error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a, 1602 segcreated); 1603 if (error != 0) { 1604 return (error); 1605 } 1606 if (save_size) { 1607 addr += size; 1608 size = save_size - size; 1609 szcvec = 0; 1610 goto again; 1611 } 1612 return (0); 1613 } 1614 1615 /* 1616 * as_map_ansegs: shared or private anonymous memory. Note that the flags 1617 * passed to map_pgszvec cannot be MAP_INITDATA, for anon. 1618 */ 1619 static int 1620 as_map_ansegs(struct as *as, caddr_t addr, size_t size, 1621 segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated) 1622 { 1623 uint_t szcvec; 1624 uchar_t type = 0; 1625 1626 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE); 1627 if (vn_a->type == MAP_SHARED) { 1628 type = MAPPGSZC_SHM; 1629 } else if (vn_a->type == MAP_PRIVATE) { 1630 if (vn_a->szc == AS_MAP_HEAP) { 1631 type = MAPPGSZC_HEAP; 1632 } else if (vn_a->szc == AS_MAP_STACK) { 1633 type = MAPPGSZC_STACK; 1634 } else { 1635 type = MAPPGSZC_PRIVM; 1636 } 1637 } 1638 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ? 1639 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE), 1640 (vn_a->flags & MAP_TEXT), type, 0); 1641 ASSERT(AS_WRITE_HELD(as)); 1642 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1643 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1644 ASSERT(vn_a->vp == NULL); 1645 1646 return (as_map_segvn_segs(as, addr, size, szcvec, 1647 crfp, vn_a, segcreated)); 1648 } 1649 1650 int 1651 as_map(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp, 1652 void *argsp) 1653 { 1654 AS_LOCK_ENTER(as, RW_WRITER); 1655 return (as_map_locked(as, addr, size, crfp, argsp)); 1656 } 1657 1658 int 1659 as_map_locked(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp, 1660 void *argsp) 1661 { 1662 caddr_t raddr; /* rounded down addr */ 1663 size_t rsize; /* rounded up size */ 1664 int error; 1665 boolean_t is_hole = B_FALSE; 1666 /* 1667 * The use of a_proc is preferred to handle the case where curproc is 1668 * a door_call server and is allocating memory in the client's (a_proc) 1669 * address space. 1670 * When creating a shared memory segment a_proc will be NULL so we 1671 * fallback to curproc in that case. 1672 */ 1673 struct proc *p = (as->a_proc == NULL) ? curproc : as->a_proc; 1674 struct segvn_crargs crargs; 1675 1676 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1677 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1678 (size_t)raddr; 1679 1680 /* 1681 * check for wrap around 1682 */ 1683 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) { 1684 AS_LOCK_EXIT(as); 1685 return (ENOMEM); 1686 } 1687 1688 as->a_updatedir = 1; /* inform /proc */ 1689 gethrestime(&as->a_updatetime); 1690 1691 if (as != &kas) { 1692 /* 1693 * Ensure that the virtual size of the process will not exceed 1694 * the configured limit. Since seg_hole segments will later 1695 * set the S_HOLE flag indicating their status as a hole in the 1696 * AS, they are excluded from this check. 1697 */ 1698 if (as->a_size + rsize > (size_t)p->p_vmem_ctl && 1699 !AS_MAP_CHECK_SEGHOLE(crfp)) { 1700 AS_LOCK_EXIT(as); 1701 1702 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], 1703 p->p_rctls, p, RCA_UNSAFE_ALL); 1704 return (ENOMEM); 1705 } 1706 } 1707 1708 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) { 1709 boolean_t do_unmap = B_FALSE; 1710 1711 crargs = *(struct segvn_crargs *)argsp; 1712 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, 1713 &do_unmap); 1714 if (error != 0) { 1715 AS_LOCK_EXIT(as); 1716 if (do_unmap) { 1717 (void) as_unmap(as, addr, size); 1718 } 1719 return (error); 1720 } 1721 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) { 1722 boolean_t do_unmap = B_FALSE; 1723 1724 crargs = *(struct segvn_crargs *)argsp; 1725 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, 1726 &do_unmap); 1727 if (error != 0) { 1728 AS_LOCK_EXIT(as); 1729 if (do_unmap) { 1730 (void) as_unmap(as, addr, size); 1731 } 1732 return (error); 1733 } 1734 } else { 1735 struct seg *seg, *segref; 1736 1737 seg = segref = seg_alloc(as, addr, size); 1738 if (seg == NULL) { 1739 AS_LOCK_EXIT(as); 1740 return (ENOMEM); 1741 } 1742 1743 /* 1744 * It is possible that the segment creation routine will free 1745 * 'seg' as part of a more advanced operation, such as when 1746 * segvn concatenates adjacent segments together. When this 1747 * occurs, the seg*_create routine must communicate the 1748 * resulting segment out via the 'struct seg **' parameter. 1749 * 1750 * If segment creation fails, it must not free the passed-in 1751 * segment, nor alter the argument pointer. 1752 */ 1753 error = (*crfp)(&seg, argsp); 1754 if (error != 0) { 1755 VERIFY3P(seg, ==, segref); 1756 seg_free(seg); 1757 AS_LOCK_EXIT(as); 1758 return (error); 1759 } 1760 1761 /* 1762 * Check if the resulting segment represents a hole in the 1763 * address space, rather than contributing to the AS size. 1764 */ 1765 is_hole = ((seg->s_flags & S_HOLE) != 0); 1766 1767 /* Add size now so as_unmap will work if as_ctl fails. */ 1768 if (!is_hole) { 1769 as->a_size += rsize; 1770 as->a_resvsize += rsize; 1771 } 1772 } 1773 1774 as_setwatch(as); 1775 1776 /* 1777 * Establish memory locks for the segment if the address space is 1778 * locked, provided it's not an explicit hole in the AS. 1779 */ 1780 mutex_enter(&as->a_contents); 1781 if (AS_ISPGLCK(as) && !is_hole) { 1782 mutex_exit(&as->a_contents); 1783 AS_LOCK_EXIT(as); 1784 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0); 1785 if (error != 0) 1786 (void) as_unmap(as, addr, size); 1787 } else { 1788 mutex_exit(&as->a_contents); 1789 AS_LOCK_EXIT(as); 1790 } 1791 return (error); 1792 } 1793 1794 1795 /* 1796 * Delete all segments in the address space marked with S_PURGE. 1797 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c). 1798 * These segments are deleted as a first step before calls to as_gap(), so 1799 * that they don't affect mmap() or shmat(). 1800 */ 1801 void 1802 as_purge(struct as *as) 1803 { 1804 struct seg *seg; 1805 struct seg *next_seg; 1806 1807 /* 1808 * the setting of NEEDSPURGE is protect by as_rangelock(), so 1809 * no need to grab a_contents mutex for this check 1810 */ 1811 if ((as->a_flags & AS_NEEDSPURGE) == 0) 1812 return; 1813 1814 AS_LOCK_ENTER(as, RW_WRITER); 1815 next_seg = NULL; 1816 seg = AS_SEGFIRST(as); 1817 while (seg != NULL) { 1818 next_seg = AS_SEGNEXT(as, seg); 1819 if (seg->s_flags & S_PURGE) 1820 SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 1821 seg = next_seg; 1822 } 1823 AS_LOCK_EXIT(as); 1824 1825 mutex_enter(&as->a_contents); 1826 as->a_flags &= ~AS_NEEDSPURGE; 1827 mutex_exit(&as->a_contents); 1828 } 1829 1830 /* 1831 * Find a hole within [*basep, *basep + *lenp), which contains a mappable 1832 * range of addresses at least "minlen" long, where the base of the range is 1833 * at "off" phase from an "align" boundary and there is space for a 1834 * "redzone"-sized redzone on eithe rside of the range. Thus, 1835 * if align was 4M and off was 16k, the user wants a hole which will start 1836 * 16k into a 4M page. 1837 * 1838 * If flags specifies AH_HI, the hole will have the highest possible address 1839 * in the range. We use the as->a_lastgap field to figure out where to 1840 * start looking for a gap. 1841 * 1842 * Otherwise, the gap will have the lowest possible address. 1843 * 1844 * If flags specifies AH_CONTAIN, the hole will contain the address addr. 1845 * 1846 * If an adequate hole is found, *basep and *lenp are set to reflect the part of 1847 * the hole that is within range, and 0 is returned. On failure, -1 is returned. 1848 * 1849 * NOTE: This routine is not correct when base+len overflows caddr_t. 1850 */ 1851 int 1852 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, 1853 uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off) 1854 { 1855 caddr_t lobound = *basep; 1856 caddr_t hibound = lobound + *lenp; 1857 struct seg *lseg, *hseg; 1858 caddr_t lo, hi; 1859 int forward; 1860 caddr_t save_base; 1861 size_t save_len; 1862 size_t save_minlen; 1863 size_t save_redzone; 1864 int fast_path = 1; 1865 1866 save_base = *basep; 1867 save_len = *lenp; 1868 save_minlen = minlen; 1869 save_redzone = redzone; 1870 1871 /* 1872 * For the first pass/fast_path, just add align and redzone into 1873 * minlen since if we get an allocation, we can guarantee that it 1874 * will fit the alignment and redzone requested. 1875 * This increases the chance that hibound will be adjusted to 1876 * a_lastgap->s_base which will likely allow us to find an 1877 * acceptable hole in the address space quicker. 1878 * If we can't find a hole with this fast_path, then we look for 1879 * smaller holes in which the alignment and offset may allow 1880 * the allocation to fit. 1881 */ 1882 minlen += align; 1883 minlen += 2 * redzone; 1884 redzone = 0; 1885 1886 AS_LOCK_ENTER(as, RW_READER); 1887 if (AS_SEGFIRST(as) == NULL) { 1888 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR, 1889 align, redzone, off)) { 1890 AS_LOCK_EXIT(as); 1891 return (0); 1892 } else { 1893 AS_LOCK_EXIT(as); 1894 *basep = save_base; 1895 *lenp = save_len; 1896 return (-1); 1897 } 1898 } 1899 1900 retry: 1901 /* 1902 * Set up to iterate over all the inter-segment holes in the given 1903 * direction. lseg is NULL for the lowest-addressed hole and hseg is 1904 * NULL for the highest-addressed hole. If moving backwards, we reset 1905 * sseg to denote the highest-addressed segment. 1906 */ 1907 forward = (flags & AH_DIR) == AH_LO; 1908 if (forward) { 1909 hseg = as_findseg(as, lobound, 1); 1910 lseg = AS_SEGPREV(as, hseg); 1911 } else { 1912 1913 /* 1914 * If allocating at least as much as the last allocation, 1915 * use a_lastgap's base as a better estimate of hibound. 1916 */ 1917 if (as->a_lastgap && 1918 minlen >= as->a_lastgap->s_size && 1919 hibound >= as->a_lastgap->s_base) 1920 hibound = as->a_lastgap->s_base; 1921 1922 hseg = as_findseg(as, hibound, 1); 1923 if (hseg->s_base + hseg->s_size < hibound) { 1924 lseg = hseg; 1925 hseg = NULL; 1926 } else { 1927 lseg = AS_SEGPREV(as, hseg); 1928 } 1929 } 1930 1931 for (;;) { 1932 /* 1933 * Set lo and hi to the hole's boundaries. (We should really 1934 * use MAXADDR in place of hibound in the expression below, 1935 * but can't express it easily; using hibound in its place is 1936 * harmless.) 1937 */ 1938 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size; 1939 hi = (hseg == NULL) ? hibound : hseg->s_base; 1940 /* 1941 * If the iteration has moved past the interval from lobound 1942 * to hibound it's pointless to continue. 1943 */ 1944 if ((forward && lo > hibound) || (!forward && hi < lobound)) 1945 break; 1946 else if (lo > hibound || hi < lobound) 1947 goto cont; 1948 /* 1949 * Candidate hole lies at least partially within the allowable 1950 * range. Restrict it to fall completely within that range, 1951 * i.e., to [max(lo, lobound), min(hi, hibound)]. 1952 */ 1953 if (lo < lobound) 1954 lo = lobound; 1955 if (hi > hibound) 1956 hi = hibound; 1957 /* 1958 * Verify that the candidate hole is big enough and meets 1959 * hardware constraints. If the hole is too small, no need 1960 * to do the further checks since they will fail. 1961 */ 1962 *basep = lo; 1963 *lenp = hi - lo; 1964 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp, 1965 minlen, forward ? AH_LO : AH_HI, align, redzone, off) && 1966 ((flags & AH_CONTAIN) == 0 || 1967 (*basep <= addr && *basep + *lenp > addr))) { 1968 if (!forward) 1969 as->a_lastgap = hseg; 1970 if (hseg != NULL) 1971 as->a_lastgaphl = hseg; 1972 else 1973 as->a_lastgaphl = lseg; 1974 AS_LOCK_EXIT(as); 1975 return (0); 1976 } 1977 cont: 1978 /* 1979 * Move to the next hole. 1980 */ 1981 if (forward) { 1982 lseg = hseg; 1983 if (lseg == NULL) 1984 break; 1985 hseg = AS_SEGNEXT(as, hseg); 1986 } else { 1987 hseg = lseg; 1988 if (hseg == NULL) 1989 break; 1990 lseg = AS_SEGPREV(as, lseg); 1991 } 1992 } 1993 if (fast_path && (align != 0 || save_redzone != 0)) { 1994 fast_path = 0; 1995 minlen = save_minlen; 1996 redzone = save_redzone; 1997 goto retry; 1998 } 1999 *basep = save_base; 2000 *lenp = save_len; 2001 AS_LOCK_EXIT(as); 2002 return (-1); 2003 } 2004 2005 /* 2006 * Find a hole of at least size minlen within [*basep, *basep + *lenp). 2007 * 2008 * If flags specifies AH_HI, the hole will have the highest possible address 2009 * in the range. We use the as->a_lastgap field to figure out where to 2010 * start looking for a gap. 2011 * 2012 * Otherwise, the gap will have the lowest possible address. 2013 * 2014 * If flags specifies AH_CONTAIN, the hole will contain the address addr. 2015 * 2016 * If an adequate hole is found, base and len are set to reflect the part of 2017 * the hole that is within range, and 0 is returned, otherwise, 2018 * -1 is returned. 2019 * 2020 * NOTE: This routine is not correct when base+len overflows caddr_t. 2021 */ 2022 int 2023 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags, 2024 caddr_t addr) 2025 { 2026 2027 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0)); 2028 } 2029 2030 /* 2031 * Return the next range within [base, base + len) that is backed 2032 * with "real memory". Skip holes and non-seg_vn segments. 2033 * We're lazy and only return one segment at a time. 2034 */ 2035 int 2036 as_memory(struct as *as, caddr_t *basep, size_t *lenp) 2037 { 2038 extern struct seg_ops segspt_shmops; /* needs a header file */ 2039 struct seg *seg; 2040 caddr_t addr, eaddr; 2041 caddr_t segend; 2042 2043 AS_LOCK_ENTER(as, RW_READER); 2044 2045 addr = *basep; 2046 eaddr = addr + *lenp; 2047 2048 seg = as_findseg(as, addr, 0); 2049 if (seg != NULL) 2050 addr = MAX(seg->s_base, addr); 2051 2052 for (;;) { 2053 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) { 2054 AS_LOCK_EXIT(as); 2055 return (EINVAL); 2056 } 2057 2058 if (seg->s_ops == &segvn_ops) { 2059 segend = seg->s_base + seg->s_size; 2060 break; 2061 } 2062 2063 /* 2064 * We do ISM by looking into the private data 2065 * to determine the real size of the segment. 2066 */ 2067 if (seg->s_ops == &segspt_shmops) { 2068 segend = seg->s_base + spt_realsize(seg); 2069 if (addr < segend) 2070 break; 2071 } 2072 2073 seg = AS_SEGNEXT(as, seg); 2074 2075 if (seg != NULL) 2076 addr = seg->s_base; 2077 } 2078 2079 *basep = addr; 2080 2081 if (segend > eaddr) 2082 *lenp = eaddr - addr; 2083 else 2084 *lenp = segend - addr; 2085 2086 AS_LOCK_EXIT(as); 2087 return (0); 2088 } 2089 2090 /* 2091 * Swap the pages associated with the address space as out to 2092 * secondary storage, returning the number of bytes actually 2093 * swapped. 2094 * 2095 * The value returned is intended to correlate well with the process's 2096 * memory requirements. Its usefulness for this purpose depends on 2097 * how well the segment-level routines do at returning accurate 2098 * information. 2099 */ 2100 size_t 2101 as_swapout(struct as *as) 2102 { 2103 struct seg *seg; 2104 size_t swpcnt = 0; 2105 2106 /* 2107 * Kernel-only processes have given up their address 2108 * spaces. Of course, we shouldn't be attempting to 2109 * swap out such processes in the first place... 2110 */ 2111 if (as == NULL) 2112 return (0); 2113 2114 AS_LOCK_ENTER(as, RW_READER); 2115 2116 /* 2117 * Free all mapping resources associated with the address 2118 * space. The segment-level swapout routines capitalize 2119 * on this unmapping by scavanging pages that have become 2120 * unmapped here. 2121 */ 2122 hat_swapout(as->a_hat); 2123 2124 /* 2125 * Call the swapout routines of all segments in the address 2126 * space to do the actual work, accumulating the amount of 2127 * space reclaimed. 2128 */ 2129 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 2130 struct seg_ops *ov = seg->s_ops; 2131 2132 /* 2133 * We have to check to see if the seg has 2134 * an ops vector because the seg may have 2135 * been in the middle of being set up when 2136 * the process was picked for swapout. 2137 */ 2138 if ((ov != NULL) && (ov->swapout != NULL)) 2139 swpcnt += SEGOP_SWAPOUT(seg); 2140 } 2141 AS_LOCK_EXIT(as); 2142 return (swpcnt); 2143 } 2144 2145 /* 2146 * Determine whether data from the mappings in interval [addr, addr + size) 2147 * are in the primary memory (core) cache. 2148 */ 2149 int 2150 as_incore(struct as *as, caddr_t addr, 2151 size_t size, char *vec, size_t *sizep) 2152 { 2153 struct seg *seg; 2154 size_t ssize; 2155 caddr_t raddr; /* rounded down addr */ 2156 size_t rsize; /* rounded up size */ 2157 size_t isize; /* iteration size */ 2158 int error = 0; /* result, assume success */ 2159 2160 *sizep = 0; 2161 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2162 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) - 2163 (size_t)raddr; 2164 2165 if (raddr + rsize < raddr) /* check for wraparound */ 2166 return (ENOMEM); 2167 2168 AS_LOCK_ENTER(as, RW_READER); 2169 seg = as_segat(as, raddr); 2170 if (seg == NULL) { 2171 AS_LOCK_EXIT(as); 2172 return (-1); 2173 } 2174 2175 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 2176 if (raddr >= seg->s_base + seg->s_size) { 2177 seg = AS_SEGNEXT(as, seg); 2178 if (seg == NULL || raddr != seg->s_base) { 2179 error = -1; 2180 break; 2181 } 2182 } 2183 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2184 ssize = seg->s_base + seg->s_size - raddr; 2185 else 2186 ssize = rsize; 2187 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec); 2188 if (isize != ssize) { 2189 error = -1; 2190 break; 2191 } 2192 vec += btopr(ssize); 2193 } 2194 AS_LOCK_EXIT(as); 2195 return (error); 2196 } 2197 2198 static void 2199 as_segunlock(struct seg *seg, caddr_t addr, int attr, 2200 ulong_t *bitmap, size_t position, size_t npages) 2201 { 2202 caddr_t range_start; 2203 size_t pos1 = position; 2204 size_t pos2; 2205 size_t size; 2206 size_t end_pos = npages + position; 2207 2208 while (bt_range(bitmap, &pos1, &pos2, end_pos)) { 2209 size = ptob((pos2 - pos1)); 2210 range_start = (caddr_t)((uintptr_t)addr + 2211 ptob(pos1 - position)); 2212 2213 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK, 2214 (ulong_t *)NULL, (size_t)NULL); 2215 pos1 = pos2; 2216 } 2217 } 2218 2219 static void 2220 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map, 2221 caddr_t raddr, size_t rsize) 2222 { 2223 struct seg *seg = as_segat(as, raddr); 2224 size_t ssize; 2225 2226 while (rsize != 0) { 2227 if (raddr >= seg->s_base + seg->s_size) 2228 seg = AS_SEGNEXT(as, seg); 2229 2230 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2231 ssize = seg->s_base + seg->s_size - raddr; 2232 else 2233 ssize = rsize; 2234 2235 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize)); 2236 2237 rsize -= ssize; 2238 raddr += ssize; 2239 } 2240 } 2241 2242 /* 2243 * Cache control operations over the interval [addr, addr + size) in 2244 * address space "as". 2245 */ 2246 /*ARGSUSED*/ 2247 int 2248 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr, 2249 uintptr_t arg, ulong_t *lock_map, size_t pos) 2250 { 2251 struct seg *seg; /* working segment */ 2252 caddr_t raddr; /* rounded down addr */ 2253 caddr_t initraddr; /* saved initial rounded down addr */ 2254 size_t rsize; /* rounded up size */ 2255 size_t initrsize; /* saved initial rounded up size */ 2256 size_t ssize; /* size of seg */ 2257 int error = 0; /* result */ 2258 size_t mlock_size; /* size of bitmap */ 2259 ulong_t *mlock_map; /* pointer to bitmap used */ 2260 /* to represent the locked */ 2261 /* pages. */ 2262 2263 mlock_size = 0; 2264 mlock_map = NULL; 2265 retry: 2266 if (error == IE_RETRY) 2267 AS_LOCK_ENTER(as, RW_WRITER); 2268 else 2269 AS_LOCK_ENTER(as, RW_READER); 2270 2271 /* 2272 * If these are address space lock/unlock operations, loop over 2273 * all segments in the address space, as appropriate. 2274 */ 2275 if (func == MC_LOCKAS) { 2276 size_t npages, idx; 2277 size_t rlen = 0; /* rounded as length */ 2278 2279 idx = pos; 2280 2281 if (arg & MCL_FUTURE) { 2282 mutex_enter(&as->a_contents); 2283 AS_SETPGLCK(as); 2284 mutex_exit(&as->a_contents); 2285 } 2286 if ((arg & MCL_CURRENT) == 0) { 2287 AS_LOCK_EXIT(as); 2288 return (0); 2289 } 2290 2291 seg = AS_SEGFIRST(as); 2292 if (seg == NULL) { 2293 AS_LOCK_EXIT(as); 2294 return (0); 2295 } 2296 2297 do { 2298 raddr = (caddr_t)((uintptr_t)seg->s_base & 2299 (uintptr_t)PAGEMASK); 2300 rlen += (((uintptr_t)(seg->s_base + seg->s_size) + 2301 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr; 2302 } while ((seg = AS_SEGNEXT(as, seg)) != NULL); 2303 2304 mlock_size = BT_BITOUL(btopr(rlen)); 2305 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * 2306 sizeof (ulong_t), KM_NOSLEEP)) == NULL) { 2307 AS_LOCK_EXIT(as); 2308 return (EAGAIN); 2309 } 2310 2311 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 2312 if ((seg->s_flags & S_HOLE) != 0) { 2313 continue; 2314 } 2315 error = SEGOP_LOCKOP(seg, seg->s_base, 2316 seg->s_size, attr, MC_LOCK, mlock_map, pos); 2317 if (error != 0) 2318 break; 2319 pos += seg_pages(seg); 2320 } 2321 2322 if (error) { 2323 for (seg = AS_SEGFIRST(as); seg != NULL; 2324 seg = AS_SEGNEXT(as, seg)) { 2325 2326 raddr = (caddr_t)((uintptr_t)seg->s_base & 2327 (uintptr_t)PAGEMASK); 2328 npages = seg_pages(seg); 2329 as_segunlock(seg, raddr, attr, mlock_map, 2330 idx, npages); 2331 idx += npages; 2332 } 2333 } 2334 2335 kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); 2336 AS_LOCK_EXIT(as); 2337 goto lockerr; 2338 } else if (func == MC_UNLOCKAS) { 2339 mutex_enter(&as->a_contents); 2340 AS_CLRPGLCK(as); 2341 mutex_exit(&as->a_contents); 2342 2343 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 2344 if ((seg->s_flags & S_HOLE) != 0) { 2345 continue; 2346 } 2347 error = SEGOP_LOCKOP(seg, seg->s_base, 2348 seg->s_size, attr, MC_UNLOCK, NULL, 0); 2349 if (error != 0) 2350 break; 2351 } 2352 2353 AS_LOCK_EXIT(as); 2354 goto lockerr; 2355 } 2356 2357 /* 2358 * Normalize addresses and sizes. 2359 */ 2360 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2361 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2362 (size_t)raddr; 2363 2364 if (raddr + rsize < raddr) { /* check for wraparound */ 2365 AS_LOCK_EXIT(as); 2366 return (ENOMEM); 2367 } 2368 2369 /* 2370 * Get initial segment. 2371 */ 2372 if ((seg = as_segat(as, raddr)) == NULL) { 2373 AS_LOCK_EXIT(as); 2374 return (ENOMEM); 2375 } 2376 2377 if (func == MC_LOCK) { 2378 mlock_size = BT_BITOUL(btopr(rsize)); 2379 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * 2380 sizeof (ulong_t), KM_NOSLEEP)) == NULL) { 2381 AS_LOCK_EXIT(as); 2382 return (EAGAIN); 2383 } 2384 } 2385 2386 /* 2387 * Loop over all segments. If a hole in the address range is 2388 * discovered, then fail. For each segment, perform the appropriate 2389 * control operation. 2390 */ 2391 while (rsize != 0) { 2392 2393 /* 2394 * Make sure there's no hole, calculate the portion 2395 * of the next segment to be operated over. 2396 */ 2397 if (raddr >= seg->s_base + seg->s_size) { 2398 seg = AS_SEGNEXT(as, seg); 2399 if (seg == NULL || raddr != seg->s_base) { 2400 if (func == MC_LOCK) { 2401 as_unlockerr(as, attr, mlock_map, 2402 initraddr, initrsize - rsize); 2403 kmem_free(mlock_map, 2404 mlock_size * sizeof (ulong_t)); 2405 } 2406 AS_LOCK_EXIT(as); 2407 return (ENOMEM); 2408 } 2409 } 2410 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2411 ssize = seg->s_base + seg->s_size - raddr; 2412 else 2413 ssize = rsize; 2414 2415 /* 2416 * Dispatch on specific function. 2417 */ 2418 switch (func) { 2419 2420 /* 2421 * Synchronize cached data from mappings with backing 2422 * objects. 2423 */ 2424 case MC_SYNC: 2425 if (error = SEGOP_SYNC(seg, raddr, ssize, 2426 attr, (uint_t)arg)) { 2427 AS_LOCK_EXIT(as); 2428 return (error); 2429 } 2430 break; 2431 2432 /* 2433 * Lock pages in memory. 2434 */ 2435 case MC_LOCK: 2436 if (error = SEGOP_LOCKOP(seg, raddr, ssize, 2437 attr, func, mlock_map, pos)) { 2438 as_unlockerr(as, attr, mlock_map, initraddr, 2439 initrsize - rsize + ssize); 2440 kmem_free(mlock_map, mlock_size * 2441 sizeof (ulong_t)); 2442 AS_LOCK_EXIT(as); 2443 goto lockerr; 2444 } 2445 break; 2446 2447 /* 2448 * Unlock mapped pages. 2449 */ 2450 case MC_UNLOCK: 2451 (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func, 2452 (ulong_t *)NULL, (size_t)NULL); 2453 break; 2454 2455 /* 2456 * Store VM advise for mapped pages in segment layer. 2457 */ 2458 case MC_ADVISE: 2459 error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg); 2460 2461 /* 2462 * Check for regular errors and special retry error 2463 */ 2464 if (error) { 2465 if (error == IE_RETRY) { 2466 /* 2467 * Need to acquire writers lock, so 2468 * have to drop readers lock and start 2469 * all over again 2470 */ 2471 AS_LOCK_EXIT(as); 2472 goto retry; 2473 } else if (error == IE_REATTACH) { 2474 /* 2475 * Find segment for current address 2476 * because current segment just got 2477 * split or concatenated 2478 */ 2479 seg = as_segat(as, raddr); 2480 if (seg == NULL) { 2481 AS_LOCK_EXIT(as); 2482 return (ENOMEM); 2483 } 2484 } else { 2485 /* 2486 * Regular error 2487 */ 2488 AS_LOCK_EXIT(as); 2489 return (error); 2490 } 2491 } 2492 break; 2493 2494 case MC_INHERIT_ZERO: 2495 if (seg->s_ops->inherit == NULL) { 2496 error = ENOTSUP; 2497 } else { 2498 error = SEGOP_INHERIT(seg, raddr, ssize, 2499 SEGP_INH_ZERO); 2500 } 2501 if (error != 0) { 2502 AS_LOCK_EXIT(as); 2503 return (error); 2504 } 2505 break; 2506 2507 /* 2508 * Can't happen. 2509 */ 2510 default: 2511 panic("as_ctl: bad operation %d", func); 2512 /*NOTREACHED*/ 2513 } 2514 2515 rsize -= ssize; 2516 raddr += ssize; 2517 } 2518 2519 if (func == MC_LOCK) 2520 kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); 2521 AS_LOCK_EXIT(as); 2522 return (0); 2523 lockerr: 2524 2525 /* 2526 * If the lower levels returned EDEADLK for a segment lockop, 2527 * it means that we should retry the operation. Let's wait 2528 * a bit also to let the deadlock causing condition clear. 2529 * This is part of a gross hack to work around a design flaw 2530 * in the ufs/sds logging code and should go away when the 2531 * logging code is re-designed to fix the problem. See bug 2532 * 4125102 for details of the problem. 2533 */ 2534 if (error == EDEADLK) { 2535 delay(deadlk_wait); 2536 error = 0; 2537 goto retry; 2538 } 2539 return (error); 2540 } 2541 2542 int 2543 fc_decode(faultcode_t fault_err) 2544 { 2545 int error = 0; 2546 2547 switch (FC_CODE(fault_err)) { 2548 case FC_OBJERR: 2549 error = FC_ERRNO(fault_err); 2550 break; 2551 case FC_PROT: 2552 error = EACCES; 2553 break; 2554 default: 2555 error = EFAULT; 2556 break; 2557 } 2558 return (error); 2559 } 2560 2561 /* 2562 * Pagelock pages from a range that spans more than 1 segment. Obtain shadow 2563 * lists from each segment and copy them to one contiguous shadow list (plist) 2564 * as expected by the caller. Save pointers to per segment shadow lists at 2565 * the tail of plist so that they can be used during as_pageunlock(). 2566 */ 2567 static int 2568 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp, 2569 caddr_t addr, size_t size, enum seg_rw rw) 2570 { 2571 caddr_t sv_addr = addr; 2572 size_t sv_size = size; 2573 struct seg *sv_seg = seg; 2574 ulong_t segcnt = 1; 2575 ulong_t cnt; 2576 size_t ssize; 2577 pgcnt_t npages = btop(size); 2578 page_t **plist; 2579 page_t **pl; 2580 int error; 2581 caddr_t eaddr; 2582 faultcode_t fault_err = 0; 2583 pgcnt_t pl_off; 2584 extern struct seg_ops segspt_shmops; 2585 2586 ASSERT(AS_LOCK_HELD(as)); 2587 ASSERT(seg != NULL); 2588 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size); 2589 ASSERT(addr + size > seg->s_base + seg->s_size); 2590 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 2591 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 2592 2593 /* 2594 * Count the number of segments covered by the range we are about to 2595 * lock. The segment count is used to size the shadow list we return 2596 * back to the caller. 2597 */ 2598 for (; size != 0; size -= ssize, addr += ssize) { 2599 if (addr >= seg->s_base + seg->s_size) { 2600 2601 seg = AS_SEGNEXT(as, seg); 2602 if (seg == NULL || addr != seg->s_base) { 2603 AS_LOCK_EXIT(as); 2604 return (EFAULT); 2605 } 2606 /* 2607 * Do a quick check if subsequent segments 2608 * will most likely support pagelock. 2609 */ 2610 if (seg->s_ops == &segvn_ops) { 2611 vnode_t *vp; 2612 2613 if (SEGOP_GETVP(seg, addr, &vp) != 0 || 2614 vp != NULL) { 2615 AS_LOCK_EXIT(as); 2616 goto slow; 2617 } 2618 } else if (seg->s_ops != &segspt_shmops) { 2619 AS_LOCK_EXIT(as); 2620 goto slow; 2621 } 2622 segcnt++; 2623 } 2624 if (addr + size > seg->s_base + seg->s_size) { 2625 ssize = seg->s_base + seg->s_size - addr; 2626 } else { 2627 ssize = size; 2628 } 2629 } 2630 ASSERT(segcnt > 1); 2631 2632 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP); 2633 2634 addr = sv_addr; 2635 size = sv_size; 2636 seg = sv_seg; 2637 2638 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) { 2639 if (addr >= seg->s_base + seg->s_size) { 2640 seg = AS_SEGNEXT(as, seg); 2641 ASSERT(seg != NULL && addr == seg->s_base); 2642 cnt++; 2643 ASSERT(cnt < segcnt); 2644 } 2645 if (addr + size > seg->s_base + seg->s_size) { 2646 ssize = seg->s_base + seg->s_size - addr; 2647 } else { 2648 ssize = size; 2649 } 2650 pl = &plist[npages + cnt]; 2651 error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, 2652 L_PAGELOCK, rw); 2653 if (error) { 2654 break; 2655 } 2656 ASSERT(plist[npages + cnt] != NULL); 2657 ASSERT(pl_off + btop(ssize) <= npages); 2658 bcopy(plist[npages + cnt], &plist[pl_off], 2659 btop(ssize) * sizeof (page_t *)); 2660 pl_off += btop(ssize); 2661 } 2662 2663 if (size == 0) { 2664 AS_LOCK_EXIT(as); 2665 ASSERT(cnt == segcnt - 1); 2666 *ppp = plist; 2667 return (0); 2668 } 2669 2670 /* 2671 * one of pagelock calls failed. The error type is in error variable. 2672 * Unlock what we've locked so far and retry with F_SOFTLOCK if error 2673 * type is either EFAULT or ENOTSUP. Otherwise just return the error 2674 * back to the caller. 2675 */ 2676 2677 eaddr = addr; 2678 seg = sv_seg; 2679 2680 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) { 2681 if (addr >= seg->s_base + seg->s_size) { 2682 seg = AS_SEGNEXT(as, seg); 2683 ASSERT(seg != NULL && addr == seg->s_base); 2684 cnt++; 2685 ASSERT(cnt < segcnt); 2686 } 2687 if (eaddr > seg->s_base + seg->s_size) { 2688 ssize = seg->s_base + seg->s_size - addr; 2689 } else { 2690 ssize = eaddr - addr; 2691 } 2692 pl = &plist[npages + cnt]; 2693 ASSERT(*pl != NULL); 2694 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, 2695 L_PAGEUNLOCK, rw); 2696 } 2697 2698 AS_LOCK_EXIT(as); 2699 2700 kmem_free(plist, (npages + segcnt) * sizeof (page_t *)); 2701 2702 if (error != ENOTSUP && error != EFAULT) { 2703 return (error); 2704 } 2705 2706 slow: 2707 /* 2708 * If we are here because pagelock failed due to the need to cow fault 2709 * in the pages we want to lock F_SOFTLOCK will do this job and in 2710 * next as_pagelock() call for this address range pagelock will 2711 * hopefully succeed. 2712 */ 2713 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw); 2714 if (fault_err != 0) { 2715 return (fc_decode(fault_err)); 2716 } 2717 *ppp = NULL; 2718 2719 return (0); 2720 } 2721 2722 /* 2723 * lock pages in a given address space. Return shadow list. If 2724 * the list is NULL, the MMU mapping is also locked. 2725 */ 2726 int 2727 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr, 2728 size_t size, enum seg_rw rw) 2729 { 2730 size_t rsize; 2731 caddr_t raddr; 2732 faultcode_t fault_err; 2733 struct seg *seg; 2734 int err; 2735 2736 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START, 2737 "as_pagelock_start: addr %p size %ld", addr, size); 2738 2739 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2740 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2741 (size_t)raddr; 2742 2743 /* 2744 * if the request crosses two segments let 2745 * as_fault handle it. 2746 */ 2747 AS_LOCK_ENTER(as, RW_READER); 2748 2749 seg = as_segat(as, raddr); 2750 if (seg == NULL) { 2751 AS_LOCK_EXIT(as); 2752 return (EFAULT); 2753 } 2754 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size); 2755 if (raddr + rsize > seg->s_base + seg->s_size) { 2756 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw)); 2757 } 2758 if (raddr + rsize <= raddr) { 2759 AS_LOCK_EXIT(as); 2760 return (EFAULT); 2761 } 2762 2763 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START, 2764 "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize); 2765 2766 /* 2767 * try to lock pages and pass back shadow list 2768 */ 2769 err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw); 2770 2771 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end"); 2772 2773 AS_LOCK_EXIT(as); 2774 2775 if (err == 0 || (err != ENOTSUP && err != EFAULT)) { 2776 return (err); 2777 } 2778 2779 /* 2780 * Use F_SOFTLOCK to lock the pages because pagelock failed either due 2781 * to no pagelock support for this segment or pages need to be cow 2782 * faulted in. If fault is needed F_SOFTLOCK will do this job for 2783 * this as_pagelock() call and in the next as_pagelock() call for the 2784 * same address range pagelock call will hopefull succeed. 2785 */ 2786 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw); 2787 if (fault_err != 0) { 2788 return (fc_decode(fault_err)); 2789 } 2790 *ppp = NULL; 2791 2792 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end"); 2793 return (0); 2794 } 2795 2796 /* 2797 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow 2798 * lists from the end of plist and call pageunlock interface for each segment. 2799 * Drop as lock and free plist. 2800 */ 2801 static void 2802 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size, 2803 struct page **plist, enum seg_rw rw) 2804 { 2805 ulong_t cnt; 2806 caddr_t eaddr = addr + size; 2807 pgcnt_t npages = btop(size); 2808 size_t ssize; 2809 page_t **pl; 2810 2811 ASSERT(AS_LOCK_HELD(as)); 2812 ASSERT(seg != NULL); 2813 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size); 2814 ASSERT(addr + size > seg->s_base + seg->s_size); 2815 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 2816 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 2817 ASSERT(plist != NULL); 2818 2819 for (cnt = 0; addr < eaddr; addr += ssize) { 2820 if (addr >= seg->s_base + seg->s_size) { 2821 seg = AS_SEGNEXT(as, seg); 2822 ASSERT(seg != NULL && addr == seg->s_base); 2823 cnt++; 2824 } 2825 if (eaddr > seg->s_base + seg->s_size) { 2826 ssize = seg->s_base + seg->s_size - addr; 2827 } else { 2828 ssize = eaddr - addr; 2829 } 2830 pl = &plist[npages + cnt]; 2831 ASSERT(*pl != NULL); 2832 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl, 2833 L_PAGEUNLOCK, rw); 2834 } 2835 ASSERT(cnt > 0); 2836 AS_LOCK_EXIT(as); 2837 2838 cnt++; 2839 kmem_free(plist, (npages + cnt) * sizeof (page_t *)); 2840 } 2841 2842 /* 2843 * unlock pages in a given address range 2844 */ 2845 void 2846 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size, 2847 enum seg_rw rw) 2848 { 2849 struct seg *seg; 2850 size_t rsize; 2851 caddr_t raddr; 2852 2853 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START, 2854 "as_pageunlock_start: addr %p size %ld", addr, size); 2855 2856 /* 2857 * if the shadow list is NULL, as_pagelock was 2858 * falling back to as_fault 2859 */ 2860 if (pp == NULL) { 2861 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw); 2862 return; 2863 } 2864 2865 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2866 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2867 (size_t)raddr; 2868 2869 AS_LOCK_ENTER(as, RW_READER); 2870 seg = as_segat(as, raddr); 2871 ASSERT(seg != NULL); 2872 2873 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START, 2874 "seg_unlock_start: raddr %p rsize %ld", raddr, rsize); 2875 2876 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size); 2877 if (raddr + rsize <= seg->s_base + seg->s_size) { 2878 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw); 2879 } else { 2880 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw); 2881 return; 2882 } 2883 AS_LOCK_EXIT(as); 2884 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end"); 2885 } 2886 2887 int 2888 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc, 2889 boolean_t wait) 2890 { 2891 struct seg *seg; 2892 size_t ssize; 2893 caddr_t raddr; /* rounded down addr */ 2894 size_t rsize; /* rounded up size */ 2895 int error = 0; 2896 size_t pgsz = page_get_pagesize(szc); 2897 2898 setpgsz_top: 2899 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) { 2900 return (EINVAL); 2901 } 2902 2903 raddr = addr; 2904 rsize = size; 2905 2906 if (raddr + rsize < raddr) /* check for wraparound */ 2907 return (ENOMEM); 2908 2909 AS_LOCK_ENTER(as, RW_WRITER); 2910 as_clearwatchprot(as, raddr, rsize); 2911 seg = as_segat(as, raddr); 2912 if (seg == NULL) { 2913 as_setwatch(as); 2914 AS_LOCK_EXIT(as); 2915 return (ENOMEM); 2916 } 2917 2918 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 2919 if (raddr >= seg->s_base + seg->s_size) { 2920 seg = AS_SEGNEXT(as, seg); 2921 if (seg == NULL || raddr != seg->s_base) { 2922 error = ENOMEM; 2923 break; 2924 } 2925 } 2926 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 2927 ssize = seg->s_base + seg->s_size - raddr; 2928 } else { 2929 ssize = rsize; 2930 } 2931 2932 retry: 2933 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc); 2934 2935 if (error == IE_NOMEM) { 2936 error = EAGAIN; 2937 break; 2938 } 2939 2940 if (error == IE_RETRY) { 2941 AS_LOCK_EXIT(as); 2942 goto setpgsz_top; 2943 } 2944 2945 if (error == ENOTSUP) { 2946 error = EINVAL; 2947 break; 2948 } 2949 2950 if (wait && (error == EAGAIN)) { 2951 /* 2952 * Memory is currently locked. It must be unlocked 2953 * before this operation can succeed through a retry. 2954 * The possible reasons for locked memory and 2955 * corresponding strategies for unlocking are: 2956 * (1) Normal I/O 2957 * wait for a signal that the I/O operation 2958 * has completed and the memory is unlocked. 2959 * (2) Asynchronous I/O 2960 * The aio subsystem does not unlock pages when 2961 * the I/O is completed. Those pages are unlocked 2962 * when the application calls aiowait/aioerror. 2963 * So, to prevent blocking forever, cv_broadcast() 2964 * is done to wake up aio_cleanup_thread. 2965 * Subsequently, segvn_reclaim will be called, and 2966 * that will do AS_CLRUNMAPWAIT() and wake us up. 2967 * (3) Long term page locking: 2968 * This is not relevant for as_setpagesize() 2969 * because we cannot change the page size for 2970 * driver memory. The attempt to do so will 2971 * fail with a different error than EAGAIN so 2972 * there's no need to trigger as callbacks like 2973 * as_unmap, as_setprot or as_free would do. 2974 */ 2975 mutex_enter(&as->a_contents); 2976 if (!AS_ISNOUNMAPWAIT(as)) { 2977 if (AS_ISUNMAPWAIT(as) == 0) { 2978 cv_broadcast(&as->a_cv); 2979 } 2980 AS_SETUNMAPWAIT(as); 2981 AS_LOCK_EXIT(as); 2982 while (AS_ISUNMAPWAIT(as)) { 2983 cv_wait(&as->a_cv, &as->a_contents); 2984 } 2985 } else { 2986 /* 2987 * We may have raced with 2988 * segvn_reclaim()/segspt_reclaim(). In this 2989 * case clean nounmapwait flag and retry since 2990 * softlockcnt in this segment may be already 2991 * 0. We don't drop as writer lock so our 2992 * number of retries without sleeping should 2993 * be very small. See segvn_reclaim() for 2994 * more comments. 2995 */ 2996 AS_CLRNOUNMAPWAIT(as); 2997 mutex_exit(&as->a_contents); 2998 goto retry; 2999 } 3000 mutex_exit(&as->a_contents); 3001 goto setpgsz_top; 3002 } else if (error != 0) { 3003 break; 3004 } 3005 } 3006 as_setwatch(as); 3007 AS_LOCK_EXIT(as); 3008 return (error); 3009 } 3010 3011 /* 3012 * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments 3013 * in its chunk where s_szc is less than the szc we want to set. 3014 */ 3015 static int 3016 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc, 3017 int *retry) 3018 { 3019 struct seg *seg; 3020 size_t ssize; 3021 int error; 3022 3023 ASSERT(AS_WRITE_HELD(as)); 3024 3025 seg = as_segat(as, raddr); 3026 if (seg == NULL) { 3027 panic("as_iset3_default_lpsize: no seg"); 3028 } 3029 3030 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 3031 if (raddr >= seg->s_base + seg->s_size) { 3032 seg = AS_SEGNEXT(as, seg); 3033 if (seg == NULL || raddr != seg->s_base) { 3034 panic("as_iset3_default_lpsize: as changed"); 3035 } 3036 } 3037 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3038 ssize = seg->s_base + seg->s_size - raddr; 3039 } else { 3040 ssize = rsize; 3041 } 3042 3043 if (szc > seg->s_szc) { 3044 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc); 3045 /* Only retry on EINVAL segments that have no vnode. */ 3046 if (error == EINVAL) { 3047 vnode_t *vp = NULL; 3048 if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) && 3049 (SEGOP_GETVP(seg, raddr, &vp) != 0 || 3050 vp == NULL)) { 3051 *retry = 1; 3052 } else { 3053 *retry = 0; 3054 } 3055 } 3056 if (error) { 3057 return (error); 3058 } 3059 } 3060 } 3061 return (0); 3062 } 3063 3064 /* 3065 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the 3066 * pagesize on each segment in its range, but if any fails with EINVAL, 3067 * then it reduces the pagesizes to the next size in the bitmap and 3068 * retries as_iset3_default_lpsize(). The reason why the code retries 3069 * smaller allowed sizes on EINVAL is because (a) the anon offset may not 3070 * match the bigger sizes, and (b) it's hard to get this offset (to begin 3071 * with) to pass to map_pgszcvec(). 3072 */ 3073 static int 3074 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc, 3075 uint_t szcvec) 3076 { 3077 int error; 3078 int retry; 3079 3080 ASSERT(AS_WRITE_HELD(as)); 3081 3082 for (;;) { 3083 error = as_iset3_default_lpsize(as, addr, size, szc, &retry); 3084 if (error == EINVAL && retry) { 3085 szcvec &= ~(1 << szc); 3086 if (szcvec <= 1) { 3087 return (EINVAL); 3088 } 3089 szc = highbit(szcvec) - 1; 3090 } else { 3091 return (error); 3092 } 3093 } 3094 } 3095 3096 /* 3097 * as_iset1_default_lpsize() breaks its chunk into areas where existing 3098 * segments have a smaller szc than we want to set. For each such area, 3099 * it calls as_iset2_default_lpsize() 3100 */ 3101 static int 3102 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc, 3103 uint_t szcvec) 3104 { 3105 struct seg *seg; 3106 size_t ssize; 3107 caddr_t setaddr = raddr; 3108 size_t setsize = 0; 3109 int set; 3110 int error; 3111 3112 ASSERT(AS_WRITE_HELD(as)); 3113 3114 seg = as_segat(as, raddr); 3115 if (seg == NULL) { 3116 panic("as_iset1_default_lpsize: no seg"); 3117 } 3118 if (seg->s_szc < szc) { 3119 set = 1; 3120 } else { 3121 set = 0; 3122 } 3123 3124 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) { 3125 if (raddr >= seg->s_base + seg->s_size) { 3126 seg = AS_SEGNEXT(as, seg); 3127 if (seg == NULL || raddr != seg->s_base) { 3128 panic("as_iset1_default_lpsize: as changed"); 3129 } 3130 if (seg->s_szc >= szc && set) { 3131 ASSERT(setsize != 0); 3132 error = as_iset2_default_lpsize(as, 3133 setaddr, setsize, szc, szcvec); 3134 if (error) { 3135 return (error); 3136 } 3137 set = 0; 3138 } else if (seg->s_szc < szc && !set) { 3139 setaddr = raddr; 3140 setsize = 0; 3141 set = 1; 3142 } 3143 } 3144 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3145 ssize = seg->s_base + seg->s_size - raddr; 3146 } else { 3147 ssize = rsize; 3148 } 3149 } 3150 error = 0; 3151 if (set) { 3152 ASSERT(setsize != 0); 3153 error = as_iset2_default_lpsize(as, setaddr, setsize, 3154 szc, szcvec); 3155 } 3156 return (error); 3157 } 3158 3159 /* 3160 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap 3161 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each 3162 * chunk to as_iset1_default_lpsize(). 3163 */ 3164 static int 3165 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags, 3166 int type) 3167 { 3168 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM; 3169 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, 3170 flags, rtype, 1); 3171 uint_t szc; 3172 uint_t nszc; 3173 int error; 3174 caddr_t a; 3175 caddr_t eaddr; 3176 size_t segsize; 3177 size_t pgsz; 3178 uint_t save_szcvec; 3179 3180 ASSERT(AS_WRITE_HELD(as)); 3181 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 3182 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 3183 3184 szcvec &= ~1; 3185 if (szcvec <= 1) { /* skip if base page size */ 3186 return (0); 3187 } 3188 3189 /* Get the pagesize of the first larger page size. */ 3190 szc = lowbit(szcvec) - 1; 3191 pgsz = page_get_pagesize(szc); 3192 eaddr = addr + size; 3193 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 3194 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 3195 3196 save_szcvec = szcvec; 3197 szcvec >>= (szc + 1); 3198 nszc = szc; 3199 while (szcvec) { 3200 if ((szcvec & 0x1) == 0) { 3201 nszc++; 3202 szcvec >>= 1; 3203 continue; 3204 } 3205 nszc++; 3206 pgsz = page_get_pagesize(nszc); 3207 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 3208 if (a != addr) { 3209 ASSERT(szc > 0); 3210 ASSERT(a < eaddr); 3211 segsize = a - addr; 3212 error = as_iset1_default_lpsize(as, addr, segsize, szc, 3213 save_szcvec); 3214 if (error) { 3215 return (error); 3216 } 3217 addr = a; 3218 } 3219 szc = nszc; 3220 szcvec >>= 1; 3221 } 3222 3223 ASSERT(addr < eaddr); 3224 szcvec = save_szcvec; 3225 while (szcvec) { 3226 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 3227 ASSERT(a >= addr); 3228 if (a != addr) { 3229 ASSERT(szc > 0); 3230 segsize = a - addr; 3231 error = as_iset1_default_lpsize(as, addr, segsize, szc, 3232 save_szcvec); 3233 if (error) { 3234 return (error); 3235 } 3236 addr = a; 3237 } 3238 szcvec &= ~(1 << szc); 3239 if (szcvec) { 3240 szc = highbit(szcvec) - 1; 3241 pgsz = page_get_pagesize(szc); 3242 } 3243 } 3244 ASSERT(addr == eaddr); 3245 3246 return (0); 3247 } 3248 3249 /* 3250 * Set the default large page size for the range. Called via memcntl with 3251 * page size set to 0. as_set_default_lpsize breaks the range down into 3252 * chunks with the same type/flags, ignores-non segvn segments, and passes 3253 * each chunk to as_iset_default_lpsize(). 3254 */ 3255 int 3256 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size) 3257 { 3258 struct seg *seg; 3259 caddr_t raddr; 3260 size_t rsize; 3261 size_t ssize; 3262 int rtype, rflags; 3263 int stype, sflags; 3264 int error; 3265 caddr_t setaddr; 3266 size_t setsize; 3267 int segvn; 3268 3269 if (size == 0) 3270 return (0); 3271 3272 AS_LOCK_ENTER(as, RW_WRITER); 3273 again: 3274 error = 0; 3275 3276 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3277 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 3278 (size_t)raddr; 3279 3280 if (raddr + rsize < raddr) { /* check for wraparound */ 3281 AS_LOCK_EXIT(as); 3282 return (ENOMEM); 3283 } 3284 as_clearwatchprot(as, raddr, rsize); 3285 seg = as_segat(as, raddr); 3286 if (seg == NULL) { 3287 as_setwatch(as); 3288 AS_LOCK_EXIT(as); 3289 return (ENOMEM); 3290 } 3291 if (seg->s_ops == &segvn_ops) { 3292 rtype = SEGOP_GETTYPE(seg, addr); 3293 rflags = rtype & (MAP_TEXT | MAP_INITDATA); 3294 rtype = rtype & (MAP_SHARED | MAP_PRIVATE); 3295 segvn = 1; 3296 } else { 3297 segvn = 0; 3298 } 3299 setaddr = raddr; 3300 setsize = 0; 3301 3302 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) { 3303 if (raddr >= (seg->s_base + seg->s_size)) { 3304 seg = AS_SEGNEXT(as, seg); 3305 if (seg == NULL || raddr != seg->s_base) { 3306 error = ENOMEM; 3307 break; 3308 } 3309 if (seg->s_ops == &segvn_ops) { 3310 stype = SEGOP_GETTYPE(seg, raddr); 3311 sflags = stype & (MAP_TEXT | MAP_INITDATA); 3312 stype &= (MAP_SHARED | MAP_PRIVATE); 3313 if (segvn && (rflags != sflags || 3314 rtype != stype)) { 3315 /* 3316 * The next segment is also segvn but 3317 * has different flags and/or type. 3318 */ 3319 ASSERT(setsize != 0); 3320 error = as_iset_default_lpsize(as, 3321 setaddr, setsize, rflags, rtype); 3322 if (error) { 3323 break; 3324 } 3325 rflags = sflags; 3326 rtype = stype; 3327 setaddr = raddr; 3328 setsize = 0; 3329 } else if (!segvn) { 3330 rflags = sflags; 3331 rtype = stype; 3332 setaddr = raddr; 3333 setsize = 0; 3334 segvn = 1; 3335 } 3336 } else if (segvn) { 3337 /* The next segment is not segvn. */ 3338 ASSERT(setsize != 0); 3339 error = as_iset_default_lpsize(as, 3340 setaddr, setsize, rflags, rtype); 3341 if (error) { 3342 break; 3343 } 3344 segvn = 0; 3345 } 3346 } 3347 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3348 ssize = seg->s_base + seg->s_size - raddr; 3349 } else { 3350 ssize = rsize; 3351 } 3352 } 3353 if (error == 0 && segvn) { 3354 /* The last chunk when rsize == 0. */ 3355 ASSERT(setsize != 0); 3356 error = as_iset_default_lpsize(as, setaddr, setsize, 3357 rflags, rtype); 3358 } 3359 3360 if (error == IE_RETRY) { 3361 goto again; 3362 } else if (error == IE_NOMEM) { 3363 error = EAGAIN; 3364 } else if (error == ENOTSUP) { 3365 error = EINVAL; 3366 } else if (error == EAGAIN) { 3367 mutex_enter(&as->a_contents); 3368 if (!AS_ISNOUNMAPWAIT(as)) { 3369 if (AS_ISUNMAPWAIT(as) == 0) { 3370 cv_broadcast(&as->a_cv); 3371 } 3372 AS_SETUNMAPWAIT(as); 3373 AS_LOCK_EXIT(as); 3374 while (AS_ISUNMAPWAIT(as)) { 3375 cv_wait(&as->a_cv, &as->a_contents); 3376 } 3377 mutex_exit(&as->a_contents); 3378 AS_LOCK_ENTER(as, RW_WRITER); 3379 } else { 3380 /* 3381 * We may have raced with 3382 * segvn_reclaim()/segspt_reclaim(). In this case 3383 * clean nounmapwait flag and retry since softlockcnt 3384 * in this segment may be already 0. We don't drop as 3385 * writer lock so our number of retries without 3386 * sleeping should be very small. See segvn_reclaim() 3387 * for more comments. 3388 */ 3389 AS_CLRNOUNMAPWAIT(as); 3390 mutex_exit(&as->a_contents); 3391 } 3392 goto again; 3393 } 3394 3395 as_setwatch(as); 3396 AS_LOCK_EXIT(as); 3397 return (error); 3398 } 3399 3400 /* 3401 * Setup all of the uninitialized watched pages that we can. 3402 */ 3403 void 3404 as_setwatch(struct as *as) 3405 { 3406 struct watched_page *pwp; 3407 struct seg *seg; 3408 caddr_t vaddr; 3409 uint_t prot; 3410 int err, retrycnt; 3411 3412 if (avl_numnodes(&as->a_wpage) == 0) 3413 return; 3414 3415 ASSERT(AS_WRITE_HELD(as)); 3416 3417 for (pwp = avl_first(&as->a_wpage); pwp != NULL; 3418 pwp = AVL_NEXT(&as->a_wpage, pwp)) { 3419 retrycnt = 0; 3420 retry: 3421 vaddr = pwp->wp_vaddr; 3422 if (pwp->wp_oprot != 0 || /* already set up */ 3423 (seg = as_segat(as, vaddr)) == NULL || 3424 SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0) 3425 continue; 3426 3427 pwp->wp_oprot = prot; 3428 if (pwp->wp_read) 3429 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3430 if (pwp->wp_write) 3431 prot &= ~PROT_WRITE; 3432 if (pwp->wp_exec) 3433 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3434 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) { 3435 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot); 3436 if (err == IE_RETRY) { 3437 pwp->wp_oprot = 0; 3438 ASSERT(retrycnt == 0); 3439 retrycnt++; 3440 goto retry; 3441 } 3442 } 3443 pwp->wp_prot = prot; 3444 } 3445 } 3446 3447 /* 3448 * Clear all of the watched pages in the address space. 3449 */ 3450 void 3451 as_clearwatch(struct as *as) 3452 { 3453 struct watched_page *pwp; 3454 struct seg *seg; 3455 caddr_t vaddr; 3456 uint_t prot; 3457 int err, retrycnt; 3458 3459 if (avl_numnodes(&as->a_wpage) == 0) 3460 return; 3461 3462 ASSERT(AS_WRITE_HELD(as)); 3463 3464 for (pwp = avl_first(&as->a_wpage); pwp != NULL; 3465 pwp = AVL_NEXT(&as->a_wpage, pwp)) { 3466 retrycnt = 0; 3467 retry: 3468 vaddr = pwp->wp_vaddr; 3469 if (pwp->wp_oprot == 0 || /* not set up */ 3470 (seg = as_segat(as, vaddr)) == NULL) 3471 continue; 3472 3473 if ((prot = pwp->wp_oprot) != pwp->wp_prot) { 3474 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot); 3475 if (err == IE_RETRY) { 3476 ASSERT(retrycnt == 0); 3477 retrycnt++; 3478 goto retry; 3479 } 3480 } 3481 pwp->wp_oprot = 0; 3482 pwp->wp_prot = 0; 3483 } 3484 } 3485 3486 /* 3487 * Force a new setup for all the watched pages in the range. 3488 */ 3489 static void 3490 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 3491 { 3492 struct watched_page *pwp; 3493 struct watched_page tpw; 3494 caddr_t eaddr = addr + size; 3495 caddr_t vaddr; 3496 struct seg *seg; 3497 int err, retrycnt; 3498 uint_t wprot; 3499 avl_index_t where; 3500 3501 if (avl_numnodes(&as->a_wpage) == 0) 3502 return; 3503 3504 ASSERT(AS_WRITE_HELD(as)); 3505 3506 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3507 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) 3508 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); 3509 3510 while (pwp != NULL && pwp->wp_vaddr < eaddr) { 3511 retrycnt = 0; 3512 vaddr = pwp->wp_vaddr; 3513 3514 wprot = prot; 3515 if (pwp->wp_read) 3516 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3517 if (pwp->wp_write) 3518 wprot &= ~PROT_WRITE; 3519 if (pwp->wp_exec) 3520 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3521 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) { 3522 retry: 3523 seg = as_segat(as, vaddr); 3524 if (seg == NULL) { 3525 panic("as_setwatchprot: no seg"); 3526 /*NOTREACHED*/ 3527 } 3528 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot); 3529 if (err == IE_RETRY) { 3530 ASSERT(retrycnt == 0); 3531 retrycnt++; 3532 goto retry; 3533 } 3534 } 3535 pwp->wp_oprot = prot; 3536 pwp->wp_prot = wprot; 3537 3538 pwp = AVL_NEXT(&as->a_wpage, pwp); 3539 } 3540 } 3541 3542 /* 3543 * Clear all of the watched pages in the range. 3544 */ 3545 static void 3546 as_clearwatchprot(struct as *as, caddr_t addr, size_t size) 3547 { 3548 caddr_t eaddr = addr + size; 3549 struct watched_page *pwp; 3550 struct watched_page tpw; 3551 uint_t prot; 3552 struct seg *seg; 3553 int err, retrycnt; 3554 avl_index_t where; 3555 3556 if (avl_numnodes(&as->a_wpage) == 0) 3557 return; 3558 3559 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3560 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) 3561 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); 3562 3563 ASSERT(AS_WRITE_HELD(as)); 3564 3565 while (pwp != NULL && pwp->wp_vaddr < eaddr) { 3566 3567 if ((prot = pwp->wp_oprot) != 0) { 3568 retrycnt = 0; 3569 3570 if (prot != pwp->wp_prot) { 3571 retry: 3572 seg = as_segat(as, pwp->wp_vaddr); 3573 if (seg == NULL) 3574 continue; 3575 err = SEGOP_SETPROT(seg, pwp->wp_vaddr, 3576 PAGESIZE, prot); 3577 if (err == IE_RETRY) { 3578 ASSERT(retrycnt == 0); 3579 retrycnt++; 3580 goto retry; 3581 3582 } 3583 } 3584 pwp->wp_oprot = 0; 3585 pwp->wp_prot = 0; 3586 } 3587 3588 pwp = AVL_NEXT(&as->a_wpage, pwp); 3589 } 3590 } 3591 3592 void 3593 as_signal_proc(struct as *as, k_siginfo_t *siginfo) 3594 { 3595 struct proc *p; 3596 3597 mutex_enter(&pidlock); 3598 for (p = practive; p; p = p->p_next) { 3599 if (p->p_as == as) { 3600 mutex_enter(&p->p_lock); 3601 if (p->p_as == as) 3602 sigaddq(p, NULL, siginfo, KM_NOSLEEP); 3603 mutex_exit(&p->p_lock); 3604 } 3605 } 3606 mutex_exit(&pidlock); 3607 } 3608 3609 /* 3610 * return memory object ID 3611 */ 3612 int 3613 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp) 3614 { 3615 struct seg *seg; 3616 int sts; 3617 3618 AS_LOCK_ENTER(as, RW_READER); 3619 seg = as_segat(as, addr); 3620 if (seg == NULL) { 3621 AS_LOCK_EXIT(as); 3622 return (EFAULT); 3623 } 3624 /* 3625 * catch old drivers which may not support getmemid 3626 */ 3627 if (seg->s_ops->getmemid == NULL) { 3628 AS_LOCK_EXIT(as); 3629 return (ENODEV); 3630 } 3631 3632 sts = SEGOP_GETMEMID(seg, addr, memidp); 3633 3634 AS_LOCK_EXIT(as); 3635 return (sts); 3636 } 3637