1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - address spaces. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/t_lock.h> 47 #include <sys/param.h> 48 #include <sys/errno.h> 49 #include <sys/systm.h> 50 #include <sys/mman.h> 51 #include <sys/sysmacros.h> 52 #include <sys/cpuvar.h> 53 #include <sys/sysinfo.h> 54 #include <sys/kmem.h> 55 #include <sys/vnode.h> 56 #include <sys/vmsystm.h> 57 #include <sys/cmn_err.h> 58 #include <sys/debug.h> 59 #include <sys/tnf_probe.h> 60 #include <sys/vtrace.h> 61 62 #include <vm/hat.h> 63 #include <vm/xhat.h> 64 #include <vm/as.h> 65 #include <vm/seg.h> 66 #include <vm/seg_vn.h> 67 #include <vm/seg_dev.h> 68 #include <vm/seg_kmem.h> 69 #include <vm/seg_map.h> 70 #include <vm/seg_spt.h> 71 #include <vm/page.h> 72 73 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */ 74 75 static struct kmem_cache *as_cache; 76 77 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t); 78 static void as_clearwatchprot(struct as *, caddr_t, size_t); 79 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *); 80 81 82 /* 83 * Verifying the segment lists is very time-consuming; it may not be 84 * desirable always to define VERIFY_SEGLIST when DEBUG is set. 85 */ 86 #ifdef DEBUG 87 #define VERIFY_SEGLIST 88 int do_as_verify = 0; 89 #endif 90 91 /* 92 * Allocate a new callback data structure entry and fill in the events of 93 * interest, the address range of interest, and the callback argument. 94 * Link the entry on the as->a_callbacks list. A callback entry for the 95 * entire address space may be specified with vaddr = 0 and size = -1. 96 * 97 * CALLERS RESPONSIBILITY: If not calling from within the process context for 98 * the specified as, the caller must guarantee persistence of the specified as 99 * for the duration of this function (eg. pages being locked within the as 100 * will guarantee persistence). 101 */ 102 int 103 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events, 104 caddr_t vaddr, size_t size, int sleepflag) 105 { 106 struct as_callback *current_head, *cb; 107 caddr_t saddr; 108 size_t rsize; 109 110 /* callback function and an event are mandatory */ 111 if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0)) 112 return (EINVAL); 113 114 /* Adding a callback after as_free has been called is not allowed */ 115 if (as == &kas) 116 return (ENOMEM); 117 118 /* 119 * vaddr = 0 and size = -1 is used to indicate that the callback range 120 * is the entire address space so no rounding is done in that case. 121 */ 122 if (size != -1) { 123 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK); 124 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) - 125 (size_t)saddr; 126 /* check for wraparound */ 127 if (saddr + rsize < saddr) 128 return (ENOMEM); 129 } else { 130 if (vaddr != 0) 131 return (EINVAL); 132 saddr = vaddr; 133 rsize = size; 134 } 135 136 /* Allocate and initialize a callback entry */ 137 cb = kmem_zalloc(sizeof (struct as_callback), sleepflag); 138 if (cb == NULL) 139 return (EAGAIN); 140 141 cb->ascb_func = cb_func; 142 cb->ascb_arg = arg; 143 cb->ascb_events = events; 144 cb->ascb_saddr = saddr; 145 cb->ascb_len = rsize; 146 147 /* Add the entry to the list */ 148 mutex_enter(&as->a_contents); 149 current_head = as->a_callbacks; 150 as->a_callbacks = cb; 151 cb->ascb_next = current_head; 152 153 /* 154 * The call to this function may lose in a race with 155 * a pertinent event - eg. a thread does long term memory locking 156 * but before the callback is added another thread executes as_unmap. 157 * A broadcast here resolves that. 158 */ 159 if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) { 160 AS_CLRUNMAPWAIT(as); 161 cv_broadcast(&as->a_cv); 162 } 163 164 mutex_exit(&as->a_contents); 165 return (0); 166 } 167 168 /* 169 * Search the callback list for an entry which pertains to arg. 170 * 171 * This is called from within the client upon completion of the callback. 172 * RETURN VALUES: 173 * AS_CALLBACK_DELETED (callback entry found and deleted) 174 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok) 175 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this 176 * entry will be made in as_do_callbacks) 177 * 178 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED 179 * set, it indicates that as_do_callbacks is processing this entry. The 180 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made 181 * to unblock as_do_callbacks, in case it is blocked. 182 * 183 * CALLERS RESPONSIBILITY: If not calling from within the process context for 184 * the specified as, the caller must guarantee persistence of the specified as 185 * for the duration of this function (eg. pages being locked within the as 186 * will guarantee persistence). 187 */ 188 uint_t 189 as_delete_callback(struct as *as, void *arg) 190 { 191 struct as_callback **prevcb = &as->a_callbacks; 192 struct as_callback *cb; 193 uint_t rc = AS_CALLBACK_NOTFOUND; 194 195 mutex_enter(&as->a_contents); 196 for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) { 197 if (cb->ascb_arg != arg) 198 continue; 199 200 /* 201 * If the events indicate AS_CALLBACK_CALLED, just clear 202 * AS_ALL_EVENT in the events field and wakeup the thread 203 * that may be waiting in as_do_callbacks. as_do_callbacks 204 * will take care of removing this entry from the list. In 205 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise 206 * (AS_CALLBACK_CALLED not set), just remove it from the 207 * list, return the memory and return AS_CALLBACK_DELETED. 208 */ 209 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) { 210 /* leave AS_CALLBACK_CALLED */ 211 cb->ascb_events &= ~AS_ALL_EVENT; 212 rc = AS_CALLBACK_DELETE_DEFERRED; 213 cv_broadcast(&as->a_cv); 214 } else { 215 *prevcb = cb->ascb_next; 216 kmem_free(cb, sizeof (struct as_callback)); 217 rc = AS_CALLBACK_DELETED; 218 } 219 break; 220 } 221 mutex_exit(&as->a_contents); 222 return (rc); 223 } 224 225 /* 226 * Searches the as callback list for a matching entry. 227 * Returns a pointer to the first matching callback, or NULL if 228 * nothing is found. 229 * This function never sleeps so it is ok to call it with more 230 * locks held but the (required) a_contents mutex. 231 * 232 * See also comment on as_do_callbacks below. 233 */ 234 static struct as_callback * 235 as_find_callback(struct as *as, uint_t events, caddr_t event_addr, 236 size_t event_len) 237 { 238 struct as_callback *cb; 239 240 ASSERT(MUTEX_HELD(&as->a_contents)); 241 for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) { 242 /* 243 * If the callback has not already been called, then 244 * check if events or address range pertains. An event_len 245 * of zero means do an unconditional callback. 246 */ 247 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) || 248 ((event_len != 0) && (((cb->ascb_events & events) == 0) || 249 (event_addr + event_len < cb->ascb_saddr) || 250 (event_addr > (cb->ascb_saddr + cb->ascb_len))))) { 251 continue; 252 } 253 break; 254 } 255 return (cb); 256 } 257 258 /* 259 * Executes a given callback and removes it from the callback list for 260 * this address space. 261 * This function may sleep so the caller must drop all locks except 262 * a_contents before calling this func. 263 * 264 * See also comments on as_do_callbacks below. 265 */ 266 static void 267 as_execute_callback(struct as *as, struct as_callback *cb, 268 uint_t events) 269 { 270 struct as_callback **prevcb; 271 void *cb_arg; 272 273 ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events)); 274 cb->ascb_events |= AS_CALLBACK_CALLED; 275 mutex_exit(&as->a_contents); 276 (*cb->ascb_func)(as, cb->ascb_arg, events); 277 mutex_enter(&as->a_contents); 278 /* 279 * the callback function is required to delete the callback 280 * when the callback function determines it is OK for 281 * this thread to continue. as_delete_callback will clear 282 * the AS_ALL_EVENT in the events field when it is deleted. 283 * If the callback function called as_delete_callback, 284 * events will already be cleared and there will be no blocking. 285 */ 286 while ((cb->ascb_events & events) != 0) { 287 cv_wait(&as->a_cv, &as->a_contents); 288 } 289 /* 290 * This entry needs to be taken off the list. Normally, the 291 * callback func itself does that, but unfortunately the list 292 * may have changed while the callback was running because the 293 * a_contents mutex was dropped and someone else other than the 294 * callback func itself could have called as_delete_callback, 295 * so we have to search to find this entry again. The entry 296 * must have AS_CALLBACK_CALLED, and have the same 'arg'. 297 */ 298 cb_arg = cb->ascb_arg; 299 prevcb = &as->a_callbacks; 300 for (cb = as->a_callbacks; cb != NULL; 301 prevcb = &cb->ascb_next, cb = *prevcb) { 302 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) || 303 (cb_arg != cb->ascb_arg)) { 304 continue; 305 } 306 *prevcb = cb->ascb_next; 307 kmem_free(cb, sizeof (struct as_callback)); 308 break; 309 } 310 } 311 312 /* 313 * Check the callback list for a matching event and intersection of 314 * address range. If there is a match invoke the callback. Skip an entry if: 315 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED) 316 * - not event of interest 317 * - not address range of interest 318 * 319 * An event_len of zero indicates a request for an unconditional callback 320 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The 321 * a_contents lock must be dropped before a callback, so only one callback 322 * can be done before returning. Return -1 (true) if a callback was 323 * executed and removed from the list, else return 0 (false). 324 * 325 * The logically separate parts, i.e. finding a matching callback and 326 * executing a given callback have been separated into two functions 327 * so that they can be called with different sets of locks held beyond 328 * the always-required a_contents. as_find_callback does not sleep so 329 * it is ok to call it if more locks than a_contents (i.e. the a_lock 330 * rwlock) are held. as_execute_callback on the other hand may sleep 331 * so all locks beyond a_contents must be dropped by the caller if one 332 * does not want to end comatose. 333 */ 334 static int 335 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr, 336 size_t event_len) 337 { 338 struct as_callback *cb; 339 340 if ((cb = as_find_callback(as, events, event_addr, event_len))) { 341 as_execute_callback(as, cb, events); 342 return (-1); 343 } 344 return (0); 345 } 346 347 /* 348 * Search for the segment containing addr. If a segment containing addr 349 * exists, that segment is returned. If no such segment exists, and 350 * the list spans addresses greater than addr, then the first segment 351 * whose base is greater than addr is returned; otherwise, NULL is 352 * returned unless tail is true, in which case the last element of the 353 * list is returned. 354 * 355 * a_seglast is used to cache the last found segment for repeated 356 * searches to the same addr (which happens frequently). 357 */ 358 struct seg * 359 as_findseg(struct as *as, caddr_t addr, int tail) 360 { 361 struct seg *seg = as->a_seglast; 362 avl_index_t where; 363 364 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 365 366 if (seg != NULL && 367 seg->s_base <= addr && 368 addr < seg->s_base + seg->s_size) 369 return (seg); 370 371 seg = avl_find(&as->a_segtree, &addr, &where); 372 if (seg != NULL) 373 return (as->a_seglast = seg); 374 375 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); 376 if (seg == NULL && tail) 377 seg = avl_last(&as->a_segtree); 378 return (as->a_seglast = seg); 379 } 380 381 #ifdef VERIFY_SEGLIST 382 /* 383 * verify that the linked list is coherent 384 */ 385 static void 386 as_verify(struct as *as) 387 { 388 struct seg *seg, *seglast, *p, *n; 389 uint_t nsegs = 0; 390 391 if (do_as_verify == 0) 392 return; 393 394 seglast = as->a_seglast; 395 396 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 397 ASSERT(seg->s_as == as); 398 p = AS_SEGPREV(as, seg); 399 n = AS_SEGNEXT(as, seg); 400 ASSERT(p == NULL || p->s_as == as); 401 ASSERT(p == NULL || p->s_base < seg->s_base); 402 ASSERT(n == NULL || n->s_base > seg->s_base); 403 ASSERT(n != NULL || seg == avl_last(&as->a_segtree)); 404 if (seg == seglast) 405 seglast = NULL; 406 nsegs++; 407 } 408 ASSERT(seglast == NULL); 409 ASSERT(avl_numnodes(&as->a_segtree) == nsegs); 410 } 411 #endif /* VERIFY_SEGLIST */ 412 413 /* 414 * Add a new segment to the address space. The avl_find() 415 * may be expensive so we attempt to use last segment accessed 416 * in as_gap() as an insertion point. 417 */ 418 int 419 as_addseg(struct as *as, struct seg *newseg) 420 { 421 struct seg *seg; 422 caddr_t addr; 423 caddr_t eaddr; 424 avl_index_t where; 425 426 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 427 428 as->a_updatedir = 1; /* inform /proc */ 429 gethrestime(&as->a_updatetime); 430 431 if (as->a_lastgaphl != NULL) { 432 struct seg *hseg = NULL; 433 struct seg *lseg = NULL; 434 435 if (as->a_lastgaphl->s_base > newseg->s_base) { 436 hseg = as->a_lastgaphl; 437 lseg = AVL_PREV(&as->a_segtree, hseg); 438 } else { 439 lseg = as->a_lastgaphl; 440 hseg = AVL_NEXT(&as->a_segtree, lseg); 441 } 442 443 if (hseg && lseg && lseg->s_base < newseg->s_base && 444 hseg->s_base > newseg->s_base) { 445 avl_insert_here(&as->a_segtree, newseg, lseg, 446 AVL_AFTER); 447 as->a_lastgaphl = NULL; 448 as->a_seglast = newseg; 449 return (0); 450 } 451 as->a_lastgaphl = NULL; 452 } 453 454 addr = newseg->s_base; 455 eaddr = addr + newseg->s_size; 456 again: 457 458 seg = avl_find(&as->a_segtree, &addr, &where); 459 460 if (seg == NULL) 461 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER); 462 463 if (seg == NULL) 464 seg = avl_last(&as->a_segtree); 465 466 if (seg != NULL) { 467 caddr_t base = seg->s_base; 468 469 /* 470 * If top of seg is below the requested address, then 471 * the insertion point is at the end of the linked list, 472 * and seg points to the tail of the list. Otherwise, 473 * the insertion point is immediately before seg. 474 */ 475 if (base + seg->s_size > addr) { 476 if (addr >= base || eaddr > base) { 477 #ifdef __sparc 478 extern struct seg_ops segnf_ops; 479 480 /* 481 * no-fault segs must disappear if overlaid. 482 * XXX need new segment type so 483 * we don't have to check s_ops 484 */ 485 if (seg->s_ops == &segnf_ops) { 486 seg_unmap(seg); 487 goto again; 488 } 489 #endif 490 return (-1); /* overlapping segment */ 491 } 492 } 493 } 494 as->a_seglast = newseg; 495 avl_insert(&as->a_segtree, newseg, where); 496 497 #ifdef VERIFY_SEGLIST 498 as_verify(as); 499 #endif 500 return (0); 501 } 502 503 struct seg * 504 as_removeseg(struct as *as, struct seg *seg) 505 { 506 avl_tree_t *t; 507 508 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 509 510 as->a_updatedir = 1; /* inform /proc */ 511 gethrestime(&as->a_updatetime); 512 513 if (seg == NULL) 514 return (NULL); 515 516 t = &as->a_segtree; 517 if (as->a_seglast == seg) 518 as->a_seglast = NULL; 519 as->a_lastgaphl = NULL; 520 521 /* 522 * if this segment is at an address higher than 523 * a_lastgap, set a_lastgap to the next segment (NULL if last segment) 524 */ 525 if (as->a_lastgap && 526 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base)) 527 as->a_lastgap = AVL_NEXT(t, seg); 528 529 /* 530 * remove the segment from the seg tree 531 */ 532 avl_remove(t, seg); 533 534 #ifdef VERIFY_SEGLIST 535 as_verify(as); 536 #endif 537 return (seg); 538 } 539 540 /* 541 * Find a segment containing addr. 542 */ 543 struct seg * 544 as_segat(struct as *as, caddr_t addr) 545 { 546 struct seg *seg = as->a_seglast; 547 548 ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 549 550 if (seg != NULL && seg->s_base <= addr && 551 addr < seg->s_base + seg->s_size) 552 return (seg); 553 554 seg = avl_find(&as->a_segtree, &addr, NULL); 555 return (seg); 556 } 557 558 /* 559 * Serialize all searches for holes in an address space to 560 * prevent two or more threads from allocating the same virtual 561 * address range. The address space must not be "read/write" 562 * locked by the caller since we may block. 563 */ 564 void 565 as_rangelock(struct as *as) 566 { 567 mutex_enter(&as->a_contents); 568 while (AS_ISCLAIMGAP(as)) 569 cv_wait(&as->a_cv, &as->a_contents); 570 AS_SETCLAIMGAP(as); 571 mutex_exit(&as->a_contents); 572 } 573 574 /* 575 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads. 576 */ 577 void 578 as_rangeunlock(struct as *as) 579 { 580 mutex_enter(&as->a_contents); 581 AS_CLRCLAIMGAP(as); 582 cv_signal(&as->a_cv); 583 mutex_exit(&as->a_contents); 584 } 585 586 /* 587 * compar segments (or just an address) by segment address range 588 */ 589 static int 590 as_segcompar(const void *x, const void *y) 591 { 592 struct seg *a = (struct seg *)x; 593 struct seg *b = (struct seg *)y; 594 595 if (a->s_base < b->s_base) 596 return (-1); 597 if (a->s_base >= b->s_base + b->s_size) 598 return (1); 599 return (0); 600 } 601 602 603 void 604 as_avlinit(struct as *as) 605 { 606 avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg), 607 offsetof(struct seg, s_tree)); 608 avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page), 609 offsetof(struct watched_page, wp_link)); 610 } 611 612 /*ARGSUSED*/ 613 static int 614 as_constructor(void *buf, void *cdrarg, int kmflags) 615 { 616 struct as *as = buf; 617 618 mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL); 619 cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL); 620 rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL); 621 as_avlinit(as); 622 return (0); 623 } 624 625 /*ARGSUSED1*/ 626 static void 627 as_destructor(void *buf, void *cdrarg) 628 { 629 struct as *as = buf; 630 631 avl_destroy(&as->a_segtree); 632 mutex_destroy(&as->a_contents); 633 cv_destroy(&as->a_cv); 634 rw_destroy(&as->a_lock); 635 } 636 637 void 638 as_init(void) 639 { 640 as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0, 641 as_constructor, as_destructor, NULL, NULL, NULL, 0); 642 } 643 644 /* 645 * Allocate and initialize an address space data structure. 646 * We call hat_alloc to allow any machine dependent 647 * information in the hat structure to be initialized. 648 */ 649 struct as * 650 as_alloc(void) 651 { 652 struct as *as; 653 654 as = kmem_cache_alloc(as_cache, KM_SLEEP); 655 656 as->a_flags = 0; 657 as->a_vbits = 0; 658 as->a_hrm = NULL; 659 as->a_seglast = NULL; 660 as->a_size = 0; 661 as->a_updatedir = 0; 662 gethrestime(&as->a_updatetime); 663 as->a_objectdir = NULL; 664 as->a_sizedir = 0; 665 as->a_userlimit = (caddr_t)USERLIMIT; 666 as->a_lastgap = NULL; 667 as->a_lastgaphl = NULL; 668 as->a_callbacks = NULL; 669 670 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 671 as->a_hat = hat_alloc(as); /* create hat for default system mmu */ 672 AS_LOCK_EXIT(as, &as->a_lock); 673 674 as->a_xhat = NULL; 675 676 return (as); 677 } 678 679 /* 680 * Free an address space data structure. 681 * Need to free the hat first and then 682 * all the segments on this as and finally 683 * the space for the as struct itself. 684 */ 685 void 686 as_free(struct as *as) 687 { 688 struct hat *hat = as->a_hat; 689 struct seg *seg, *next; 690 int called = 0; 691 692 top: 693 /* 694 * Invoke ALL callbacks. as_do_callbacks will do one callback 695 * per call, and not return (-1) until the callback has completed. 696 * When as_do_callbacks returns zero, all callbacks have completed. 697 */ 698 mutex_enter(&as->a_contents); 699 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0)); 700 701 /* This will prevent new XHATs from attaching to as */ 702 if (!called) 703 AS_SETBUSY(as); 704 mutex_exit(&as->a_contents); 705 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 706 707 if (!called) { 708 called = 1; 709 hat_free_start(hat); 710 if (as->a_xhat != NULL) 711 xhat_free_start_all(as); 712 } 713 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) { 714 int err; 715 716 next = AS_SEGNEXT(as, seg); 717 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 718 if (err == EAGAIN) { 719 mutex_enter(&as->a_contents); 720 if (as->a_callbacks) { 721 AS_LOCK_EXIT(as, &as->a_lock); 722 } else { 723 /* 724 * Memory is currently locked. Wait for a 725 * cv_signal that it has been unlocked, then 726 * try the operation again. 727 */ 728 if (AS_ISUNMAPWAIT(as) == 0) 729 cv_broadcast(&as->a_cv); 730 AS_SETUNMAPWAIT(as); 731 AS_LOCK_EXIT(as, &as->a_lock); 732 while (AS_ISUNMAPWAIT(as)) 733 cv_wait(&as->a_cv, &as->a_contents); 734 } 735 mutex_exit(&as->a_contents); 736 goto top; 737 } else { 738 /* 739 * We do not expect any other error return at this 740 * time. This is similar to an ASSERT in seg_unmap() 741 */ 742 ASSERT(err == 0); 743 } 744 } 745 hat_free_end(hat); 746 if (as->a_xhat != NULL) 747 xhat_free_end_all(as); 748 AS_LOCK_EXIT(as, &as->a_lock); 749 750 /* /proc stuff */ 751 ASSERT(avl_numnodes(&as->a_wpage) == 0); 752 if (as->a_objectdir) { 753 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *)); 754 as->a_objectdir = NULL; 755 as->a_sizedir = 0; 756 } 757 758 /* 759 * Free the struct as back to kmem. Assert it has no segments. 760 */ 761 ASSERT(avl_numnodes(&as->a_segtree) == 0); 762 kmem_cache_free(as_cache, as); 763 } 764 765 int 766 as_dup(struct as *as, struct as **outas) 767 { 768 struct as *newas; 769 struct seg *seg, *newseg; 770 int error; 771 772 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 773 as_clearwatch(as); 774 newas = as_alloc(); 775 newas->a_userlimit = as->a_userlimit; 776 AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER); 777 778 /* This will prevent new XHATs from attaching */ 779 mutex_enter(&as->a_contents); 780 AS_SETBUSY(as); 781 mutex_exit(&as->a_contents); 782 mutex_enter(&newas->a_contents); 783 AS_SETBUSY(newas); 784 mutex_exit(&newas->a_contents); 785 786 787 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 788 789 if (seg->s_flags & S_PURGE) 790 continue; 791 792 newseg = seg_alloc(newas, seg->s_base, seg->s_size); 793 if (newseg == NULL) { 794 AS_LOCK_EXIT(newas, &newas->a_lock); 795 as_setwatch(as); 796 mutex_enter(&as->a_contents); 797 AS_CLRBUSY(as); 798 mutex_exit(&as->a_contents); 799 AS_LOCK_EXIT(as, &as->a_lock); 800 as_free(newas); 801 return (-1); 802 } 803 if ((error = SEGOP_DUP(seg, newseg)) != 0) { 804 /* 805 * We call seg_free() on the new seg 806 * because the segment is not set up 807 * completely; i.e. it has no ops. 808 */ 809 as_setwatch(as); 810 mutex_enter(&as->a_contents); 811 AS_CLRBUSY(as); 812 mutex_exit(&as->a_contents); 813 AS_LOCK_EXIT(as, &as->a_lock); 814 seg_free(newseg); 815 AS_LOCK_EXIT(newas, &newas->a_lock); 816 as_free(newas); 817 return (error); 818 } 819 newas->a_size += seg->s_size; 820 } 821 822 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL); 823 if (as->a_xhat != NULL) 824 error |= xhat_dup_all(as, newas, NULL, 0, HAT_DUP_ALL); 825 826 mutex_enter(&newas->a_contents); 827 AS_CLRBUSY(newas); 828 mutex_exit(&newas->a_contents); 829 AS_LOCK_EXIT(newas, &newas->a_lock); 830 831 as_setwatch(as); 832 mutex_enter(&as->a_contents); 833 AS_CLRBUSY(as); 834 mutex_exit(&as->a_contents); 835 AS_LOCK_EXIT(as, &as->a_lock); 836 if (error != 0) { 837 as_free(newas); 838 return (error); 839 } 840 *outas = newas; 841 return (0); 842 } 843 844 /* 845 * Handle a ``fault'' at addr for size bytes. 846 */ 847 faultcode_t 848 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size, 849 enum fault_type type, enum seg_rw rw) 850 { 851 struct seg *seg; 852 caddr_t raddr; /* rounded down addr */ 853 size_t rsize; /* rounded up size */ 854 size_t ssize; 855 faultcode_t res = 0; 856 caddr_t addrsav; 857 struct seg *segsav; 858 int as_lock_held; 859 klwp_t *lwp = ttolwp(curthread); 860 int is_xhat = 0; 861 int holding_wpage = 0; 862 extern struct seg_ops segdev_ops; 863 864 865 866 if (as->a_hat != hat) { 867 /* This must be an XHAT then */ 868 is_xhat = 1; 869 870 if ((type != F_INVAL) || (as == &kas)) 871 return (FC_NOSUPPORT); 872 } 873 874 retry: 875 if (!is_xhat) { 876 /* 877 * Indicate that the lwp is not to be stopped while waiting 878 * for a pagefault. This is to avoid deadlock while debugging 879 * a process via /proc over NFS (in particular). 880 */ 881 if (lwp != NULL) 882 lwp->lwp_nostop++; 883 884 /* 885 * same length must be used when we softlock and softunlock. 886 * We don't support softunlocking lengths less than 887 * the original length when there is largepage support. 888 * See seg_dev.c for more comments. 889 */ 890 switch (type) { 891 892 case F_SOFTLOCK: 893 CPU_STATS_ADD_K(vm, softlock, 1); 894 break; 895 896 case F_SOFTUNLOCK: 897 break; 898 899 case F_PROT: 900 CPU_STATS_ADD_K(vm, prot_fault, 1); 901 break; 902 903 case F_INVAL: 904 CPU_STATS_ENTER_K(); 905 CPU_STATS_ADDQ(CPU, vm, as_fault, 1); 906 if (as == &kas) 907 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1); 908 CPU_STATS_EXIT_K(); 909 break; 910 } 911 } 912 913 /* Kernel probe */ 914 TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */, 915 tnf_opaque, address, addr, 916 tnf_fault_type, fault_type, type, 917 tnf_seg_access, access, rw); 918 919 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 920 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 921 (size_t)raddr; 922 923 /* 924 * XXX -- Don't grab the as lock for segkmap. We should grab it for 925 * correctness, but then we could be stuck holding this lock for 926 * a LONG time if the fault needs to be resolved on a slow 927 * filesystem, and then no-one will be able to exec new commands, 928 * as exec'ing requires the write lock on the as. 929 */ 930 if (as == &kas && segkmap && segkmap->s_base <= raddr && 931 raddr + size < segkmap->s_base + segkmap->s_size) { 932 /* 933 * if (as==&kas), this can't be XHAT: we've already returned 934 * FC_NOSUPPORT. 935 */ 936 seg = segkmap; 937 as_lock_held = 0; 938 } else { 939 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 940 if (is_xhat && avl_numnodes(&as->a_wpage) != 0) { 941 /* 942 * Grab and hold the writers' lock on the as 943 * if the fault is to a watched page. 944 * This will keep CPUs from "peeking" at the 945 * address range while we're temporarily boosting 946 * the permissions for the XHAT device to 947 * resolve the fault in the segment layer. 948 * 949 * We could check whether faulted address 950 * is within a watched page and only then grab 951 * the writer lock, but this is simpler. 952 */ 953 AS_LOCK_EXIT(as, &as->a_lock); 954 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 955 } 956 957 seg = as_segat(as, raddr); 958 if (seg == NULL) { 959 AS_LOCK_EXIT(as, &as->a_lock); 960 if ((lwp != NULL) && (!is_xhat)) 961 lwp->lwp_nostop--; 962 return (FC_NOMAP); 963 } 964 965 as_lock_held = 1; 966 } 967 968 addrsav = raddr; 969 segsav = seg; 970 971 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 972 if (raddr >= seg->s_base + seg->s_size) { 973 seg = AS_SEGNEXT(as, seg); 974 if (seg == NULL || raddr != seg->s_base) { 975 res = FC_NOMAP; 976 break; 977 } 978 } 979 if (raddr + rsize > seg->s_base + seg->s_size) 980 ssize = seg->s_base + seg->s_size - raddr; 981 else 982 ssize = rsize; 983 984 if (!is_xhat || (seg->s_ops != &segdev_ops)) { 985 986 if (is_xhat && avl_numnodes(&as->a_wpage) != 0 && 987 pr_is_watchpage_as(raddr, rw, as)) { 988 /* 989 * Handle watch pages. If we're faulting on a 990 * watched page from an X-hat, we have to 991 * restore the original permissions while we 992 * handle the fault. 993 */ 994 as_clearwatch(as); 995 holding_wpage = 1; 996 } 997 998 res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw); 999 1000 /* Restore watchpoints */ 1001 if (holding_wpage) { 1002 as_setwatch(as); 1003 holding_wpage = 0; 1004 } 1005 1006 if (res != 0) 1007 break; 1008 } else { 1009 /* XHAT does not support seg_dev */ 1010 res = FC_NOSUPPORT; 1011 break; 1012 } 1013 } 1014 1015 /* 1016 * If we were SOFTLOCKing and encountered a failure, 1017 * we must SOFTUNLOCK the range we already did. (Maybe we 1018 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing 1019 * right here...) 1020 */ 1021 if (res != 0 && type == F_SOFTLOCK) { 1022 for (seg = segsav; addrsav < raddr; addrsav += ssize) { 1023 if (addrsav >= seg->s_base + seg->s_size) 1024 seg = AS_SEGNEXT(as, seg); 1025 ASSERT(seg != NULL); 1026 /* 1027 * Now call the fault routine again to perform the 1028 * unlock using S_OTHER instead of the rw variable 1029 * since we never got a chance to touch the pages. 1030 */ 1031 if (raddr > seg->s_base + seg->s_size) 1032 ssize = seg->s_base + seg->s_size - addrsav; 1033 else 1034 ssize = raddr - addrsav; 1035 (void) SEGOP_FAULT(hat, seg, addrsav, ssize, 1036 F_SOFTUNLOCK, S_OTHER); 1037 } 1038 } 1039 if (as_lock_held) 1040 AS_LOCK_EXIT(as, &as->a_lock); 1041 if ((lwp != NULL) && (!is_xhat)) 1042 lwp->lwp_nostop--; 1043 1044 /* 1045 * If the lower levels returned EDEADLK for a fault, 1046 * It means that we should retry the fault. Let's wait 1047 * a bit also to let the deadlock causing condition clear. 1048 * This is part of a gross hack to work around a design flaw 1049 * in the ufs/sds logging code and should go away when the 1050 * logging code is re-designed to fix the problem. See bug 1051 * 4125102 for details of the problem. 1052 */ 1053 if (FC_ERRNO(res) == EDEADLK) { 1054 delay(deadlk_wait); 1055 res = 0; 1056 goto retry; 1057 } 1058 return (res); 1059 } 1060 1061 1062 1063 /* 1064 * Asynchronous ``fault'' at addr for size bytes. 1065 */ 1066 faultcode_t 1067 as_faulta(struct as *as, caddr_t addr, size_t size) 1068 { 1069 struct seg *seg; 1070 caddr_t raddr; /* rounded down addr */ 1071 size_t rsize; /* rounded up size */ 1072 faultcode_t res = 0; 1073 klwp_t *lwp = ttolwp(curthread); 1074 1075 retry: 1076 /* 1077 * Indicate that the lwp is not to be stopped while waiting 1078 * for a pagefault. This is to avoid deadlock while debugging 1079 * a process via /proc over NFS (in particular). 1080 */ 1081 if (lwp != NULL) 1082 lwp->lwp_nostop++; 1083 1084 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1085 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1086 (size_t)raddr; 1087 1088 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1089 seg = as_segat(as, raddr); 1090 if (seg == NULL) { 1091 AS_LOCK_EXIT(as, &as->a_lock); 1092 if (lwp != NULL) 1093 lwp->lwp_nostop--; 1094 return (FC_NOMAP); 1095 } 1096 1097 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) { 1098 if (raddr >= seg->s_base + seg->s_size) { 1099 seg = AS_SEGNEXT(as, seg); 1100 if (seg == NULL || raddr != seg->s_base) { 1101 res = FC_NOMAP; 1102 break; 1103 } 1104 } 1105 res = SEGOP_FAULTA(seg, raddr); 1106 if (res != 0) 1107 break; 1108 } 1109 AS_LOCK_EXIT(as, &as->a_lock); 1110 if (lwp != NULL) 1111 lwp->lwp_nostop--; 1112 /* 1113 * If the lower levels returned EDEADLK for a fault, 1114 * It means that we should retry the fault. Let's wait 1115 * a bit also to let the deadlock causing condition clear. 1116 * This is part of a gross hack to work around a design flaw 1117 * in the ufs/sds logging code and should go away when the 1118 * logging code is re-designed to fix the problem. See bug 1119 * 4125102 for details of the problem. 1120 */ 1121 if (FC_ERRNO(res) == EDEADLK) { 1122 delay(deadlk_wait); 1123 res = 0; 1124 goto retry; 1125 } 1126 return (res); 1127 } 1128 1129 /* 1130 * Set the virtual mapping for the interval from [addr : addr + size) 1131 * in address space `as' to have the specified protection. 1132 * It is ok for the range to cross over several segments, 1133 * as long as they are contiguous. 1134 */ 1135 int 1136 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 1137 { 1138 struct seg *seg; 1139 struct as_callback *cb; 1140 size_t ssize; 1141 caddr_t raddr; /* rounded down addr */ 1142 size_t rsize; /* rounded up size */ 1143 int error = 0, writer = 0; 1144 caddr_t saveraddr; 1145 size_t saversize; 1146 1147 setprot_top: 1148 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1149 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1150 (size_t)raddr; 1151 1152 if (raddr + rsize < raddr) /* check for wraparound */ 1153 return (ENOMEM); 1154 1155 saveraddr = raddr; 1156 saversize = rsize; 1157 1158 /* 1159 * Normally we only lock the as as a reader. But 1160 * if due to setprot the segment driver needs to split 1161 * a segment it will return IE_RETRY. Therefore we re-aquire 1162 * the as lock as a writer so the segment driver can change 1163 * the seg list. Also the segment driver will return IE_RETRY 1164 * after it has changed the segment list so we therefore keep 1165 * locking as a writer. Since these opeartions should be rare 1166 * want to only lock as a writer when necessary. 1167 */ 1168 if (writer || avl_numnodes(&as->a_wpage) != 0) { 1169 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1170 } else { 1171 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1172 } 1173 1174 as_clearwatchprot(as, raddr, rsize); 1175 seg = as_segat(as, raddr); 1176 if (seg == NULL) { 1177 as_setwatch(as); 1178 AS_LOCK_EXIT(as, &as->a_lock); 1179 return (ENOMEM); 1180 } 1181 1182 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 1183 if (raddr >= seg->s_base + seg->s_size) { 1184 seg = AS_SEGNEXT(as, seg); 1185 if (seg == NULL || raddr != seg->s_base) { 1186 error = ENOMEM; 1187 break; 1188 } 1189 } 1190 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 1191 ssize = seg->s_base + seg->s_size - raddr; 1192 else 1193 ssize = rsize; 1194 error = SEGOP_SETPROT(seg, raddr, ssize, prot); 1195 1196 if (error == IE_NOMEM) { 1197 error = EAGAIN; 1198 break; 1199 } 1200 1201 if (error == IE_RETRY) { 1202 AS_LOCK_EXIT(as, &as->a_lock); 1203 writer = 1; 1204 goto setprot_top; 1205 } 1206 1207 if (error == EAGAIN) { 1208 /* 1209 * Make sure we have a_lock as writer. 1210 */ 1211 if (writer == 0) { 1212 AS_LOCK_EXIT(as, &as->a_lock); 1213 writer = 1; 1214 goto setprot_top; 1215 } 1216 1217 /* 1218 * Memory is currently locked. It must be unlocked 1219 * before this operation can succeed through a retry. 1220 * The possible reasons for locked memory and 1221 * corresponding strategies for unlocking are: 1222 * (1) Normal I/O 1223 * wait for a signal that the I/O operation 1224 * has completed and the memory is unlocked. 1225 * (2) Asynchronous I/O 1226 * The aio subsystem does not unlock pages when 1227 * the I/O is completed. Those pages are unlocked 1228 * when the application calls aiowait/aioerror. 1229 * So, to prevent blocking forever, cv_broadcast() 1230 * is done to wake up aio_cleanup_thread. 1231 * Subsequently, segvn_reclaim will be called, and 1232 * that will do AS_CLRUNMAPWAIT() and wake us up. 1233 * (3) Long term page locking: 1234 * Drivers intending to have pages locked for a 1235 * period considerably longer than for normal I/O 1236 * (essentially forever) may have registered for a 1237 * callback so they may unlock these pages on 1238 * request. This is needed to allow this operation 1239 * to succeed. Each entry on the callback list is 1240 * examined. If the event or address range pertains 1241 * the callback is invoked (unless it already is in 1242 * progress). The a_contents lock must be dropped 1243 * before the callback, so only one callback can 1244 * be done at a time. Go to the top and do more 1245 * until zero is returned. If zero is returned, 1246 * either there were no callbacks for this event 1247 * or they were already in progress. 1248 */ 1249 mutex_enter(&as->a_contents); 1250 if (as->a_callbacks && 1251 (cb = as_find_callback(as, AS_SETPROT_EVENT, 1252 seg->s_base, seg->s_size))) { 1253 AS_LOCK_EXIT(as, &as->a_lock); 1254 as_execute_callback(as, cb, AS_SETPROT_EVENT); 1255 } else { 1256 if (AS_ISUNMAPWAIT(as) == 0) 1257 cv_broadcast(&as->a_cv); 1258 AS_SETUNMAPWAIT(as); 1259 AS_LOCK_EXIT(as, &as->a_lock); 1260 while (AS_ISUNMAPWAIT(as)) 1261 cv_wait(&as->a_cv, &as->a_contents); 1262 } 1263 mutex_exit(&as->a_contents); 1264 goto setprot_top; 1265 } else if (error != 0) 1266 break; 1267 } 1268 if (error != 0) { 1269 as_setwatch(as); 1270 } else { 1271 as_setwatchprot(as, saveraddr, saversize, prot); 1272 } 1273 AS_LOCK_EXIT(as, &as->a_lock); 1274 return (error); 1275 } 1276 1277 /* 1278 * Check to make sure that the interval [addr, addr + size) 1279 * in address space `as' has at least the specified protection. 1280 * It is ok for the range to cross over several segments, as long 1281 * as they are contiguous. 1282 */ 1283 int 1284 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 1285 { 1286 struct seg *seg; 1287 size_t ssize; 1288 caddr_t raddr; /* rounded down addr */ 1289 size_t rsize; /* rounded up size */ 1290 int error = 0; 1291 1292 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1293 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1294 (size_t)raddr; 1295 1296 if (raddr + rsize < raddr) /* check for wraparound */ 1297 return (ENOMEM); 1298 1299 /* 1300 * This is ugly as sin... 1301 * Normally, we only acquire the address space readers lock. 1302 * However, if the address space has watchpoints present, 1303 * we must acquire the writer lock on the address space for 1304 * the benefit of as_clearwatchprot() and as_setwatchprot(). 1305 */ 1306 if (avl_numnodes(&as->a_wpage) != 0) 1307 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1308 else 1309 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1310 as_clearwatchprot(as, raddr, rsize); 1311 seg = as_segat(as, raddr); 1312 if (seg == NULL) { 1313 as_setwatch(as); 1314 AS_LOCK_EXIT(as, &as->a_lock); 1315 return (ENOMEM); 1316 } 1317 1318 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 1319 if (raddr >= seg->s_base + seg->s_size) { 1320 seg = AS_SEGNEXT(as, seg); 1321 if (seg == NULL || raddr != seg->s_base) { 1322 error = ENOMEM; 1323 break; 1324 } 1325 } 1326 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 1327 ssize = seg->s_base + seg->s_size - raddr; 1328 else 1329 ssize = rsize; 1330 1331 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot); 1332 if (error != 0) 1333 break; 1334 } 1335 as_setwatch(as); 1336 AS_LOCK_EXIT(as, &as->a_lock); 1337 return (error); 1338 } 1339 1340 int 1341 as_unmap(struct as *as, caddr_t addr, size_t size) 1342 { 1343 struct seg *seg, *seg_next; 1344 struct as_callback *cb; 1345 caddr_t raddr, eaddr; 1346 size_t ssize; 1347 int err; 1348 1349 top: 1350 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1351 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) & 1352 (uintptr_t)PAGEMASK); 1353 1354 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1355 1356 as->a_updatedir = 1; /* inform /proc */ 1357 gethrestime(&as->a_updatetime); 1358 1359 /* 1360 * Use as_findseg to find the first segment in the range, then 1361 * step through the segments in order, following s_next. 1362 */ 1363 as_clearwatchprot(as, raddr, eaddr - raddr); 1364 1365 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) { 1366 if (eaddr <= seg->s_base) 1367 break; /* eaddr was in a gap; all done */ 1368 1369 /* this is implied by the test above */ 1370 ASSERT(raddr < eaddr); 1371 1372 if (raddr < seg->s_base) 1373 raddr = seg->s_base; /* raddr was in a gap */ 1374 1375 if (eaddr > (seg->s_base + seg->s_size)) 1376 ssize = seg->s_base + seg->s_size - raddr; 1377 else 1378 ssize = eaddr - raddr; 1379 1380 /* 1381 * Save next segment pointer since seg can be 1382 * destroyed during the segment unmap operation. 1383 */ 1384 seg_next = AS_SEGNEXT(as, seg); 1385 1386 err = SEGOP_UNMAP(seg, raddr, ssize); 1387 if (err == EAGAIN) { 1388 /* 1389 * Memory is currently locked. It must be unlocked 1390 * before this operation can succeed through a retry. 1391 * The possible reasons for locked memory and 1392 * corresponding strategies for unlocking are: 1393 * (1) Normal I/O 1394 * wait for a signal that the I/O operation 1395 * has completed and the memory is unlocked. 1396 * (2) Asynchronous I/O 1397 * The aio subsystem does not unlock pages when 1398 * the I/O is completed. Those pages are unlocked 1399 * when the application calls aiowait/aioerror. 1400 * So, to prevent blocking forever, cv_broadcast() 1401 * is done to wake up aio_cleanup_thread. 1402 * Subsequently, segvn_reclaim will be called, and 1403 * that will do AS_CLRUNMAPWAIT() and wake us up. 1404 * (3) Long term page locking: 1405 * Drivers intending to have pages locked for a 1406 * period considerably longer than for normal I/O 1407 * (essentially forever) may have registered for a 1408 * callback so they may unlock these pages on 1409 * request. This is needed to allow this operation 1410 * to succeed. Each entry on the callback list is 1411 * examined. If the event or address range pertains 1412 * the callback is invoked (unless it already is in 1413 * progress). The a_contents lock must be dropped 1414 * before the callback, so only one callback can 1415 * be done at a time. Go to the top and do more 1416 * until zero is returned. If zero is returned, 1417 * either there were no callbacks for this event 1418 * or they were already in progress. 1419 */ 1420 as_setwatch(as); 1421 mutex_enter(&as->a_contents); 1422 if (as->a_callbacks && 1423 (cb = as_find_callback(as, AS_UNMAP_EVENT, 1424 seg->s_base, seg->s_size))) { 1425 AS_LOCK_EXIT(as, &as->a_lock); 1426 as_execute_callback(as, cb, AS_UNMAP_EVENT); 1427 } else { 1428 if (AS_ISUNMAPWAIT(as) == 0) 1429 cv_broadcast(&as->a_cv); 1430 AS_SETUNMAPWAIT(as); 1431 AS_LOCK_EXIT(as, &as->a_lock); 1432 while (AS_ISUNMAPWAIT(as)) 1433 cv_wait(&as->a_cv, &as->a_contents); 1434 } 1435 mutex_exit(&as->a_contents); 1436 goto top; 1437 } else if (err == IE_RETRY) { 1438 as_setwatch(as); 1439 AS_LOCK_EXIT(as, &as->a_lock); 1440 goto top; 1441 } else if (err) { 1442 as_setwatch(as); 1443 AS_LOCK_EXIT(as, &as->a_lock); 1444 return (-1); 1445 } 1446 1447 as->a_size -= ssize; 1448 raddr += ssize; 1449 } 1450 AS_LOCK_EXIT(as, &as->a_lock); 1451 return (0); 1452 } 1453 1454 static int 1455 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec, 1456 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated) 1457 { 1458 uint_t szc; 1459 uint_t nszc; 1460 int error; 1461 caddr_t a; 1462 caddr_t eaddr; 1463 size_t segsize; 1464 struct seg *seg; 1465 size_t pgsz; 1466 int do_off = (vn_a->vp != NULL || vn_a->amp != NULL); 1467 uint_t save_szcvec; 1468 1469 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1470 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1471 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1472 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL); 1473 if (!do_off) { 1474 vn_a->offset = 0; 1475 } 1476 1477 if (szcvec <= 1) { 1478 seg = seg_alloc(as, addr, size); 1479 if (seg == NULL) { 1480 return (ENOMEM); 1481 } 1482 vn_a->szc = 0; 1483 error = (*crfp)(seg, vn_a); 1484 if (error != 0) { 1485 seg_free(seg); 1486 } 1487 return (error); 1488 } 1489 1490 eaddr = addr + size; 1491 save_szcvec = szcvec; 1492 szcvec >>= 1; 1493 szc = 0; 1494 nszc = 0; 1495 while (szcvec) { 1496 if ((szcvec & 0x1) == 0) { 1497 nszc++; 1498 szcvec >>= 1; 1499 continue; 1500 } 1501 nszc++; 1502 pgsz = page_get_pagesize(nszc); 1503 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 1504 if (a != addr) { 1505 ASSERT(a < eaddr); 1506 segsize = a - addr; 1507 seg = seg_alloc(as, addr, segsize); 1508 if (seg == NULL) { 1509 return (ENOMEM); 1510 } 1511 vn_a->szc = szc; 1512 error = (*crfp)(seg, vn_a); 1513 if (error != 0) { 1514 seg_free(seg); 1515 return (error); 1516 } 1517 *segcreated = 1; 1518 if (do_off) { 1519 vn_a->offset += segsize; 1520 } 1521 addr = a; 1522 } 1523 szc = nszc; 1524 szcvec >>= 1; 1525 } 1526 1527 ASSERT(addr < eaddr); 1528 szcvec = save_szcvec | 1; /* add 8K pages */ 1529 while (szcvec) { 1530 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 1531 ASSERT(a >= addr); 1532 if (a != addr) { 1533 segsize = a - addr; 1534 seg = seg_alloc(as, addr, segsize); 1535 if (seg == NULL) { 1536 return (ENOMEM); 1537 } 1538 vn_a->szc = szc; 1539 error = (*crfp)(seg, vn_a); 1540 if (error != 0) { 1541 seg_free(seg); 1542 return (error); 1543 } 1544 *segcreated = 1; 1545 if (do_off) { 1546 vn_a->offset += segsize; 1547 } 1548 addr = a; 1549 } 1550 szcvec &= ~(1 << szc); 1551 if (szcvec) { 1552 szc = highbit(szcvec) - 1; 1553 pgsz = page_get_pagesize(szc); 1554 } 1555 } 1556 ASSERT(addr == eaddr); 1557 1558 return (0); 1559 } 1560 1561 static int 1562 as_map_vnsegs(struct as *as, caddr_t addr, size_t size, 1563 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated) 1564 { 1565 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA); 1566 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM; 1567 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags, 1568 type, 0); 1569 int error; 1570 struct seg *seg; 1571 struct vattr va; 1572 u_offset_t eoff; 1573 size_t save_size = 0; 1574 1575 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1576 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1577 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1578 ASSERT(vn_a->vp != NULL); 1579 ASSERT(vn_a->amp == NULL); 1580 1581 again: 1582 if (szcvec <= 1) { 1583 seg = seg_alloc(as, addr, size); 1584 if (seg == NULL) { 1585 return (ENOMEM); 1586 } 1587 vn_a->szc = 0; 1588 error = (*crfp)(seg, vn_a); 1589 if (error != 0) { 1590 seg_free(seg); 1591 } 1592 return (error); 1593 } 1594 1595 va.va_mask = AT_SIZE; 1596 if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred) != 0) { 1597 szcvec = 0; 1598 goto again; 1599 } 1600 eoff = vn_a->offset & PAGEMASK; 1601 if (eoff >= va.va_size) { 1602 szcvec = 0; 1603 goto again; 1604 } 1605 eoff += size; 1606 if (btopr(va.va_size) < btopr(eoff)) { 1607 save_size = size; 1608 size = va.va_size - (vn_a->offset & PAGEMASK); 1609 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t); 1610 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags, 1611 type, 0); 1612 if (szcvec <= 1) { 1613 size = save_size; 1614 goto again; 1615 } 1616 } 1617 1618 error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a, 1619 segcreated); 1620 if (error != 0) { 1621 return (error); 1622 } 1623 if (save_size) { 1624 addr += size; 1625 size = save_size - size; 1626 szcvec = 0; 1627 goto again; 1628 } 1629 return (0); 1630 } 1631 1632 /* 1633 * as_map_ansegs: shared or private anonymous memory. Note that the flags 1634 * passed to map_pgszvec cannot be MAP_INITDATA, for anon. 1635 */ 1636 static int 1637 as_map_ansegs(struct as *as, caddr_t addr, size_t size, 1638 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated) 1639 { 1640 uint_t szcvec; 1641 uchar_t type; 1642 1643 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE); 1644 if (vn_a->type == MAP_SHARED) { 1645 type = MAPPGSZC_SHM; 1646 } else if (vn_a->type == MAP_PRIVATE) { 1647 if (vn_a->szc == AS_MAP_HEAP) { 1648 type = MAPPGSZC_HEAP; 1649 } else if (vn_a->szc == AS_MAP_STACK) { 1650 type = MAPPGSZC_STACK; 1651 } else { 1652 type = MAPPGSZC_PRIVM; 1653 } 1654 } 1655 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ? 1656 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE), 1657 (vn_a->flags & MAP_TEXT), type, 0); 1658 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1659 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 1660 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 1661 ASSERT(vn_a->vp == NULL); 1662 1663 return (as_map_segvn_segs(as, addr, size, szcvec, 1664 crfp, vn_a, segcreated)); 1665 } 1666 1667 int 1668 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp) 1669 { 1670 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1671 return (as_map_locked(as, addr, size, crfp, argsp)); 1672 } 1673 1674 int 1675 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(), 1676 void *argsp) 1677 { 1678 struct seg *seg = NULL; 1679 caddr_t raddr; /* rounded down addr */ 1680 size_t rsize; /* rounded up size */ 1681 int error; 1682 int unmap = 0; 1683 struct proc *p = curproc; 1684 struct segvn_crargs crargs; 1685 1686 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1687 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 1688 (size_t)raddr; 1689 1690 /* 1691 * check for wrap around 1692 */ 1693 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) { 1694 AS_LOCK_EXIT(as, &as->a_lock); 1695 return (ENOMEM); 1696 } 1697 1698 as->a_updatedir = 1; /* inform /proc */ 1699 gethrestime(&as->a_updatetime); 1700 1701 if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) { 1702 AS_LOCK_EXIT(as, &as->a_lock); 1703 1704 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p, 1705 RCA_UNSAFE_ALL); 1706 1707 return (ENOMEM); 1708 } 1709 1710 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) { 1711 crargs = *(struct segvn_crargs *)argsp; 1712 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap); 1713 if (error != 0) { 1714 AS_LOCK_EXIT(as, &as->a_lock); 1715 if (unmap) { 1716 (void) as_unmap(as, addr, size); 1717 } 1718 return (error); 1719 } 1720 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) { 1721 crargs = *(struct segvn_crargs *)argsp; 1722 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap); 1723 if (error != 0) { 1724 AS_LOCK_EXIT(as, &as->a_lock); 1725 if (unmap) { 1726 (void) as_unmap(as, addr, size); 1727 } 1728 return (error); 1729 } 1730 } else { 1731 seg = seg_alloc(as, addr, size); 1732 if (seg == NULL) { 1733 AS_LOCK_EXIT(as, &as->a_lock); 1734 return (ENOMEM); 1735 } 1736 1737 error = (*crfp)(seg, argsp); 1738 if (error != 0) { 1739 seg_free(seg); 1740 AS_LOCK_EXIT(as, &as->a_lock); 1741 return (error); 1742 } 1743 } 1744 1745 /* 1746 * Add size now so as_unmap will work if as_ctl fails. 1747 */ 1748 as->a_size += rsize; 1749 1750 as_setwatch(as); 1751 1752 /* 1753 * If the address space is locked, 1754 * establish memory locks for the new segment. 1755 */ 1756 mutex_enter(&as->a_contents); 1757 if (AS_ISPGLCK(as)) { 1758 mutex_exit(&as->a_contents); 1759 AS_LOCK_EXIT(as, &as->a_lock); 1760 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0); 1761 if (error != 0) 1762 (void) as_unmap(as, addr, size); 1763 } else { 1764 mutex_exit(&as->a_contents); 1765 AS_LOCK_EXIT(as, &as->a_lock); 1766 } 1767 return (error); 1768 } 1769 1770 1771 /* 1772 * Delete all segments in the address space marked with S_PURGE. 1773 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c). 1774 * These segments are deleted as a first step before calls to as_gap(), so 1775 * that they don't affect mmap() or shmat(). 1776 */ 1777 void 1778 as_purge(struct as *as) 1779 { 1780 struct seg *seg; 1781 struct seg *next_seg; 1782 1783 /* 1784 * the setting of NEEDSPURGE is protect by as_rangelock(), so 1785 * no need to grab a_contents mutex for this check 1786 */ 1787 if ((as->a_flags & AS_NEEDSPURGE) == 0) 1788 return; 1789 1790 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1791 next_seg = NULL; 1792 seg = AS_SEGFIRST(as); 1793 while (seg != NULL) { 1794 next_seg = AS_SEGNEXT(as, seg); 1795 if (seg->s_flags & S_PURGE) 1796 SEGOP_UNMAP(seg, seg->s_base, seg->s_size); 1797 seg = next_seg; 1798 } 1799 AS_LOCK_EXIT(as, &as->a_lock); 1800 1801 mutex_enter(&as->a_contents); 1802 as->a_flags &= ~AS_NEEDSPURGE; 1803 mutex_exit(&as->a_contents); 1804 } 1805 1806 /* 1807 * Find a hole of at least size minlen within [base, base + len). 1808 * 1809 * If flags specifies AH_HI, the hole will have the highest possible address 1810 * in the range. We use the as->a_lastgap field to figure out where to 1811 * start looking for a gap. 1812 * 1813 * Otherwise, the gap will have the lowest possible address. 1814 * 1815 * If flags specifies AH_CONTAIN, the hole will contain the address addr. 1816 * 1817 * If an adequate hole is found, base and len are set to reflect the part of 1818 * the hole that is within range, and 0 is returned, otherwise, 1819 * -1 is returned. 1820 * 1821 * NOTE: This routine is not correct when base+len overflows caddr_t. 1822 */ 1823 int 1824 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags, 1825 caddr_t addr) 1826 { 1827 caddr_t lobound = *basep; 1828 caddr_t hibound = lobound + *lenp; 1829 struct seg *lseg, *hseg; 1830 caddr_t lo, hi; 1831 int forward; 1832 caddr_t save_base; 1833 size_t save_len; 1834 1835 save_base = *basep; 1836 save_len = *lenp; 1837 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1838 if (AS_SEGFIRST(as) == NULL) { 1839 if (valid_va_range(basep, lenp, minlen, flags & AH_DIR)) { 1840 AS_LOCK_EXIT(as, &as->a_lock); 1841 return (0); 1842 } else { 1843 AS_LOCK_EXIT(as, &as->a_lock); 1844 *basep = save_base; 1845 *lenp = save_len; 1846 return (-1); 1847 } 1848 } 1849 1850 /* 1851 * Set up to iterate over all the inter-segment holes in the given 1852 * direction. lseg is NULL for the lowest-addressed hole and hseg is 1853 * NULL for the highest-addressed hole. If moving backwards, we reset 1854 * sseg to denote the highest-addressed segment. 1855 */ 1856 forward = (flags & AH_DIR) == AH_LO; 1857 if (forward) { 1858 hseg = as_findseg(as, lobound, 1); 1859 lseg = AS_SEGPREV(as, hseg); 1860 } else { 1861 1862 /* 1863 * If allocating at least as much as the last allocation, 1864 * use a_lastgap's base as a better estimate of hibound. 1865 */ 1866 if (as->a_lastgap && 1867 minlen >= as->a_lastgap->s_size && 1868 hibound >= as->a_lastgap->s_base) 1869 hibound = as->a_lastgap->s_base; 1870 1871 hseg = as_findseg(as, hibound, 1); 1872 if (hseg->s_base + hseg->s_size < hibound) { 1873 lseg = hseg; 1874 hseg = NULL; 1875 } else { 1876 lseg = AS_SEGPREV(as, hseg); 1877 } 1878 } 1879 1880 for (;;) { 1881 /* 1882 * Set lo and hi to the hole's boundaries. (We should really 1883 * use MAXADDR in place of hibound in the expression below, 1884 * but can't express it easily; using hibound in its place is 1885 * harmless.) 1886 */ 1887 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size; 1888 hi = (hseg == NULL) ? hibound : hseg->s_base; 1889 /* 1890 * If the iteration has moved past the interval from lobound 1891 * to hibound it's pointless to continue. 1892 */ 1893 if ((forward && lo > hibound) || (!forward && hi < lobound)) 1894 break; 1895 else if (lo > hibound || hi < lobound) 1896 goto cont; 1897 /* 1898 * Candidate hole lies at least partially within the allowable 1899 * range. Restrict it to fall completely within that range, 1900 * i.e., to [max(lo, lobound), min(hi, hibound)]. 1901 */ 1902 if (lo < lobound) 1903 lo = lobound; 1904 if (hi > hibound) 1905 hi = hibound; 1906 /* 1907 * Verify that the candidate hole is big enough and meets 1908 * hardware constraints. 1909 */ 1910 *basep = lo; 1911 *lenp = hi - lo; 1912 if (valid_va_range(basep, lenp, minlen, 1913 forward ? AH_LO : AH_HI) && 1914 ((flags & AH_CONTAIN) == 0 || 1915 (*basep <= addr && *basep + *lenp > addr))) { 1916 if (!forward) 1917 as->a_lastgap = hseg; 1918 if (hseg != NULL) 1919 as->a_lastgaphl = hseg; 1920 else 1921 as->a_lastgaphl = lseg; 1922 AS_LOCK_EXIT(as, &as->a_lock); 1923 return (0); 1924 } 1925 cont: 1926 /* 1927 * Move to the next hole. 1928 */ 1929 if (forward) { 1930 lseg = hseg; 1931 if (lseg == NULL) 1932 break; 1933 hseg = AS_SEGNEXT(as, hseg); 1934 } else { 1935 hseg = lseg; 1936 if (hseg == NULL) 1937 break; 1938 lseg = AS_SEGPREV(as, lseg); 1939 } 1940 } 1941 *basep = save_base; 1942 *lenp = save_len; 1943 AS_LOCK_EXIT(as, &as->a_lock); 1944 return (-1); 1945 } 1946 1947 /* 1948 * Return the next range within [base, base + len) that is backed 1949 * with "real memory". Skip holes and non-seg_vn segments. 1950 * We're lazy and only return one segment at a time. 1951 */ 1952 int 1953 as_memory(struct as *as, caddr_t *basep, size_t *lenp) 1954 { 1955 extern struct seg_ops segspt_shmops; /* needs a header file */ 1956 struct seg *seg; 1957 caddr_t addr, eaddr; 1958 caddr_t segend; 1959 1960 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1961 1962 addr = *basep; 1963 eaddr = addr + *lenp; 1964 1965 seg = as_findseg(as, addr, 0); 1966 if (seg != NULL) 1967 addr = MAX(seg->s_base, addr); 1968 1969 for (;;) { 1970 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) { 1971 AS_LOCK_EXIT(as, &as->a_lock); 1972 return (EINVAL); 1973 } 1974 1975 if (seg->s_ops == &segvn_ops) { 1976 segend = seg->s_base + seg->s_size; 1977 break; 1978 } 1979 1980 /* 1981 * We do ISM by looking into the private data 1982 * to determine the real size of the segment. 1983 */ 1984 if (seg->s_ops == &segspt_shmops) { 1985 segend = seg->s_base + spt_realsize(seg); 1986 if (addr < segend) 1987 break; 1988 } 1989 1990 seg = AS_SEGNEXT(as, seg); 1991 1992 if (seg != NULL) 1993 addr = seg->s_base; 1994 } 1995 1996 *basep = addr; 1997 1998 if (segend > eaddr) 1999 *lenp = eaddr - addr; 2000 else 2001 *lenp = segend - addr; 2002 2003 AS_LOCK_EXIT(as, &as->a_lock); 2004 return (0); 2005 } 2006 2007 /* 2008 * Swap the pages associated with the address space as out to 2009 * secondary storage, returning the number of bytes actually 2010 * swapped. 2011 * 2012 * The value returned is intended to correlate well with the process's 2013 * memory requirements. Its usefulness for this purpose depends on 2014 * how well the segment-level routines do at returning accurate 2015 * information. 2016 */ 2017 size_t 2018 as_swapout(struct as *as) 2019 { 2020 struct seg *seg; 2021 size_t swpcnt = 0; 2022 2023 /* 2024 * Kernel-only processes have given up their address 2025 * spaces. Of course, we shouldn't be attempting to 2026 * swap out such processes in the first place... 2027 */ 2028 if (as == NULL) 2029 return (0); 2030 2031 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2032 2033 /* Prevent XHATs from attaching */ 2034 mutex_enter(&as->a_contents); 2035 AS_SETBUSY(as); 2036 mutex_exit(&as->a_contents); 2037 2038 2039 /* 2040 * Free all mapping resources associated with the address 2041 * space. The segment-level swapout routines capitalize 2042 * on this unmapping by scavanging pages that have become 2043 * unmapped here. 2044 */ 2045 hat_swapout(as->a_hat); 2046 if (as->a_xhat != NULL) 2047 xhat_swapout_all(as); 2048 2049 mutex_enter(&as->a_contents); 2050 AS_CLRBUSY(as); 2051 mutex_exit(&as->a_contents); 2052 2053 /* 2054 * Call the swapout routines of all segments in the address 2055 * space to do the actual work, accumulating the amount of 2056 * space reclaimed. 2057 */ 2058 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 2059 struct seg_ops *ov = seg->s_ops; 2060 2061 /* 2062 * We have to check to see if the seg has 2063 * an ops vector because the seg may have 2064 * been in the middle of being set up when 2065 * the process was picked for swapout. 2066 */ 2067 if ((ov != NULL) && (ov->swapout != NULL)) 2068 swpcnt += SEGOP_SWAPOUT(seg); 2069 } 2070 AS_LOCK_EXIT(as, &as->a_lock); 2071 return (swpcnt); 2072 } 2073 2074 /* 2075 * Determine whether data from the mappings in interval [addr, addr + size) 2076 * are in the primary memory (core) cache. 2077 */ 2078 int 2079 as_incore(struct as *as, caddr_t addr, 2080 size_t size, char *vec, size_t *sizep) 2081 { 2082 struct seg *seg; 2083 size_t ssize; 2084 caddr_t raddr; /* rounded down addr */ 2085 size_t rsize; /* rounded up size */ 2086 size_t isize; /* iteration size */ 2087 int error = 0; /* result, assume success */ 2088 2089 *sizep = 0; 2090 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2091 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) - 2092 (size_t)raddr; 2093 2094 if (raddr + rsize < raddr) /* check for wraparound */ 2095 return (ENOMEM); 2096 2097 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2098 seg = as_segat(as, raddr); 2099 if (seg == NULL) { 2100 AS_LOCK_EXIT(as, &as->a_lock); 2101 return (-1); 2102 } 2103 2104 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 2105 if (raddr >= seg->s_base + seg->s_size) { 2106 seg = AS_SEGNEXT(as, seg); 2107 if (seg == NULL || raddr != seg->s_base) { 2108 error = -1; 2109 break; 2110 } 2111 } 2112 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2113 ssize = seg->s_base + seg->s_size - raddr; 2114 else 2115 ssize = rsize; 2116 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec); 2117 if (isize != ssize) { 2118 error = -1; 2119 break; 2120 } 2121 vec += btopr(ssize); 2122 } 2123 AS_LOCK_EXIT(as, &as->a_lock); 2124 return (error); 2125 } 2126 2127 static void 2128 as_segunlock(struct seg *seg, caddr_t addr, int attr, 2129 ulong_t *bitmap, size_t position, size_t npages) 2130 { 2131 caddr_t range_start; 2132 size_t pos1 = position; 2133 size_t pos2; 2134 size_t size; 2135 size_t end_pos = npages + position; 2136 2137 while (bt_range(bitmap, &pos1, &pos2, end_pos)) { 2138 size = ptob((pos2 - pos1)); 2139 range_start = (caddr_t)((uintptr_t)addr + 2140 ptob(pos1 - position)); 2141 2142 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK, 2143 (ulong_t *)NULL, (size_t)NULL); 2144 pos1 = pos2; 2145 } 2146 } 2147 2148 static void 2149 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map, 2150 caddr_t raddr, size_t rsize) 2151 { 2152 struct seg *seg = as_segat(as, raddr); 2153 size_t ssize; 2154 2155 while (rsize != 0) { 2156 if (raddr >= seg->s_base + seg->s_size) 2157 seg = AS_SEGNEXT(as, seg); 2158 2159 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2160 ssize = seg->s_base + seg->s_size - raddr; 2161 else 2162 ssize = rsize; 2163 2164 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize)); 2165 2166 rsize -= ssize; 2167 raddr += ssize; 2168 } 2169 } 2170 2171 /* 2172 * Cache control operations over the interval [addr, addr + size) in 2173 * address space "as". 2174 */ 2175 /*ARGSUSED*/ 2176 int 2177 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr, 2178 uintptr_t arg, ulong_t *lock_map, size_t pos) 2179 { 2180 struct seg *seg; /* working segment */ 2181 caddr_t raddr; /* rounded down addr */ 2182 caddr_t initraddr; /* saved initial rounded down addr */ 2183 size_t rsize; /* rounded up size */ 2184 size_t initrsize; /* saved initial rounded up size */ 2185 size_t ssize; /* size of seg */ 2186 int error = 0; /* result */ 2187 size_t mlock_size; /* size of bitmap */ 2188 ulong_t *mlock_map; /* pointer to bitmap used */ 2189 /* to represent the locked */ 2190 /* pages. */ 2191 retry: 2192 if (error == IE_RETRY) 2193 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 2194 else 2195 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2196 2197 /* 2198 * If these are address space lock/unlock operations, loop over 2199 * all segments in the address space, as appropriate. 2200 */ 2201 if (func == MC_LOCKAS) { 2202 size_t npages, idx; 2203 size_t rlen = 0; /* rounded as length */ 2204 2205 idx = pos; 2206 2207 if (arg & MCL_FUTURE) { 2208 mutex_enter(&as->a_contents); 2209 AS_SETPGLCK(as); 2210 mutex_exit(&as->a_contents); 2211 } 2212 if ((arg & MCL_CURRENT) == 0) { 2213 AS_LOCK_EXIT(as, &as->a_lock); 2214 return (0); 2215 } 2216 2217 seg = AS_SEGFIRST(as); 2218 if (seg == NULL) { 2219 AS_LOCK_EXIT(as, &as->a_lock); 2220 return (0); 2221 } 2222 2223 do { 2224 raddr = (caddr_t)((uintptr_t)seg->s_base & 2225 (uintptr_t)PAGEMASK); 2226 rlen += (((uintptr_t)(seg->s_base + seg->s_size) + 2227 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr; 2228 } while ((seg = AS_SEGNEXT(as, seg)) != NULL); 2229 2230 mlock_size = BT_BITOUL(btopr(rlen)); 2231 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * 2232 sizeof (ulong_t), KM_NOSLEEP)) == NULL) { 2233 AS_LOCK_EXIT(as, &as->a_lock); 2234 return (EAGAIN); 2235 } 2236 2237 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 2238 error = SEGOP_LOCKOP(seg, seg->s_base, 2239 seg->s_size, attr, MC_LOCK, mlock_map, pos); 2240 if (error != 0) 2241 break; 2242 pos += seg_pages(seg); 2243 } 2244 2245 if (error) { 2246 for (seg = AS_SEGFIRST(as); seg != NULL; 2247 seg = AS_SEGNEXT(as, seg)) { 2248 2249 raddr = (caddr_t)((uintptr_t)seg->s_base & 2250 (uintptr_t)PAGEMASK); 2251 npages = seg_pages(seg); 2252 as_segunlock(seg, raddr, attr, mlock_map, 2253 idx, npages); 2254 idx += npages; 2255 } 2256 } 2257 2258 kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); 2259 AS_LOCK_EXIT(as, &as->a_lock); 2260 goto lockerr; 2261 } else if (func == MC_UNLOCKAS) { 2262 mutex_enter(&as->a_contents); 2263 AS_CLRPGLCK(as); 2264 mutex_exit(&as->a_contents); 2265 2266 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 2267 error = SEGOP_LOCKOP(seg, seg->s_base, 2268 seg->s_size, attr, MC_UNLOCK, NULL, 0); 2269 if (error != 0) 2270 break; 2271 } 2272 2273 AS_LOCK_EXIT(as, &as->a_lock); 2274 goto lockerr; 2275 } 2276 2277 /* 2278 * Normalize addresses and sizes. 2279 */ 2280 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2281 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2282 (size_t)raddr; 2283 2284 if (raddr + rsize < raddr) { /* check for wraparound */ 2285 AS_LOCK_EXIT(as, &as->a_lock); 2286 return (ENOMEM); 2287 } 2288 2289 /* 2290 * Get initial segment. 2291 */ 2292 if ((seg = as_segat(as, raddr)) == NULL) { 2293 AS_LOCK_EXIT(as, &as->a_lock); 2294 return (ENOMEM); 2295 } 2296 2297 if (func == MC_LOCK) { 2298 mlock_size = BT_BITOUL(btopr(rsize)); 2299 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size * 2300 sizeof (ulong_t), KM_NOSLEEP)) == NULL) { 2301 AS_LOCK_EXIT(as, &as->a_lock); 2302 return (EAGAIN); 2303 } 2304 } 2305 2306 /* 2307 * Loop over all segments. If a hole in the address range is 2308 * discovered, then fail. For each segment, perform the appropriate 2309 * control operation. 2310 */ 2311 while (rsize != 0) { 2312 2313 /* 2314 * Make sure there's no hole, calculate the portion 2315 * of the next segment to be operated over. 2316 */ 2317 if (raddr >= seg->s_base + seg->s_size) { 2318 seg = AS_SEGNEXT(as, seg); 2319 if (seg == NULL || raddr != seg->s_base) { 2320 if (func == MC_LOCK) { 2321 as_unlockerr(as, attr, mlock_map, 2322 initraddr, initrsize - rsize); 2323 kmem_free(mlock_map, 2324 mlock_size * sizeof (ulong_t)); 2325 } 2326 AS_LOCK_EXIT(as, &as->a_lock); 2327 return (ENOMEM); 2328 } 2329 } 2330 if ((raddr + rsize) > (seg->s_base + seg->s_size)) 2331 ssize = seg->s_base + seg->s_size - raddr; 2332 else 2333 ssize = rsize; 2334 2335 /* 2336 * Dispatch on specific function. 2337 */ 2338 switch (func) { 2339 2340 /* 2341 * Synchronize cached data from mappings with backing 2342 * objects. 2343 */ 2344 case MC_SYNC: 2345 if (error = SEGOP_SYNC(seg, raddr, ssize, 2346 attr, (uint_t)arg)) { 2347 AS_LOCK_EXIT(as, &as->a_lock); 2348 return (error); 2349 } 2350 break; 2351 2352 /* 2353 * Lock pages in memory. 2354 */ 2355 case MC_LOCK: 2356 if (error = SEGOP_LOCKOP(seg, raddr, ssize, 2357 attr, func, mlock_map, pos)) { 2358 as_unlockerr(as, attr, mlock_map, initraddr, 2359 initrsize - rsize + ssize); 2360 kmem_free(mlock_map, mlock_size * 2361 sizeof (ulong_t)); 2362 AS_LOCK_EXIT(as, &as->a_lock); 2363 goto lockerr; 2364 } 2365 break; 2366 2367 /* 2368 * Unlock mapped pages. 2369 */ 2370 case MC_UNLOCK: 2371 (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func, 2372 (ulong_t *)NULL, (size_t)NULL); 2373 break; 2374 2375 /* 2376 * Store VM advise for mapped pages in segment layer. 2377 */ 2378 case MC_ADVISE: 2379 error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg); 2380 2381 /* 2382 * Check for regular errors and special retry error 2383 */ 2384 if (error) { 2385 if (error == IE_RETRY) { 2386 /* 2387 * Need to acquire writers lock, so 2388 * have to drop readers lock and start 2389 * all over again 2390 */ 2391 AS_LOCK_EXIT(as, &as->a_lock); 2392 goto retry; 2393 } else if (error == IE_REATTACH) { 2394 /* 2395 * Find segment for current address 2396 * because current segment just got 2397 * split or concatenated 2398 */ 2399 seg = as_segat(as, raddr); 2400 if (seg == NULL) { 2401 AS_LOCK_EXIT(as, &as->a_lock); 2402 return (ENOMEM); 2403 } 2404 } else { 2405 /* 2406 * Regular error 2407 */ 2408 AS_LOCK_EXIT(as, &as->a_lock); 2409 return (error); 2410 } 2411 } 2412 break; 2413 2414 /* 2415 * Can't happen. 2416 */ 2417 default: 2418 panic("as_ctl: bad operation %d", func); 2419 /*NOTREACHED*/ 2420 } 2421 2422 rsize -= ssize; 2423 raddr += ssize; 2424 } 2425 2426 if (func == MC_LOCK) 2427 kmem_free(mlock_map, mlock_size * sizeof (ulong_t)); 2428 AS_LOCK_EXIT(as, &as->a_lock); 2429 return (0); 2430 lockerr: 2431 2432 /* 2433 * If the lower levels returned EDEADLK for a segment lockop, 2434 * it means that we should retry the operation. Let's wait 2435 * a bit also to let the deadlock causing condition clear. 2436 * This is part of a gross hack to work around a design flaw 2437 * in the ufs/sds logging code and should go away when the 2438 * logging code is re-designed to fix the problem. See bug 2439 * 4125102 for details of the problem. 2440 */ 2441 if (error == EDEADLK) { 2442 delay(deadlk_wait); 2443 error = 0; 2444 goto retry; 2445 } 2446 return (error); 2447 } 2448 2449 /* 2450 * Special code for exec to move the stack segment from its interim 2451 * place in the old address to the right place in the new address space. 2452 */ 2453 /*ARGSUSED*/ 2454 int 2455 as_exec(struct as *oas, caddr_t ostka, size_t stksz, 2456 struct as *nas, caddr_t nstka, uint_t hatflag) 2457 { 2458 struct seg *stkseg; 2459 2460 AS_LOCK_ENTER(oas, &oas->a_lock, RW_WRITER); 2461 stkseg = as_segat(oas, ostka); 2462 stkseg = as_removeseg(oas, stkseg); 2463 ASSERT(stkseg != NULL); 2464 ASSERT(stkseg->s_base == ostka && stkseg->s_size == stksz); 2465 stkseg->s_as = nas; 2466 stkseg->s_base = nstka; 2467 2468 /* 2469 * It's ok to lock the address space we are about to exec to. 2470 */ 2471 AS_LOCK_ENTER(nas, &nas->a_lock, RW_WRITER); 2472 ASSERT(avl_numnodes(&nas->a_wpage) == 0); 2473 nas->a_size += stkseg->s_size; 2474 oas->a_size -= stkseg->s_size; 2475 (void) as_addseg(nas, stkseg); 2476 AS_LOCK_EXIT(nas, &nas->a_lock); 2477 AS_LOCK_EXIT(oas, &oas->a_lock); 2478 return (0); 2479 } 2480 2481 static int 2482 f_decode(faultcode_t fault_err) 2483 { 2484 int error = 0; 2485 2486 switch (FC_CODE(fault_err)) { 2487 case FC_OBJERR: 2488 error = FC_ERRNO(fault_err); 2489 break; 2490 case FC_PROT: 2491 error = EACCES; 2492 break; 2493 default: 2494 error = EFAULT; 2495 break; 2496 } 2497 return (error); 2498 } 2499 2500 /* 2501 * lock pages in a given address space. Return shadow list. If 2502 * the list is NULL, the MMU mapping is also locked. 2503 */ 2504 int 2505 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr, 2506 size_t size, enum seg_rw rw) 2507 { 2508 size_t rsize; 2509 caddr_t base; 2510 caddr_t raddr; 2511 faultcode_t fault_err; 2512 struct seg *seg; 2513 int res; 2514 int prefaulted = 0; 2515 2516 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START, 2517 "as_pagelock_start: addr %p size %ld", addr, size); 2518 2519 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2520 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2521 (size_t)raddr; 2522 top: 2523 /* 2524 * if the request crosses two segments let 2525 * as_fault handle it. 2526 */ 2527 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2528 seg = as_findseg(as, addr, 0); 2529 if ((seg == NULL) || ((base = seg->s_base) > addr) || 2530 (addr + size) > base + seg->s_size) { 2531 AS_LOCK_EXIT(as, &as->a_lock); 2532 goto slow; 2533 } 2534 2535 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START, 2536 "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize); 2537 2538 /* 2539 * try to lock pages and pass back shadow list 2540 */ 2541 res = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw); 2542 2543 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end"); 2544 AS_LOCK_EXIT(as, &as->a_lock); 2545 if (res == 0) { 2546 return (0); 2547 } else if (res == ENOTSUP || prefaulted) { 2548 /* 2549 * (1) segment driver doesn't support PAGELOCK fastpath, or 2550 * (2) we've already tried fast path unsuccessfully after 2551 * faulting in the addr range below; system might be 2552 * thrashing or there may not be enough availrmem. 2553 */ 2554 goto slow; 2555 } 2556 2557 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_FAULT_START, 2558 "as_fault_start: addr %p size %ld", addr, size); 2559 2560 /* 2561 * we might get here because of some COW fault or non 2562 * existing page. Let as_fault deal with it. Just load 2563 * the page, don't lock the MMU mapping. 2564 */ 2565 fault_err = as_fault(as->a_hat, as, addr, size, F_INVAL, rw); 2566 if (fault_err != 0) { 2567 return (f_decode(fault_err)); 2568 } 2569 2570 prefaulted = 1; 2571 2572 /* 2573 * try fast path again; since we've dropped a_lock, 2574 * we need to try the dance from the start to see if 2575 * the addr range is still valid. 2576 */ 2577 goto top; 2578 slow: 2579 /* 2580 * load the page and lock the MMU mapping. 2581 */ 2582 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw); 2583 if (fault_err != 0) { 2584 return (f_decode(fault_err)); 2585 } 2586 *ppp = NULL; 2587 2588 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end"); 2589 return (0); 2590 } 2591 2592 /* 2593 * unlock pages in a given address range 2594 */ 2595 void 2596 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size, 2597 enum seg_rw rw) 2598 { 2599 struct seg *seg; 2600 size_t rsize; 2601 caddr_t raddr; 2602 2603 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START, 2604 "as_pageunlock_start: addr %p size %ld", addr, size); 2605 2606 /* 2607 * if the shadow list is NULL, as_pagelock was 2608 * falling back to as_fault 2609 */ 2610 if (pp == NULL) { 2611 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw); 2612 return; 2613 } 2614 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2615 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2616 (size_t)raddr; 2617 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 2618 seg = as_findseg(as, addr, 0); 2619 ASSERT(seg); 2620 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START, 2621 "seg_unlock_start: raddr %p rsize %ld", raddr, rsize); 2622 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw); 2623 AS_LOCK_EXIT(as, &as->a_lock); 2624 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end"); 2625 } 2626 2627 /* 2628 * reclaim cached pages in a given address range 2629 */ 2630 void 2631 as_pagereclaim(struct as *as, struct page **pp, caddr_t addr, 2632 size_t size, enum seg_rw rw) 2633 { 2634 struct seg *seg; 2635 size_t rsize; 2636 caddr_t raddr; 2637 2638 ASSERT(AS_READ_HELD(as, &as->a_lock)); 2639 ASSERT(pp != NULL); 2640 2641 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 2642 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 2643 (size_t)raddr; 2644 seg = as_findseg(as, addr, 0); 2645 ASSERT(seg); 2646 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGERECLAIM, rw); 2647 } 2648 2649 #define MAXPAGEFLIP 4 2650 #define MAXPAGEFLIPSIZ MAXPAGEFLIP*PAGESIZE 2651 2652 int 2653 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc, 2654 boolean_t wait) 2655 { 2656 struct seg *seg; 2657 size_t ssize; 2658 caddr_t raddr; /* rounded down addr */ 2659 size_t rsize; /* rounded up size */ 2660 int error = 0; 2661 size_t pgsz = page_get_pagesize(szc); 2662 2663 setpgsz_top: 2664 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) { 2665 return (EINVAL); 2666 } 2667 2668 raddr = addr; 2669 rsize = size; 2670 2671 if (raddr + rsize < raddr) /* check for wraparound */ 2672 return (ENOMEM); 2673 2674 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 2675 as_clearwatchprot(as, raddr, rsize); 2676 seg = as_segat(as, raddr); 2677 if (seg == NULL) { 2678 as_setwatch(as); 2679 AS_LOCK_EXIT(as, &as->a_lock); 2680 return (ENOMEM); 2681 } 2682 2683 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 2684 if (raddr >= seg->s_base + seg->s_size) { 2685 seg = AS_SEGNEXT(as, seg); 2686 if (seg == NULL || raddr != seg->s_base) { 2687 error = ENOMEM; 2688 break; 2689 } 2690 } 2691 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 2692 ssize = seg->s_base + seg->s_size - raddr; 2693 } else { 2694 ssize = rsize; 2695 } 2696 2697 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc); 2698 2699 if (error == IE_NOMEM) { 2700 error = EAGAIN; 2701 break; 2702 } 2703 2704 if (error == IE_RETRY) { 2705 AS_LOCK_EXIT(as, &as->a_lock); 2706 goto setpgsz_top; 2707 } 2708 2709 if (error == ENOTSUP) { 2710 error = EINVAL; 2711 break; 2712 } 2713 2714 if (wait && (error == EAGAIN)) { 2715 /* 2716 * Memory is currently locked. It must be unlocked 2717 * before this operation can succeed through a retry. 2718 * The possible reasons for locked memory and 2719 * corresponding strategies for unlocking are: 2720 * (1) Normal I/O 2721 * wait for a signal that the I/O operation 2722 * has completed and the memory is unlocked. 2723 * (2) Asynchronous I/O 2724 * The aio subsystem does not unlock pages when 2725 * the I/O is completed. Those pages are unlocked 2726 * when the application calls aiowait/aioerror. 2727 * So, to prevent blocking forever, cv_broadcast() 2728 * is done to wake up aio_cleanup_thread. 2729 * Subsequently, segvn_reclaim will be called, and 2730 * that will do AS_CLRUNMAPWAIT() and wake us up. 2731 * (3) Long term page locking: 2732 * This is not relevant for as_setpagesize() 2733 * because we cannot change the page size for 2734 * driver memory. The attempt to do so will 2735 * fail with a different error than EAGAIN so 2736 * there's no need to trigger as callbacks like 2737 * as_unmap, as_setprot or as_free would do. 2738 */ 2739 mutex_enter(&as->a_contents); 2740 if (AS_ISUNMAPWAIT(as) == 0) { 2741 cv_broadcast(&as->a_cv); 2742 } 2743 AS_SETUNMAPWAIT(as); 2744 AS_LOCK_EXIT(as, &as->a_lock); 2745 while (AS_ISUNMAPWAIT(as)) { 2746 cv_wait(&as->a_cv, &as->a_contents); 2747 } 2748 mutex_exit(&as->a_contents); 2749 goto setpgsz_top; 2750 } else if (error != 0) { 2751 break; 2752 } 2753 } 2754 as_setwatch(as); 2755 AS_LOCK_EXIT(as, &as->a_lock); 2756 return (error); 2757 } 2758 2759 /* 2760 * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments 2761 * in its chunk where s_szc is less than the szc we want to set. 2762 */ 2763 static int 2764 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc, 2765 int *retry) 2766 { 2767 struct seg *seg; 2768 size_t ssize; 2769 int error; 2770 2771 seg = as_segat(as, raddr); 2772 if (seg == NULL) { 2773 panic("as_iset3_default_lpsize: no seg"); 2774 } 2775 2776 for (; rsize != 0; rsize -= ssize, raddr += ssize) { 2777 if (raddr >= seg->s_base + seg->s_size) { 2778 seg = AS_SEGNEXT(as, seg); 2779 if (seg == NULL || raddr != seg->s_base) { 2780 panic("as_iset3_default_lpsize: as changed"); 2781 } 2782 } 2783 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 2784 ssize = seg->s_base + seg->s_size - raddr; 2785 } else { 2786 ssize = rsize; 2787 } 2788 2789 if (szc > seg->s_szc) { 2790 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc); 2791 /* Only retry on EINVAL segments that have no vnode. */ 2792 if (error == EINVAL) { 2793 vnode_t *vp = NULL; 2794 if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) && 2795 (SEGOP_GETVP(seg, raddr, &vp) != 0 || 2796 vp == NULL)) { 2797 *retry = 1; 2798 } else { 2799 *retry = 0; 2800 } 2801 } 2802 if (error) { 2803 return (error); 2804 } 2805 } 2806 } 2807 return (0); 2808 } 2809 2810 /* 2811 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the 2812 * pagesize on each segment in its range, but if any fails with EINVAL, 2813 * then it reduces the pagesizes to the next size in the bitmap and 2814 * retries as_iset3_default_lpsize(). The reason why the code retries 2815 * smaller allowed sizes on EINVAL is because (a) the anon offset may not 2816 * match the bigger sizes, and (b) it's hard to get this offset (to begin 2817 * with) to pass to map_pgszcvec(). 2818 */ 2819 static int 2820 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc, 2821 uint_t szcvec) 2822 { 2823 int error; 2824 int retry; 2825 2826 for (;;) { 2827 error = as_iset3_default_lpsize(as, addr, size, szc, &retry); 2828 if (error == EINVAL && retry) { 2829 szcvec &= ~(1 << szc); 2830 if (szcvec <= 1) { 2831 return (EINVAL); 2832 } 2833 szc = highbit(szcvec) - 1; 2834 } else { 2835 return (error); 2836 } 2837 } 2838 } 2839 2840 /* 2841 * as_iset1_default_lpsize() breaks its chunk into areas where existing 2842 * segments have a smaller szc than we want to set. For each such area, 2843 * it calls as_iset2_default_lpsize() 2844 */ 2845 static int 2846 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc, 2847 uint_t szcvec) 2848 { 2849 struct seg *seg; 2850 size_t ssize; 2851 caddr_t setaddr = raddr; 2852 size_t setsize = 0; 2853 int set; 2854 int error; 2855 2856 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 2857 2858 seg = as_segat(as, raddr); 2859 if (seg == NULL) { 2860 panic("as_iset1_default_lpsize: no seg"); 2861 } 2862 if (seg->s_szc < szc) { 2863 set = 1; 2864 } else { 2865 set = 0; 2866 } 2867 2868 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) { 2869 if (raddr >= seg->s_base + seg->s_size) { 2870 seg = AS_SEGNEXT(as, seg); 2871 if (seg == NULL || raddr != seg->s_base) { 2872 panic("as_iset1_default_lpsize: as changed"); 2873 } 2874 if (seg->s_szc >= szc && set) { 2875 ASSERT(setsize != 0); 2876 error = as_iset2_default_lpsize(as, 2877 setaddr, setsize, szc, szcvec); 2878 if (error) { 2879 return (error); 2880 } 2881 set = 0; 2882 } else if (seg->s_szc < szc && !set) { 2883 setaddr = raddr; 2884 setsize = 0; 2885 set = 1; 2886 } 2887 } 2888 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 2889 ssize = seg->s_base + seg->s_size - raddr; 2890 } else { 2891 ssize = rsize; 2892 } 2893 } 2894 error = 0; 2895 if (set) { 2896 ASSERT(setsize != 0); 2897 error = as_iset2_default_lpsize(as, setaddr, setsize, 2898 szc, szcvec); 2899 } 2900 return (error); 2901 } 2902 2903 /* 2904 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap 2905 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each 2906 * chunk to as_iset1_default_lpsize(). 2907 */ 2908 static int 2909 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags, 2910 int type) 2911 { 2912 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM; 2913 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, 2914 flags, rtype, 1); 2915 uint_t szc; 2916 uint_t nszc; 2917 int error; 2918 caddr_t a; 2919 caddr_t eaddr; 2920 size_t segsize; 2921 size_t pgsz; 2922 uint_t save_szcvec; 2923 2924 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 2925 ASSERT(IS_P2ALIGNED(addr, PAGESIZE)); 2926 ASSERT(IS_P2ALIGNED(size, PAGESIZE)); 2927 2928 szcvec &= ~1; 2929 if (szcvec <= 1) { /* skip if base page size */ 2930 return (0); 2931 } 2932 2933 /* Get the pagesize of the first larger page size. */ 2934 szc = lowbit(szcvec) - 1; 2935 pgsz = page_get_pagesize(szc); 2936 eaddr = addr + size; 2937 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 2938 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 2939 2940 save_szcvec = szcvec; 2941 szcvec >>= (szc + 1); 2942 nszc = szc; 2943 while (szcvec) { 2944 if ((szcvec & 0x1) == 0) { 2945 nszc++; 2946 szcvec >>= 1; 2947 continue; 2948 } 2949 nszc++; 2950 pgsz = page_get_pagesize(nszc); 2951 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 2952 if (a != addr) { 2953 ASSERT(szc > 0); 2954 ASSERT(a < eaddr); 2955 segsize = a - addr; 2956 error = as_iset1_default_lpsize(as, addr, segsize, szc, 2957 save_szcvec); 2958 if (error) { 2959 return (error); 2960 } 2961 addr = a; 2962 } 2963 szc = nszc; 2964 szcvec >>= 1; 2965 } 2966 2967 ASSERT(addr < eaddr); 2968 szcvec = save_szcvec; 2969 while (szcvec) { 2970 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 2971 ASSERT(a >= addr); 2972 if (a != addr) { 2973 ASSERT(szc > 0); 2974 segsize = a - addr; 2975 error = as_iset1_default_lpsize(as, addr, segsize, szc, 2976 save_szcvec); 2977 if (error) { 2978 return (error); 2979 } 2980 addr = a; 2981 } 2982 szcvec &= ~(1 << szc); 2983 if (szcvec) { 2984 szc = highbit(szcvec) - 1; 2985 pgsz = page_get_pagesize(szc); 2986 } 2987 } 2988 ASSERT(addr == eaddr); 2989 2990 return (0); 2991 } 2992 2993 /* 2994 * Set the default large page size for the range. Called via memcntl with 2995 * page size set to 0. as_set_default_lpsize breaks the range down into 2996 * chunks with the same type/flags, ignores-non segvn segments, and passes 2997 * each chunk to as_iset_default_lpsize(). 2998 */ 2999 int 3000 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size) 3001 { 3002 struct seg *seg; 3003 caddr_t raddr; 3004 size_t rsize; 3005 size_t ssize; 3006 int rtype, rflags; 3007 int stype, sflags; 3008 int error; 3009 caddr_t setaddr; 3010 size_t setsize; 3011 int segvn; 3012 3013 if (size == 0) 3014 return (0); 3015 3016 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 3017 again: 3018 error = 0; 3019 3020 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3021 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) - 3022 (size_t)raddr; 3023 3024 if (raddr + rsize < raddr) { /* check for wraparound */ 3025 AS_LOCK_EXIT(as, &as->a_lock); 3026 return (ENOMEM); 3027 } 3028 as_clearwatchprot(as, raddr, rsize); 3029 seg = as_segat(as, raddr); 3030 if (seg == NULL) { 3031 as_setwatch(as); 3032 AS_LOCK_EXIT(as, &as->a_lock); 3033 return (ENOMEM); 3034 } 3035 if (seg->s_ops == &segvn_ops) { 3036 rtype = SEGOP_GETTYPE(seg, addr); 3037 rflags = rtype & (MAP_TEXT | MAP_INITDATA); 3038 rtype = rtype & (MAP_SHARED | MAP_PRIVATE); 3039 segvn = 1; 3040 } else { 3041 segvn = 0; 3042 } 3043 setaddr = raddr; 3044 setsize = 0; 3045 3046 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) { 3047 if (raddr >= (seg->s_base + seg->s_size)) { 3048 seg = AS_SEGNEXT(as, seg); 3049 if (seg == NULL || raddr != seg->s_base) { 3050 error = ENOMEM; 3051 break; 3052 } 3053 if (seg->s_ops == &segvn_ops) { 3054 stype = SEGOP_GETTYPE(seg, raddr); 3055 sflags = stype & (MAP_TEXT | MAP_INITDATA); 3056 stype &= (MAP_SHARED | MAP_PRIVATE); 3057 if (segvn && (rflags != sflags || 3058 rtype != stype)) { 3059 /* 3060 * The next segment is also segvn but 3061 * has different flags and/or type. 3062 */ 3063 ASSERT(setsize != 0); 3064 error = as_iset_default_lpsize(as, 3065 setaddr, setsize, rflags, rtype); 3066 if (error) { 3067 break; 3068 } 3069 rflags = sflags; 3070 rtype = stype; 3071 setaddr = raddr; 3072 setsize = 0; 3073 } else if (!segvn) { 3074 rflags = sflags; 3075 rtype = stype; 3076 setaddr = raddr; 3077 setsize = 0; 3078 segvn = 1; 3079 } 3080 } else if (segvn) { 3081 /* The next segment is not segvn. */ 3082 ASSERT(setsize != 0); 3083 error = as_iset_default_lpsize(as, 3084 setaddr, setsize, rflags, rtype); 3085 if (error) { 3086 break; 3087 } 3088 segvn = 0; 3089 } 3090 } 3091 if ((raddr + rsize) > (seg->s_base + seg->s_size)) { 3092 ssize = seg->s_base + seg->s_size - raddr; 3093 } else { 3094 ssize = rsize; 3095 } 3096 } 3097 if (error == 0 && segvn) { 3098 /* The last chunk when rsize == 0. */ 3099 ASSERT(setsize != 0); 3100 error = as_iset_default_lpsize(as, setaddr, setsize, 3101 rflags, rtype); 3102 } 3103 3104 if (error == IE_RETRY) { 3105 goto again; 3106 } else if (error == IE_NOMEM) { 3107 error = EAGAIN; 3108 } else if (error == ENOTSUP) { 3109 error = EINVAL; 3110 } else if (error == EAGAIN) { 3111 mutex_enter(&as->a_contents); 3112 if (AS_ISUNMAPWAIT(as) == 0) { 3113 cv_broadcast(&as->a_cv); 3114 } 3115 AS_SETUNMAPWAIT(as); 3116 AS_LOCK_EXIT(as, &as->a_lock); 3117 while (AS_ISUNMAPWAIT(as)) { 3118 cv_wait(&as->a_cv, &as->a_contents); 3119 } 3120 mutex_exit(&as->a_contents); 3121 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 3122 goto again; 3123 } 3124 3125 as_setwatch(as); 3126 AS_LOCK_EXIT(as, &as->a_lock); 3127 return (error); 3128 } 3129 3130 /* 3131 * Setup all of the uninitialized watched pages that we can. 3132 */ 3133 void 3134 as_setwatch(struct as *as) 3135 { 3136 struct watched_page *pwp; 3137 struct seg *seg; 3138 caddr_t vaddr; 3139 uint_t prot; 3140 int err, retrycnt; 3141 3142 if (avl_numnodes(&as->a_wpage) == 0) 3143 return; 3144 3145 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3146 3147 for (pwp = avl_first(&as->a_wpage); pwp != NULL; 3148 pwp = AVL_NEXT(&as->a_wpage, pwp)) { 3149 retrycnt = 0; 3150 retry: 3151 vaddr = pwp->wp_vaddr; 3152 if (pwp->wp_oprot != 0 || /* already set up */ 3153 (seg = as_segat(as, vaddr)) == NULL || 3154 SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0) 3155 continue; 3156 3157 pwp->wp_oprot = prot; 3158 if (pwp->wp_read) 3159 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3160 if (pwp->wp_write) 3161 prot &= ~PROT_WRITE; 3162 if (pwp->wp_exec) 3163 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3164 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) { 3165 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot); 3166 if (err == IE_RETRY) { 3167 pwp->wp_oprot = 0; 3168 ASSERT(retrycnt == 0); 3169 retrycnt++; 3170 goto retry; 3171 } 3172 } 3173 pwp->wp_prot = prot; 3174 } 3175 } 3176 3177 /* 3178 * Clear all of the watched pages in the address space. 3179 */ 3180 void 3181 as_clearwatch(struct as *as) 3182 { 3183 struct watched_page *pwp; 3184 struct seg *seg; 3185 caddr_t vaddr; 3186 uint_t prot; 3187 int err, retrycnt; 3188 3189 if (avl_numnodes(&as->a_wpage) == 0) 3190 return; 3191 3192 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3193 3194 for (pwp = avl_first(&as->a_wpage); pwp != NULL; 3195 pwp = AVL_NEXT(&as->a_wpage, pwp)) { 3196 retrycnt = 0; 3197 retry: 3198 vaddr = pwp->wp_vaddr; 3199 if (pwp->wp_oprot == 0 || /* not set up */ 3200 (seg = as_segat(as, vaddr)) == NULL) 3201 continue; 3202 3203 if ((prot = pwp->wp_oprot) != pwp->wp_prot) { 3204 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot); 3205 if (err == IE_RETRY) { 3206 ASSERT(retrycnt == 0); 3207 retrycnt++; 3208 goto retry; 3209 } 3210 } 3211 pwp->wp_oprot = 0; 3212 pwp->wp_prot = 0; 3213 } 3214 } 3215 3216 /* 3217 * Force a new setup for all the watched pages in the range. 3218 */ 3219 static void 3220 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot) 3221 { 3222 struct watched_page *pwp; 3223 struct watched_page tpw; 3224 caddr_t eaddr = addr + size; 3225 caddr_t vaddr; 3226 struct seg *seg; 3227 int err, retrycnt; 3228 uint_t wprot; 3229 avl_index_t where; 3230 3231 if (avl_numnodes(&as->a_wpage) == 0) 3232 return; 3233 3234 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3235 3236 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3237 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) 3238 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); 3239 3240 while (pwp != NULL && pwp->wp_vaddr < eaddr) { 3241 retrycnt = 0; 3242 vaddr = pwp->wp_vaddr; 3243 3244 wprot = prot; 3245 if (pwp->wp_read) 3246 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3247 if (pwp->wp_write) 3248 wprot &= ~PROT_WRITE; 3249 if (pwp->wp_exec) 3250 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC); 3251 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) { 3252 retry: 3253 seg = as_segat(as, vaddr); 3254 if (seg == NULL) { 3255 panic("as_setwatchprot: no seg"); 3256 /*NOTREACHED*/ 3257 } 3258 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot); 3259 if (err == IE_RETRY) { 3260 ASSERT(retrycnt == 0); 3261 retrycnt++; 3262 goto retry; 3263 } 3264 } 3265 pwp->wp_oprot = prot; 3266 pwp->wp_prot = wprot; 3267 3268 pwp = AVL_NEXT(&as->a_wpage, pwp); 3269 } 3270 } 3271 3272 /* 3273 * Clear all of the watched pages in the range. 3274 */ 3275 static void 3276 as_clearwatchprot(struct as *as, caddr_t addr, size_t size) 3277 { 3278 caddr_t eaddr = addr + size; 3279 struct watched_page *pwp; 3280 struct watched_page tpw; 3281 uint_t prot; 3282 struct seg *seg; 3283 int err, retrycnt; 3284 avl_index_t where; 3285 3286 if (avl_numnodes(&as->a_wpage) == 0) 3287 return; 3288 3289 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 3290 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL) 3291 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER); 3292 3293 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 3294 3295 while (pwp != NULL && pwp->wp_vaddr < eaddr) { 3296 3297 if ((prot = pwp->wp_oprot) != 0) { 3298 retrycnt = 0; 3299 3300 if (prot != pwp->wp_prot) { 3301 retry: 3302 seg = as_segat(as, pwp->wp_vaddr); 3303 if (seg == NULL) 3304 continue; 3305 err = SEGOP_SETPROT(seg, pwp->wp_vaddr, 3306 PAGESIZE, prot); 3307 if (err == IE_RETRY) { 3308 ASSERT(retrycnt == 0); 3309 retrycnt++; 3310 goto retry; 3311 3312 } 3313 } 3314 pwp->wp_oprot = 0; 3315 pwp->wp_prot = 0; 3316 } 3317 3318 pwp = AVL_NEXT(&as->a_wpage, pwp); 3319 } 3320 } 3321 3322 void 3323 as_signal_proc(struct as *as, k_siginfo_t *siginfo) 3324 { 3325 struct proc *p; 3326 3327 mutex_enter(&pidlock); 3328 for (p = practive; p; p = p->p_next) { 3329 if (p->p_as == as) { 3330 mutex_enter(&p->p_lock); 3331 if (p->p_as == as) 3332 sigaddq(p, NULL, siginfo, KM_NOSLEEP); 3333 mutex_exit(&p->p_lock); 3334 } 3335 } 3336 mutex_exit(&pidlock); 3337 } 3338 3339 /* 3340 * return memory object ID 3341 */ 3342 int 3343 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp) 3344 { 3345 struct seg *seg; 3346 int sts; 3347 3348 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 3349 seg = as_segat(as, addr); 3350 if (seg == NULL) { 3351 AS_LOCK_EXIT(as, &as->a_lock); 3352 return (EFAULT); 3353 } 3354 /* 3355 * catch old drivers which may not support getmemid 3356 */ 3357 if (seg->s_ops->getmemid == NULL) { 3358 AS_LOCK_EXIT(as, &as->a_lock); 3359 return (ENODEV); 3360 } 3361 3362 sts = SEGOP_GETMEMID(seg, addr, memidp); 3363 3364 AS_LOCK_EXIT(as, &as->a_lock); 3365 return (sts); 3366 } 3367