1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2018 Joyent, Inc.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 */
27
28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
29 /* All Rights Reserved */
30
31 /*
32 * University Copyright- Copyright (c) 1982, 1986, 1988
33 * The Regents of the University of California
34 * All Rights Reserved
35 *
36 * University Acknowledgment- Portions of this document are derived from
37 * software developed by the University of California, Berkeley, and its
38 * contributors.
39 */
40
41 /*
42 * VM - address spaces.
43 */
44
45 #include <sys/types.h>
46 #include <sys/t_lock.h>
47 #include <sys/param.h>
48 #include <sys/errno.h>
49 #include <sys/systm.h>
50 #include <sys/mman.h>
51 #include <sys/sysmacros.h>
52 #include <sys/cpuvar.h>
53 #include <sys/sysinfo.h>
54 #include <sys/kmem.h>
55 #include <sys/vnode.h>
56 #include <sys/vmsystm.h>
57 #include <sys/cmn_err.h>
58 #include <sys/debug.h>
59 #include <sys/vtrace.h>
60
61 #include <vm/hat.h>
62 #include <vm/as.h>
63 #include <vm/seg.h>
64 #include <vm/seg_vn.h>
65 #include <vm/seg_dev.h>
66 #include <vm/seg_kmem.h>
67 #include <vm/seg_map.h>
68 #include <vm/seg_spt.h>
69 #include <vm/seg_hole.h>
70 #include <vm/page.h>
71
72 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
73
74 static struct kmem_cache *as_cache;
75
76 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
77 static void as_clearwatchprot(struct as *, caddr_t, size_t);
78
79
80 /*
81 * Verifying the segment lists is very time-consuming; it may not be
82 * desirable always to define VERIFY_SEGLIST when DEBUG is set.
83 */
84 #ifdef DEBUG
85 #define VERIFY_SEGLIST
86 int do_as_verify = 0;
87 #endif
88
89 /*
90 * Allocate a new callback data structure entry and fill in the events of
91 * interest, the address range of interest, and the callback argument.
92 * Link the entry on the as->a_callbacks list. A callback entry for the
93 * entire address space may be specified with vaddr = 0 and size = -1.
94 *
95 * CALLERS RESPONSIBILITY: If not calling from within the process context for
96 * the specified as, the caller must guarantee persistence of the specified as
97 * for the duration of this function (eg. pages being locked within the as
98 * will guarantee persistence).
99 */
100 int
as_add_callback(struct as * as,void (* cb_func)(),void * arg,uint_t events,caddr_t vaddr,size_t size,int sleepflag)101 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
102 caddr_t vaddr, size_t size, int sleepflag)
103 {
104 struct as_callback *current_head, *cb;
105 caddr_t saddr;
106 size_t rsize;
107
108 /* callback function and an event are mandatory */
109 if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
110 return (EINVAL);
111
112 /* Adding a callback after as_free has been called is not allowed */
113 if (as == &kas)
114 return (ENOMEM);
115
116 /*
117 * vaddr = 0 and size = -1 is used to indicate that the callback range
118 * is the entire address space so no rounding is done in that case.
119 */
120 if (size != -1) {
121 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
122 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
123 (size_t)saddr;
124 /* check for wraparound */
125 if (saddr + rsize < saddr)
126 return (ENOMEM);
127 } else {
128 if (vaddr != 0)
129 return (EINVAL);
130 saddr = vaddr;
131 rsize = size;
132 }
133
134 /* Allocate and initialize a callback entry */
135 cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
136 if (cb == NULL)
137 return (EAGAIN);
138
139 cb->ascb_func = cb_func;
140 cb->ascb_arg = arg;
141 cb->ascb_events = events;
142 cb->ascb_saddr = saddr;
143 cb->ascb_len = rsize;
144
145 /* Add the entry to the list */
146 mutex_enter(&as->a_contents);
147 current_head = as->a_callbacks;
148 as->a_callbacks = cb;
149 cb->ascb_next = current_head;
150
151 /*
152 * The call to this function may lose in a race with
153 * a pertinent event - eg. a thread does long term memory locking
154 * but before the callback is added another thread executes as_unmap.
155 * A broadcast here resolves that.
156 */
157 if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
158 AS_CLRUNMAPWAIT(as);
159 cv_broadcast(&as->a_cv);
160 }
161
162 mutex_exit(&as->a_contents);
163 return (0);
164 }
165
166 /*
167 * Search the callback list for an entry which pertains to arg.
168 *
169 * This is called from within the client upon completion of the callback.
170 * RETURN VALUES:
171 * AS_CALLBACK_DELETED (callback entry found and deleted)
172 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
173 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
174 * entry will be made in as_do_callbacks)
175 *
176 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
177 * set, it indicates that as_do_callbacks is processing this entry. The
178 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
179 * to unblock as_do_callbacks, in case it is blocked.
180 *
181 * CALLERS RESPONSIBILITY: If not calling from within the process context for
182 * the specified as, the caller must guarantee persistence of the specified as
183 * for the duration of this function (eg. pages being locked within the as
184 * will guarantee persistence).
185 */
186 uint_t
as_delete_callback(struct as * as,void * arg)187 as_delete_callback(struct as *as, void *arg)
188 {
189 struct as_callback **prevcb = &as->a_callbacks;
190 struct as_callback *cb;
191 uint_t rc = AS_CALLBACK_NOTFOUND;
192
193 mutex_enter(&as->a_contents);
194 for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
195 if (cb->ascb_arg != arg)
196 continue;
197
198 /*
199 * If the events indicate AS_CALLBACK_CALLED, just clear
200 * AS_ALL_EVENT in the events field and wakeup the thread
201 * that may be waiting in as_do_callbacks. as_do_callbacks
202 * will take care of removing this entry from the list. In
203 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise
204 * (AS_CALLBACK_CALLED not set), just remove it from the
205 * list, return the memory and return AS_CALLBACK_DELETED.
206 */
207 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
208 /* leave AS_CALLBACK_CALLED */
209 cb->ascb_events &= ~AS_ALL_EVENT;
210 rc = AS_CALLBACK_DELETE_DEFERRED;
211 cv_broadcast(&as->a_cv);
212 } else {
213 *prevcb = cb->ascb_next;
214 kmem_free(cb, sizeof (struct as_callback));
215 rc = AS_CALLBACK_DELETED;
216 }
217 break;
218 }
219 mutex_exit(&as->a_contents);
220 return (rc);
221 }
222
223 /*
224 * Searches the as callback list for a matching entry.
225 * Returns a pointer to the first matching callback, or NULL if
226 * nothing is found.
227 * This function never sleeps so it is ok to call it with more
228 * locks held but the (required) a_contents mutex.
229 *
230 * See also comment on as_do_callbacks below.
231 */
232 static struct as_callback *
as_find_callback(struct as * as,uint_t events,caddr_t event_addr,size_t event_len)233 as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
234 size_t event_len)
235 {
236 struct as_callback *cb;
237
238 ASSERT(MUTEX_HELD(&as->a_contents));
239 for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
240 /*
241 * If the callback has not already been called, then
242 * check if events or address range pertains. An event_len
243 * of zero means do an unconditional callback.
244 */
245 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
246 ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
247 (event_addr + event_len < cb->ascb_saddr) ||
248 (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
249 continue;
250 }
251 break;
252 }
253 return (cb);
254 }
255
256 /*
257 * Executes a given callback and removes it from the callback list for
258 * this address space.
259 * This function may sleep so the caller must drop all locks except
260 * a_contents before calling this func.
261 *
262 * See also comments on as_do_callbacks below.
263 */
264 static void
as_execute_callback(struct as * as,struct as_callback * cb,uint_t events)265 as_execute_callback(struct as *as, struct as_callback *cb,
266 uint_t events)
267 {
268 struct as_callback **prevcb;
269 void *cb_arg;
270
271 ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
272 cb->ascb_events |= AS_CALLBACK_CALLED;
273 mutex_exit(&as->a_contents);
274 (*cb->ascb_func)(as, cb->ascb_arg, events);
275 mutex_enter(&as->a_contents);
276 /*
277 * the callback function is required to delete the callback
278 * when the callback function determines it is OK for
279 * this thread to continue. as_delete_callback will clear
280 * the AS_ALL_EVENT in the events field when it is deleted.
281 * If the callback function called as_delete_callback,
282 * events will already be cleared and there will be no blocking.
283 */
284 while ((cb->ascb_events & events) != 0) {
285 cv_wait(&as->a_cv, &as->a_contents);
286 }
287 /*
288 * This entry needs to be taken off the list. Normally, the
289 * callback func itself does that, but unfortunately the list
290 * may have changed while the callback was running because the
291 * a_contents mutex was dropped and someone else other than the
292 * callback func itself could have called as_delete_callback,
293 * so we have to search to find this entry again. The entry
294 * must have AS_CALLBACK_CALLED, and have the same 'arg'.
295 */
296 cb_arg = cb->ascb_arg;
297 prevcb = &as->a_callbacks;
298 for (cb = as->a_callbacks; cb != NULL;
299 prevcb = &cb->ascb_next, cb = *prevcb) {
300 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
301 (cb_arg != cb->ascb_arg)) {
302 continue;
303 }
304 *prevcb = cb->ascb_next;
305 kmem_free(cb, sizeof (struct as_callback));
306 break;
307 }
308 }
309
310 /*
311 * Check the callback list for a matching event and intersection of
312 * address range. If there is a match invoke the callback. Skip an entry if:
313 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
314 * - not event of interest
315 * - not address range of interest
316 *
317 * An event_len of zero indicates a request for an unconditional callback
318 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The
319 * a_contents lock must be dropped before a callback, so only one callback
320 * can be done before returning. Return -1 (true) if a callback was
321 * executed and removed from the list, else return 0 (false).
322 *
323 * The logically separate parts, i.e. finding a matching callback and
324 * executing a given callback have been separated into two functions
325 * so that they can be called with different sets of locks held beyond
326 * the always-required a_contents. as_find_callback does not sleep so
327 * it is ok to call it if more locks than a_contents (i.e. the a_lock
328 * rwlock) are held. as_execute_callback on the other hand may sleep
329 * so all locks beyond a_contents must be dropped by the caller if one
330 * does not want to end comatose.
331 */
332 static int
as_do_callbacks(struct as * as,uint_t events,caddr_t event_addr,size_t event_len)333 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
334 size_t event_len)
335 {
336 struct as_callback *cb;
337
338 if ((cb = as_find_callback(as, events, event_addr, event_len))) {
339 as_execute_callback(as, cb, events);
340 return (-1);
341 }
342 return (0);
343 }
344
345 /*
346 * Search for the segment containing addr. If a segment containing addr
347 * exists, that segment is returned. If no such segment exists, and
348 * the list spans addresses greater than addr, then the first segment
349 * whose base is greater than addr is returned; otherwise, NULL is
350 * returned unless tail is true, in which case the last element of the
351 * list is returned.
352 *
353 * a_seglast is used to cache the last found segment for repeated
354 * searches to the same addr (which happens frequently).
355 */
356 struct seg *
as_findseg(struct as * as,caddr_t addr,int tail)357 as_findseg(struct as *as, caddr_t addr, int tail)
358 {
359 struct seg *seg = as->a_seglast;
360 avl_index_t where;
361
362 ASSERT(AS_LOCK_HELD(as));
363
364 if (seg != NULL &&
365 seg->s_base <= addr &&
366 addr < seg->s_base + seg->s_size)
367 return (seg);
368
369 seg = avl_find(&as->a_segtree, &addr, &where);
370 if (seg != NULL)
371 return (as->a_seglast = seg);
372
373 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
374 if (seg == NULL && tail)
375 seg = avl_last(&as->a_segtree);
376 return (as->a_seglast = seg);
377 }
378
379 #ifdef VERIFY_SEGLIST
380 /*
381 * verify that the linked list is coherent
382 */
383 static void
as_verify(struct as * as)384 as_verify(struct as *as)
385 {
386 struct seg *seg, *seglast, *p, *n;
387 uint_t nsegs = 0;
388
389 if (do_as_verify == 0)
390 return;
391
392 seglast = as->a_seglast;
393
394 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
395 ASSERT(seg->s_as == as);
396 p = AS_SEGPREV(as, seg);
397 n = AS_SEGNEXT(as, seg);
398 ASSERT(p == NULL || p->s_as == as);
399 ASSERT(p == NULL || p->s_base < seg->s_base);
400 ASSERT(n == NULL || n->s_base > seg->s_base);
401 ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
402 if (seg == seglast)
403 seglast = NULL;
404 nsegs++;
405 }
406 ASSERT(seglast == NULL);
407 ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
408 }
409 #endif /* VERIFY_SEGLIST */
410
411 /*
412 * Add a new segment to the address space. The avl_find()
413 * may be expensive so we attempt to use last segment accessed
414 * in as_gap() as an insertion point.
415 */
416 int
as_addseg(struct as * as,struct seg * newseg)417 as_addseg(struct as *as, struct seg *newseg)
418 {
419 struct seg *seg;
420 caddr_t addr;
421 caddr_t eaddr;
422 avl_index_t where;
423
424 ASSERT(AS_WRITE_HELD(as));
425
426 as->a_updatedir = 1; /* inform /proc */
427 gethrestime(&as->a_updatetime);
428
429 if (as->a_lastgaphl != NULL) {
430 struct seg *hseg = NULL;
431 struct seg *lseg = NULL;
432
433 if (as->a_lastgaphl->s_base > newseg->s_base) {
434 hseg = as->a_lastgaphl;
435 lseg = AVL_PREV(&as->a_segtree, hseg);
436 } else {
437 lseg = as->a_lastgaphl;
438 hseg = AVL_NEXT(&as->a_segtree, lseg);
439 }
440
441 if (hseg && lseg && lseg->s_base < newseg->s_base &&
442 hseg->s_base > newseg->s_base) {
443 avl_insert_here(&as->a_segtree, newseg, lseg,
444 AVL_AFTER);
445 as->a_lastgaphl = NULL;
446 as->a_seglast = newseg;
447 return (0);
448 }
449 as->a_lastgaphl = NULL;
450 }
451
452 addr = newseg->s_base;
453 eaddr = addr + newseg->s_size;
454
455 seg = avl_find(&as->a_segtree, &addr, &where);
456
457 if (seg == NULL)
458 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
459
460 if (seg == NULL)
461 seg = avl_last(&as->a_segtree);
462
463 if (seg != NULL) {
464 caddr_t base = seg->s_base;
465
466 /*
467 * If top of seg is below the requested address, then
468 * the insertion point is at the end of the linked list,
469 * and seg points to the tail of the list. Otherwise,
470 * the insertion point is immediately before seg.
471 */
472 if (base + seg->s_size > addr) {
473 if (addr >= base || eaddr > base) {
474 return (-1); /* overlapping segment */
475 }
476 }
477 }
478 as->a_seglast = newseg;
479 avl_insert(&as->a_segtree, newseg, where);
480
481 #ifdef VERIFY_SEGLIST
482 as_verify(as);
483 #endif
484 return (0);
485 }
486
487 struct seg *
as_removeseg(struct as * as,struct seg * seg)488 as_removeseg(struct as *as, struct seg *seg)
489 {
490 avl_tree_t *t;
491
492 ASSERT(AS_WRITE_HELD(as));
493
494 as->a_updatedir = 1; /* inform /proc */
495 gethrestime(&as->a_updatetime);
496
497 if (seg == NULL)
498 return (NULL);
499
500 t = &as->a_segtree;
501 if (as->a_seglast == seg)
502 as->a_seglast = NULL;
503 as->a_lastgaphl = NULL;
504
505 /*
506 * if this segment is at an address higher than
507 * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
508 */
509 if (as->a_lastgap &&
510 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
511 as->a_lastgap = AVL_NEXT(t, seg);
512
513 /*
514 * remove the segment from the seg tree
515 */
516 avl_remove(t, seg);
517
518 #ifdef VERIFY_SEGLIST
519 as_verify(as);
520 #endif
521 return (seg);
522 }
523
524 /*
525 * Find a segment containing addr.
526 */
527 struct seg *
as_segat(struct as * as,caddr_t addr)528 as_segat(struct as *as, caddr_t addr)
529 {
530 struct seg *seg = as->a_seglast;
531
532 ASSERT(AS_LOCK_HELD(as));
533
534 if (seg != NULL && seg->s_base <= addr &&
535 addr < seg->s_base + seg->s_size)
536 return (seg);
537
538 seg = avl_find(&as->a_segtree, &addr, NULL);
539 return (seg);
540 }
541
542 /*
543 * Serialize all searches for holes in an address space to
544 * prevent two or more threads from allocating the same virtual
545 * address range. The address space must not be "read/write"
546 * locked by the caller since we may block.
547 */
548 void
as_rangelock(struct as * as)549 as_rangelock(struct as *as)
550 {
551 mutex_enter(&as->a_contents);
552 while (AS_ISCLAIMGAP(as))
553 cv_wait(&as->a_cv, &as->a_contents);
554 AS_SETCLAIMGAP(as);
555 mutex_exit(&as->a_contents);
556 }
557
558 /*
559 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
560 */
561 void
as_rangeunlock(struct as * as)562 as_rangeunlock(struct as *as)
563 {
564 mutex_enter(&as->a_contents);
565 AS_CLRCLAIMGAP(as);
566 cv_signal(&as->a_cv);
567 mutex_exit(&as->a_contents);
568 }
569
570 /*
571 * compar segments (or just an address) by segment address range
572 */
573 static int
as_segcompar(const void * x,const void * y)574 as_segcompar(const void *x, const void *y)
575 {
576 struct seg *a = (struct seg *)x;
577 struct seg *b = (struct seg *)y;
578
579 if (a->s_base < b->s_base)
580 return (-1);
581 if (a->s_base >= b->s_base + b->s_size)
582 return (1);
583 return (0);
584 }
585
586
587 void
as_avlinit(struct as * as)588 as_avlinit(struct as *as)
589 {
590 avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
591 offsetof(struct seg, s_tree));
592 avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
593 offsetof(struct watched_page, wp_link));
594 }
595
596 /*ARGSUSED*/
597 static int
as_constructor(void * buf,void * cdrarg,int kmflags)598 as_constructor(void *buf, void *cdrarg, int kmflags)
599 {
600 struct as *as = buf;
601
602 mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
603 cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
604 rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
605 as_avlinit(as);
606 return (0);
607 }
608
609 /*ARGSUSED1*/
610 static void
as_destructor(void * buf,void * cdrarg)611 as_destructor(void *buf, void *cdrarg)
612 {
613 struct as *as = buf;
614
615 avl_destroy(&as->a_segtree);
616 mutex_destroy(&as->a_contents);
617 cv_destroy(&as->a_cv);
618 rw_destroy(&as->a_lock);
619 }
620
621 void
as_init(void)622 as_init(void)
623 {
624 as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
625 as_constructor, as_destructor, NULL, NULL, NULL, 0);
626 }
627
628 /*
629 * Allocate and initialize an address space data structure.
630 * We call hat_alloc to allow any machine dependent
631 * information in the hat structure to be initialized.
632 */
633 struct as *
as_alloc(void)634 as_alloc(void)
635 {
636 struct as *as;
637
638 as = kmem_cache_alloc(as_cache, KM_SLEEP);
639
640 as->a_flags = 0;
641 as->a_vbits = 0;
642 as->a_hrm = NULL;
643 as->a_seglast = NULL;
644 as->a_size = 0;
645 as->a_resvsize = 0;
646 as->a_updatedir = 0;
647 gethrestime(&as->a_updatetime);
648 as->a_objectdir = NULL;
649 as->a_sizedir = 0;
650 as->a_userlimit = (caddr_t)USERLIMIT;
651 as->a_lastgap = NULL;
652 as->a_lastgaphl = NULL;
653 as->a_callbacks = NULL;
654 as->a_proc = NULL;
655
656 AS_LOCK_ENTER(as, RW_WRITER);
657 as->a_hat = hat_alloc(as); /* create hat for default system mmu */
658 AS_LOCK_EXIT(as);
659
660 return (as);
661 }
662
663 /*
664 * Free an address space data structure.
665 * Need to free the hat first and then
666 * all the segments on this as and finally
667 * the space for the as struct itself.
668 */
669 void
as_free(struct as * as)670 as_free(struct as *as)
671 {
672 struct hat *hat = as->a_hat;
673 struct seg *seg, *next;
674 boolean_t free_started = B_FALSE;
675
676 top:
677 /*
678 * Invoke ALL callbacks. as_do_callbacks will do one callback
679 * per call, and not return (-1) until the callback has completed.
680 * When as_do_callbacks returns zero, all callbacks have completed.
681 */
682 mutex_enter(&as->a_contents);
683 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
684 ;
685
686 mutex_exit(&as->a_contents);
687 AS_LOCK_ENTER(as, RW_WRITER);
688
689 if (!free_started) {
690 free_started = B_TRUE;
691 hat_free_start(hat);
692 }
693 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
694 int err;
695
696 next = AS_SEGNEXT(as, seg);
697 retry:
698 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
699 if (err == EAGAIN) {
700 mutex_enter(&as->a_contents);
701 if (as->a_callbacks) {
702 AS_LOCK_EXIT(as);
703 } else if (!AS_ISNOUNMAPWAIT(as)) {
704 /*
705 * Memory is currently locked. Wait for a
706 * cv_signal that it has been unlocked, then
707 * try the operation again.
708 */
709 if (AS_ISUNMAPWAIT(as) == 0)
710 cv_broadcast(&as->a_cv);
711 AS_SETUNMAPWAIT(as);
712 AS_LOCK_EXIT(as);
713 while (AS_ISUNMAPWAIT(as))
714 cv_wait(&as->a_cv, &as->a_contents);
715 } else {
716 /*
717 * We may have raced with
718 * segvn_reclaim()/segspt_reclaim(). In this
719 * case clean nounmapwait flag and retry since
720 * softlockcnt in this segment may be already
721 * 0. We don't drop as writer lock so our
722 * number of retries without sleeping should
723 * be very small. See segvn_reclaim() for
724 * more comments.
725 */
726 AS_CLRNOUNMAPWAIT(as);
727 mutex_exit(&as->a_contents);
728 goto retry;
729 }
730 mutex_exit(&as->a_contents);
731 goto top;
732 } else {
733 /*
734 * We do not expect any other error return at this
735 * time. This is similar to an ASSERT in seg_unmap()
736 */
737 ASSERT(err == 0);
738 }
739 }
740 hat_free_end(hat);
741 AS_LOCK_EXIT(as);
742
743 /* /proc stuff */
744 ASSERT(avl_numnodes(&as->a_wpage) == 0);
745 if (as->a_objectdir) {
746 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
747 as->a_objectdir = NULL;
748 as->a_sizedir = 0;
749 }
750
751 /*
752 * Free the struct as back to kmem. Assert it has no segments.
753 */
754 ASSERT(avl_numnodes(&as->a_segtree) == 0);
755 kmem_cache_free(as_cache, as);
756 }
757
758 int
as_dup(struct as * as,struct proc * forkedproc)759 as_dup(struct as *as, struct proc *forkedproc)
760 {
761 struct as *newas;
762 struct seg *seg, *newseg;
763 size_t purgesize = 0;
764 int error;
765
766 AS_LOCK_ENTER(as, RW_WRITER);
767 as_clearwatch(as);
768 newas = as_alloc();
769 newas->a_userlimit = as->a_userlimit;
770 newas->a_proc = forkedproc;
771
772 AS_LOCK_ENTER(newas, RW_WRITER);
773
774 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
775
776 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
777
778 if (seg->s_flags & S_PURGE) {
779 purgesize += seg->s_size;
780 continue;
781 }
782
783 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
784 if (newseg == NULL) {
785 AS_LOCK_EXIT(newas);
786 as_setwatch(as);
787 AS_LOCK_EXIT(as);
788 as_free(newas);
789 return (-1);
790 }
791 if ((error = SEGOP_DUP(seg, newseg)) != 0) {
792 /*
793 * We call seg_free() on the new seg
794 * because the segment is not set up
795 * completely; i.e. it has no ops.
796 */
797 as_setwatch(as);
798 AS_LOCK_EXIT(as);
799 seg_free(newseg);
800 AS_LOCK_EXIT(newas);
801 as_free(newas);
802 return (error);
803 }
804 if ((newseg->s_flags & S_HOLE) == 0) {
805 newas->a_size += seg->s_size;
806 }
807 }
808 newas->a_resvsize = as->a_resvsize - purgesize;
809
810 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
811
812 AS_LOCK_EXIT(newas);
813
814 as_setwatch(as);
815 AS_LOCK_EXIT(as);
816 if (error != 0) {
817 as_free(newas);
818 return (error);
819 }
820 forkedproc->p_as = newas;
821 return (0);
822 }
823
824 /*
825 * Handle a ``fault'' at addr for size bytes.
826 */
827 faultcode_t
as_fault(struct hat * hat,struct as * as,caddr_t addr,size_t size,enum fault_type type,enum seg_rw rw)828 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
829 enum fault_type type, enum seg_rw rw)
830 {
831 struct seg *seg;
832 caddr_t raddr; /* rounded down addr */
833 size_t rsize; /* rounded up size */
834 size_t ssize;
835 faultcode_t res = 0;
836 caddr_t addrsav;
837 struct seg *segsav;
838 int as_lock_held;
839 klwp_t *lwp = ttolwp(curthread);
840
841
842
843 retry:
844 /*
845 * Indicate that the lwp is not to be stopped while waiting for a
846 * pagefault. This is to avoid deadlock while debugging a process
847 * via /proc over NFS (in particular).
848 */
849 if (lwp != NULL)
850 lwp->lwp_nostop++;
851
852 /*
853 * same length must be used when we softlock and softunlock. We
854 * don't support softunlocking lengths less than the original length
855 * when there is largepage support. See seg_dev.c for more
856 * comments.
857 */
858 switch (type) {
859
860 case F_SOFTLOCK:
861 CPU_STATS_ADD_K(vm, softlock, 1);
862 break;
863
864 case F_SOFTUNLOCK:
865 break;
866
867 case F_PROT:
868 CPU_STATS_ADD_K(vm, prot_fault, 1);
869 break;
870
871 case F_INVAL:
872 CPU_STATS_ENTER_K();
873 CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
874 if (as == &kas)
875 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
876 CPU_STATS_EXIT_K();
877 break;
878 }
879
880 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
881 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
882 (size_t)raddr;
883
884 /*
885 * XXX -- Don't grab the as lock for segkmap. We should grab it for
886 * correctness, but then we could be stuck holding this lock for
887 * a LONG time if the fault needs to be resolved on a slow
888 * filesystem, and then no-one will be able to exec new commands,
889 * as exec'ing requires the write lock on the as.
890 */
891 if (as == &kas && segkmap && segkmap->s_base <= raddr &&
892 raddr + size < segkmap->s_base + segkmap->s_size) {
893 seg = segkmap;
894 as_lock_held = 0;
895 } else {
896 AS_LOCK_ENTER(as, RW_READER);
897
898 seg = as_segat(as, raddr);
899 if (seg == NULL) {
900 AS_LOCK_EXIT(as);
901 if (lwp != NULL)
902 lwp->lwp_nostop--;
903 return (FC_NOMAP);
904 }
905
906 as_lock_held = 1;
907 }
908
909 addrsav = raddr;
910 segsav = seg;
911
912 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
913 if (raddr >= seg->s_base + seg->s_size) {
914 seg = AS_SEGNEXT(as, seg);
915 if (seg == NULL || raddr != seg->s_base) {
916 res = FC_NOMAP;
917 break;
918 }
919 }
920 if (raddr + rsize > seg->s_base + seg->s_size)
921 ssize = seg->s_base + seg->s_size - raddr;
922 else
923 ssize = rsize;
924
925 res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
926 if (res != 0)
927 break;
928 }
929
930 /*
931 * If we were SOFTLOCKing and encountered a failure,
932 * we must SOFTUNLOCK the range we already did. (Maybe we
933 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
934 * right here...)
935 */
936 if (res != 0 && type == F_SOFTLOCK) {
937 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
938 if (addrsav >= seg->s_base + seg->s_size)
939 seg = AS_SEGNEXT(as, seg);
940 ASSERT(seg != NULL);
941 /*
942 * Now call the fault routine again to perform the
943 * unlock using S_OTHER instead of the rw variable
944 * since we never got a chance to touch the pages.
945 */
946 if (raddr > seg->s_base + seg->s_size)
947 ssize = seg->s_base + seg->s_size - addrsav;
948 else
949 ssize = raddr - addrsav;
950 (void) SEGOP_FAULT(hat, seg, addrsav, ssize,
951 F_SOFTUNLOCK, S_OTHER);
952 }
953 }
954 if (as_lock_held)
955 AS_LOCK_EXIT(as);
956 if (lwp != NULL)
957 lwp->lwp_nostop--;
958
959 /*
960 * If the lower levels returned EDEADLK for a fault,
961 * It means that we should retry the fault. Let's wait
962 * a bit also to let the deadlock causing condition clear.
963 * This is part of a gross hack to work around a design flaw
964 * in the ufs/sds logging code and should go away when the
965 * logging code is re-designed to fix the problem. See bug
966 * 4125102 for details of the problem.
967 */
968 if (FC_ERRNO(res) == EDEADLK) {
969 delay(deadlk_wait);
970 res = 0;
971 goto retry;
972 }
973 return (res);
974 }
975
976
977
978 /*
979 * Asynchronous ``fault'' at addr for size bytes.
980 */
981 faultcode_t
as_faulta(struct as * as,caddr_t addr,size_t size)982 as_faulta(struct as *as, caddr_t addr, size_t size)
983 {
984 struct seg *seg;
985 caddr_t raddr; /* rounded down addr */
986 size_t rsize; /* rounded up size */
987 faultcode_t res = 0;
988 klwp_t *lwp = ttolwp(curthread);
989
990 retry:
991 /*
992 * Indicate that the lwp is not to be stopped while waiting
993 * for a pagefault. This is to avoid deadlock while debugging
994 * a process via /proc over NFS (in particular).
995 */
996 if (lwp != NULL)
997 lwp->lwp_nostop++;
998
999 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1000 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1001 (size_t)raddr;
1002
1003 AS_LOCK_ENTER(as, RW_READER);
1004 seg = as_segat(as, raddr);
1005 if (seg == NULL) {
1006 AS_LOCK_EXIT(as);
1007 if (lwp != NULL)
1008 lwp->lwp_nostop--;
1009 return (FC_NOMAP);
1010 }
1011
1012 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1013 if (raddr >= seg->s_base + seg->s_size) {
1014 seg = AS_SEGNEXT(as, seg);
1015 if (seg == NULL || raddr != seg->s_base) {
1016 res = FC_NOMAP;
1017 break;
1018 }
1019 }
1020 res = SEGOP_FAULTA(seg, raddr);
1021 if (res != 0)
1022 break;
1023 }
1024 AS_LOCK_EXIT(as);
1025 if (lwp != NULL)
1026 lwp->lwp_nostop--;
1027 /*
1028 * If the lower levels returned EDEADLK for a fault,
1029 * It means that we should retry the fault. Let's wait
1030 * a bit also to let the deadlock causing condition clear.
1031 * This is part of a gross hack to work around a design flaw
1032 * in the ufs/sds logging code and should go away when the
1033 * logging code is re-designed to fix the problem. See bug
1034 * 4125102 for details of the problem.
1035 */
1036 if (FC_ERRNO(res) == EDEADLK) {
1037 delay(deadlk_wait);
1038 res = 0;
1039 goto retry;
1040 }
1041 return (res);
1042 }
1043
1044 /*
1045 * Set the virtual mapping for the interval from [addr : addr + size)
1046 * in address space `as' to have the specified protection.
1047 * It is ok for the range to cross over several segments,
1048 * as long as they are contiguous.
1049 */
1050 int
as_setprot(struct as * as,caddr_t addr,size_t size,uint_t prot)1051 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1052 {
1053 struct seg *seg;
1054 struct as_callback *cb;
1055 size_t ssize;
1056 caddr_t raddr; /* rounded down addr */
1057 size_t rsize; /* rounded up size */
1058 int error = 0, writer = 0;
1059 caddr_t saveraddr;
1060 size_t saversize;
1061
1062 setprot_top:
1063 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1064 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1065 (size_t)raddr;
1066
1067 if (raddr + rsize < raddr) /* check for wraparound */
1068 return (ENOMEM);
1069
1070 saveraddr = raddr;
1071 saversize = rsize;
1072
1073 /*
1074 * Normally we only lock the as as a reader. But
1075 * if due to setprot the segment driver needs to split
1076 * a segment it will return IE_RETRY. Therefore we re-acquire
1077 * the as lock as a writer so the segment driver can change
1078 * the seg list. Also the segment driver will return IE_RETRY
1079 * after it has changed the segment list so we therefore keep
1080 * locking as a writer. Since these opeartions should be rare
1081 * want to only lock as a writer when necessary.
1082 */
1083 if (writer || avl_numnodes(&as->a_wpage) != 0) {
1084 AS_LOCK_ENTER(as, RW_WRITER);
1085 } else {
1086 AS_LOCK_ENTER(as, RW_READER);
1087 }
1088
1089 as_clearwatchprot(as, raddr, rsize);
1090 seg = as_segat(as, raddr);
1091 if (seg == NULL) {
1092 as_setwatch(as);
1093 AS_LOCK_EXIT(as);
1094 return (ENOMEM);
1095 }
1096
1097 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1098 if (raddr >= seg->s_base + seg->s_size) {
1099 seg = AS_SEGNEXT(as, seg);
1100 if (seg == NULL || raddr != seg->s_base) {
1101 error = ENOMEM;
1102 break;
1103 }
1104 }
1105 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1106 ssize = seg->s_base + seg->s_size - raddr;
1107 else
1108 ssize = rsize;
1109 retry:
1110 error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1111
1112 if (error == IE_NOMEM) {
1113 error = EAGAIN;
1114 break;
1115 }
1116
1117 if (error == IE_RETRY) {
1118 AS_LOCK_EXIT(as);
1119 writer = 1;
1120 goto setprot_top;
1121 }
1122
1123 if (error == EAGAIN) {
1124 /*
1125 * Make sure we have a_lock as writer.
1126 */
1127 if (writer == 0) {
1128 AS_LOCK_EXIT(as);
1129 writer = 1;
1130 goto setprot_top;
1131 }
1132
1133 /*
1134 * Memory is currently locked. It must be unlocked
1135 * before this operation can succeed through a retry.
1136 * The possible reasons for locked memory and
1137 * corresponding strategies for unlocking are:
1138 * (1) Normal I/O
1139 * wait for a signal that the I/O operation
1140 * has completed and the memory is unlocked.
1141 * (2) Asynchronous I/O
1142 * The aio subsystem does not unlock pages when
1143 * the I/O is completed. Those pages are unlocked
1144 * when the application calls aiowait/aioerror.
1145 * So, to prevent blocking forever, cv_broadcast()
1146 * is done to wake up aio_cleanup_thread.
1147 * Subsequently, segvn_reclaim will be called, and
1148 * that will do AS_CLRUNMAPWAIT() and wake us up.
1149 * (3) Long term page locking:
1150 * Drivers intending to have pages locked for a
1151 * period considerably longer than for normal I/O
1152 * (essentially forever) may have registered for a
1153 * callback so they may unlock these pages on
1154 * request. This is needed to allow this operation
1155 * to succeed. Each entry on the callback list is
1156 * examined. If the event or address range pertains
1157 * the callback is invoked (unless it already is in
1158 * progress). The a_contents lock must be dropped
1159 * before the callback, so only one callback can
1160 * be done at a time. Go to the top and do more
1161 * until zero is returned. If zero is returned,
1162 * either there were no callbacks for this event
1163 * or they were already in progress.
1164 */
1165 mutex_enter(&as->a_contents);
1166 if (as->a_callbacks &&
1167 (cb = as_find_callback(as, AS_SETPROT_EVENT,
1168 seg->s_base, seg->s_size))) {
1169 AS_LOCK_EXIT(as);
1170 as_execute_callback(as, cb, AS_SETPROT_EVENT);
1171 } else if (!AS_ISNOUNMAPWAIT(as)) {
1172 if (AS_ISUNMAPWAIT(as) == 0)
1173 cv_broadcast(&as->a_cv);
1174 AS_SETUNMAPWAIT(as);
1175 AS_LOCK_EXIT(as);
1176 while (AS_ISUNMAPWAIT(as))
1177 cv_wait(&as->a_cv, &as->a_contents);
1178 } else {
1179 /*
1180 * We may have raced with
1181 * segvn_reclaim()/segspt_reclaim(). In this
1182 * case clean nounmapwait flag and retry since
1183 * softlockcnt in this segment may be already
1184 * 0. We don't drop as writer lock so our
1185 * number of retries without sleeping should
1186 * be very small. See segvn_reclaim() for
1187 * more comments.
1188 */
1189 AS_CLRNOUNMAPWAIT(as);
1190 mutex_exit(&as->a_contents);
1191 goto retry;
1192 }
1193 mutex_exit(&as->a_contents);
1194 goto setprot_top;
1195 } else if (error != 0)
1196 break;
1197 }
1198 if (error != 0) {
1199 as_setwatch(as);
1200 } else {
1201 as_setwatchprot(as, saveraddr, saversize, prot);
1202 }
1203 AS_LOCK_EXIT(as);
1204 return (error);
1205 }
1206
1207 /*
1208 * Check to make sure that the interval [addr, addr + size)
1209 * in address space `as' has at least the specified protection.
1210 * It is ok for the range to cross over several segments, as long
1211 * as they are contiguous.
1212 */
1213 int
as_checkprot(struct as * as,caddr_t addr,size_t size,uint_t prot)1214 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1215 {
1216 struct seg *seg;
1217 size_t ssize;
1218 caddr_t raddr; /* rounded down addr */
1219 size_t rsize; /* rounded up size */
1220 int error = 0;
1221
1222 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1223 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1224 (size_t)raddr;
1225
1226 if (raddr + rsize < raddr) /* check for wraparound */
1227 return (ENOMEM);
1228
1229 /*
1230 * This is ugly as sin...
1231 * Normally, we only acquire the address space readers lock.
1232 * However, if the address space has watchpoints present,
1233 * we must acquire the writer lock on the address space for
1234 * the benefit of as_clearwatchprot() and as_setwatchprot().
1235 */
1236 if (avl_numnodes(&as->a_wpage) != 0)
1237 AS_LOCK_ENTER(as, RW_WRITER);
1238 else
1239 AS_LOCK_ENTER(as, RW_READER);
1240 as_clearwatchprot(as, raddr, rsize);
1241 seg = as_segat(as, raddr);
1242 if (seg == NULL) {
1243 as_setwatch(as);
1244 AS_LOCK_EXIT(as);
1245 return (ENOMEM);
1246 }
1247
1248 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1249 if (raddr >= seg->s_base + seg->s_size) {
1250 seg = AS_SEGNEXT(as, seg);
1251 if (seg == NULL || raddr != seg->s_base) {
1252 error = ENOMEM;
1253 break;
1254 }
1255 }
1256 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1257 ssize = seg->s_base + seg->s_size - raddr;
1258 else
1259 ssize = rsize;
1260
1261 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1262 if (error != 0)
1263 break;
1264 }
1265 as_setwatch(as);
1266 AS_LOCK_EXIT(as);
1267 return (error);
1268 }
1269
1270 int
as_unmap(struct as * as,caddr_t addr,size_t size)1271 as_unmap(struct as *as, caddr_t addr, size_t size)
1272 {
1273 struct seg *seg, *seg_next;
1274 struct as_callback *cb;
1275 caddr_t raddr, eaddr;
1276 size_t ssize, rsize = 0;
1277 int err;
1278
1279 top:
1280 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1281 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1282 (uintptr_t)PAGEMASK);
1283
1284 AS_LOCK_ENTER(as, RW_WRITER);
1285
1286 as->a_updatedir = 1; /* inform /proc */
1287 gethrestime(&as->a_updatetime);
1288
1289 /*
1290 * Use as_findseg to find the first segment in the range, then
1291 * step through the segments in order, following s_next.
1292 */
1293 as_clearwatchprot(as, raddr, eaddr - raddr);
1294
1295 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1296 const boolean_t is_hole = ((seg->s_flags & S_HOLE) != 0);
1297
1298 if (eaddr <= seg->s_base)
1299 break; /* eaddr was in a gap; all done */
1300
1301 /* this is implied by the test above */
1302 ASSERT(raddr < eaddr);
1303
1304 if (raddr < seg->s_base)
1305 raddr = seg->s_base; /* raddr was in a gap */
1306
1307 if (eaddr > (seg->s_base + seg->s_size))
1308 ssize = seg->s_base + seg->s_size - raddr;
1309 else
1310 ssize = eaddr - raddr;
1311
1312 /*
1313 * Save next segment pointer since seg can be
1314 * destroyed during the segment unmap operation.
1315 */
1316 seg_next = AS_SEGNEXT(as, seg);
1317
1318 /*
1319 * We didn't count /dev/null mappings, so ignore them here.
1320 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1321 * we have to do this check here while we have seg.)
1322 */
1323 rsize = 0;
1324 if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1325 !SEG_IS_PARTIAL_RESV(seg))
1326 rsize = ssize;
1327
1328 retry:
1329 err = SEGOP_UNMAP(seg, raddr, ssize);
1330 if (err == EAGAIN) {
1331 /*
1332 * Memory is currently locked. It must be unlocked
1333 * before this operation can succeed through a retry.
1334 * The possible reasons for locked memory and
1335 * corresponding strategies for unlocking are:
1336 * (1) Normal I/O
1337 * wait for a signal that the I/O operation
1338 * has completed and the memory is unlocked.
1339 * (2) Asynchronous I/O
1340 * The aio subsystem does not unlock pages when
1341 * the I/O is completed. Those pages are unlocked
1342 * when the application calls aiowait/aioerror.
1343 * So, to prevent blocking forever, cv_broadcast()
1344 * is done to wake up aio_cleanup_thread.
1345 * Subsequently, segvn_reclaim will be called, and
1346 * that will do AS_CLRUNMAPWAIT() and wake us up.
1347 * (3) Long term page locking:
1348 * Drivers intending to have pages locked for a
1349 * period considerably longer than for normal I/O
1350 * (essentially forever) may have registered for a
1351 * callback so they may unlock these pages on
1352 * request. This is needed to allow this operation
1353 * to succeed. Each entry on the callback list is
1354 * examined. If the event or address range pertains
1355 * the callback is invoked (unless it already is in
1356 * progress). The a_contents lock must be dropped
1357 * before the callback, so only one callback can
1358 * be done at a time. Go to the top and do more
1359 * until zero is returned. If zero is returned,
1360 * either there were no callbacks for this event
1361 * or they were already in progress.
1362 */
1363 mutex_enter(&as->a_contents);
1364 if (as->a_callbacks &&
1365 (cb = as_find_callback(as, AS_UNMAP_EVENT,
1366 seg->s_base, seg->s_size))) {
1367 AS_LOCK_EXIT(as);
1368 as_execute_callback(as, cb, AS_UNMAP_EVENT);
1369 } else if (!AS_ISNOUNMAPWAIT(as)) {
1370 if (AS_ISUNMAPWAIT(as) == 0)
1371 cv_broadcast(&as->a_cv);
1372 AS_SETUNMAPWAIT(as);
1373 AS_LOCK_EXIT(as);
1374 while (AS_ISUNMAPWAIT(as))
1375 cv_wait(&as->a_cv, &as->a_contents);
1376 } else {
1377 /*
1378 * We may have raced with
1379 * segvn_reclaim()/segspt_reclaim(). In this
1380 * case clean nounmapwait flag and retry since
1381 * softlockcnt in this segment may be already
1382 * 0. We don't drop as writer lock so our
1383 * number of retries without sleeping should
1384 * be very small. See segvn_reclaim() for
1385 * more comments.
1386 */
1387 AS_CLRNOUNMAPWAIT(as);
1388 mutex_exit(&as->a_contents);
1389 goto retry;
1390 }
1391 mutex_exit(&as->a_contents);
1392 goto top;
1393 } else if (err == IE_RETRY) {
1394 AS_LOCK_EXIT(as);
1395 goto top;
1396 } else if (err) {
1397 as_setwatch(as);
1398 AS_LOCK_EXIT(as);
1399 return (-1);
1400 }
1401
1402 if (!is_hole) {
1403 as->a_size -= ssize;
1404 if (rsize)
1405 as->a_resvsize -= rsize;
1406 }
1407 raddr += ssize;
1408 }
1409 AS_LOCK_EXIT(as);
1410 return (0);
1411 }
1412
1413 static int
as_map_segvn_segs(struct as * as,caddr_t addr,size_t size,uint_t szcvec,segcreate_func_t crfp,struct segvn_crargs * vn_a,boolean_t * segcreated)1414 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1415 segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated)
1416 {
1417 uint_t szc, nszc, save_szcvec;
1418 int error;
1419 caddr_t a, eaddr;
1420 size_t pgsz = 0;
1421 const boolean_t do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1422
1423 ASSERT(AS_WRITE_HELD(as));
1424 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1425 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1426 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1427
1428 if (!do_off) {
1429 vn_a->offset = 0;
1430 }
1431
1432 if (szcvec <= 1) {
1433 struct seg *seg, *segref;
1434
1435 seg = segref = seg_alloc(as, addr, size);
1436 if (seg == NULL) {
1437 return (ENOMEM);
1438 }
1439 vn_a->szc = 0;
1440 error = (*crfp)(&seg, vn_a);
1441 if (error != 0) {
1442 VERIFY3P(seg, ==, segref);
1443 seg_free(seg);
1444 } else {
1445 as->a_size += size;
1446 as->a_resvsize += size;
1447 }
1448 return (error);
1449 }
1450
1451 eaddr = addr + size;
1452 save_szcvec = szcvec;
1453 szcvec >>= 1;
1454 szc = 0;
1455 nszc = 0;
1456 while (szcvec) {
1457 if ((szcvec & 0x1) == 0) {
1458 nszc++;
1459 szcvec >>= 1;
1460 continue;
1461 }
1462 nszc++;
1463 pgsz = page_get_pagesize(nszc);
1464 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1465 if (a != addr) {
1466 struct seg *seg, *segref;
1467 size_t segsize;
1468
1469 ASSERT(a < eaddr);
1470
1471 segsize = a - addr;
1472 seg = segref = seg_alloc(as, addr, segsize);
1473 if (seg == NULL) {
1474 return (ENOMEM);
1475 }
1476 vn_a->szc = szc;
1477 error = (*crfp)(&seg, vn_a);
1478 if (error != 0) {
1479 VERIFY3P(seg, ==, segref);
1480 seg_free(seg);
1481 return (error);
1482 }
1483 as->a_size += segsize;
1484 as->a_resvsize += segsize;
1485 *segcreated = B_TRUE;
1486 if (do_off) {
1487 vn_a->offset += segsize;
1488 }
1489 addr = a;
1490 }
1491 szc = nszc;
1492 szcvec >>= 1;
1493 }
1494
1495 ASSERT(addr < eaddr);
1496 szcvec = save_szcvec | 1; /* add 8K pages */
1497 while (szcvec) {
1498 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1499 ASSERT(a >= addr);
1500 if (a != addr) {
1501 struct seg *seg, *segref;
1502 size_t segsize;
1503
1504 segsize = a - addr;
1505 seg = segref = seg_alloc(as, addr, segsize);
1506 if (seg == NULL) {
1507 return (ENOMEM);
1508 }
1509 vn_a->szc = szc;
1510 error = (*crfp)(&seg, vn_a);
1511 if (error != 0) {
1512 VERIFY3P(seg, ==, segref);
1513 seg_free(seg);
1514 return (error);
1515 }
1516 as->a_size += segsize;
1517 as->a_resvsize += segsize;
1518 *segcreated = B_TRUE;
1519 if (do_off) {
1520 vn_a->offset += segsize;
1521 }
1522 addr = a;
1523 }
1524 szcvec &= ~(1 << szc);
1525 if (szcvec) {
1526 szc = highbit(szcvec) - 1;
1527 pgsz = page_get_pagesize(szc);
1528 }
1529 }
1530 ASSERT(addr == eaddr);
1531
1532 return (0);
1533 }
1534
1535 static int
as_map_vnsegs(struct as * as,caddr_t addr,size_t size,segcreate_func_t crfp,struct segvn_crargs * vn_a,boolean_t * segcreated)1536 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1537 segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated)
1538 {
1539 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1540 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1541 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1542 type, 0);
1543 int error;
1544 struct vattr va;
1545 u_offset_t eoff;
1546 size_t save_size = 0;
1547 extern size_t textrepl_size_thresh;
1548
1549 ASSERT(AS_WRITE_HELD(as));
1550 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1551 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1552 ASSERT(vn_a->vp != NULL);
1553 ASSERT(vn_a->amp == NULL);
1554
1555 again:
1556 if (szcvec <= 1) {
1557 struct seg *seg, *segref;
1558
1559 seg = segref = seg_alloc(as, addr, size);
1560 if (seg == NULL) {
1561 return (ENOMEM);
1562 }
1563 vn_a->szc = 0;
1564 error = (*crfp)(&seg, vn_a);
1565 if (error != 0) {
1566 VERIFY3P(seg, ==, segref);
1567 seg_free(seg);
1568 } else {
1569 as->a_size += size;
1570 as->a_resvsize += size;
1571 }
1572 return (error);
1573 }
1574
1575 va.va_mask = AT_SIZE;
1576 if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1577 szcvec = 0;
1578 goto again;
1579 }
1580 eoff = vn_a->offset & PAGEMASK;
1581 if (eoff >= va.va_size) {
1582 szcvec = 0;
1583 goto again;
1584 }
1585 eoff += size;
1586 if (btopr(va.va_size) < btopr(eoff)) {
1587 save_size = size;
1588 size = va.va_size - (vn_a->offset & PAGEMASK);
1589 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1590 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1591 type, 0);
1592 if (szcvec <= 1) {
1593 size = save_size;
1594 goto again;
1595 }
1596 }
1597
1598 if (size > textrepl_size_thresh) {
1599 vn_a->flags |= _MAP_TEXTREPL;
1600 }
1601 error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1602 segcreated);
1603 if (error != 0) {
1604 return (error);
1605 }
1606 if (save_size) {
1607 addr += size;
1608 size = save_size - size;
1609 szcvec = 0;
1610 goto again;
1611 }
1612 return (0);
1613 }
1614
1615 /*
1616 * as_map_ansegs: shared or private anonymous memory. Note that the flags
1617 * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1618 */
1619 static int
as_map_ansegs(struct as * as,caddr_t addr,size_t size,segcreate_func_t crfp,struct segvn_crargs * vn_a,boolean_t * segcreated)1620 as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1621 segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated)
1622 {
1623 uint_t szcvec;
1624 uchar_t type = 0;
1625
1626 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1627 if (vn_a->type == MAP_SHARED) {
1628 type = MAPPGSZC_SHM;
1629 } else if (vn_a->type == MAP_PRIVATE) {
1630 if (vn_a->szc == AS_MAP_HEAP) {
1631 type = MAPPGSZC_HEAP;
1632 } else if (vn_a->szc == AS_MAP_STACK) {
1633 type = MAPPGSZC_STACK;
1634 } else {
1635 type = MAPPGSZC_PRIVM;
1636 }
1637 }
1638 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1639 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1640 (vn_a->flags & MAP_TEXT), type, 0);
1641 ASSERT(AS_WRITE_HELD(as));
1642 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1643 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1644 ASSERT(vn_a->vp == NULL);
1645
1646 return (as_map_segvn_segs(as, addr, size, szcvec,
1647 crfp, vn_a, segcreated));
1648 }
1649
1650 int
as_map(struct as * as,caddr_t addr,size_t size,segcreate_func_t crfp,void * argsp)1651 as_map(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp,
1652 void *argsp)
1653 {
1654 AS_LOCK_ENTER(as, RW_WRITER);
1655 return (as_map_locked(as, addr, size, crfp, argsp));
1656 }
1657
1658 int
as_map_locked(struct as * as,caddr_t addr,size_t size,segcreate_func_t crfp,void * argsp)1659 as_map_locked(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp,
1660 void *argsp)
1661 {
1662 caddr_t raddr; /* rounded down addr */
1663 size_t rsize; /* rounded up size */
1664 int error;
1665 boolean_t is_hole = B_FALSE;
1666 /*
1667 * The use of a_proc is preferred to handle the case where curproc is
1668 * a door_call server and is allocating memory in the client's (a_proc)
1669 * address space.
1670 * When creating a shared memory segment a_proc will be NULL so we
1671 * fallback to curproc in that case.
1672 */
1673 struct proc *p = (as->a_proc == NULL) ? curproc : as->a_proc;
1674 struct segvn_crargs crargs;
1675
1676 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1677 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1678 (size_t)raddr;
1679
1680 /*
1681 * check for wrap around
1682 */
1683 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1684 AS_LOCK_EXIT(as);
1685 return (ENOMEM);
1686 }
1687
1688 as->a_updatedir = 1; /* inform /proc */
1689 gethrestime(&as->a_updatetime);
1690
1691 if (as != &kas) {
1692 /*
1693 * Ensure that the virtual size of the process will not exceed
1694 * the configured limit. Since seg_hole segments will later
1695 * set the S_HOLE flag indicating their status as a hole in the
1696 * AS, they are excluded from this check.
1697 */
1698 if (as->a_size + rsize > (size_t)p->p_vmem_ctl &&
1699 !AS_MAP_CHECK_SEGHOLE(crfp)) {
1700 AS_LOCK_EXIT(as);
1701
1702 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM],
1703 p->p_rctls, p, RCA_UNSAFE_ALL);
1704 return (ENOMEM);
1705 }
1706 }
1707
1708 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1709 boolean_t do_unmap = B_FALSE;
1710
1711 crargs = *(struct segvn_crargs *)argsp;
1712 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs,
1713 &do_unmap);
1714 if (error != 0) {
1715 AS_LOCK_EXIT(as);
1716 if (do_unmap) {
1717 (void) as_unmap(as, addr, size);
1718 }
1719 return (error);
1720 }
1721 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1722 boolean_t do_unmap = B_FALSE;
1723
1724 crargs = *(struct segvn_crargs *)argsp;
1725 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs,
1726 &do_unmap);
1727 if (error != 0) {
1728 AS_LOCK_EXIT(as);
1729 if (do_unmap) {
1730 (void) as_unmap(as, addr, size);
1731 }
1732 return (error);
1733 }
1734 } else {
1735 struct seg *seg, *segref;
1736
1737 seg = segref = seg_alloc(as, addr, size);
1738 if (seg == NULL) {
1739 AS_LOCK_EXIT(as);
1740 return (ENOMEM);
1741 }
1742
1743 /*
1744 * It is possible that the segment creation routine will free
1745 * 'seg' as part of a more advanced operation, such as when
1746 * segvn concatenates adjacent segments together. When this
1747 * occurs, the seg*_create routine must communicate the
1748 * resulting segment out via the 'struct seg **' parameter.
1749 *
1750 * If segment creation fails, it must not free the passed-in
1751 * segment, nor alter the argument pointer.
1752 */
1753 error = (*crfp)(&seg, argsp);
1754 if (error != 0) {
1755 VERIFY3P(seg, ==, segref);
1756 seg_free(seg);
1757 AS_LOCK_EXIT(as);
1758 return (error);
1759 }
1760
1761 /*
1762 * Check if the resulting segment represents a hole in the
1763 * address space, rather than contributing to the AS size.
1764 */
1765 is_hole = ((seg->s_flags & S_HOLE) != 0);
1766
1767 /* Add size now so as_unmap will work if as_ctl fails. */
1768 if (!is_hole) {
1769 as->a_size += rsize;
1770 as->a_resvsize += rsize;
1771 }
1772 }
1773
1774 as_setwatch(as);
1775
1776 /*
1777 * Establish memory locks for the segment if the address space is
1778 * locked, provided it's not an explicit hole in the AS.
1779 */
1780 mutex_enter(&as->a_contents);
1781 if (AS_ISPGLCK(as) && !is_hole) {
1782 mutex_exit(&as->a_contents);
1783 AS_LOCK_EXIT(as);
1784 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1785 if (error != 0)
1786 (void) as_unmap(as, addr, size);
1787 } else {
1788 mutex_exit(&as->a_contents);
1789 AS_LOCK_EXIT(as);
1790 }
1791 return (error);
1792 }
1793
1794
1795 /*
1796 * Delete all segments in the address space marked with S_PURGE.
1797 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1798 * These segments are deleted as a first step before calls to as_gap(), so
1799 * that they don't affect mmap() or shmat().
1800 */
1801 void
as_purge(struct as * as)1802 as_purge(struct as *as)
1803 {
1804 struct seg *seg;
1805 struct seg *next_seg;
1806
1807 /*
1808 * the setting of NEEDSPURGE is protect by as_rangelock(), so
1809 * no need to grab a_contents mutex for this check
1810 */
1811 if ((as->a_flags & AS_NEEDSPURGE) == 0)
1812 return;
1813
1814 AS_LOCK_ENTER(as, RW_WRITER);
1815 next_seg = NULL;
1816 seg = AS_SEGFIRST(as);
1817 while (seg != NULL) {
1818 next_seg = AS_SEGNEXT(as, seg);
1819 if (seg->s_flags & S_PURGE)
1820 SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1821 seg = next_seg;
1822 }
1823 AS_LOCK_EXIT(as);
1824
1825 mutex_enter(&as->a_contents);
1826 as->a_flags &= ~AS_NEEDSPURGE;
1827 mutex_exit(&as->a_contents);
1828 }
1829
1830 /*
1831 * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1832 * range of addresses at least "minlen" long, where the base of the range is
1833 * at "off" phase from an "align" boundary and there is space for a
1834 * "redzone"-sized redzone on eithe rside of the range. Thus,
1835 * if align was 4M and off was 16k, the user wants a hole which will start
1836 * 16k into a 4M page.
1837 *
1838 * If flags specifies AH_HI, the hole will have the highest possible address
1839 * in the range. We use the as->a_lastgap field to figure out where to
1840 * start looking for a gap.
1841 *
1842 * Otherwise, the gap will have the lowest possible address.
1843 *
1844 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1845 *
1846 * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1847 * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1848 *
1849 * NOTE: This routine is not correct when base+len overflows caddr_t.
1850 */
1851 int
as_gap_aligned(struct as * as,size_t minlen,caddr_t * basep,size_t * lenp,uint_t flags,caddr_t addr,size_t align,size_t redzone,size_t off)1852 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1853 uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1854 {
1855 caddr_t lobound = *basep;
1856 caddr_t hibound = lobound + *lenp;
1857 struct seg *lseg, *hseg;
1858 caddr_t lo, hi;
1859 int forward;
1860 caddr_t save_base;
1861 size_t save_len;
1862 size_t save_minlen;
1863 size_t save_redzone;
1864 int fast_path = 1;
1865
1866 save_base = *basep;
1867 save_len = *lenp;
1868 save_minlen = minlen;
1869 save_redzone = redzone;
1870
1871 /*
1872 * For the first pass/fast_path, just add align and redzone into
1873 * minlen since if we get an allocation, we can guarantee that it
1874 * will fit the alignment and redzone requested.
1875 * This increases the chance that hibound will be adjusted to
1876 * a_lastgap->s_base which will likely allow us to find an
1877 * acceptable hole in the address space quicker.
1878 * If we can't find a hole with this fast_path, then we look for
1879 * smaller holes in which the alignment and offset may allow
1880 * the allocation to fit.
1881 */
1882 minlen += align;
1883 minlen += 2 * redzone;
1884 redzone = 0;
1885
1886 AS_LOCK_ENTER(as, RW_READER);
1887 if (AS_SEGFIRST(as) == NULL) {
1888 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1889 align, redzone, off)) {
1890 AS_LOCK_EXIT(as);
1891 return (0);
1892 } else {
1893 AS_LOCK_EXIT(as);
1894 *basep = save_base;
1895 *lenp = save_len;
1896 return (-1);
1897 }
1898 }
1899
1900 retry:
1901 /*
1902 * Set up to iterate over all the inter-segment holes in the given
1903 * direction. lseg is NULL for the lowest-addressed hole and hseg is
1904 * NULL for the highest-addressed hole. If moving backwards, we reset
1905 * sseg to denote the highest-addressed segment.
1906 */
1907 forward = (flags & AH_DIR) == AH_LO;
1908 if (forward) {
1909 hseg = as_findseg(as, lobound, 1);
1910 lseg = AS_SEGPREV(as, hseg);
1911 } else {
1912
1913 /*
1914 * If allocating at least as much as the last allocation,
1915 * use a_lastgap's base as a better estimate of hibound.
1916 */
1917 if (as->a_lastgap &&
1918 minlen >= as->a_lastgap->s_size &&
1919 hibound >= as->a_lastgap->s_base)
1920 hibound = as->a_lastgap->s_base;
1921
1922 hseg = as_findseg(as, hibound, 1);
1923 if (hseg->s_base + hseg->s_size < hibound) {
1924 lseg = hseg;
1925 hseg = NULL;
1926 } else {
1927 lseg = AS_SEGPREV(as, hseg);
1928 }
1929 }
1930
1931 for (;;) {
1932 /*
1933 * Set lo and hi to the hole's boundaries. (We should really
1934 * use MAXADDR in place of hibound in the expression below,
1935 * but can't express it easily; using hibound in its place is
1936 * harmless.)
1937 */
1938 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1939 hi = (hseg == NULL) ? hibound : hseg->s_base;
1940 /*
1941 * If the iteration has moved past the interval from lobound
1942 * to hibound it's pointless to continue.
1943 */
1944 if ((forward && lo > hibound) || (!forward && hi < lobound))
1945 break;
1946 else if (lo > hibound || hi < lobound)
1947 goto cont;
1948 /*
1949 * Candidate hole lies at least partially within the allowable
1950 * range. Restrict it to fall completely within that range,
1951 * i.e., to [max(lo, lobound), min(hi, hibound)].
1952 */
1953 if (lo < lobound)
1954 lo = lobound;
1955 if (hi > hibound)
1956 hi = hibound;
1957 /*
1958 * Verify that the candidate hole is big enough and meets
1959 * hardware constraints. If the hole is too small, no need
1960 * to do the further checks since they will fail.
1961 */
1962 *basep = lo;
1963 *lenp = hi - lo;
1964 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1965 minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1966 ((flags & AH_CONTAIN) == 0 ||
1967 (*basep <= addr && *basep + *lenp > addr))) {
1968 if (!forward)
1969 as->a_lastgap = hseg;
1970 if (hseg != NULL)
1971 as->a_lastgaphl = hseg;
1972 else
1973 as->a_lastgaphl = lseg;
1974 AS_LOCK_EXIT(as);
1975 return (0);
1976 }
1977 cont:
1978 /*
1979 * Move to the next hole.
1980 */
1981 if (forward) {
1982 lseg = hseg;
1983 if (lseg == NULL)
1984 break;
1985 hseg = AS_SEGNEXT(as, hseg);
1986 } else {
1987 hseg = lseg;
1988 if (hseg == NULL)
1989 break;
1990 lseg = AS_SEGPREV(as, lseg);
1991 }
1992 }
1993 if (fast_path && (align != 0 || save_redzone != 0)) {
1994 fast_path = 0;
1995 minlen = save_minlen;
1996 redzone = save_redzone;
1997 goto retry;
1998 }
1999 *basep = save_base;
2000 *lenp = save_len;
2001 AS_LOCK_EXIT(as);
2002 return (-1);
2003 }
2004
2005 /*
2006 * Find a hole of at least size minlen within [*basep, *basep + *lenp).
2007 *
2008 * If flags specifies AH_HI, the hole will have the highest possible address
2009 * in the range. We use the as->a_lastgap field to figure out where to
2010 * start looking for a gap.
2011 *
2012 * Otherwise, the gap will have the lowest possible address.
2013 *
2014 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
2015 *
2016 * If an adequate hole is found, base and len are set to reflect the part of
2017 * the hole that is within range, and 0 is returned, otherwise,
2018 * -1 is returned.
2019 *
2020 * NOTE: This routine is not correct when base+len overflows caddr_t.
2021 */
2022 int
as_gap(struct as * as,size_t minlen,caddr_t * basep,size_t * lenp,uint_t flags,caddr_t addr)2023 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
2024 caddr_t addr)
2025 {
2026
2027 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
2028 }
2029
2030 /*
2031 * Return the next range within [base, base + len) that is backed
2032 * with "real memory". Skip holes and non-seg_vn segments.
2033 * We're lazy and only return one segment at a time.
2034 */
2035 int
as_memory(struct as * as,caddr_t * basep,size_t * lenp)2036 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2037 {
2038 extern struct seg_ops segspt_shmops; /* needs a header file */
2039 struct seg *seg;
2040 caddr_t addr, eaddr;
2041 caddr_t segend;
2042
2043 AS_LOCK_ENTER(as, RW_READER);
2044
2045 addr = *basep;
2046 eaddr = addr + *lenp;
2047
2048 seg = as_findseg(as, addr, 0);
2049 if (seg != NULL)
2050 addr = MAX(seg->s_base, addr);
2051
2052 for (;;) {
2053 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2054 AS_LOCK_EXIT(as);
2055 return (EINVAL);
2056 }
2057
2058 if (seg->s_ops == &segvn_ops) {
2059 segend = seg->s_base + seg->s_size;
2060 break;
2061 }
2062
2063 /*
2064 * We do ISM by looking into the private data
2065 * to determine the real size of the segment.
2066 */
2067 if (seg->s_ops == &segspt_shmops) {
2068 segend = seg->s_base + spt_realsize(seg);
2069 if (addr < segend)
2070 break;
2071 }
2072
2073 seg = AS_SEGNEXT(as, seg);
2074
2075 if (seg != NULL)
2076 addr = seg->s_base;
2077 }
2078
2079 *basep = addr;
2080
2081 if (segend > eaddr)
2082 *lenp = eaddr - addr;
2083 else
2084 *lenp = segend - addr;
2085
2086 AS_LOCK_EXIT(as);
2087 return (0);
2088 }
2089
2090 /*
2091 * Swap the pages associated with the address space as out to
2092 * secondary storage, returning the number of bytes actually
2093 * swapped.
2094 *
2095 * The value returned is intended to correlate well with the process's
2096 * memory requirements. Its usefulness for this purpose depends on
2097 * how well the segment-level routines do at returning accurate
2098 * information.
2099 */
2100 size_t
as_swapout(struct as * as)2101 as_swapout(struct as *as)
2102 {
2103 struct seg *seg;
2104 size_t swpcnt = 0;
2105
2106 /*
2107 * Kernel-only processes have given up their address
2108 * spaces. Of course, we shouldn't be attempting to
2109 * swap out such processes in the first place...
2110 */
2111 if (as == NULL)
2112 return (0);
2113
2114 AS_LOCK_ENTER(as, RW_READER);
2115
2116 /*
2117 * Free all mapping resources associated with the address
2118 * space. The segment-level swapout routines capitalize
2119 * on this unmapping by scavanging pages that have become
2120 * unmapped here.
2121 */
2122 hat_swapout(as->a_hat);
2123
2124 /*
2125 * Call the swapout routines of all segments in the address
2126 * space to do the actual work, accumulating the amount of
2127 * space reclaimed.
2128 */
2129 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2130 struct seg_ops *ov = seg->s_ops;
2131
2132 /*
2133 * We have to check to see if the seg has
2134 * an ops vector because the seg may have
2135 * been in the middle of being set up when
2136 * the process was picked for swapout.
2137 */
2138 if ((ov != NULL) && (ov->swapout != NULL))
2139 swpcnt += SEGOP_SWAPOUT(seg);
2140 }
2141 AS_LOCK_EXIT(as);
2142 return (swpcnt);
2143 }
2144
2145 /*
2146 * Determine whether data from the mappings in interval [addr, addr + size)
2147 * are in the primary memory (core) cache.
2148 */
2149 int
as_incore(struct as * as,caddr_t addr,size_t size,char * vec,size_t * sizep)2150 as_incore(struct as *as, caddr_t addr,
2151 size_t size, char *vec, size_t *sizep)
2152 {
2153 struct seg *seg;
2154 size_t ssize;
2155 caddr_t raddr; /* rounded down addr */
2156 size_t rsize; /* rounded up size */
2157 size_t isize; /* iteration size */
2158 int error = 0; /* result, assume success */
2159
2160 *sizep = 0;
2161 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2162 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2163 (size_t)raddr;
2164
2165 if (raddr + rsize < raddr) /* check for wraparound */
2166 return (ENOMEM);
2167
2168 AS_LOCK_ENTER(as, RW_READER);
2169 seg = as_segat(as, raddr);
2170 if (seg == NULL) {
2171 AS_LOCK_EXIT(as);
2172 return (-1);
2173 }
2174
2175 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2176 if (raddr >= seg->s_base + seg->s_size) {
2177 seg = AS_SEGNEXT(as, seg);
2178 if (seg == NULL || raddr != seg->s_base) {
2179 error = -1;
2180 break;
2181 }
2182 }
2183 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2184 ssize = seg->s_base + seg->s_size - raddr;
2185 else
2186 ssize = rsize;
2187 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2188 if (isize != ssize) {
2189 error = -1;
2190 break;
2191 }
2192 vec += btopr(ssize);
2193 }
2194 AS_LOCK_EXIT(as);
2195 return (error);
2196 }
2197
2198 static void
as_segunlock(struct seg * seg,caddr_t addr,int attr,ulong_t * bitmap,size_t position,size_t npages)2199 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2200 ulong_t *bitmap, size_t position, size_t npages)
2201 {
2202 caddr_t range_start;
2203 size_t pos1 = position;
2204 size_t pos2;
2205 size_t size;
2206 size_t end_pos = npages + position;
2207
2208 while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2209 size = ptob((pos2 - pos1));
2210 range_start = (caddr_t)((uintptr_t)addr +
2211 ptob(pos1 - position));
2212
2213 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2214 (ulong_t *)NULL, (size_t)NULL);
2215 pos1 = pos2;
2216 }
2217 }
2218
2219 static void
as_unlockerr(struct as * as,int attr,ulong_t * mlock_map,caddr_t raddr,size_t rsize)2220 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2221 caddr_t raddr, size_t rsize)
2222 {
2223 struct seg *seg = as_segat(as, raddr);
2224 size_t ssize;
2225
2226 while (rsize != 0) {
2227 if (raddr >= seg->s_base + seg->s_size)
2228 seg = AS_SEGNEXT(as, seg);
2229
2230 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2231 ssize = seg->s_base + seg->s_size - raddr;
2232 else
2233 ssize = rsize;
2234
2235 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2236
2237 rsize -= ssize;
2238 raddr += ssize;
2239 }
2240 }
2241
2242 /*
2243 * Cache control operations over the interval [addr, addr + size) in
2244 * address space "as".
2245 */
2246 /*ARGSUSED*/
2247 int
as_ctl(struct as * as,caddr_t addr,size_t size,int func,int attr,uintptr_t arg,ulong_t * lock_map,size_t pos)2248 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2249 uintptr_t arg, ulong_t *lock_map, size_t pos)
2250 {
2251 struct seg *seg; /* working segment */
2252 caddr_t raddr; /* rounded down addr */
2253 caddr_t initraddr; /* saved initial rounded down addr */
2254 size_t rsize; /* rounded up size */
2255 size_t initrsize; /* saved initial rounded up size */
2256 size_t ssize; /* size of seg */
2257 int error = 0; /* result */
2258 size_t mlock_size; /* size of bitmap */
2259 ulong_t *mlock_map; /* pointer to bitmap used */
2260 /* to represent the locked */
2261 /* pages. */
2262
2263 mlock_size = 0;
2264 mlock_map = NULL;
2265 retry:
2266 if (error == IE_RETRY)
2267 AS_LOCK_ENTER(as, RW_WRITER);
2268 else
2269 AS_LOCK_ENTER(as, RW_READER);
2270
2271 /*
2272 * If these are address space lock/unlock operations, loop over
2273 * all segments in the address space, as appropriate.
2274 */
2275 if (func == MC_LOCKAS) {
2276 size_t npages, idx;
2277 size_t rlen = 0; /* rounded as length */
2278
2279 idx = pos;
2280
2281 if (arg & MCL_FUTURE) {
2282 mutex_enter(&as->a_contents);
2283 AS_SETPGLCK(as);
2284 mutex_exit(&as->a_contents);
2285 }
2286 if ((arg & MCL_CURRENT) == 0) {
2287 AS_LOCK_EXIT(as);
2288 return (0);
2289 }
2290
2291 seg = AS_SEGFIRST(as);
2292 if (seg == NULL) {
2293 AS_LOCK_EXIT(as);
2294 return (0);
2295 }
2296
2297 do {
2298 raddr = (caddr_t)((uintptr_t)seg->s_base &
2299 (uintptr_t)PAGEMASK);
2300 rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2301 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2302 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2303
2304 mlock_size = BT_BITOUL(btopr(rlen));
2305 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2306 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2307 AS_LOCK_EXIT(as);
2308 return (EAGAIN);
2309 }
2310
2311 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2312 if ((seg->s_flags & S_HOLE) != 0) {
2313 continue;
2314 }
2315 error = SEGOP_LOCKOP(seg, seg->s_base,
2316 seg->s_size, attr, MC_LOCK, mlock_map, pos);
2317 if (error != 0)
2318 break;
2319 pos += seg_pages(seg);
2320 }
2321
2322 if (error) {
2323 for (seg = AS_SEGFIRST(as); seg != NULL;
2324 seg = AS_SEGNEXT(as, seg)) {
2325
2326 raddr = (caddr_t)((uintptr_t)seg->s_base &
2327 (uintptr_t)PAGEMASK);
2328 npages = seg_pages(seg);
2329 as_segunlock(seg, raddr, attr, mlock_map,
2330 idx, npages);
2331 idx += npages;
2332 }
2333 }
2334
2335 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2336 AS_LOCK_EXIT(as);
2337 goto lockerr;
2338 } else if (func == MC_UNLOCKAS) {
2339 mutex_enter(&as->a_contents);
2340 AS_CLRPGLCK(as);
2341 mutex_exit(&as->a_contents);
2342
2343 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2344 if ((seg->s_flags & S_HOLE) != 0) {
2345 continue;
2346 }
2347 error = SEGOP_LOCKOP(seg, seg->s_base,
2348 seg->s_size, attr, MC_UNLOCK, NULL, 0);
2349 if (error != 0)
2350 break;
2351 }
2352
2353 AS_LOCK_EXIT(as);
2354 goto lockerr;
2355 }
2356
2357 /*
2358 * Normalize addresses and sizes.
2359 */
2360 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2361 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2362 (size_t)raddr;
2363
2364 if (raddr + rsize < raddr) { /* check for wraparound */
2365 AS_LOCK_EXIT(as);
2366 return (ENOMEM);
2367 }
2368
2369 /*
2370 * Get initial segment.
2371 */
2372 if ((seg = as_segat(as, raddr)) == NULL) {
2373 AS_LOCK_EXIT(as);
2374 return (ENOMEM);
2375 }
2376
2377 if (func == MC_LOCK) {
2378 mlock_size = BT_BITOUL(btopr(rsize));
2379 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2380 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2381 AS_LOCK_EXIT(as);
2382 return (EAGAIN);
2383 }
2384 }
2385
2386 /*
2387 * Loop over all segments. If a hole in the address range is
2388 * discovered, then fail. For each segment, perform the appropriate
2389 * control operation.
2390 */
2391 while (rsize != 0) {
2392
2393 /*
2394 * Make sure there's no hole, calculate the portion
2395 * of the next segment to be operated over.
2396 */
2397 if (raddr >= seg->s_base + seg->s_size) {
2398 seg = AS_SEGNEXT(as, seg);
2399 if (seg == NULL || raddr != seg->s_base) {
2400 if (func == MC_LOCK) {
2401 as_unlockerr(as, attr, mlock_map,
2402 initraddr, initrsize - rsize);
2403 kmem_free(mlock_map,
2404 mlock_size * sizeof (ulong_t));
2405 }
2406 AS_LOCK_EXIT(as);
2407 return (ENOMEM);
2408 }
2409 }
2410 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2411 ssize = seg->s_base + seg->s_size - raddr;
2412 else
2413 ssize = rsize;
2414
2415 /*
2416 * Dispatch on specific function.
2417 */
2418 switch (func) {
2419
2420 /*
2421 * Synchronize cached data from mappings with backing
2422 * objects.
2423 */
2424 case MC_SYNC:
2425 if (error = SEGOP_SYNC(seg, raddr, ssize,
2426 attr, (uint_t)arg)) {
2427 AS_LOCK_EXIT(as);
2428 return (error);
2429 }
2430 break;
2431
2432 /*
2433 * Lock pages in memory.
2434 */
2435 case MC_LOCK:
2436 if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2437 attr, func, mlock_map, pos)) {
2438 as_unlockerr(as, attr, mlock_map, initraddr,
2439 initrsize - rsize + ssize);
2440 kmem_free(mlock_map, mlock_size *
2441 sizeof (ulong_t));
2442 AS_LOCK_EXIT(as);
2443 goto lockerr;
2444 }
2445 break;
2446
2447 /*
2448 * Unlock mapped pages.
2449 */
2450 case MC_UNLOCK:
2451 (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2452 (ulong_t *)NULL, (size_t)NULL);
2453 break;
2454
2455 /*
2456 * Store VM advise for mapped pages in segment layer.
2457 */
2458 case MC_ADVISE:
2459 error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2460
2461 /*
2462 * Check for regular errors and special retry error
2463 */
2464 if (error) {
2465 if (error == IE_RETRY) {
2466 /*
2467 * Need to acquire writers lock, so
2468 * have to drop readers lock and start
2469 * all over again
2470 */
2471 AS_LOCK_EXIT(as);
2472 goto retry;
2473 } else if (error == IE_REATTACH) {
2474 /*
2475 * Find segment for current address
2476 * because current segment just got
2477 * split or concatenated
2478 */
2479 seg = as_segat(as, raddr);
2480 if (seg == NULL) {
2481 AS_LOCK_EXIT(as);
2482 return (ENOMEM);
2483 }
2484 } else {
2485 /*
2486 * Regular error
2487 */
2488 AS_LOCK_EXIT(as);
2489 return (error);
2490 }
2491 }
2492 break;
2493
2494 case MC_INHERIT_ZERO:
2495 if (seg->s_ops->inherit == NULL) {
2496 error = ENOTSUP;
2497 } else {
2498 error = SEGOP_INHERIT(seg, raddr, ssize,
2499 SEGP_INH_ZERO);
2500 }
2501 if (error != 0) {
2502 AS_LOCK_EXIT(as);
2503 return (error);
2504 }
2505 break;
2506
2507 /*
2508 * Can't happen.
2509 */
2510 default:
2511 panic("as_ctl: bad operation %d", func);
2512 /*NOTREACHED*/
2513 }
2514
2515 rsize -= ssize;
2516 raddr += ssize;
2517 }
2518
2519 if (func == MC_LOCK)
2520 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2521 AS_LOCK_EXIT(as);
2522 return (0);
2523 lockerr:
2524
2525 /*
2526 * If the lower levels returned EDEADLK for a segment lockop,
2527 * it means that we should retry the operation. Let's wait
2528 * a bit also to let the deadlock causing condition clear.
2529 * This is part of a gross hack to work around a design flaw
2530 * in the ufs/sds logging code and should go away when the
2531 * logging code is re-designed to fix the problem. See bug
2532 * 4125102 for details of the problem.
2533 */
2534 if (error == EDEADLK) {
2535 delay(deadlk_wait);
2536 error = 0;
2537 goto retry;
2538 }
2539 return (error);
2540 }
2541
2542 int
fc_decode(faultcode_t fault_err)2543 fc_decode(faultcode_t fault_err)
2544 {
2545 int error = 0;
2546
2547 switch (FC_CODE(fault_err)) {
2548 case FC_OBJERR:
2549 error = FC_ERRNO(fault_err);
2550 break;
2551 case FC_PROT:
2552 error = EACCES;
2553 break;
2554 default:
2555 error = EFAULT;
2556 break;
2557 }
2558 return (error);
2559 }
2560
2561 /*
2562 * Pagelock pages from a range that spans more than 1 segment. Obtain shadow
2563 * lists from each segment and copy them to one contiguous shadow list (plist)
2564 * as expected by the caller. Save pointers to per segment shadow lists at
2565 * the tail of plist so that they can be used during as_pageunlock().
2566 */
2567 static int
as_pagelock_segs(struct as * as,struct seg * seg,struct page *** ppp,caddr_t addr,size_t size,enum seg_rw rw)2568 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2569 caddr_t addr, size_t size, enum seg_rw rw)
2570 {
2571 caddr_t sv_addr = addr;
2572 size_t sv_size = size;
2573 struct seg *sv_seg = seg;
2574 ulong_t segcnt = 1;
2575 ulong_t cnt;
2576 size_t ssize;
2577 pgcnt_t npages = btop(size);
2578 page_t **plist;
2579 page_t **pl;
2580 int error;
2581 caddr_t eaddr;
2582 faultcode_t fault_err = 0;
2583 pgcnt_t pl_off;
2584 extern struct seg_ops segspt_shmops;
2585
2586 ASSERT(AS_LOCK_HELD(as));
2587 ASSERT(seg != NULL);
2588 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2589 ASSERT(addr + size > seg->s_base + seg->s_size);
2590 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2591 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2592
2593 /*
2594 * Count the number of segments covered by the range we are about to
2595 * lock. The segment count is used to size the shadow list we return
2596 * back to the caller.
2597 */
2598 for (; size != 0; size -= ssize, addr += ssize) {
2599 if (addr >= seg->s_base + seg->s_size) {
2600
2601 seg = AS_SEGNEXT(as, seg);
2602 if (seg == NULL || addr != seg->s_base) {
2603 AS_LOCK_EXIT(as);
2604 return (EFAULT);
2605 }
2606 /*
2607 * Do a quick check if subsequent segments
2608 * will most likely support pagelock.
2609 */
2610 if (seg->s_ops == &segvn_ops) {
2611 vnode_t *vp;
2612
2613 if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2614 vp != NULL) {
2615 AS_LOCK_EXIT(as);
2616 goto slow;
2617 }
2618 } else if (seg->s_ops != &segspt_shmops) {
2619 AS_LOCK_EXIT(as);
2620 goto slow;
2621 }
2622 segcnt++;
2623 }
2624 if (addr + size > seg->s_base + seg->s_size) {
2625 ssize = seg->s_base + seg->s_size - addr;
2626 } else {
2627 ssize = size;
2628 }
2629 }
2630 ASSERT(segcnt > 1);
2631
2632 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2633
2634 addr = sv_addr;
2635 size = sv_size;
2636 seg = sv_seg;
2637
2638 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2639 if (addr >= seg->s_base + seg->s_size) {
2640 seg = AS_SEGNEXT(as, seg);
2641 ASSERT(seg != NULL && addr == seg->s_base);
2642 cnt++;
2643 ASSERT(cnt < segcnt);
2644 }
2645 if (addr + size > seg->s_base + seg->s_size) {
2646 ssize = seg->s_base + seg->s_size - addr;
2647 } else {
2648 ssize = size;
2649 }
2650 pl = &plist[npages + cnt];
2651 error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2652 L_PAGELOCK, rw);
2653 if (error) {
2654 break;
2655 }
2656 ASSERT(plist[npages + cnt] != NULL);
2657 ASSERT(pl_off + btop(ssize) <= npages);
2658 bcopy(plist[npages + cnt], &plist[pl_off],
2659 btop(ssize) * sizeof (page_t *));
2660 pl_off += btop(ssize);
2661 }
2662
2663 if (size == 0) {
2664 AS_LOCK_EXIT(as);
2665 ASSERT(cnt == segcnt - 1);
2666 *ppp = plist;
2667 return (0);
2668 }
2669
2670 /*
2671 * one of pagelock calls failed. The error type is in error variable.
2672 * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2673 * type is either EFAULT or ENOTSUP. Otherwise just return the error
2674 * back to the caller.
2675 */
2676
2677 eaddr = addr;
2678 seg = sv_seg;
2679
2680 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2681 if (addr >= seg->s_base + seg->s_size) {
2682 seg = AS_SEGNEXT(as, seg);
2683 ASSERT(seg != NULL && addr == seg->s_base);
2684 cnt++;
2685 ASSERT(cnt < segcnt);
2686 }
2687 if (eaddr > seg->s_base + seg->s_size) {
2688 ssize = seg->s_base + seg->s_size - addr;
2689 } else {
2690 ssize = eaddr - addr;
2691 }
2692 pl = &plist[npages + cnt];
2693 ASSERT(*pl != NULL);
2694 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2695 L_PAGEUNLOCK, rw);
2696 }
2697
2698 AS_LOCK_EXIT(as);
2699
2700 kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2701
2702 if (error != ENOTSUP && error != EFAULT) {
2703 return (error);
2704 }
2705
2706 slow:
2707 /*
2708 * If we are here because pagelock failed due to the need to cow fault
2709 * in the pages we want to lock F_SOFTLOCK will do this job and in
2710 * next as_pagelock() call for this address range pagelock will
2711 * hopefully succeed.
2712 */
2713 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2714 if (fault_err != 0) {
2715 return (fc_decode(fault_err));
2716 }
2717 *ppp = NULL;
2718
2719 return (0);
2720 }
2721
2722 /*
2723 * lock pages in a given address space. Return shadow list. If
2724 * the list is NULL, the MMU mapping is also locked.
2725 */
2726 int
as_pagelock(struct as * as,struct page *** ppp,caddr_t addr,size_t size,enum seg_rw rw)2727 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2728 size_t size, enum seg_rw rw)
2729 {
2730 size_t rsize;
2731 caddr_t raddr;
2732 faultcode_t fault_err;
2733 struct seg *seg;
2734 int err;
2735
2736 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2737 "as_pagelock_start: addr %p size %ld", addr, size);
2738
2739 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2740 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2741 (size_t)raddr;
2742
2743 /*
2744 * if the request crosses two segments let
2745 * as_fault handle it.
2746 */
2747 AS_LOCK_ENTER(as, RW_READER);
2748
2749 seg = as_segat(as, raddr);
2750 if (seg == NULL) {
2751 AS_LOCK_EXIT(as);
2752 return (EFAULT);
2753 }
2754 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2755 if (raddr + rsize > seg->s_base + seg->s_size) {
2756 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2757 }
2758 if (raddr + rsize <= raddr) {
2759 AS_LOCK_EXIT(as);
2760 return (EFAULT);
2761 }
2762
2763 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2764 "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2765
2766 /*
2767 * try to lock pages and pass back shadow list
2768 */
2769 err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2770
2771 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2772
2773 AS_LOCK_EXIT(as);
2774
2775 if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2776 return (err);
2777 }
2778
2779 /*
2780 * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2781 * to no pagelock support for this segment or pages need to be cow
2782 * faulted in. If fault is needed F_SOFTLOCK will do this job for
2783 * this as_pagelock() call and in the next as_pagelock() call for the
2784 * same address range pagelock call will hopefull succeed.
2785 */
2786 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2787 if (fault_err != 0) {
2788 return (fc_decode(fault_err));
2789 }
2790 *ppp = NULL;
2791
2792 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2793 return (0);
2794 }
2795
2796 /*
2797 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow
2798 * lists from the end of plist and call pageunlock interface for each segment.
2799 * Drop as lock and free plist.
2800 */
2801 static void
as_pageunlock_segs(struct as * as,struct seg * seg,caddr_t addr,size_t size,struct page ** plist,enum seg_rw rw)2802 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2803 struct page **plist, enum seg_rw rw)
2804 {
2805 ulong_t cnt;
2806 caddr_t eaddr = addr + size;
2807 pgcnt_t npages = btop(size);
2808 size_t ssize;
2809 page_t **pl;
2810
2811 ASSERT(AS_LOCK_HELD(as));
2812 ASSERT(seg != NULL);
2813 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2814 ASSERT(addr + size > seg->s_base + seg->s_size);
2815 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2816 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2817 ASSERT(plist != NULL);
2818
2819 for (cnt = 0; addr < eaddr; addr += ssize) {
2820 if (addr >= seg->s_base + seg->s_size) {
2821 seg = AS_SEGNEXT(as, seg);
2822 ASSERT(seg != NULL && addr == seg->s_base);
2823 cnt++;
2824 }
2825 if (eaddr > seg->s_base + seg->s_size) {
2826 ssize = seg->s_base + seg->s_size - addr;
2827 } else {
2828 ssize = eaddr - addr;
2829 }
2830 pl = &plist[npages + cnt];
2831 ASSERT(*pl != NULL);
2832 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2833 L_PAGEUNLOCK, rw);
2834 }
2835 ASSERT(cnt > 0);
2836 AS_LOCK_EXIT(as);
2837
2838 cnt++;
2839 kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2840 }
2841
2842 /*
2843 * unlock pages in a given address range
2844 */
2845 void
as_pageunlock(struct as * as,struct page ** pp,caddr_t addr,size_t size,enum seg_rw rw)2846 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2847 enum seg_rw rw)
2848 {
2849 struct seg *seg;
2850 size_t rsize;
2851 caddr_t raddr;
2852
2853 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2854 "as_pageunlock_start: addr %p size %ld", addr, size);
2855
2856 /*
2857 * if the shadow list is NULL, as_pagelock was
2858 * falling back to as_fault
2859 */
2860 if (pp == NULL) {
2861 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2862 return;
2863 }
2864
2865 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2866 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2867 (size_t)raddr;
2868
2869 AS_LOCK_ENTER(as, RW_READER);
2870 seg = as_segat(as, raddr);
2871 ASSERT(seg != NULL);
2872
2873 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2874 "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2875
2876 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2877 if (raddr + rsize <= seg->s_base + seg->s_size) {
2878 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2879 } else {
2880 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2881 return;
2882 }
2883 AS_LOCK_EXIT(as);
2884 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2885 }
2886
2887 int
as_setpagesize(struct as * as,caddr_t addr,size_t size,uint_t szc,boolean_t wait)2888 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2889 boolean_t wait)
2890 {
2891 struct seg *seg;
2892 size_t ssize;
2893 caddr_t raddr; /* rounded down addr */
2894 size_t rsize; /* rounded up size */
2895 int error = 0;
2896 size_t pgsz = page_get_pagesize(szc);
2897
2898 setpgsz_top:
2899 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2900 return (EINVAL);
2901 }
2902
2903 raddr = addr;
2904 rsize = size;
2905
2906 if (raddr + rsize < raddr) /* check for wraparound */
2907 return (ENOMEM);
2908
2909 AS_LOCK_ENTER(as, RW_WRITER);
2910 as_clearwatchprot(as, raddr, rsize);
2911 seg = as_segat(as, raddr);
2912 if (seg == NULL) {
2913 as_setwatch(as);
2914 AS_LOCK_EXIT(as);
2915 return (ENOMEM);
2916 }
2917
2918 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2919 if (raddr >= seg->s_base + seg->s_size) {
2920 seg = AS_SEGNEXT(as, seg);
2921 if (seg == NULL || raddr != seg->s_base) {
2922 error = ENOMEM;
2923 break;
2924 }
2925 }
2926 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2927 ssize = seg->s_base + seg->s_size - raddr;
2928 } else {
2929 ssize = rsize;
2930 }
2931
2932 retry:
2933 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2934
2935 if (error == IE_NOMEM) {
2936 error = EAGAIN;
2937 break;
2938 }
2939
2940 if (error == IE_RETRY) {
2941 AS_LOCK_EXIT(as);
2942 goto setpgsz_top;
2943 }
2944
2945 if (error == ENOTSUP) {
2946 error = EINVAL;
2947 break;
2948 }
2949
2950 if (wait && (error == EAGAIN)) {
2951 /*
2952 * Memory is currently locked. It must be unlocked
2953 * before this operation can succeed through a retry.
2954 * The possible reasons for locked memory and
2955 * corresponding strategies for unlocking are:
2956 * (1) Normal I/O
2957 * wait for a signal that the I/O operation
2958 * has completed and the memory is unlocked.
2959 * (2) Asynchronous I/O
2960 * The aio subsystem does not unlock pages when
2961 * the I/O is completed. Those pages are unlocked
2962 * when the application calls aiowait/aioerror.
2963 * So, to prevent blocking forever, cv_broadcast()
2964 * is done to wake up aio_cleanup_thread.
2965 * Subsequently, segvn_reclaim will be called, and
2966 * that will do AS_CLRUNMAPWAIT() and wake us up.
2967 * (3) Long term page locking:
2968 * This is not relevant for as_setpagesize()
2969 * because we cannot change the page size for
2970 * driver memory. The attempt to do so will
2971 * fail with a different error than EAGAIN so
2972 * there's no need to trigger as callbacks like
2973 * as_unmap, as_setprot or as_free would do.
2974 */
2975 mutex_enter(&as->a_contents);
2976 if (!AS_ISNOUNMAPWAIT(as)) {
2977 if (AS_ISUNMAPWAIT(as) == 0) {
2978 cv_broadcast(&as->a_cv);
2979 }
2980 AS_SETUNMAPWAIT(as);
2981 AS_LOCK_EXIT(as);
2982 while (AS_ISUNMAPWAIT(as)) {
2983 cv_wait(&as->a_cv, &as->a_contents);
2984 }
2985 } else {
2986 /*
2987 * We may have raced with
2988 * segvn_reclaim()/segspt_reclaim(). In this
2989 * case clean nounmapwait flag and retry since
2990 * softlockcnt in this segment may be already
2991 * 0. We don't drop as writer lock so our
2992 * number of retries without sleeping should
2993 * be very small. See segvn_reclaim() for
2994 * more comments.
2995 */
2996 AS_CLRNOUNMAPWAIT(as);
2997 mutex_exit(&as->a_contents);
2998 goto retry;
2999 }
3000 mutex_exit(&as->a_contents);
3001 goto setpgsz_top;
3002 } else if (error != 0) {
3003 break;
3004 }
3005 }
3006 as_setwatch(as);
3007 AS_LOCK_EXIT(as);
3008 return (error);
3009 }
3010
3011 /*
3012 * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
3013 * in its chunk where s_szc is less than the szc we want to set.
3014 */
3015 static int
as_iset3_default_lpsize(struct as * as,caddr_t raddr,size_t rsize,uint_t szc,int * retry)3016 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3017 int *retry)
3018 {
3019 struct seg *seg;
3020 size_t ssize;
3021 int error;
3022
3023 ASSERT(AS_WRITE_HELD(as));
3024
3025 seg = as_segat(as, raddr);
3026 if (seg == NULL) {
3027 panic("as_iset3_default_lpsize: no seg");
3028 }
3029
3030 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
3031 if (raddr >= seg->s_base + seg->s_size) {
3032 seg = AS_SEGNEXT(as, seg);
3033 if (seg == NULL || raddr != seg->s_base) {
3034 panic("as_iset3_default_lpsize: as changed");
3035 }
3036 }
3037 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3038 ssize = seg->s_base + seg->s_size - raddr;
3039 } else {
3040 ssize = rsize;
3041 }
3042
3043 if (szc > seg->s_szc) {
3044 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
3045 /* Only retry on EINVAL segments that have no vnode. */
3046 if (error == EINVAL) {
3047 vnode_t *vp = NULL;
3048 if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) &&
3049 (SEGOP_GETVP(seg, raddr, &vp) != 0 ||
3050 vp == NULL)) {
3051 *retry = 1;
3052 } else {
3053 *retry = 0;
3054 }
3055 }
3056 if (error) {
3057 return (error);
3058 }
3059 }
3060 }
3061 return (0);
3062 }
3063
3064 /*
3065 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3066 * pagesize on each segment in its range, but if any fails with EINVAL,
3067 * then it reduces the pagesizes to the next size in the bitmap and
3068 * retries as_iset3_default_lpsize(). The reason why the code retries
3069 * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3070 * match the bigger sizes, and (b) it's hard to get this offset (to begin
3071 * with) to pass to map_pgszcvec().
3072 */
3073 static int
as_iset2_default_lpsize(struct as * as,caddr_t addr,size_t size,uint_t szc,uint_t szcvec)3074 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3075 uint_t szcvec)
3076 {
3077 int error;
3078 int retry;
3079
3080 ASSERT(AS_WRITE_HELD(as));
3081
3082 for (;;) {
3083 error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3084 if (error == EINVAL && retry) {
3085 szcvec &= ~(1 << szc);
3086 if (szcvec <= 1) {
3087 return (EINVAL);
3088 }
3089 szc = highbit(szcvec) - 1;
3090 } else {
3091 return (error);
3092 }
3093 }
3094 }
3095
3096 /*
3097 * as_iset1_default_lpsize() breaks its chunk into areas where existing
3098 * segments have a smaller szc than we want to set. For each such area,
3099 * it calls as_iset2_default_lpsize()
3100 */
3101 static int
as_iset1_default_lpsize(struct as * as,caddr_t raddr,size_t rsize,uint_t szc,uint_t szcvec)3102 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3103 uint_t szcvec)
3104 {
3105 struct seg *seg;
3106 size_t ssize;
3107 caddr_t setaddr = raddr;
3108 size_t setsize = 0;
3109 int set;
3110 int error;
3111
3112 ASSERT(AS_WRITE_HELD(as));
3113
3114 seg = as_segat(as, raddr);
3115 if (seg == NULL) {
3116 panic("as_iset1_default_lpsize: no seg");
3117 }
3118 if (seg->s_szc < szc) {
3119 set = 1;
3120 } else {
3121 set = 0;
3122 }
3123
3124 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3125 if (raddr >= seg->s_base + seg->s_size) {
3126 seg = AS_SEGNEXT(as, seg);
3127 if (seg == NULL || raddr != seg->s_base) {
3128 panic("as_iset1_default_lpsize: as changed");
3129 }
3130 if (seg->s_szc >= szc && set) {
3131 ASSERT(setsize != 0);
3132 error = as_iset2_default_lpsize(as,
3133 setaddr, setsize, szc, szcvec);
3134 if (error) {
3135 return (error);
3136 }
3137 set = 0;
3138 } else if (seg->s_szc < szc && !set) {
3139 setaddr = raddr;
3140 setsize = 0;
3141 set = 1;
3142 }
3143 }
3144 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3145 ssize = seg->s_base + seg->s_size - raddr;
3146 } else {
3147 ssize = rsize;
3148 }
3149 }
3150 error = 0;
3151 if (set) {
3152 ASSERT(setsize != 0);
3153 error = as_iset2_default_lpsize(as, setaddr, setsize,
3154 szc, szcvec);
3155 }
3156 return (error);
3157 }
3158
3159 /*
3160 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3161 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3162 * chunk to as_iset1_default_lpsize().
3163 */
3164 static int
as_iset_default_lpsize(struct as * as,caddr_t addr,size_t size,int flags,int type)3165 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3166 int type)
3167 {
3168 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3169 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3170 flags, rtype, 1);
3171 uint_t szc;
3172 uint_t nszc;
3173 int error;
3174 caddr_t a;
3175 caddr_t eaddr;
3176 size_t segsize;
3177 size_t pgsz;
3178 uint_t save_szcvec;
3179
3180 ASSERT(AS_WRITE_HELD(as));
3181 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3182 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3183
3184 szcvec &= ~1;
3185 if (szcvec <= 1) { /* skip if base page size */
3186 return (0);
3187 }
3188
3189 /* Get the pagesize of the first larger page size. */
3190 szc = lowbit(szcvec) - 1;
3191 pgsz = page_get_pagesize(szc);
3192 eaddr = addr + size;
3193 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3194 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3195
3196 save_szcvec = szcvec;
3197 szcvec >>= (szc + 1);
3198 nszc = szc;
3199 while (szcvec) {
3200 if ((szcvec & 0x1) == 0) {
3201 nszc++;
3202 szcvec >>= 1;
3203 continue;
3204 }
3205 nszc++;
3206 pgsz = page_get_pagesize(nszc);
3207 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3208 if (a != addr) {
3209 ASSERT(szc > 0);
3210 ASSERT(a < eaddr);
3211 segsize = a - addr;
3212 error = as_iset1_default_lpsize(as, addr, segsize, szc,
3213 save_szcvec);
3214 if (error) {
3215 return (error);
3216 }
3217 addr = a;
3218 }
3219 szc = nszc;
3220 szcvec >>= 1;
3221 }
3222
3223 ASSERT(addr < eaddr);
3224 szcvec = save_szcvec;
3225 while (szcvec) {
3226 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3227 ASSERT(a >= addr);
3228 if (a != addr) {
3229 ASSERT(szc > 0);
3230 segsize = a - addr;
3231 error = as_iset1_default_lpsize(as, addr, segsize, szc,
3232 save_szcvec);
3233 if (error) {
3234 return (error);
3235 }
3236 addr = a;
3237 }
3238 szcvec &= ~(1 << szc);
3239 if (szcvec) {
3240 szc = highbit(szcvec) - 1;
3241 pgsz = page_get_pagesize(szc);
3242 }
3243 }
3244 ASSERT(addr == eaddr);
3245
3246 return (0);
3247 }
3248
3249 /*
3250 * Set the default large page size for the range. Called via memcntl with
3251 * page size set to 0. as_set_default_lpsize breaks the range down into
3252 * chunks with the same type/flags, ignores-non segvn segments, and passes
3253 * each chunk to as_iset_default_lpsize().
3254 */
3255 int
as_set_default_lpsize(struct as * as,caddr_t addr,size_t size)3256 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3257 {
3258 struct seg *seg;
3259 caddr_t raddr;
3260 size_t rsize;
3261 size_t ssize;
3262 int rtype, rflags;
3263 int stype, sflags;
3264 int error;
3265 caddr_t setaddr;
3266 size_t setsize;
3267 int segvn;
3268
3269 if (size == 0)
3270 return (0);
3271
3272 AS_LOCK_ENTER(as, RW_WRITER);
3273 again:
3274 error = 0;
3275
3276 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3277 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3278 (size_t)raddr;
3279
3280 if (raddr + rsize < raddr) { /* check for wraparound */
3281 AS_LOCK_EXIT(as);
3282 return (ENOMEM);
3283 }
3284 as_clearwatchprot(as, raddr, rsize);
3285 seg = as_segat(as, raddr);
3286 if (seg == NULL) {
3287 as_setwatch(as);
3288 AS_LOCK_EXIT(as);
3289 return (ENOMEM);
3290 }
3291 if (seg->s_ops == &segvn_ops) {
3292 rtype = SEGOP_GETTYPE(seg, addr);
3293 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3294 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3295 segvn = 1;
3296 } else {
3297 segvn = 0;
3298 }
3299 setaddr = raddr;
3300 setsize = 0;
3301
3302 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3303 if (raddr >= (seg->s_base + seg->s_size)) {
3304 seg = AS_SEGNEXT(as, seg);
3305 if (seg == NULL || raddr != seg->s_base) {
3306 error = ENOMEM;
3307 break;
3308 }
3309 if (seg->s_ops == &segvn_ops) {
3310 stype = SEGOP_GETTYPE(seg, raddr);
3311 sflags = stype & (MAP_TEXT | MAP_INITDATA);
3312 stype &= (MAP_SHARED | MAP_PRIVATE);
3313 if (segvn && (rflags != sflags ||
3314 rtype != stype)) {
3315 /*
3316 * The next segment is also segvn but
3317 * has different flags and/or type.
3318 */
3319 ASSERT(setsize != 0);
3320 error = as_iset_default_lpsize(as,
3321 setaddr, setsize, rflags, rtype);
3322 if (error) {
3323 break;
3324 }
3325 rflags = sflags;
3326 rtype = stype;
3327 setaddr = raddr;
3328 setsize = 0;
3329 } else if (!segvn) {
3330 rflags = sflags;
3331 rtype = stype;
3332 setaddr = raddr;
3333 setsize = 0;
3334 segvn = 1;
3335 }
3336 } else if (segvn) {
3337 /* The next segment is not segvn. */
3338 ASSERT(setsize != 0);
3339 error = as_iset_default_lpsize(as,
3340 setaddr, setsize, rflags, rtype);
3341 if (error) {
3342 break;
3343 }
3344 segvn = 0;
3345 }
3346 }
3347 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3348 ssize = seg->s_base + seg->s_size - raddr;
3349 } else {
3350 ssize = rsize;
3351 }
3352 }
3353 if (error == 0 && segvn) {
3354 /* The last chunk when rsize == 0. */
3355 ASSERT(setsize != 0);
3356 error = as_iset_default_lpsize(as, setaddr, setsize,
3357 rflags, rtype);
3358 }
3359
3360 if (error == IE_RETRY) {
3361 goto again;
3362 } else if (error == IE_NOMEM) {
3363 error = EAGAIN;
3364 } else if (error == ENOTSUP) {
3365 error = EINVAL;
3366 } else if (error == EAGAIN) {
3367 mutex_enter(&as->a_contents);
3368 if (!AS_ISNOUNMAPWAIT(as)) {
3369 if (AS_ISUNMAPWAIT(as) == 0) {
3370 cv_broadcast(&as->a_cv);
3371 }
3372 AS_SETUNMAPWAIT(as);
3373 AS_LOCK_EXIT(as);
3374 while (AS_ISUNMAPWAIT(as)) {
3375 cv_wait(&as->a_cv, &as->a_contents);
3376 }
3377 mutex_exit(&as->a_contents);
3378 AS_LOCK_ENTER(as, RW_WRITER);
3379 } else {
3380 /*
3381 * We may have raced with
3382 * segvn_reclaim()/segspt_reclaim(). In this case
3383 * clean nounmapwait flag and retry since softlockcnt
3384 * in this segment may be already 0. We don't drop as
3385 * writer lock so our number of retries without
3386 * sleeping should be very small. See segvn_reclaim()
3387 * for more comments.
3388 */
3389 AS_CLRNOUNMAPWAIT(as);
3390 mutex_exit(&as->a_contents);
3391 }
3392 goto again;
3393 }
3394
3395 as_setwatch(as);
3396 AS_LOCK_EXIT(as);
3397 return (error);
3398 }
3399
3400 /*
3401 * Setup all of the uninitialized watched pages that we can.
3402 */
3403 void
as_setwatch(struct as * as)3404 as_setwatch(struct as *as)
3405 {
3406 struct watched_page *pwp;
3407 struct seg *seg;
3408 caddr_t vaddr;
3409 uint_t prot;
3410 int err, retrycnt;
3411
3412 if (avl_numnodes(&as->a_wpage) == 0)
3413 return;
3414
3415 ASSERT(AS_WRITE_HELD(as));
3416
3417 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3418 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3419 retrycnt = 0;
3420 retry:
3421 vaddr = pwp->wp_vaddr;
3422 if (pwp->wp_oprot != 0 || /* already set up */
3423 (seg = as_segat(as, vaddr)) == NULL ||
3424 SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3425 continue;
3426
3427 pwp->wp_oprot = prot;
3428 if (pwp->wp_read)
3429 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3430 if (pwp->wp_write)
3431 prot &= ~PROT_WRITE;
3432 if (pwp->wp_exec)
3433 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3434 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3435 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3436 if (err == IE_RETRY) {
3437 pwp->wp_oprot = 0;
3438 ASSERT(retrycnt == 0);
3439 retrycnt++;
3440 goto retry;
3441 }
3442 }
3443 pwp->wp_prot = prot;
3444 }
3445 }
3446
3447 /*
3448 * Clear all of the watched pages in the address space.
3449 */
3450 void
as_clearwatch(struct as * as)3451 as_clearwatch(struct as *as)
3452 {
3453 struct watched_page *pwp;
3454 struct seg *seg;
3455 caddr_t vaddr;
3456 uint_t prot;
3457 int err, retrycnt;
3458
3459 if (avl_numnodes(&as->a_wpage) == 0)
3460 return;
3461
3462 ASSERT(AS_WRITE_HELD(as));
3463
3464 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3465 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3466 retrycnt = 0;
3467 retry:
3468 vaddr = pwp->wp_vaddr;
3469 if (pwp->wp_oprot == 0 || /* not set up */
3470 (seg = as_segat(as, vaddr)) == NULL)
3471 continue;
3472
3473 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3474 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3475 if (err == IE_RETRY) {
3476 ASSERT(retrycnt == 0);
3477 retrycnt++;
3478 goto retry;
3479 }
3480 }
3481 pwp->wp_oprot = 0;
3482 pwp->wp_prot = 0;
3483 }
3484 }
3485
3486 /*
3487 * Force a new setup for all the watched pages in the range.
3488 */
3489 static void
as_setwatchprot(struct as * as,caddr_t addr,size_t size,uint_t prot)3490 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3491 {
3492 struct watched_page *pwp;
3493 struct watched_page tpw;
3494 caddr_t eaddr = addr + size;
3495 caddr_t vaddr;
3496 struct seg *seg;
3497 int err, retrycnt;
3498 uint_t wprot;
3499 avl_index_t where;
3500
3501 if (avl_numnodes(&as->a_wpage) == 0)
3502 return;
3503
3504 ASSERT(AS_WRITE_HELD(as));
3505
3506 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3507 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3508 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3509
3510 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3511 retrycnt = 0;
3512 vaddr = pwp->wp_vaddr;
3513
3514 wprot = prot;
3515 if (pwp->wp_read)
3516 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3517 if (pwp->wp_write)
3518 wprot &= ~PROT_WRITE;
3519 if (pwp->wp_exec)
3520 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3521 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3522 retry:
3523 seg = as_segat(as, vaddr);
3524 if (seg == NULL) {
3525 panic("as_setwatchprot: no seg");
3526 /*NOTREACHED*/
3527 }
3528 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
3529 if (err == IE_RETRY) {
3530 ASSERT(retrycnt == 0);
3531 retrycnt++;
3532 goto retry;
3533 }
3534 }
3535 pwp->wp_oprot = prot;
3536 pwp->wp_prot = wprot;
3537
3538 pwp = AVL_NEXT(&as->a_wpage, pwp);
3539 }
3540 }
3541
3542 /*
3543 * Clear all of the watched pages in the range.
3544 */
3545 static void
as_clearwatchprot(struct as * as,caddr_t addr,size_t size)3546 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3547 {
3548 caddr_t eaddr = addr + size;
3549 struct watched_page *pwp;
3550 struct watched_page tpw;
3551 uint_t prot;
3552 struct seg *seg;
3553 int err, retrycnt;
3554 avl_index_t where;
3555
3556 if (avl_numnodes(&as->a_wpage) == 0)
3557 return;
3558
3559 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3560 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3561 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3562
3563 ASSERT(AS_WRITE_HELD(as));
3564
3565 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3566
3567 if ((prot = pwp->wp_oprot) != 0) {
3568 retrycnt = 0;
3569
3570 if (prot != pwp->wp_prot) {
3571 retry:
3572 seg = as_segat(as, pwp->wp_vaddr);
3573 if (seg == NULL)
3574 continue;
3575 err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3576 PAGESIZE, prot);
3577 if (err == IE_RETRY) {
3578 ASSERT(retrycnt == 0);
3579 retrycnt++;
3580 goto retry;
3581
3582 }
3583 }
3584 pwp->wp_oprot = 0;
3585 pwp->wp_prot = 0;
3586 }
3587
3588 pwp = AVL_NEXT(&as->a_wpage, pwp);
3589 }
3590 }
3591
3592 void
as_signal_proc(struct as * as,k_siginfo_t * siginfo)3593 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3594 {
3595 struct proc *p;
3596
3597 mutex_enter(&pidlock);
3598 for (p = practive; p; p = p->p_next) {
3599 if (p->p_as == as) {
3600 mutex_enter(&p->p_lock);
3601 if (p->p_as == as)
3602 sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3603 mutex_exit(&p->p_lock);
3604 }
3605 }
3606 mutex_exit(&pidlock);
3607 }
3608
3609 /*
3610 * return memory object ID
3611 */
3612 int
as_getmemid(struct as * as,caddr_t addr,memid_t * memidp)3613 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3614 {
3615 struct seg *seg;
3616 int sts;
3617
3618 AS_LOCK_ENTER(as, RW_READER);
3619 seg = as_segat(as, addr);
3620 if (seg == NULL) {
3621 AS_LOCK_EXIT(as);
3622 return (EFAULT);
3623 }
3624 /*
3625 * catch old drivers which may not support getmemid
3626 */
3627 if (seg->s_ops->getmemid == NULL) {
3628 AS_LOCK_EXIT(as);
3629 return (ENODEV);
3630 }
3631
3632 sts = SEGOP_GETMEMID(seg, addr, memidp);
3633
3634 AS_LOCK_EXIT(as);
3635 return (sts);
3636 }
3637