1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2015, Joyent, Inc. All rights reserved.
25 */
26
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30 /*
31 * University Copyright- Copyright (c) 1982, 1986, 1988
32 * The Regents of the University of California
33 * All Rights Reserved
34 *
35 * University Acknowledgment- Portions of this document are derived from
36 * software developed by the University of California, Berkeley, and its
37 * contributors.
38 */
39
40 /*
41 * VM - address spaces.
42 */
43
44 #include <sys/types.h>
45 #include <sys/t_lock.h>
46 #include <sys/param.h>
47 #include <sys/errno.h>
48 #include <sys/systm.h>
49 #include <sys/mman.h>
50 #include <sys/sysmacros.h>
51 #include <sys/cpuvar.h>
52 #include <sys/sysinfo.h>
53 #include <sys/kmem.h>
54 #include <sys/vnode.h>
55 #include <sys/vmsystm.h>
56 #include <sys/cmn_err.h>
57 #include <sys/debug.h>
58 #include <sys/tnf_probe.h>
59 #include <sys/vtrace.h>
60
61 #include <vm/hat.h>
62 #include <vm/as.h>
63 #include <vm/seg.h>
64 #include <vm/seg_vn.h>
65 #include <vm/seg_dev.h>
66 #include <vm/seg_kmem.h>
67 #include <vm/seg_map.h>
68 #include <vm/seg_spt.h>
69 #include <vm/page.h>
70
71 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
72
73 static struct kmem_cache *as_cache;
74
75 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
76 static void as_clearwatchprot(struct as *, caddr_t, size_t);
77 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
78
79
80 /*
81 * Verifying the segment lists is very time-consuming; it may not be
82 * desirable always to define VERIFY_SEGLIST when DEBUG is set.
83 */
84 #ifdef DEBUG
85 #define VERIFY_SEGLIST
86 int do_as_verify = 0;
87 #endif
88
89 /*
90 * Allocate a new callback data structure entry and fill in the events of
91 * interest, the address range of interest, and the callback argument.
92 * Link the entry on the as->a_callbacks list. A callback entry for the
93 * entire address space may be specified with vaddr = 0 and size = -1.
94 *
95 * CALLERS RESPONSIBILITY: If not calling from within the process context for
96 * the specified as, the caller must guarantee persistence of the specified as
97 * for the duration of this function (eg. pages being locked within the as
98 * will guarantee persistence).
99 */
100 int
as_add_callback(struct as * as,void (* cb_func)(),void * arg,uint_t events,caddr_t vaddr,size_t size,int sleepflag)101 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
102 caddr_t vaddr, size_t size, int sleepflag)
103 {
104 struct as_callback *current_head, *cb;
105 caddr_t saddr;
106 size_t rsize;
107
108 /* callback function and an event are mandatory */
109 if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
110 return (EINVAL);
111
112 /* Adding a callback after as_free has been called is not allowed */
113 if (as == &kas)
114 return (ENOMEM);
115
116 /*
117 * vaddr = 0 and size = -1 is used to indicate that the callback range
118 * is the entire address space so no rounding is done in that case.
119 */
120 if (size != -1) {
121 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
122 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
123 (size_t)saddr;
124 /* check for wraparound */
125 if (saddr + rsize < saddr)
126 return (ENOMEM);
127 } else {
128 if (vaddr != 0)
129 return (EINVAL);
130 saddr = vaddr;
131 rsize = size;
132 }
133
134 /* Allocate and initialize a callback entry */
135 cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
136 if (cb == NULL)
137 return (EAGAIN);
138
139 cb->ascb_func = cb_func;
140 cb->ascb_arg = arg;
141 cb->ascb_events = events;
142 cb->ascb_saddr = saddr;
143 cb->ascb_len = rsize;
144
145 /* Add the entry to the list */
146 mutex_enter(&as->a_contents);
147 current_head = as->a_callbacks;
148 as->a_callbacks = cb;
149 cb->ascb_next = current_head;
150
151 /*
152 * The call to this function may lose in a race with
153 * a pertinent event - eg. a thread does long term memory locking
154 * but before the callback is added another thread executes as_unmap.
155 * A broadcast here resolves that.
156 */
157 if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
158 AS_CLRUNMAPWAIT(as);
159 cv_broadcast(&as->a_cv);
160 }
161
162 mutex_exit(&as->a_contents);
163 return (0);
164 }
165
166 /*
167 * Search the callback list for an entry which pertains to arg.
168 *
169 * This is called from within the client upon completion of the callback.
170 * RETURN VALUES:
171 * AS_CALLBACK_DELETED (callback entry found and deleted)
172 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
173 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
174 * entry will be made in as_do_callbacks)
175 *
176 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
177 * set, it indicates that as_do_callbacks is processing this entry. The
178 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
179 * to unblock as_do_callbacks, in case it is blocked.
180 *
181 * CALLERS RESPONSIBILITY: If not calling from within the process context for
182 * the specified as, the caller must guarantee persistence of the specified as
183 * for the duration of this function (eg. pages being locked within the as
184 * will guarantee persistence).
185 */
186 uint_t
as_delete_callback(struct as * as,void * arg)187 as_delete_callback(struct as *as, void *arg)
188 {
189 struct as_callback **prevcb = &as->a_callbacks;
190 struct as_callback *cb;
191 uint_t rc = AS_CALLBACK_NOTFOUND;
192
193 mutex_enter(&as->a_contents);
194 for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
195 if (cb->ascb_arg != arg)
196 continue;
197
198 /*
199 * If the events indicate AS_CALLBACK_CALLED, just clear
200 * AS_ALL_EVENT in the events field and wakeup the thread
201 * that may be waiting in as_do_callbacks. as_do_callbacks
202 * will take care of removing this entry from the list. In
203 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise
204 * (AS_CALLBACK_CALLED not set), just remove it from the
205 * list, return the memory and return AS_CALLBACK_DELETED.
206 */
207 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
208 /* leave AS_CALLBACK_CALLED */
209 cb->ascb_events &= ~AS_ALL_EVENT;
210 rc = AS_CALLBACK_DELETE_DEFERRED;
211 cv_broadcast(&as->a_cv);
212 } else {
213 *prevcb = cb->ascb_next;
214 kmem_free(cb, sizeof (struct as_callback));
215 rc = AS_CALLBACK_DELETED;
216 }
217 break;
218 }
219 mutex_exit(&as->a_contents);
220 return (rc);
221 }
222
223 /*
224 * Searches the as callback list for a matching entry.
225 * Returns a pointer to the first matching callback, or NULL if
226 * nothing is found.
227 * This function never sleeps so it is ok to call it with more
228 * locks held but the (required) a_contents mutex.
229 *
230 * See also comment on as_do_callbacks below.
231 */
232 static struct as_callback *
as_find_callback(struct as * as,uint_t events,caddr_t event_addr,size_t event_len)233 as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
234 size_t event_len)
235 {
236 struct as_callback *cb;
237
238 ASSERT(MUTEX_HELD(&as->a_contents));
239 for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
240 /*
241 * If the callback has not already been called, then
242 * check if events or address range pertains. An event_len
243 * of zero means do an unconditional callback.
244 */
245 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
246 ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
247 (event_addr + event_len < cb->ascb_saddr) ||
248 (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
249 continue;
250 }
251 break;
252 }
253 return (cb);
254 }
255
256 /*
257 * Executes a given callback and removes it from the callback list for
258 * this address space.
259 * This function may sleep so the caller must drop all locks except
260 * a_contents before calling this func.
261 *
262 * See also comments on as_do_callbacks below.
263 */
264 static void
as_execute_callback(struct as * as,struct as_callback * cb,uint_t events)265 as_execute_callback(struct as *as, struct as_callback *cb,
266 uint_t events)
267 {
268 struct as_callback **prevcb;
269 void *cb_arg;
270
271 ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
272 cb->ascb_events |= AS_CALLBACK_CALLED;
273 mutex_exit(&as->a_contents);
274 (*cb->ascb_func)(as, cb->ascb_arg, events);
275 mutex_enter(&as->a_contents);
276 /*
277 * the callback function is required to delete the callback
278 * when the callback function determines it is OK for
279 * this thread to continue. as_delete_callback will clear
280 * the AS_ALL_EVENT in the events field when it is deleted.
281 * If the callback function called as_delete_callback,
282 * events will already be cleared and there will be no blocking.
283 */
284 while ((cb->ascb_events & events) != 0) {
285 cv_wait(&as->a_cv, &as->a_contents);
286 }
287 /*
288 * This entry needs to be taken off the list. Normally, the
289 * callback func itself does that, but unfortunately the list
290 * may have changed while the callback was running because the
291 * a_contents mutex was dropped and someone else other than the
292 * callback func itself could have called as_delete_callback,
293 * so we have to search to find this entry again. The entry
294 * must have AS_CALLBACK_CALLED, and have the same 'arg'.
295 */
296 cb_arg = cb->ascb_arg;
297 prevcb = &as->a_callbacks;
298 for (cb = as->a_callbacks; cb != NULL;
299 prevcb = &cb->ascb_next, cb = *prevcb) {
300 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
301 (cb_arg != cb->ascb_arg)) {
302 continue;
303 }
304 *prevcb = cb->ascb_next;
305 kmem_free(cb, sizeof (struct as_callback));
306 break;
307 }
308 }
309
310 /*
311 * Check the callback list for a matching event and intersection of
312 * address range. If there is a match invoke the callback. Skip an entry if:
313 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
314 * - not event of interest
315 * - not address range of interest
316 *
317 * An event_len of zero indicates a request for an unconditional callback
318 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The
319 * a_contents lock must be dropped before a callback, so only one callback
320 * can be done before returning. Return -1 (true) if a callback was
321 * executed and removed from the list, else return 0 (false).
322 *
323 * The logically separate parts, i.e. finding a matching callback and
324 * executing a given callback have been separated into two functions
325 * so that they can be called with different sets of locks held beyond
326 * the always-required a_contents. as_find_callback does not sleep so
327 * it is ok to call it if more locks than a_contents (i.e. the a_lock
328 * rwlock) are held. as_execute_callback on the other hand may sleep
329 * so all locks beyond a_contents must be dropped by the caller if one
330 * does not want to end comatose.
331 */
332 static int
as_do_callbacks(struct as * as,uint_t events,caddr_t event_addr,size_t event_len)333 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
334 size_t event_len)
335 {
336 struct as_callback *cb;
337
338 if ((cb = as_find_callback(as, events, event_addr, event_len))) {
339 as_execute_callback(as, cb, events);
340 return (-1);
341 }
342 return (0);
343 }
344
345 /*
346 * Search for the segment containing addr. If a segment containing addr
347 * exists, that segment is returned. If no such segment exists, and
348 * the list spans addresses greater than addr, then the first segment
349 * whose base is greater than addr is returned; otherwise, NULL is
350 * returned unless tail is true, in which case the last element of the
351 * list is returned.
352 *
353 * a_seglast is used to cache the last found segment for repeated
354 * searches to the same addr (which happens frequently).
355 */
356 struct seg *
as_findseg(struct as * as,caddr_t addr,int tail)357 as_findseg(struct as *as, caddr_t addr, int tail)
358 {
359 struct seg *seg = as->a_seglast;
360 avl_index_t where;
361
362 ASSERT(AS_LOCK_HELD(as));
363
364 if (seg != NULL &&
365 seg->s_base <= addr &&
366 addr < seg->s_base + seg->s_size)
367 return (seg);
368
369 seg = avl_find(&as->a_segtree, &addr, &where);
370 if (seg != NULL)
371 return (as->a_seglast = seg);
372
373 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
374 if (seg == NULL && tail)
375 seg = avl_last(&as->a_segtree);
376 return (as->a_seglast = seg);
377 }
378
379 #ifdef VERIFY_SEGLIST
380 /*
381 * verify that the linked list is coherent
382 */
383 static void
as_verify(struct as * as)384 as_verify(struct as *as)
385 {
386 struct seg *seg, *seglast, *p, *n;
387 uint_t nsegs = 0;
388
389 if (do_as_verify == 0)
390 return;
391
392 seglast = as->a_seglast;
393
394 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
395 ASSERT(seg->s_as == as);
396 p = AS_SEGPREV(as, seg);
397 n = AS_SEGNEXT(as, seg);
398 ASSERT(p == NULL || p->s_as == as);
399 ASSERT(p == NULL || p->s_base < seg->s_base);
400 ASSERT(n == NULL || n->s_base > seg->s_base);
401 ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
402 if (seg == seglast)
403 seglast = NULL;
404 nsegs++;
405 }
406 ASSERT(seglast == NULL);
407 ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
408 }
409 #endif /* VERIFY_SEGLIST */
410
411 /*
412 * Add a new segment to the address space. The avl_find()
413 * may be expensive so we attempt to use last segment accessed
414 * in as_gap() as an insertion point.
415 */
416 int
as_addseg(struct as * as,struct seg * newseg)417 as_addseg(struct as *as, struct seg *newseg)
418 {
419 struct seg *seg;
420 caddr_t addr;
421 caddr_t eaddr;
422 avl_index_t where;
423
424 ASSERT(AS_WRITE_HELD(as));
425
426 as->a_updatedir = 1; /* inform /proc */
427 gethrestime(&as->a_updatetime);
428
429 if (as->a_lastgaphl != NULL) {
430 struct seg *hseg = NULL;
431 struct seg *lseg = NULL;
432
433 if (as->a_lastgaphl->s_base > newseg->s_base) {
434 hseg = as->a_lastgaphl;
435 lseg = AVL_PREV(&as->a_segtree, hseg);
436 } else {
437 lseg = as->a_lastgaphl;
438 hseg = AVL_NEXT(&as->a_segtree, lseg);
439 }
440
441 if (hseg && lseg && lseg->s_base < newseg->s_base &&
442 hseg->s_base > newseg->s_base) {
443 avl_insert_here(&as->a_segtree, newseg, lseg,
444 AVL_AFTER);
445 as->a_lastgaphl = NULL;
446 as->a_seglast = newseg;
447 return (0);
448 }
449 as->a_lastgaphl = NULL;
450 }
451
452 addr = newseg->s_base;
453 eaddr = addr + newseg->s_size;
454 again:
455
456 seg = avl_find(&as->a_segtree, &addr, &where);
457
458 if (seg == NULL)
459 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
460
461 if (seg == NULL)
462 seg = avl_last(&as->a_segtree);
463
464 if (seg != NULL) {
465 caddr_t base = seg->s_base;
466
467 /*
468 * If top of seg is below the requested address, then
469 * the insertion point is at the end of the linked list,
470 * and seg points to the tail of the list. Otherwise,
471 * the insertion point is immediately before seg.
472 */
473 if (base + seg->s_size > addr) {
474 if (addr >= base || eaddr > base) {
475 #ifdef __sparc
476 extern struct seg_ops segnf_ops;
477
478 /*
479 * no-fault segs must disappear if overlaid.
480 * XXX need new segment type so
481 * we don't have to check s_ops
482 */
483 if (seg->s_ops == &segnf_ops) {
484 seg_unmap(seg);
485 goto again;
486 }
487 #endif
488 return (-1); /* overlapping segment */
489 }
490 }
491 }
492 as->a_seglast = newseg;
493 avl_insert(&as->a_segtree, newseg, where);
494
495 #ifdef VERIFY_SEGLIST
496 as_verify(as);
497 #endif
498 return (0);
499 }
500
501 struct seg *
as_removeseg(struct as * as,struct seg * seg)502 as_removeseg(struct as *as, struct seg *seg)
503 {
504 avl_tree_t *t;
505
506 ASSERT(AS_WRITE_HELD(as));
507
508 as->a_updatedir = 1; /* inform /proc */
509 gethrestime(&as->a_updatetime);
510
511 if (seg == NULL)
512 return (NULL);
513
514 t = &as->a_segtree;
515 if (as->a_seglast == seg)
516 as->a_seglast = NULL;
517 as->a_lastgaphl = NULL;
518
519 /*
520 * if this segment is at an address higher than
521 * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
522 */
523 if (as->a_lastgap &&
524 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
525 as->a_lastgap = AVL_NEXT(t, seg);
526
527 /*
528 * remove the segment from the seg tree
529 */
530 avl_remove(t, seg);
531
532 #ifdef VERIFY_SEGLIST
533 as_verify(as);
534 #endif
535 return (seg);
536 }
537
538 /*
539 * Find a segment containing addr.
540 */
541 struct seg *
as_segat(struct as * as,caddr_t addr)542 as_segat(struct as *as, caddr_t addr)
543 {
544 struct seg *seg = as->a_seglast;
545
546 ASSERT(AS_LOCK_HELD(as));
547
548 if (seg != NULL && seg->s_base <= addr &&
549 addr < seg->s_base + seg->s_size)
550 return (seg);
551
552 seg = avl_find(&as->a_segtree, &addr, NULL);
553 return (seg);
554 }
555
556 /*
557 * Serialize all searches for holes in an address space to
558 * prevent two or more threads from allocating the same virtual
559 * address range. The address space must not be "read/write"
560 * locked by the caller since we may block.
561 */
562 void
as_rangelock(struct as * as)563 as_rangelock(struct as *as)
564 {
565 mutex_enter(&as->a_contents);
566 while (AS_ISCLAIMGAP(as))
567 cv_wait(&as->a_cv, &as->a_contents);
568 AS_SETCLAIMGAP(as);
569 mutex_exit(&as->a_contents);
570 }
571
572 /*
573 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
574 */
575 void
as_rangeunlock(struct as * as)576 as_rangeunlock(struct as *as)
577 {
578 mutex_enter(&as->a_contents);
579 AS_CLRCLAIMGAP(as);
580 cv_signal(&as->a_cv);
581 mutex_exit(&as->a_contents);
582 }
583
584 /*
585 * compar segments (or just an address) by segment address range
586 */
587 static int
as_segcompar(const void * x,const void * y)588 as_segcompar(const void *x, const void *y)
589 {
590 struct seg *a = (struct seg *)x;
591 struct seg *b = (struct seg *)y;
592
593 if (a->s_base < b->s_base)
594 return (-1);
595 if (a->s_base >= b->s_base + b->s_size)
596 return (1);
597 return (0);
598 }
599
600
601 void
as_avlinit(struct as * as)602 as_avlinit(struct as *as)
603 {
604 avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
605 offsetof(struct seg, s_tree));
606 avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
607 offsetof(struct watched_page, wp_link));
608 }
609
610 /*ARGSUSED*/
611 static int
as_constructor(void * buf,void * cdrarg,int kmflags)612 as_constructor(void *buf, void *cdrarg, int kmflags)
613 {
614 struct as *as = buf;
615
616 mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
617 cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
618 rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
619 as_avlinit(as);
620 return (0);
621 }
622
623 /*ARGSUSED1*/
624 static void
as_destructor(void * buf,void * cdrarg)625 as_destructor(void *buf, void *cdrarg)
626 {
627 struct as *as = buf;
628
629 avl_destroy(&as->a_segtree);
630 mutex_destroy(&as->a_contents);
631 cv_destroy(&as->a_cv);
632 rw_destroy(&as->a_lock);
633 }
634
635 void
as_init(void)636 as_init(void)
637 {
638 as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
639 as_constructor, as_destructor, NULL, NULL, NULL, 0);
640 }
641
642 /*
643 * Allocate and initialize an address space data structure.
644 * We call hat_alloc to allow any machine dependent
645 * information in the hat structure to be initialized.
646 */
647 struct as *
as_alloc(void)648 as_alloc(void)
649 {
650 struct as *as;
651
652 as = kmem_cache_alloc(as_cache, KM_SLEEP);
653
654 as->a_flags = 0;
655 as->a_vbits = 0;
656 as->a_hrm = NULL;
657 as->a_seglast = NULL;
658 as->a_size = 0;
659 as->a_resvsize = 0;
660 as->a_updatedir = 0;
661 gethrestime(&as->a_updatetime);
662 as->a_objectdir = NULL;
663 as->a_sizedir = 0;
664 as->a_userlimit = (caddr_t)USERLIMIT;
665 as->a_lastgap = NULL;
666 as->a_lastgaphl = NULL;
667 as->a_callbacks = NULL;
668
669 AS_LOCK_ENTER(as, RW_WRITER);
670 as->a_hat = hat_alloc(as); /* create hat for default system mmu */
671 AS_LOCK_EXIT(as);
672
673 return (as);
674 }
675
676 /*
677 * Free an address space data structure.
678 * Need to free the hat first and then
679 * all the segments on this as and finally
680 * the space for the as struct itself.
681 */
682 void
as_free(struct as * as)683 as_free(struct as *as)
684 {
685 struct hat *hat = as->a_hat;
686 struct seg *seg, *next;
687 boolean_t free_started = B_FALSE;
688
689 top:
690 /*
691 * Invoke ALL callbacks. as_do_callbacks will do one callback
692 * per call, and not return (-1) until the callback has completed.
693 * When as_do_callbacks returns zero, all callbacks have completed.
694 */
695 mutex_enter(&as->a_contents);
696 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
697 ;
698
699 mutex_exit(&as->a_contents);
700 AS_LOCK_ENTER(as, RW_WRITER);
701
702 if (!free_started) {
703 free_started = B_TRUE;
704 hat_free_start(hat);
705 }
706 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
707 int err;
708
709 next = AS_SEGNEXT(as, seg);
710 retry:
711 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
712 if (err == EAGAIN) {
713 mutex_enter(&as->a_contents);
714 if (as->a_callbacks) {
715 AS_LOCK_EXIT(as);
716 } else if (!AS_ISNOUNMAPWAIT(as)) {
717 /*
718 * Memory is currently locked. Wait for a
719 * cv_signal that it has been unlocked, then
720 * try the operation again.
721 */
722 if (AS_ISUNMAPWAIT(as) == 0)
723 cv_broadcast(&as->a_cv);
724 AS_SETUNMAPWAIT(as);
725 AS_LOCK_EXIT(as);
726 while (AS_ISUNMAPWAIT(as))
727 cv_wait(&as->a_cv, &as->a_contents);
728 } else {
729 /*
730 * We may have raced with
731 * segvn_reclaim()/segspt_reclaim(). In this
732 * case clean nounmapwait flag and retry since
733 * softlockcnt in this segment may be already
734 * 0. We don't drop as writer lock so our
735 * number of retries without sleeping should
736 * be very small. See segvn_reclaim() for
737 * more comments.
738 */
739 AS_CLRNOUNMAPWAIT(as);
740 mutex_exit(&as->a_contents);
741 goto retry;
742 }
743 mutex_exit(&as->a_contents);
744 goto top;
745 } else {
746 /*
747 * We do not expect any other error return at this
748 * time. This is similar to an ASSERT in seg_unmap()
749 */
750 ASSERT(err == 0);
751 }
752 }
753 hat_free_end(hat);
754 AS_LOCK_EXIT(as);
755
756 /* /proc stuff */
757 ASSERT(avl_numnodes(&as->a_wpage) == 0);
758 if (as->a_objectdir) {
759 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
760 as->a_objectdir = NULL;
761 as->a_sizedir = 0;
762 }
763
764 /*
765 * Free the struct as back to kmem. Assert it has no segments.
766 */
767 ASSERT(avl_numnodes(&as->a_segtree) == 0);
768 kmem_cache_free(as_cache, as);
769 }
770
771 int
as_dup(struct as * as,struct proc * forkedproc)772 as_dup(struct as *as, struct proc *forkedproc)
773 {
774 struct as *newas;
775 struct seg *seg, *newseg;
776 size_t purgesize = 0;
777 int error;
778
779 AS_LOCK_ENTER(as, RW_WRITER);
780 as_clearwatch(as);
781 newas = as_alloc();
782 newas->a_userlimit = as->a_userlimit;
783 newas->a_proc = forkedproc;
784
785 AS_LOCK_ENTER(newas, RW_WRITER);
786
787 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
788
789 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
790
791 if (seg->s_flags & S_PURGE) {
792 purgesize += seg->s_size;
793 continue;
794 }
795
796 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
797 if (newseg == NULL) {
798 AS_LOCK_EXIT(newas);
799 as_setwatch(as);
800 AS_LOCK_EXIT(as);
801 as_free(newas);
802 return (-1);
803 }
804 if ((error = SEGOP_DUP(seg, newseg)) != 0) {
805 /*
806 * We call seg_free() on the new seg
807 * because the segment is not set up
808 * completely; i.e. it has no ops.
809 */
810 as_setwatch(as);
811 AS_LOCK_EXIT(as);
812 seg_free(newseg);
813 AS_LOCK_EXIT(newas);
814 as_free(newas);
815 return (error);
816 }
817 newas->a_size += seg->s_size;
818 }
819 newas->a_resvsize = as->a_resvsize - purgesize;
820
821 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
822
823 AS_LOCK_EXIT(newas);
824
825 as_setwatch(as);
826 AS_LOCK_EXIT(as);
827 if (error != 0) {
828 as_free(newas);
829 return (error);
830 }
831 forkedproc->p_as = newas;
832 return (0);
833 }
834
835 /*
836 * Handle a ``fault'' at addr for size bytes.
837 */
838 faultcode_t
as_fault(struct hat * hat,struct as * as,caddr_t addr,size_t size,enum fault_type type,enum seg_rw rw)839 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
840 enum fault_type type, enum seg_rw rw)
841 {
842 struct seg *seg;
843 caddr_t raddr; /* rounded down addr */
844 size_t rsize; /* rounded up size */
845 size_t ssize;
846 faultcode_t res = 0;
847 caddr_t addrsav;
848 struct seg *segsav;
849 int as_lock_held;
850 klwp_t *lwp = ttolwp(curthread);
851
852
853
854 retry:
855 /*
856 * Indicate that the lwp is not to be stopped while waiting for a
857 * pagefault. This is to avoid deadlock while debugging a process
858 * via /proc over NFS (in particular).
859 */
860 if (lwp != NULL)
861 lwp->lwp_nostop++;
862
863 /*
864 * same length must be used when we softlock and softunlock. We
865 * don't support softunlocking lengths less than the original length
866 * when there is largepage support. See seg_dev.c for more
867 * comments.
868 */
869 switch (type) {
870
871 case F_SOFTLOCK:
872 CPU_STATS_ADD_K(vm, softlock, 1);
873 break;
874
875 case F_SOFTUNLOCK:
876 break;
877
878 case F_PROT:
879 CPU_STATS_ADD_K(vm, prot_fault, 1);
880 break;
881
882 case F_INVAL:
883 CPU_STATS_ENTER_K();
884 CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
885 if (as == &kas)
886 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
887 CPU_STATS_EXIT_K();
888 break;
889 }
890
891 /* Kernel probe */
892 TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
893 tnf_opaque, address, addr,
894 tnf_fault_type, fault_type, type,
895 tnf_seg_access, access, rw);
896
897 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
898 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
899 (size_t)raddr;
900
901 /*
902 * XXX -- Don't grab the as lock for segkmap. We should grab it for
903 * correctness, but then we could be stuck holding this lock for
904 * a LONG time if the fault needs to be resolved on a slow
905 * filesystem, and then no-one will be able to exec new commands,
906 * as exec'ing requires the write lock on the as.
907 */
908 if (as == &kas && segkmap && segkmap->s_base <= raddr &&
909 raddr + size < segkmap->s_base + segkmap->s_size) {
910 seg = segkmap;
911 as_lock_held = 0;
912 } else {
913 AS_LOCK_ENTER(as, RW_READER);
914
915 seg = as_segat(as, raddr);
916 if (seg == NULL) {
917 AS_LOCK_EXIT(as);
918 if (lwp != NULL)
919 lwp->lwp_nostop--;
920 return (FC_NOMAP);
921 }
922
923 as_lock_held = 1;
924 }
925
926 addrsav = raddr;
927 segsav = seg;
928
929 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
930 if (raddr >= seg->s_base + seg->s_size) {
931 seg = AS_SEGNEXT(as, seg);
932 if (seg == NULL || raddr != seg->s_base) {
933 res = FC_NOMAP;
934 break;
935 }
936 }
937 if (raddr + rsize > seg->s_base + seg->s_size)
938 ssize = seg->s_base + seg->s_size - raddr;
939 else
940 ssize = rsize;
941
942 res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
943 if (res != 0)
944 break;
945 }
946
947 /*
948 * If we were SOFTLOCKing and encountered a failure,
949 * we must SOFTUNLOCK the range we already did. (Maybe we
950 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
951 * right here...)
952 */
953 if (res != 0 && type == F_SOFTLOCK) {
954 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
955 if (addrsav >= seg->s_base + seg->s_size)
956 seg = AS_SEGNEXT(as, seg);
957 ASSERT(seg != NULL);
958 /*
959 * Now call the fault routine again to perform the
960 * unlock using S_OTHER instead of the rw variable
961 * since we never got a chance to touch the pages.
962 */
963 if (raddr > seg->s_base + seg->s_size)
964 ssize = seg->s_base + seg->s_size - addrsav;
965 else
966 ssize = raddr - addrsav;
967 (void) SEGOP_FAULT(hat, seg, addrsav, ssize,
968 F_SOFTUNLOCK, S_OTHER);
969 }
970 }
971 if (as_lock_held)
972 AS_LOCK_EXIT(as);
973 if (lwp != NULL)
974 lwp->lwp_nostop--;
975
976 /*
977 * If the lower levels returned EDEADLK for a fault,
978 * It means that we should retry the fault. Let's wait
979 * a bit also to let the deadlock causing condition clear.
980 * This is part of a gross hack to work around a design flaw
981 * in the ufs/sds logging code and should go away when the
982 * logging code is re-designed to fix the problem. See bug
983 * 4125102 for details of the problem.
984 */
985 if (FC_ERRNO(res) == EDEADLK) {
986 delay(deadlk_wait);
987 res = 0;
988 goto retry;
989 }
990 return (res);
991 }
992
993
994
995 /*
996 * Asynchronous ``fault'' at addr for size bytes.
997 */
998 faultcode_t
as_faulta(struct as * as,caddr_t addr,size_t size)999 as_faulta(struct as *as, caddr_t addr, size_t size)
1000 {
1001 struct seg *seg;
1002 caddr_t raddr; /* rounded down addr */
1003 size_t rsize; /* rounded up size */
1004 faultcode_t res = 0;
1005 klwp_t *lwp = ttolwp(curthread);
1006
1007 retry:
1008 /*
1009 * Indicate that the lwp is not to be stopped while waiting
1010 * for a pagefault. This is to avoid deadlock while debugging
1011 * a process via /proc over NFS (in particular).
1012 */
1013 if (lwp != NULL)
1014 lwp->lwp_nostop++;
1015
1016 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1017 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1018 (size_t)raddr;
1019
1020 AS_LOCK_ENTER(as, RW_READER);
1021 seg = as_segat(as, raddr);
1022 if (seg == NULL) {
1023 AS_LOCK_EXIT(as);
1024 if (lwp != NULL)
1025 lwp->lwp_nostop--;
1026 return (FC_NOMAP);
1027 }
1028
1029 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1030 if (raddr >= seg->s_base + seg->s_size) {
1031 seg = AS_SEGNEXT(as, seg);
1032 if (seg == NULL || raddr != seg->s_base) {
1033 res = FC_NOMAP;
1034 break;
1035 }
1036 }
1037 res = SEGOP_FAULTA(seg, raddr);
1038 if (res != 0)
1039 break;
1040 }
1041 AS_LOCK_EXIT(as);
1042 if (lwp != NULL)
1043 lwp->lwp_nostop--;
1044 /*
1045 * If the lower levels returned EDEADLK for a fault,
1046 * It means that we should retry the fault. Let's wait
1047 * a bit also to let the deadlock causing condition clear.
1048 * This is part of a gross hack to work around a design flaw
1049 * in the ufs/sds logging code and should go away when the
1050 * logging code is re-designed to fix the problem. See bug
1051 * 4125102 for details of the problem.
1052 */
1053 if (FC_ERRNO(res) == EDEADLK) {
1054 delay(deadlk_wait);
1055 res = 0;
1056 goto retry;
1057 }
1058 return (res);
1059 }
1060
1061 /*
1062 * Set the virtual mapping for the interval from [addr : addr + size)
1063 * in address space `as' to have the specified protection.
1064 * It is ok for the range to cross over several segments,
1065 * as long as they are contiguous.
1066 */
1067 int
as_setprot(struct as * as,caddr_t addr,size_t size,uint_t prot)1068 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1069 {
1070 struct seg *seg;
1071 struct as_callback *cb;
1072 size_t ssize;
1073 caddr_t raddr; /* rounded down addr */
1074 size_t rsize; /* rounded up size */
1075 int error = 0, writer = 0;
1076 caddr_t saveraddr;
1077 size_t saversize;
1078
1079 setprot_top:
1080 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1081 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1082 (size_t)raddr;
1083
1084 if (raddr + rsize < raddr) /* check for wraparound */
1085 return (ENOMEM);
1086
1087 saveraddr = raddr;
1088 saversize = rsize;
1089
1090 /*
1091 * Normally we only lock the as as a reader. But
1092 * if due to setprot the segment driver needs to split
1093 * a segment it will return IE_RETRY. Therefore we re-acquire
1094 * the as lock as a writer so the segment driver can change
1095 * the seg list. Also the segment driver will return IE_RETRY
1096 * after it has changed the segment list so we therefore keep
1097 * locking as a writer. Since these opeartions should be rare
1098 * want to only lock as a writer when necessary.
1099 */
1100 if (writer || avl_numnodes(&as->a_wpage) != 0) {
1101 AS_LOCK_ENTER(as, RW_WRITER);
1102 } else {
1103 AS_LOCK_ENTER(as, RW_READER);
1104 }
1105
1106 as_clearwatchprot(as, raddr, rsize);
1107 seg = as_segat(as, raddr);
1108 if (seg == NULL) {
1109 as_setwatch(as);
1110 AS_LOCK_EXIT(as);
1111 return (ENOMEM);
1112 }
1113
1114 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1115 if (raddr >= seg->s_base + seg->s_size) {
1116 seg = AS_SEGNEXT(as, seg);
1117 if (seg == NULL || raddr != seg->s_base) {
1118 error = ENOMEM;
1119 break;
1120 }
1121 }
1122 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1123 ssize = seg->s_base + seg->s_size - raddr;
1124 else
1125 ssize = rsize;
1126 retry:
1127 error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1128
1129 if (error == IE_NOMEM) {
1130 error = EAGAIN;
1131 break;
1132 }
1133
1134 if (error == IE_RETRY) {
1135 AS_LOCK_EXIT(as);
1136 writer = 1;
1137 goto setprot_top;
1138 }
1139
1140 if (error == EAGAIN) {
1141 /*
1142 * Make sure we have a_lock as writer.
1143 */
1144 if (writer == 0) {
1145 AS_LOCK_EXIT(as);
1146 writer = 1;
1147 goto setprot_top;
1148 }
1149
1150 /*
1151 * Memory is currently locked. It must be unlocked
1152 * before this operation can succeed through a retry.
1153 * The possible reasons for locked memory and
1154 * corresponding strategies for unlocking are:
1155 * (1) Normal I/O
1156 * wait for a signal that the I/O operation
1157 * has completed and the memory is unlocked.
1158 * (2) Asynchronous I/O
1159 * The aio subsystem does not unlock pages when
1160 * the I/O is completed. Those pages are unlocked
1161 * when the application calls aiowait/aioerror.
1162 * So, to prevent blocking forever, cv_broadcast()
1163 * is done to wake up aio_cleanup_thread.
1164 * Subsequently, segvn_reclaim will be called, and
1165 * that will do AS_CLRUNMAPWAIT() and wake us up.
1166 * (3) Long term page locking:
1167 * Drivers intending to have pages locked for a
1168 * period considerably longer than for normal I/O
1169 * (essentially forever) may have registered for a
1170 * callback so they may unlock these pages on
1171 * request. This is needed to allow this operation
1172 * to succeed. Each entry on the callback list is
1173 * examined. If the event or address range pertains
1174 * the callback is invoked (unless it already is in
1175 * progress). The a_contents lock must be dropped
1176 * before the callback, so only one callback can
1177 * be done at a time. Go to the top and do more
1178 * until zero is returned. If zero is returned,
1179 * either there were no callbacks for this event
1180 * or they were already in progress.
1181 */
1182 mutex_enter(&as->a_contents);
1183 if (as->a_callbacks &&
1184 (cb = as_find_callback(as, AS_SETPROT_EVENT,
1185 seg->s_base, seg->s_size))) {
1186 AS_LOCK_EXIT(as);
1187 as_execute_callback(as, cb, AS_SETPROT_EVENT);
1188 } else if (!AS_ISNOUNMAPWAIT(as)) {
1189 if (AS_ISUNMAPWAIT(as) == 0)
1190 cv_broadcast(&as->a_cv);
1191 AS_SETUNMAPWAIT(as);
1192 AS_LOCK_EXIT(as);
1193 while (AS_ISUNMAPWAIT(as))
1194 cv_wait(&as->a_cv, &as->a_contents);
1195 } else {
1196 /*
1197 * We may have raced with
1198 * segvn_reclaim()/segspt_reclaim(). In this
1199 * case clean nounmapwait flag and retry since
1200 * softlockcnt in this segment may be already
1201 * 0. We don't drop as writer lock so our
1202 * number of retries without sleeping should
1203 * be very small. See segvn_reclaim() for
1204 * more comments.
1205 */
1206 AS_CLRNOUNMAPWAIT(as);
1207 mutex_exit(&as->a_contents);
1208 goto retry;
1209 }
1210 mutex_exit(&as->a_contents);
1211 goto setprot_top;
1212 } else if (error != 0)
1213 break;
1214 }
1215 if (error != 0) {
1216 as_setwatch(as);
1217 } else {
1218 as_setwatchprot(as, saveraddr, saversize, prot);
1219 }
1220 AS_LOCK_EXIT(as);
1221 return (error);
1222 }
1223
1224 /*
1225 * Check to make sure that the interval [addr, addr + size)
1226 * in address space `as' has at least the specified protection.
1227 * It is ok for the range to cross over several segments, as long
1228 * as they are contiguous.
1229 */
1230 int
as_checkprot(struct as * as,caddr_t addr,size_t size,uint_t prot)1231 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1232 {
1233 struct seg *seg;
1234 size_t ssize;
1235 caddr_t raddr; /* rounded down addr */
1236 size_t rsize; /* rounded up size */
1237 int error = 0;
1238
1239 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1240 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1241 (size_t)raddr;
1242
1243 if (raddr + rsize < raddr) /* check for wraparound */
1244 return (ENOMEM);
1245
1246 /*
1247 * This is ugly as sin...
1248 * Normally, we only acquire the address space readers lock.
1249 * However, if the address space has watchpoints present,
1250 * we must acquire the writer lock on the address space for
1251 * the benefit of as_clearwatchprot() and as_setwatchprot().
1252 */
1253 if (avl_numnodes(&as->a_wpage) != 0)
1254 AS_LOCK_ENTER(as, RW_WRITER);
1255 else
1256 AS_LOCK_ENTER(as, RW_READER);
1257 as_clearwatchprot(as, raddr, rsize);
1258 seg = as_segat(as, raddr);
1259 if (seg == NULL) {
1260 as_setwatch(as);
1261 AS_LOCK_EXIT(as);
1262 return (ENOMEM);
1263 }
1264
1265 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1266 if (raddr >= seg->s_base + seg->s_size) {
1267 seg = AS_SEGNEXT(as, seg);
1268 if (seg == NULL || raddr != seg->s_base) {
1269 error = ENOMEM;
1270 break;
1271 }
1272 }
1273 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1274 ssize = seg->s_base + seg->s_size - raddr;
1275 else
1276 ssize = rsize;
1277
1278 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1279 if (error != 0)
1280 break;
1281 }
1282 as_setwatch(as);
1283 AS_LOCK_EXIT(as);
1284 return (error);
1285 }
1286
1287 int
as_unmap(struct as * as,caddr_t addr,size_t size)1288 as_unmap(struct as *as, caddr_t addr, size_t size)
1289 {
1290 struct seg *seg, *seg_next;
1291 struct as_callback *cb;
1292 caddr_t raddr, eaddr;
1293 size_t ssize, rsize = 0;
1294 int err;
1295
1296 top:
1297 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1298 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1299 (uintptr_t)PAGEMASK);
1300
1301 AS_LOCK_ENTER(as, RW_WRITER);
1302
1303 as->a_updatedir = 1; /* inform /proc */
1304 gethrestime(&as->a_updatetime);
1305
1306 /*
1307 * Use as_findseg to find the first segment in the range, then
1308 * step through the segments in order, following s_next.
1309 */
1310 as_clearwatchprot(as, raddr, eaddr - raddr);
1311
1312 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1313 if (eaddr <= seg->s_base)
1314 break; /* eaddr was in a gap; all done */
1315
1316 /* this is implied by the test above */
1317 ASSERT(raddr < eaddr);
1318
1319 if (raddr < seg->s_base)
1320 raddr = seg->s_base; /* raddr was in a gap */
1321
1322 if (eaddr > (seg->s_base + seg->s_size))
1323 ssize = seg->s_base + seg->s_size - raddr;
1324 else
1325 ssize = eaddr - raddr;
1326
1327 /*
1328 * Save next segment pointer since seg can be
1329 * destroyed during the segment unmap operation.
1330 */
1331 seg_next = AS_SEGNEXT(as, seg);
1332
1333 /*
1334 * We didn't count /dev/null mappings, so ignore them here.
1335 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1336 * we have to do this check here while we have seg.)
1337 */
1338 rsize = 0;
1339 if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1340 !SEG_IS_PARTIAL_RESV(seg))
1341 rsize = ssize;
1342
1343 retry:
1344 err = SEGOP_UNMAP(seg, raddr, ssize);
1345 if (err == EAGAIN) {
1346 /*
1347 * Memory is currently locked. It must be unlocked
1348 * before this operation can succeed through a retry.
1349 * The possible reasons for locked memory and
1350 * corresponding strategies for unlocking are:
1351 * (1) Normal I/O
1352 * wait for a signal that the I/O operation
1353 * has completed and the memory is unlocked.
1354 * (2) Asynchronous I/O
1355 * The aio subsystem does not unlock pages when
1356 * the I/O is completed. Those pages are unlocked
1357 * when the application calls aiowait/aioerror.
1358 * So, to prevent blocking forever, cv_broadcast()
1359 * is done to wake up aio_cleanup_thread.
1360 * Subsequently, segvn_reclaim will be called, and
1361 * that will do AS_CLRUNMAPWAIT() and wake us up.
1362 * (3) Long term page locking:
1363 * Drivers intending to have pages locked for a
1364 * period considerably longer than for normal I/O
1365 * (essentially forever) may have registered for a
1366 * callback so they may unlock these pages on
1367 * request. This is needed to allow this operation
1368 * to succeed. Each entry on the callback list is
1369 * examined. If the event or address range pertains
1370 * the callback is invoked (unless it already is in
1371 * progress). The a_contents lock must be dropped
1372 * before the callback, so only one callback can
1373 * be done at a time. Go to the top and do more
1374 * until zero is returned. If zero is returned,
1375 * either there were no callbacks for this event
1376 * or they were already in progress.
1377 */
1378 mutex_enter(&as->a_contents);
1379 if (as->a_callbacks &&
1380 (cb = as_find_callback(as, AS_UNMAP_EVENT,
1381 seg->s_base, seg->s_size))) {
1382 AS_LOCK_EXIT(as);
1383 as_execute_callback(as, cb, AS_UNMAP_EVENT);
1384 } else if (!AS_ISNOUNMAPWAIT(as)) {
1385 if (AS_ISUNMAPWAIT(as) == 0)
1386 cv_broadcast(&as->a_cv);
1387 AS_SETUNMAPWAIT(as);
1388 AS_LOCK_EXIT(as);
1389 while (AS_ISUNMAPWAIT(as))
1390 cv_wait(&as->a_cv, &as->a_contents);
1391 } else {
1392 /*
1393 * We may have raced with
1394 * segvn_reclaim()/segspt_reclaim(). In this
1395 * case clean nounmapwait flag and retry since
1396 * softlockcnt in this segment may be already
1397 * 0. We don't drop as writer lock so our
1398 * number of retries without sleeping should
1399 * be very small. See segvn_reclaim() for
1400 * more comments.
1401 */
1402 AS_CLRNOUNMAPWAIT(as);
1403 mutex_exit(&as->a_contents);
1404 goto retry;
1405 }
1406 mutex_exit(&as->a_contents);
1407 goto top;
1408 } else if (err == IE_RETRY) {
1409 AS_LOCK_EXIT(as);
1410 goto top;
1411 } else if (err) {
1412 as_setwatch(as);
1413 AS_LOCK_EXIT(as);
1414 return (-1);
1415 }
1416
1417 as->a_size -= ssize;
1418 if (rsize)
1419 as->a_resvsize -= rsize;
1420 raddr += ssize;
1421 }
1422 AS_LOCK_EXIT(as);
1423 return (0);
1424 }
1425
1426 static int
as_map_segvn_segs(struct as * as,caddr_t addr,size_t size,uint_t szcvec,int (* crfp)(),struct segvn_crargs * vn_a,int * segcreated)1427 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1428 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1429 {
1430 uint_t szc;
1431 uint_t nszc;
1432 int error;
1433 caddr_t a;
1434 caddr_t eaddr;
1435 size_t segsize;
1436 struct seg *seg;
1437 size_t pgsz;
1438 int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1439 uint_t save_szcvec;
1440
1441 ASSERT(AS_WRITE_HELD(as));
1442 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1443 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1444 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1445 if (!do_off) {
1446 vn_a->offset = 0;
1447 }
1448
1449 if (szcvec <= 1) {
1450 seg = seg_alloc(as, addr, size);
1451 if (seg == NULL) {
1452 return (ENOMEM);
1453 }
1454 vn_a->szc = 0;
1455 error = (*crfp)(seg, vn_a);
1456 if (error != 0) {
1457 seg_free(seg);
1458 } else {
1459 as->a_size += size;
1460 as->a_resvsize += size;
1461 }
1462 return (error);
1463 }
1464
1465 eaddr = addr + size;
1466 save_szcvec = szcvec;
1467 szcvec >>= 1;
1468 szc = 0;
1469 nszc = 0;
1470 while (szcvec) {
1471 if ((szcvec & 0x1) == 0) {
1472 nszc++;
1473 szcvec >>= 1;
1474 continue;
1475 }
1476 nszc++;
1477 pgsz = page_get_pagesize(nszc);
1478 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1479 if (a != addr) {
1480 ASSERT(a < eaddr);
1481 segsize = a - addr;
1482 seg = seg_alloc(as, addr, segsize);
1483 if (seg == NULL) {
1484 return (ENOMEM);
1485 }
1486 vn_a->szc = szc;
1487 error = (*crfp)(seg, vn_a);
1488 if (error != 0) {
1489 seg_free(seg);
1490 return (error);
1491 }
1492 as->a_size += segsize;
1493 as->a_resvsize += segsize;
1494 *segcreated = 1;
1495 if (do_off) {
1496 vn_a->offset += segsize;
1497 }
1498 addr = a;
1499 }
1500 szc = nszc;
1501 szcvec >>= 1;
1502 }
1503
1504 ASSERT(addr < eaddr);
1505 szcvec = save_szcvec | 1; /* add 8K pages */
1506 while (szcvec) {
1507 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1508 ASSERT(a >= addr);
1509 if (a != addr) {
1510 segsize = a - addr;
1511 seg = seg_alloc(as, addr, segsize);
1512 if (seg == NULL) {
1513 return (ENOMEM);
1514 }
1515 vn_a->szc = szc;
1516 error = (*crfp)(seg, vn_a);
1517 if (error != 0) {
1518 seg_free(seg);
1519 return (error);
1520 }
1521 as->a_size += segsize;
1522 as->a_resvsize += segsize;
1523 *segcreated = 1;
1524 if (do_off) {
1525 vn_a->offset += segsize;
1526 }
1527 addr = a;
1528 }
1529 szcvec &= ~(1 << szc);
1530 if (szcvec) {
1531 szc = highbit(szcvec) - 1;
1532 pgsz = page_get_pagesize(szc);
1533 }
1534 }
1535 ASSERT(addr == eaddr);
1536
1537 return (0);
1538 }
1539
1540 static int
as_map_vnsegs(struct as * as,caddr_t addr,size_t size,int (* crfp)(),struct segvn_crargs * vn_a,int * segcreated)1541 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1542 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1543 {
1544 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1545 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1546 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1547 type, 0);
1548 int error;
1549 struct seg *seg;
1550 struct vattr va;
1551 u_offset_t eoff;
1552 size_t save_size = 0;
1553 extern size_t textrepl_size_thresh;
1554
1555 ASSERT(AS_WRITE_HELD(as));
1556 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1557 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1558 ASSERT(vn_a->vp != NULL);
1559 ASSERT(vn_a->amp == NULL);
1560
1561 again:
1562 if (szcvec <= 1) {
1563 seg = seg_alloc(as, addr, size);
1564 if (seg == NULL) {
1565 return (ENOMEM);
1566 }
1567 vn_a->szc = 0;
1568 error = (*crfp)(seg, vn_a);
1569 if (error != 0) {
1570 seg_free(seg);
1571 } else {
1572 as->a_size += size;
1573 as->a_resvsize += size;
1574 }
1575 return (error);
1576 }
1577
1578 va.va_mask = AT_SIZE;
1579 if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1580 szcvec = 0;
1581 goto again;
1582 }
1583 eoff = vn_a->offset & PAGEMASK;
1584 if (eoff >= va.va_size) {
1585 szcvec = 0;
1586 goto again;
1587 }
1588 eoff += size;
1589 if (btopr(va.va_size) < btopr(eoff)) {
1590 save_size = size;
1591 size = va.va_size - (vn_a->offset & PAGEMASK);
1592 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1593 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1594 type, 0);
1595 if (szcvec <= 1) {
1596 size = save_size;
1597 goto again;
1598 }
1599 }
1600
1601 if (size > textrepl_size_thresh) {
1602 vn_a->flags |= _MAP_TEXTREPL;
1603 }
1604 error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1605 segcreated);
1606 if (error != 0) {
1607 return (error);
1608 }
1609 if (save_size) {
1610 addr += size;
1611 size = save_size - size;
1612 szcvec = 0;
1613 goto again;
1614 }
1615 return (0);
1616 }
1617
1618 /*
1619 * as_map_ansegs: shared or private anonymous memory. Note that the flags
1620 * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1621 */
1622 static int
as_map_ansegs(struct as * as,caddr_t addr,size_t size,int (* crfp)(),struct segvn_crargs * vn_a,int * segcreated)1623 as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1624 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1625 {
1626 uint_t szcvec;
1627 uchar_t type;
1628
1629 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1630 if (vn_a->type == MAP_SHARED) {
1631 type = MAPPGSZC_SHM;
1632 } else if (vn_a->type == MAP_PRIVATE) {
1633 if (vn_a->szc == AS_MAP_HEAP) {
1634 type = MAPPGSZC_HEAP;
1635 } else if (vn_a->szc == AS_MAP_STACK) {
1636 type = MAPPGSZC_STACK;
1637 } else {
1638 type = MAPPGSZC_PRIVM;
1639 }
1640 }
1641 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1642 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1643 (vn_a->flags & MAP_TEXT), type, 0);
1644 ASSERT(AS_WRITE_HELD(as));
1645 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1646 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1647 ASSERT(vn_a->vp == NULL);
1648
1649 return (as_map_segvn_segs(as, addr, size, szcvec,
1650 crfp, vn_a, segcreated));
1651 }
1652
1653 int
as_map(struct as * as,caddr_t addr,size_t size,int (* crfp)(),void * argsp)1654 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1655 {
1656 AS_LOCK_ENTER(as, RW_WRITER);
1657 return (as_map_locked(as, addr, size, crfp, argsp));
1658 }
1659
1660 int
as_map_locked(struct as * as,caddr_t addr,size_t size,int (* crfp)(),void * argsp)1661 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1662 void *argsp)
1663 {
1664 struct seg *seg = NULL;
1665 caddr_t raddr; /* rounded down addr */
1666 size_t rsize; /* rounded up size */
1667 int error;
1668 int unmap = 0;
1669 struct proc *p = curproc;
1670 struct segvn_crargs crargs;
1671
1672 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1673 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1674 (size_t)raddr;
1675
1676 /*
1677 * check for wrap around
1678 */
1679 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1680 AS_LOCK_EXIT(as);
1681 return (ENOMEM);
1682 }
1683
1684 as->a_updatedir = 1; /* inform /proc */
1685 gethrestime(&as->a_updatetime);
1686
1687 if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1688 AS_LOCK_EXIT(as);
1689
1690 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1691 RCA_UNSAFE_ALL);
1692
1693 return (ENOMEM);
1694 }
1695
1696 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1697 crargs = *(struct segvn_crargs *)argsp;
1698 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1699 if (error != 0) {
1700 AS_LOCK_EXIT(as);
1701 if (unmap) {
1702 (void) as_unmap(as, addr, size);
1703 }
1704 return (error);
1705 }
1706 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1707 crargs = *(struct segvn_crargs *)argsp;
1708 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1709 if (error != 0) {
1710 AS_LOCK_EXIT(as);
1711 if (unmap) {
1712 (void) as_unmap(as, addr, size);
1713 }
1714 return (error);
1715 }
1716 } else {
1717 seg = seg_alloc(as, addr, size);
1718 if (seg == NULL) {
1719 AS_LOCK_EXIT(as);
1720 return (ENOMEM);
1721 }
1722
1723 error = (*crfp)(seg, argsp);
1724 if (error != 0) {
1725 seg_free(seg);
1726 AS_LOCK_EXIT(as);
1727 return (error);
1728 }
1729 /*
1730 * Add size now so as_unmap will work if as_ctl fails.
1731 */
1732 as->a_size += rsize;
1733 as->a_resvsize += rsize;
1734 }
1735
1736 as_setwatch(as);
1737
1738 /*
1739 * If the address space is locked,
1740 * establish memory locks for the new segment.
1741 */
1742 mutex_enter(&as->a_contents);
1743 if (AS_ISPGLCK(as)) {
1744 mutex_exit(&as->a_contents);
1745 AS_LOCK_EXIT(as);
1746 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1747 if (error != 0)
1748 (void) as_unmap(as, addr, size);
1749 } else {
1750 mutex_exit(&as->a_contents);
1751 AS_LOCK_EXIT(as);
1752 }
1753 return (error);
1754 }
1755
1756
1757 /*
1758 * Delete all segments in the address space marked with S_PURGE.
1759 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1760 * These segments are deleted as a first step before calls to as_gap(), so
1761 * that they don't affect mmap() or shmat().
1762 */
1763 void
as_purge(struct as * as)1764 as_purge(struct as *as)
1765 {
1766 struct seg *seg;
1767 struct seg *next_seg;
1768
1769 /*
1770 * the setting of NEEDSPURGE is protect by as_rangelock(), so
1771 * no need to grab a_contents mutex for this check
1772 */
1773 if ((as->a_flags & AS_NEEDSPURGE) == 0)
1774 return;
1775
1776 AS_LOCK_ENTER(as, RW_WRITER);
1777 next_seg = NULL;
1778 seg = AS_SEGFIRST(as);
1779 while (seg != NULL) {
1780 next_seg = AS_SEGNEXT(as, seg);
1781 if (seg->s_flags & S_PURGE)
1782 SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1783 seg = next_seg;
1784 }
1785 AS_LOCK_EXIT(as);
1786
1787 mutex_enter(&as->a_contents);
1788 as->a_flags &= ~AS_NEEDSPURGE;
1789 mutex_exit(&as->a_contents);
1790 }
1791
1792 /*
1793 * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1794 * range of addresses at least "minlen" long, where the base of the range is
1795 * at "off" phase from an "align" boundary and there is space for a
1796 * "redzone"-sized redzone on eithe rside of the range. Thus,
1797 * if align was 4M and off was 16k, the user wants a hole which will start
1798 * 16k into a 4M page.
1799 *
1800 * If flags specifies AH_HI, the hole will have the highest possible address
1801 * in the range. We use the as->a_lastgap field to figure out where to
1802 * start looking for a gap.
1803 *
1804 * Otherwise, the gap will have the lowest possible address.
1805 *
1806 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1807 *
1808 * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1809 * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1810 *
1811 * NOTE: This routine is not correct when base+len overflows caddr_t.
1812 */
1813 int
as_gap_aligned(struct as * as,size_t minlen,caddr_t * basep,size_t * lenp,uint_t flags,caddr_t addr,size_t align,size_t redzone,size_t off)1814 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1815 uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1816 {
1817 caddr_t lobound = *basep;
1818 caddr_t hibound = lobound + *lenp;
1819 struct seg *lseg, *hseg;
1820 caddr_t lo, hi;
1821 int forward;
1822 caddr_t save_base;
1823 size_t save_len;
1824 size_t save_minlen;
1825 size_t save_redzone;
1826 int fast_path = 1;
1827
1828 save_base = *basep;
1829 save_len = *lenp;
1830 save_minlen = minlen;
1831 save_redzone = redzone;
1832
1833 /*
1834 * For the first pass/fast_path, just add align and redzone into
1835 * minlen since if we get an allocation, we can guarantee that it
1836 * will fit the alignment and redzone requested.
1837 * This increases the chance that hibound will be adjusted to
1838 * a_lastgap->s_base which will likely allow us to find an
1839 * acceptable hole in the address space quicker.
1840 * If we can't find a hole with this fast_path, then we look for
1841 * smaller holes in which the alignment and offset may allow
1842 * the allocation to fit.
1843 */
1844 minlen += align;
1845 minlen += 2 * redzone;
1846 redzone = 0;
1847
1848 AS_LOCK_ENTER(as, RW_READER);
1849 if (AS_SEGFIRST(as) == NULL) {
1850 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1851 align, redzone, off)) {
1852 AS_LOCK_EXIT(as);
1853 return (0);
1854 } else {
1855 AS_LOCK_EXIT(as);
1856 *basep = save_base;
1857 *lenp = save_len;
1858 return (-1);
1859 }
1860 }
1861
1862 retry:
1863 /*
1864 * Set up to iterate over all the inter-segment holes in the given
1865 * direction. lseg is NULL for the lowest-addressed hole and hseg is
1866 * NULL for the highest-addressed hole. If moving backwards, we reset
1867 * sseg to denote the highest-addressed segment.
1868 */
1869 forward = (flags & AH_DIR) == AH_LO;
1870 if (forward) {
1871 hseg = as_findseg(as, lobound, 1);
1872 lseg = AS_SEGPREV(as, hseg);
1873 } else {
1874
1875 /*
1876 * If allocating at least as much as the last allocation,
1877 * use a_lastgap's base as a better estimate of hibound.
1878 */
1879 if (as->a_lastgap &&
1880 minlen >= as->a_lastgap->s_size &&
1881 hibound >= as->a_lastgap->s_base)
1882 hibound = as->a_lastgap->s_base;
1883
1884 hseg = as_findseg(as, hibound, 1);
1885 if (hseg->s_base + hseg->s_size < hibound) {
1886 lseg = hseg;
1887 hseg = NULL;
1888 } else {
1889 lseg = AS_SEGPREV(as, hseg);
1890 }
1891 }
1892
1893 for (;;) {
1894 /*
1895 * Set lo and hi to the hole's boundaries. (We should really
1896 * use MAXADDR in place of hibound in the expression below,
1897 * but can't express it easily; using hibound in its place is
1898 * harmless.)
1899 */
1900 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1901 hi = (hseg == NULL) ? hibound : hseg->s_base;
1902 /*
1903 * If the iteration has moved past the interval from lobound
1904 * to hibound it's pointless to continue.
1905 */
1906 if ((forward && lo > hibound) || (!forward && hi < lobound))
1907 break;
1908 else if (lo > hibound || hi < lobound)
1909 goto cont;
1910 /*
1911 * Candidate hole lies at least partially within the allowable
1912 * range. Restrict it to fall completely within that range,
1913 * i.e., to [max(lo, lobound), min(hi, hibound)].
1914 */
1915 if (lo < lobound)
1916 lo = lobound;
1917 if (hi > hibound)
1918 hi = hibound;
1919 /*
1920 * Verify that the candidate hole is big enough and meets
1921 * hardware constraints. If the hole is too small, no need
1922 * to do the further checks since they will fail.
1923 */
1924 *basep = lo;
1925 *lenp = hi - lo;
1926 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1927 minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1928 ((flags & AH_CONTAIN) == 0 ||
1929 (*basep <= addr && *basep + *lenp > addr))) {
1930 if (!forward)
1931 as->a_lastgap = hseg;
1932 if (hseg != NULL)
1933 as->a_lastgaphl = hseg;
1934 else
1935 as->a_lastgaphl = lseg;
1936 AS_LOCK_EXIT(as);
1937 return (0);
1938 }
1939 cont:
1940 /*
1941 * Move to the next hole.
1942 */
1943 if (forward) {
1944 lseg = hseg;
1945 if (lseg == NULL)
1946 break;
1947 hseg = AS_SEGNEXT(as, hseg);
1948 } else {
1949 hseg = lseg;
1950 if (hseg == NULL)
1951 break;
1952 lseg = AS_SEGPREV(as, lseg);
1953 }
1954 }
1955 if (fast_path && (align != 0 || save_redzone != 0)) {
1956 fast_path = 0;
1957 minlen = save_minlen;
1958 redzone = save_redzone;
1959 goto retry;
1960 }
1961 *basep = save_base;
1962 *lenp = save_len;
1963 AS_LOCK_EXIT(as);
1964 return (-1);
1965 }
1966
1967 /*
1968 * Find a hole of at least size minlen within [*basep, *basep + *lenp).
1969 *
1970 * If flags specifies AH_HI, the hole will have the highest possible address
1971 * in the range. We use the as->a_lastgap field to figure out where to
1972 * start looking for a gap.
1973 *
1974 * Otherwise, the gap will have the lowest possible address.
1975 *
1976 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1977 *
1978 * If an adequate hole is found, base and len are set to reflect the part of
1979 * the hole that is within range, and 0 is returned, otherwise,
1980 * -1 is returned.
1981 *
1982 * NOTE: This routine is not correct when base+len overflows caddr_t.
1983 */
1984 int
as_gap(struct as * as,size_t minlen,caddr_t * basep,size_t * lenp,uint_t flags,caddr_t addr)1985 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
1986 caddr_t addr)
1987 {
1988
1989 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
1990 }
1991
1992 /*
1993 * Return the next range within [base, base + len) that is backed
1994 * with "real memory". Skip holes and non-seg_vn segments.
1995 * We're lazy and only return one segment at a time.
1996 */
1997 int
as_memory(struct as * as,caddr_t * basep,size_t * lenp)1998 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
1999 {
2000 extern struct seg_ops segspt_shmops; /* needs a header file */
2001 struct seg *seg;
2002 caddr_t addr, eaddr;
2003 caddr_t segend;
2004
2005 AS_LOCK_ENTER(as, RW_READER);
2006
2007 addr = *basep;
2008 eaddr = addr + *lenp;
2009
2010 seg = as_findseg(as, addr, 0);
2011 if (seg != NULL)
2012 addr = MAX(seg->s_base, addr);
2013
2014 for (;;) {
2015 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2016 AS_LOCK_EXIT(as);
2017 return (EINVAL);
2018 }
2019
2020 if (seg->s_ops == &segvn_ops) {
2021 segend = seg->s_base + seg->s_size;
2022 break;
2023 }
2024
2025 /*
2026 * We do ISM by looking into the private data
2027 * to determine the real size of the segment.
2028 */
2029 if (seg->s_ops == &segspt_shmops) {
2030 segend = seg->s_base + spt_realsize(seg);
2031 if (addr < segend)
2032 break;
2033 }
2034
2035 seg = AS_SEGNEXT(as, seg);
2036
2037 if (seg != NULL)
2038 addr = seg->s_base;
2039 }
2040
2041 *basep = addr;
2042
2043 if (segend > eaddr)
2044 *lenp = eaddr - addr;
2045 else
2046 *lenp = segend - addr;
2047
2048 AS_LOCK_EXIT(as);
2049 return (0);
2050 }
2051
2052 /*
2053 * Swap the pages associated with the address space as out to
2054 * secondary storage, returning the number of bytes actually
2055 * swapped.
2056 *
2057 * The value returned is intended to correlate well with the process's
2058 * memory requirements. Its usefulness for this purpose depends on
2059 * how well the segment-level routines do at returning accurate
2060 * information.
2061 */
2062 size_t
as_swapout(struct as * as)2063 as_swapout(struct as *as)
2064 {
2065 struct seg *seg;
2066 size_t swpcnt = 0;
2067
2068 /*
2069 * Kernel-only processes have given up their address
2070 * spaces. Of course, we shouldn't be attempting to
2071 * swap out such processes in the first place...
2072 */
2073 if (as == NULL)
2074 return (0);
2075
2076 AS_LOCK_ENTER(as, RW_READER);
2077
2078 /*
2079 * Free all mapping resources associated with the address
2080 * space. The segment-level swapout routines capitalize
2081 * on this unmapping by scavanging pages that have become
2082 * unmapped here.
2083 */
2084 hat_swapout(as->a_hat);
2085
2086 /*
2087 * Call the swapout routines of all segments in the address
2088 * space to do the actual work, accumulating the amount of
2089 * space reclaimed.
2090 */
2091 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2092 struct seg_ops *ov = seg->s_ops;
2093
2094 /*
2095 * We have to check to see if the seg has
2096 * an ops vector because the seg may have
2097 * been in the middle of being set up when
2098 * the process was picked for swapout.
2099 */
2100 if ((ov != NULL) && (ov->swapout != NULL))
2101 swpcnt += SEGOP_SWAPOUT(seg);
2102 }
2103 AS_LOCK_EXIT(as);
2104 return (swpcnt);
2105 }
2106
2107 /*
2108 * Determine whether data from the mappings in interval [addr, addr + size)
2109 * are in the primary memory (core) cache.
2110 */
2111 int
as_incore(struct as * as,caddr_t addr,size_t size,char * vec,size_t * sizep)2112 as_incore(struct as *as, caddr_t addr,
2113 size_t size, char *vec, size_t *sizep)
2114 {
2115 struct seg *seg;
2116 size_t ssize;
2117 caddr_t raddr; /* rounded down addr */
2118 size_t rsize; /* rounded up size */
2119 size_t isize; /* iteration size */
2120 int error = 0; /* result, assume success */
2121
2122 *sizep = 0;
2123 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2124 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2125 (size_t)raddr;
2126
2127 if (raddr + rsize < raddr) /* check for wraparound */
2128 return (ENOMEM);
2129
2130 AS_LOCK_ENTER(as, RW_READER);
2131 seg = as_segat(as, raddr);
2132 if (seg == NULL) {
2133 AS_LOCK_EXIT(as);
2134 return (-1);
2135 }
2136
2137 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2138 if (raddr >= seg->s_base + seg->s_size) {
2139 seg = AS_SEGNEXT(as, seg);
2140 if (seg == NULL || raddr != seg->s_base) {
2141 error = -1;
2142 break;
2143 }
2144 }
2145 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2146 ssize = seg->s_base + seg->s_size - raddr;
2147 else
2148 ssize = rsize;
2149 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2150 if (isize != ssize) {
2151 error = -1;
2152 break;
2153 }
2154 vec += btopr(ssize);
2155 }
2156 AS_LOCK_EXIT(as);
2157 return (error);
2158 }
2159
2160 static void
as_segunlock(struct seg * seg,caddr_t addr,int attr,ulong_t * bitmap,size_t position,size_t npages)2161 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2162 ulong_t *bitmap, size_t position, size_t npages)
2163 {
2164 caddr_t range_start;
2165 size_t pos1 = position;
2166 size_t pos2;
2167 size_t size;
2168 size_t end_pos = npages + position;
2169
2170 while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2171 size = ptob((pos2 - pos1));
2172 range_start = (caddr_t)((uintptr_t)addr +
2173 ptob(pos1 - position));
2174
2175 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2176 (ulong_t *)NULL, (size_t)NULL);
2177 pos1 = pos2;
2178 }
2179 }
2180
2181 static void
as_unlockerr(struct as * as,int attr,ulong_t * mlock_map,caddr_t raddr,size_t rsize)2182 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2183 caddr_t raddr, size_t rsize)
2184 {
2185 struct seg *seg = as_segat(as, raddr);
2186 size_t ssize;
2187
2188 while (rsize != 0) {
2189 if (raddr >= seg->s_base + seg->s_size)
2190 seg = AS_SEGNEXT(as, seg);
2191
2192 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2193 ssize = seg->s_base + seg->s_size - raddr;
2194 else
2195 ssize = rsize;
2196
2197 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2198
2199 rsize -= ssize;
2200 raddr += ssize;
2201 }
2202 }
2203
2204 /*
2205 * Cache control operations over the interval [addr, addr + size) in
2206 * address space "as".
2207 */
2208 /*ARGSUSED*/
2209 int
as_ctl(struct as * as,caddr_t addr,size_t size,int func,int attr,uintptr_t arg,ulong_t * lock_map,size_t pos)2210 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2211 uintptr_t arg, ulong_t *lock_map, size_t pos)
2212 {
2213 struct seg *seg; /* working segment */
2214 caddr_t raddr; /* rounded down addr */
2215 caddr_t initraddr; /* saved initial rounded down addr */
2216 size_t rsize; /* rounded up size */
2217 size_t initrsize; /* saved initial rounded up size */
2218 size_t ssize; /* size of seg */
2219 int error = 0; /* result */
2220 size_t mlock_size; /* size of bitmap */
2221 ulong_t *mlock_map; /* pointer to bitmap used */
2222 /* to represent the locked */
2223 /* pages. */
2224 retry:
2225 if (error == IE_RETRY)
2226 AS_LOCK_ENTER(as, RW_WRITER);
2227 else
2228 AS_LOCK_ENTER(as, RW_READER);
2229
2230 /*
2231 * If these are address space lock/unlock operations, loop over
2232 * all segments in the address space, as appropriate.
2233 */
2234 if (func == MC_LOCKAS) {
2235 size_t npages, idx;
2236 size_t rlen = 0; /* rounded as length */
2237
2238 idx = pos;
2239
2240 if (arg & MCL_FUTURE) {
2241 mutex_enter(&as->a_contents);
2242 AS_SETPGLCK(as);
2243 mutex_exit(&as->a_contents);
2244 }
2245 if ((arg & MCL_CURRENT) == 0) {
2246 AS_LOCK_EXIT(as);
2247 return (0);
2248 }
2249
2250 seg = AS_SEGFIRST(as);
2251 if (seg == NULL) {
2252 AS_LOCK_EXIT(as);
2253 return (0);
2254 }
2255
2256 do {
2257 raddr = (caddr_t)((uintptr_t)seg->s_base &
2258 (uintptr_t)PAGEMASK);
2259 rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2260 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2261 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2262
2263 mlock_size = BT_BITOUL(btopr(rlen));
2264 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2265 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2266 AS_LOCK_EXIT(as);
2267 return (EAGAIN);
2268 }
2269
2270 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2271 error = SEGOP_LOCKOP(seg, seg->s_base,
2272 seg->s_size, attr, MC_LOCK, mlock_map, pos);
2273 if (error != 0)
2274 break;
2275 pos += seg_pages(seg);
2276 }
2277
2278 if (error) {
2279 for (seg = AS_SEGFIRST(as); seg != NULL;
2280 seg = AS_SEGNEXT(as, seg)) {
2281
2282 raddr = (caddr_t)((uintptr_t)seg->s_base &
2283 (uintptr_t)PAGEMASK);
2284 npages = seg_pages(seg);
2285 as_segunlock(seg, raddr, attr, mlock_map,
2286 idx, npages);
2287 idx += npages;
2288 }
2289 }
2290
2291 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2292 AS_LOCK_EXIT(as);
2293 goto lockerr;
2294 } else if (func == MC_UNLOCKAS) {
2295 mutex_enter(&as->a_contents);
2296 AS_CLRPGLCK(as);
2297 mutex_exit(&as->a_contents);
2298
2299 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2300 error = SEGOP_LOCKOP(seg, seg->s_base,
2301 seg->s_size, attr, MC_UNLOCK, NULL, 0);
2302 if (error != 0)
2303 break;
2304 }
2305
2306 AS_LOCK_EXIT(as);
2307 goto lockerr;
2308 }
2309
2310 /*
2311 * Normalize addresses and sizes.
2312 */
2313 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2314 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2315 (size_t)raddr;
2316
2317 if (raddr + rsize < raddr) { /* check for wraparound */
2318 AS_LOCK_EXIT(as);
2319 return (ENOMEM);
2320 }
2321
2322 /*
2323 * Get initial segment.
2324 */
2325 if ((seg = as_segat(as, raddr)) == NULL) {
2326 AS_LOCK_EXIT(as);
2327 return (ENOMEM);
2328 }
2329
2330 if (func == MC_LOCK) {
2331 mlock_size = BT_BITOUL(btopr(rsize));
2332 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2333 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2334 AS_LOCK_EXIT(as);
2335 return (EAGAIN);
2336 }
2337 }
2338
2339 /*
2340 * Loop over all segments. If a hole in the address range is
2341 * discovered, then fail. For each segment, perform the appropriate
2342 * control operation.
2343 */
2344 while (rsize != 0) {
2345
2346 /*
2347 * Make sure there's no hole, calculate the portion
2348 * of the next segment to be operated over.
2349 */
2350 if (raddr >= seg->s_base + seg->s_size) {
2351 seg = AS_SEGNEXT(as, seg);
2352 if (seg == NULL || raddr != seg->s_base) {
2353 if (func == MC_LOCK) {
2354 as_unlockerr(as, attr, mlock_map,
2355 initraddr, initrsize - rsize);
2356 kmem_free(mlock_map,
2357 mlock_size * sizeof (ulong_t));
2358 }
2359 AS_LOCK_EXIT(as);
2360 return (ENOMEM);
2361 }
2362 }
2363 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2364 ssize = seg->s_base + seg->s_size - raddr;
2365 else
2366 ssize = rsize;
2367
2368 /*
2369 * Dispatch on specific function.
2370 */
2371 switch (func) {
2372
2373 /*
2374 * Synchronize cached data from mappings with backing
2375 * objects.
2376 */
2377 case MC_SYNC:
2378 if (error = SEGOP_SYNC(seg, raddr, ssize,
2379 attr, (uint_t)arg)) {
2380 AS_LOCK_EXIT(as);
2381 return (error);
2382 }
2383 break;
2384
2385 /*
2386 * Lock pages in memory.
2387 */
2388 case MC_LOCK:
2389 if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2390 attr, func, mlock_map, pos)) {
2391 as_unlockerr(as, attr, mlock_map, initraddr,
2392 initrsize - rsize + ssize);
2393 kmem_free(mlock_map, mlock_size *
2394 sizeof (ulong_t));
2395 AS_LOCK_EXIT(as);
2396 goto lockerr;
2397 }
2398 break;
2399
2400 /*
2401 * Unlock mapped pages.
2402 */
2403 case MC_UNLOCK:
2404 (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2405 (ulong_t *)NULL, (size_t)NULL);
2406 break;
2407
2408 /*
2409 * Store VM advise for mapped pages in segment layer.
2410 */
2411 case MC_ADVISE:
2412 error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2413
2414 /*
2415 * Check for regular errors and special retry error
2416 */
2417 if (error) {
2418 if (error == IE_RETRY) {
2419 /*
2420 * Need to acquire writers lock, so
2421 * have to drop readers lock and start
2422 * all over again
2423 */
2424 AS_LOCK_EXIT(as);
2425 goto retry;
2426 } else if (error == IE_REATTACH) {
2427 /*
2428 * Find segment for current address
2429 * because current segment just got
2430 * split or concatenated
2431 */
2432 seg = as_segat(as, raddr);
2433 if (seg == NULL) {
2434 AS_LOCK_EXIT(as);
2435 return (ENOMEM);
2436 }
2437 } else {
2438 /*
2439 * Regular error
2440 */
2441 AS_LOCK_EXIT(as);
2442 return (error);
2443 }
2444 }
2445 break;
2446
2447 case MC_INHERIT_ZERO:
2448 if (seg->s_ops->inherit == NULL) {
2449 error = ENOTSUP;
2450 } else {
2451 error = SEGOP_INHERIT(seg, raddr, ssize,
2452 SEGP_INH_ZERO);
2453 }
2454 if (error != 0) {
2455 AS_LOCK_EXIT(as);
2456 return (error);
2457 }
2458 break;
2459
2460 /*
2461 * Can't happen.
2462 */
2463 default:
2464 panic("as_ctl: bad operation %d", func);
2465 /*NOTREACHED*/
2466 }
2467
2468 rsize -= ssize;
2469 raddr += ssize;
2470 }
2471
2472 if (func == MC_LOCK)
2473 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2474 AS_LOCK_EXIT(as);
2475 return (0);
2476 lockerr:
2477
2478 /*
2479 * If the lower levels returned EDEADLK for a segment lockop,
2480 * it means that we should retry the operation. Let's wait
2481 * a bit also to let the deadlock causing condition clear.
2482 * This is part of a gross hack to work around a design flaw
2483 * in the ufs/sds logging code and should go away when the
2484 * logging code is re-designed to fix the problem. See bug
2485 * 4125102 for details of the problem.
2486 */
2487 if (error == EDEADLK) {
2488 delay(deadlk_wait);
2489 error = 0;
2490 goto retry;
2491 }
2492 return (error);
2493 }
2494
2495 int
fc_decode(faultcode_t fault_err)2496 fc_decode(faultcode_t fault_err)
2497 {
2498 int error = 0;
2499
2500 switch (FC_CODE(fault_err)) {
2501 case FC_OBJERR:
2502 error = FC_ERRNO(fault_err);
2503 break;
2504 case FC_PROT:
2505 error = EACCES;
2506 break;
2507 default:
2508 error = EFAULT;
2509 break;
2510 }
2511 return (error);
2512 }
2513
2514 /*
2515 * Pagelock pages from a range that spans more than 1 segment. Obtain shadow
2516 * lists from each segment and copy them to one contiguous shadow list (plist)
2517 * as expected by the caller. Save pointers to per segment shadow lists at
2518 * the tail of plist so that they can be used during as_pageunlock().
2519 */
2520 static int
as_pagelock_segs(struct as * as,struct seg * seg,struct page *** ppp,caddr_t addr,size_t size,enum seg_rw rw)2521 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2522 caddr_t addr, size_t size, enum seg_rw rw)
2523 {
2524 caddr_t sv_addr = addr;
2525 size_t sv_size = size;
2526 struct seg *sv_seg = seg;
2527 ulong_t segcnt = 1;
2528 ulong_t cnt;
2529 size_t ssize;
2530 pgcnt_t npages = btop(size);
2531 page_t **plist;
2532 page_t **pl;
2533 int error;
2534 caddr_t eaddr;
2535 faultcode_t fault_err = 0;
2536 pgcnt_t pl_off;
2537 extern struct seg_ops segspt_shmops;
2538
2539 ASSERT(AS_LOCK_HELD(as));
2540 ASSERT(seg != NULL);
2541 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2542 ASSERT(addr + size > seg->s_base + seg->s_size);
2543 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2544 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2545
2546 /*
2547 * Count the number of segments covered by the range we are about to
2548 * lock. The segment count is used to size the shadow list we return
2549 * back to the caller.
2550 */
2551 for (; size != 0; size -= ssize, addr += ssize) {
2552 if (addr >= seg->s_base + seg->s_size) {
2553
2554 seg = AS_SEGNEXT(as, seg);
2555 if (seg == NULL || addr != seg->s_base) {
2556 AS_LOCK_EXIT(as);
2557 return (EFAULT);
2558 }
2559 /*
2560 * Do a quick check if subsequent segments
2561 * will most likely support pagelock.
2562 */
2563 if (seg->s_ops == &segvn_ops) {
2564 vnode_t *vp;
2565
2566 if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2567 vp != NULL) {
2568 AS_LOCK_EXIT(as);
2569 goto slow;
2570 }
2571 } else if (seg->s_ops != &segspt_shmops) {
2572 AS_LOCK_EXIT(as);
2573 goto slow;
2574 }
2575 segcnt++;
2576 }
2577 if (addr + size > seg->s_base + seg->s_size) {
2578 ssize = seg->s_base + seg->s_size - addr;
2579 } else {
2580 ssize = size;
2581 }
2582 }
2583 ASSERT(segcnt > 1);
2584
2585 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2586
2587 addr = sv_addr;
2588 size = sv_size;
2589 seg = sv_seg;
2590
2591 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2592 if (addr >= seg->s_base + seg->s_size) {
2593 seg = AS_SEGNEXT(as, seg);
2594 ASSERT(seg != NULL && addr == seg->s_base);
2595 cnt++;
2596 ASSERT(cnt < segcnt);
2597 }
2598 if (addr + size > seg->s_base + seg->s_size) {
2599 ssize = seg->s_base + seg->s_size - addr;
2600 } else {
2601 ssize = size;
2602 }
2603 pl = &plist[npages + cnt];
2604 error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2605 L_PAGELOCK, rw);
2606 if (error) {
2607 break;
2608 }
2609 ASSERT(plist[npages + cnt] != NULL);
2610 ASSERT(pl_off + btop(ssize) <= npages);
2611 bcopy(plist[npages + cnt], &plist[pl_off],
2612 btop(ssize) * sizeof (page_t *));
2613 pl_off += btop(ssize);
2614 }
2615
2616 if (size == 0) {
2617 AS_LOCK_EXIT(as);
2618 ASSERT(cnt == segcnt - 1);
2619 *ppp = plist;
2620 return (0);
2621 }
2622
2623 /*
2624 * one of pagelock calls failed. The error type is in error variable.
2625 * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2626 * type is either EFAULT or ENOTSUP. Otherwise just return the error
2627 * back to the caller.
2628 */
2629
2630 eaddr = addr;
2631 seg = sv_seg;
2632
2633 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2634 if (addr >= seg->s_base + seg->s_size) {
2635 seg = AS_SEGNEXT(as, seg);
2636 ASSERT(seg != NULL && addr == seg->s_base);
2637 cnt++;
2638 ASSERT(cnt < segcnt);
2639 }
2640 if (eaddr > seg->s_base + seg->s_size) {
2641 ssize = seg->s_base + seg->s_size - addr;
2642 } else {
2643 ssize = eaddr - addr;
2644 }
2645 pl = &plist[npages + cnt];
2646 ASSERT(*pl != NULL);
2647 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2648 L_PAGEUNLOCK, rw);
2649 }
2650
2651 AS_LOCK_EXIT(as);
2652
2653 kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2654
2655 if (error != ENOTSUP && error != EFAULT) {
2656 return (error);
2657 }
2658
2659 slow:
2660 /*
2661 * If we are here because pagelock failed due to the need to cow fault
2662 * in the pages we want to lock F_SOFTLOCK will do this job and in
2663 * next as_pagelock() call for this address range pagelock will
2664 * hopefully succeed.
2665 */
2666 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2667 if (fault_err != 0) {
2668 return (fc_decode(fault_err));
2669 }
2670 *ppp = NULL;
2671
2672 return (0);
2673 }
2674
2675 /*
2676 * lock pages in a given address space. Return shadow list. If
2677 * the list is NULL, the MMU mapping is also locked.
2678 */
2679 int
as_pagelock(struct as * as,struct page *** ppp,caddr_t addr,size_t size,enum seg_rw rw)2680 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2681 size_t size, enum seg_rw rw)
2682 {
2683 size_t rsize;
2684 caddr_t raddr;
2685 faultcode_t fault_err;
2686 struct seg *seg;
2687 int err;
2688
2689 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2690 "as_pagelock_start: addr %p size %ld", addr, size);
2691
2692 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2693 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2694 (size_t)raddr;
2695
2696 /*
2697 * if the request crosses two segments let
2698 * as_fault handle it.
2699 */
2700 AS_LOCK_ENTER(as, RW_READER);
2701
2702 seg = as_segat(as, raddr);
2703 if (seg == NULL) {
2704 AS_LOCK_EXIT(as);
2705 return (EFAULT);
2706 }
2707 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2708 if (raddr + rsize > seg->s_base + seg->s_size) {
2709 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2710 }
2711 if (raddr + rsize <= raddr) {
2712 AS_LOCK_EXIT(as);
2713 return (EFAULT);
2714 }
2715
2716 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2717 "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2718
2719 /*
2720 * try to lock pages and pass back shadow list
2721 */
2722 err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2723
2724 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2725
2726 AS_LOCK_EXIT(as);
2727
2728 if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2729 return (err);
2730 }
2731
2732 /*
2733 * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2734 * to no pagelock support for this segment or pages need to be cow
2735 * faulted in. If fault is needed F_SOFTLOCK will do this job for
2736 * this as_pagelock() call and in the next as_pagelock() call for the
2737 * same address range pagelock call will hopefull succeed.
2738 */
2739 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2740 if (fault_err != 0) {
2741 return (fc_decode(fault_err));
2742 }
2743 *ppp = NULL;
2744
2745 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2746 return (0);
2747 }
2748
2749 /*
2750 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow
2751 * lists from the end of plist and call pageunlock interface for each segment.
2752 * Drop as lock and free plist.
2753 */
2754 static void
as_pageunlock_segs(struct as * as,struct seg * seg,caddr_t addr,size_t size,struct page ** plist,enum seg_rw rw)2755 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2756 struct page **plist, enum seg_rw rw)
2757 {
2758 ulong_t cnt;
2759 caddr_t eaddr = addr + size;
2760 pgcnt_t npages = btop(size);
2761 size_t ssize;
2762 page_t **pl;
2763
2764 ASSERT(AS_LOCK_HELD(as));
2765 ASSERT(seg != NULL);
2766 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2767 ASSERT(addr + size > seg->s_base + seg->s_size);
2768 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2769 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2770 ASSERT(plist != NULL);
2771
2772 for (cnt = 0; addr < eaddr; addr += ssize) {
2773 if (addr >= seg->s_base + seg->s_size) {
2774 seg = AS_SEGNEXT(as, seg);
2775 ASSERT(seg != NULL && addr == seg->s_base);
2776 cnt++;
2777 }
2778 if (eaddr > seg->s_base + seg->s_size) {
2779 ssize = seg->s_base + seg->s_size - addr;
2780 } else {
2781 ssize = eaddr - addr;
2782 }
2783 pl = &plist[npages + cnt];
2784 ASSERT(*pl != NULL);
2785 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2786 L_PAGEUNLOCK, rw);
2787 }
2788 ASSERT(cnt > 0);
2789 AS_LOCK_EXIT(as);
2790
2791 cnt++;
2792 kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2793 }
2794
2795 /*
2796 * unlock pages in a given address range
2797 */
2798 void
as_pageunlock(struct as * as,struct page ** pp,caddr_t addr,size_t size,enum seg_rw rw)2799 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2800 enum seg_rw rw)
2801 {
2802 struct seg *seg;
2803 size_t rsize;
2804 caddr_t raddr;
2805
2806 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2807 "as_pageunlock_start: addr %p size %ld", addr, size);
2808
2809 /*
2810 * if the shadow list is NULL, as_pagelock was
2811 * falling back to as_fault
2812 */
2813 if (pp == NULL) {
2814 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2815 return;
2816 }
2817
2818 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2819 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2820 (size_t)raddr;
2821
2822 AS_LOCK_ENTER(as, RW_READER);
2823 seg = as_segat(as, raddr);
2824 ASSERT(seg != NULL);
2825
2826 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2827 "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2828
2829 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2830 if (raddr + rsize <= seg->s_base + seg->s_size) {
2831 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2832 } else {
2833 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2834 return;
2835 }
2836 AS_LOCK_EXIT(as);
2837 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2838 }
2839
2840 int
as_setpagesize(struct as * as,caddr_t addr,size_t size,uint_t szc,boolean_t wait)2841 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2842 boolean_t wait)
2843 {
2844 struct seg *seg;
2845 size_t ssize;
2846 caddr_t raddr; /* rounded down addr */
2847 size_t rsize; /* rounded up size */
2848 int error = 0;
2849 size_t pgsz = page_get_pagesize(szc);
2850
2851 setpgsz_top:
2852 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2853 return (EINVAL);
2854 }
2855
2856 raddr = addr;
2857 rsize = size;
2858
2859 if (raddr + rsize < raddr) /* check for wraparound */
2860 return (ENOMEM);
2861
2862 AS_LOCK_ENTER(as, RW_WRITER);
2863 as_clearwatchprot(as, raddr, rsize);
2864 seg = as_segat(as, raddr);
2865 if (seg == NULL) {
2866 as_setwatch(as);
2867 AS_LOCK_EXIT(as);
2868 return (ENOMEM);
2869 }
2870
2871 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2872 if (raddr >= seg->s_base + seg->s_size) {
2873 seg = AS_SEGNEXT(as, seg);
2874 if (seg == NULL || raddr != seg->s_base) {
2875 error = ENOMEM;
2876 break;
2877 }
2878 }
2879 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2880 ssize = seg->s_base + seg->s_size - raddr;
2881 } else {
2882 ssize = rsize;
2883 }
2884
2885 retry:
2886 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2887
2888 if (error == IE_NOMEM) {
2889 error = EAGAIN;
2890 break;
2891 }
2892
2893 if (error == IE_RETRY) {
2894 AS_LOCK_EXIT(as);
2895 goto setpgsz_top;
2896 }
2897
2898 if (error == ENOTSUP) {
2899 error = EINVAL;
2900 break;
2901 }
2902
2903 if (wait && (error == EAGAIN)) {
2904 /*
2905 * Memory is currently locked. It must be unlocked
2906 * before this operation can succeed through a retry.
2907 * The possible reasons for locked memory and
2908 * corresponding strategies for unlocking are:
2909 * (1) Normal I/O
2910 * wait for a signal that the I/O operation
2911 * has completed and the memory is unlocked.
2912 * (2) Asynchronous I/O
2913 * The aio subsystem does not unlock pages when
2914 * the I/O is completed. Those pages are unlocked
2915 * when the application calls aiowait/aioerror.
2916 * So, to prevent blocking forever, cv_broadcast()
2917 * is done to wake up aio_cleanup_thread.
2918 * Subsequently, segvn_reclaim will be called, and
2919 * that will do AS_CLRUNMAPWAIT() and wake us up.
2920 * (3) Long term page locking:
2921 * This is not relevant for as_setpagesize()
2922 * because we cannot change the page size for
2923 * driver memory. The attempt to do so will
2924 * fail with a different error than EAGAIN so
2925 * there's no need to trigger as callbacks like
2926 * as_unmap, as_setprot or as_free would do.
2927 */
2928 mutex_enter(&as->a_contents);
2929 if (!AS_ISNOUNMAPWAIT(as)) {
2930 if (AS_ISUNMAPWAIT(as) == 0) {
2931 cv_broadcast(&as->a_cv);
2932 }
2933 AS_SETUNMAPWAIT(as);
2934 AS_LOCK_EXIT(as);
2935 while (AS_ISUNMAPWAIT(as)) {
2936 cv_wait(&as->a_cv, &as->a_contents);
2937 }
2938 } else {
2939 /*
2940 * We may have raced with
2941 * segvn_reclaim()/segspt_reclaim(). In this
2942 * case clean nounmapwait flag and retry since
2943 * softlockcnt in this segment may be already
2944 * 0. We don't drop as writer lock so our
2945 * number of retries without sleeping should
2946 * be very small. See segvn_reclaim() for
2947 * more comments.
2948 */
2949 AS_CLRNOUNMAPWAIT(as);
2950 mutex_exit(&as->a_contents);
2951 goto retry;
2952 }
2953 mutex_exit(&as->a_contents);
2954 goto setpgsz_top;
2955 } else if (error != 0) {
2956 break;
2957 }
2958 }
2959 as_setwatch(as);
2960 AS_LOCK_EXIT(as);
2961 return (error);
2962 }
2963
2964 /*
2965 * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
2966 * in its chunk where s_szc is less than the szc we want to set.
2967 */
2968 static int
as_iset3_default_lpsize(struct as * as,caddr_t raddr,size_t rsize,uint_t szc,int * retry)2969 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
2970 int *retry)
2971 {
2972 struct seg *seg;
2973 size_t ssize;
2974 int error;
2975
2976 ASSERT(AS_WRITE_HELD(as));
2977
2978 seg = as_segat(as, raddr);
2979 if (seg == NULL) {
2980 panic("as_iset3_default_lpsize: no seg");
2981 }
2982
2983 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2984 if (raddr >= seg->s_base + seg->s_size) {
2985 seg = AS_SEGNEXT(as, seg);
2986 if (seg == NULL || raddr != seg->s_base) {
2987 panic("as_iset3_default_lpsize: as changed");
2988 }
2989 }
2990 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2991 ssize = seg->s_base + seg->s_size - raddr;
2992 } else {
2993 ssize = rsize;
2994 }
2995
2996 if (szc > seg->s_szc) {
2997 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2998 /* Only retry on EINVAL segments that have no vnode. */
2999 if (error == EINVAL) {
3000 vnode_t *vp = NULL;
3001 if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) &&
3002 (SEGOP_GETVP(seg, raddr, &vp) != 0 ||
3003 vp == NULL)) {
3004 *retry = 1;
3005 } else {
3006 *retry = 0;
3007 }
3008 }
3009 if (error) {
3010 return (error);
3011 }
3012 }
3013 }
3014 return (0);
3015 }
3016
3017 /*
3018 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3019 * pagesize on each segment in its range, but if any fails with EINVAL,
3020 * then it reduces the pagesizes to the next size in the bitmap and
3021 * retries as_iset3_default_lpsize(). The reason why the code retries
3022 * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3023 * match the bigger sizes, and (b) it's hard to get this offset (to begin
3024 * with) to pass to map_pgszcvec().
3025 */
3026 static int
as_iset2_default_lpsize(struct as * as,caddr_t addr,size_t size,uint_t szc,uint_t szcvec)3027 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3028 uint_t szcvec)
3029 {
3030 int error;
3031 int retry;
3032
3033 ASSERT(AS_WRITE_HELD(as));
3034
3035 for (;;) {
3036 error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3037 if (error == EINVAL && retry) {
3038 szcvec &= ~(1 << szc);
3039 if (szcvec <= 1) {
3040 return (EINVAL);
3041 }
3042 szc = highbit(szcvec) - 1;
3043 } else {
3044 return (error);
3045 }
3046 }
3047 }
3048
3049 /*
3050 * as_iset1_default_lpsize() breaks its chunk into areas where existing
3051 * segments have a smaller szc than we want to set. For each such area,
3052 * it calls as_iset2_default_lpsize()
3053 */
3054 static int
as_iset1_default_lpsize(struct as * as,caddr_t raddr,size_t rsize,uint_t szc,uint_t szcvec)3055 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3056 uint_t szcvec)
3057 {
3058 struct seg *seg;
3059 size_t ssize;
3060 caddr_t setaddr = raddr;
3061 size_t setsize = 0;
3062 int set;
3063 int error;
3064
3065 ASSERT(AS_WRITE_HELD(as));
3066
3067 seg = as_segat(as, raddr);
3068 if (seg == NULL) {
3069 panic("as_iset1_default_lpsize: no seg");
3070 }
3071 if (seg->s_szc < szc) {
3072 set = 1;
3073 } else {
3074 set = 0;
3075 }
3076
3077 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3078 if (raddr >= seg->s_base + seg->s_size) {
3079 seg = AS_SEGNEXT(as, seg);
3080 if (seg == NULL || raddr != seg->s_base) {
3081 panic("as_iset1_default_lpsize: as changed");
3082 }
3083 if (seg->s_szc >= szc && set) {
3084 ASSERT(setsize != 0);
3085 error = as_iset2_default_lpsize(as,
3086 setaddr, setsize, szc, szcvec);
3087 if (error) {
3088 return (error);
3089 }
3090 set = 0;
3091 } else if (seg->s_szc < szc && !set) {
3092 setaddr = raddr;
3093 setsize = 0;
3094 set = 1;
3095 }
3096 }
3097 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3098 ssize = seg->s_base + seg->s_size - raddr;
3099 } else {
3100 ssize = rsize;
3101 }
3102 }
3103 error = 0;
3104 if (set) {
3105 ASSERT(setsize != 0);
3106 error = as_iset2_default_lpsize(as, setaddr, setsize,
3107 szc, szcvec);
3108 }
3109 return (error);
3110 }
3111
3112 /*
3113 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3114 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3115 * chunk to as_iset1_default_lpsize().
3116 */
3117 static int
as_iset_default_lpsize(struct as * as,caddr_t addr,size_t size,int flags,int type)3118 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3119 int type)
3120 {
3121 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3122 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3123 flags, rtype, 1);
3124 uint_t szc;
3125 uint_t nszc;
3126 int error;
3127 caddr_t a;
3128 caddr_t eaddr;
3129 size_t segsize;
3130 size_t pgsz;
3131 uint_t save_szcvec;
3132
3133 ASSERT(AS_WRITE_HELD(as));
3134 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3135 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3136
3137 szcvec &= ~1;
3138 if (szcvec <= 1) { /* skip if base page size */
3139 return (0);
3140 }
3141
3142 /* Get the pagesize of the first larger page size. */
3143 szc = lowbit(szcvec) - 1;
3144 pgsz = page_get_pagesize(szc);
3145 eaddr = addr + size;
3146 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3147 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3148
3149 save_szcvec = szcvec;
3150 szcvec >>= (szc + 1);
3151 nszc = szc;
3152 while (szcvec) {
3153 if ((szcvec & 0x1) == 0) {
3154 nszc++;
3155 szcvec >>= 1;
3156 continue;
3157 }
3158 nszc++;
3159 pgsz = page_get_pagesize(nszc);
3160 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3161 if (a != addr) {
3162 ASSERT(szc > 0);
3163 ASSERT(a < eaddr);
3164 segsize = a - addr;
3165 error = as_iset1_default_lpsize(as, addr, segsize, szc,
3166 save_szcvec);
3167 if (error) {
3168 return (error);
3169 }
3170 addr = a;
3171 }
3172 szc = nszc;
3173 szcvec >>= 1;
3174 }
3175
3176 ASSERT(addr < eaddr);
3177 szcvec = save_szcvec;
3178 while (szcvec) {
3179 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3180 ASSERT(a >= addr);
3181 if (a != addr) {
3182 ASSERT(szc > 0);
3183 segsize = a - addr;
3184 error = as_iset1_default_lpsize(as, addr, segsize, szc,
3185 save_szcvec);
3186 if (error) {
3187 return (error);
3188 }
3189 addr = a;
3190 }
3191 szcvec &= ~(1 << szc);
3192 if (szcvec) {
3193 szc = highbit(szcvec) - 1;
3194 pgsz = page_get_pagesize(szc);
3195 }
3196 }
3197 ASSERT(addr == eaddr);
3198
3199 return (0);
3200 }
3201
3202 /*
3203 * Set the default large page size for the range. Called via memcntl with
3204 * page size set to 0. as_set_default_lpsize breaks the range down into
3205 * chunks with the same type/flags, ignores-non segvn segments, and passes
3206 * each chunk to as_iset_default_lpsize().
3207 */
3208 int
as_set_default_lpsize(struct as * as,caddr_t addr,size_t size)3209 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3210 {
3211 struct seg *seg;
3212 caddr_t raddr;
3213 size_t rsize;
3214 size_t ssize;
3215 int rtype, rflags;
3216 int stype, sflags;
3217 int error;
3218 caddr_t setaddr;
3219 size_t setsize;
3220 int segvn;
3221
3222 if (size == 0)
3223 return (0);
3224
3225 AS_LOCK_ENTER(as, RW_WRITER);
3226 again:
3227 error = 0;
3228
3229 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3230 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3231 (size_t)raddr;
3232
3233 if (raddr + rsize < raddr) { /* check for wraparound */
3234 AS_LOCK_EXIT(as);
3235 return (ENOMEM);
3236 }
3237 as_clearwatchprot(as, raddr, rsize);
3238 seg = as_segat(as, raddr);
3239 if (seg == NULL) {
3240 as_setwatch(as);
3241 AS_LOCK_EXIT(as);
3242 return (ENOMEM);
3243 }
3244 if (seg->s_ops == &segvn_ops) {
3245 rtype = SEGOP_GETTYPE(seg, addr);
3246 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3247 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3248 segvn = 1;
3249 } else {
3250 segvn = 0;
3251 }
3252 setaddr = raddr;
3253 setsize = 0;
3254
3255 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3256 if (raddr >= (seg->s_base + seg->s_size)) {
3257 seg = AS_SEGNEXT(as, seg);
3258 if (seg == NULL || raddr != seg->s_base) {
3259 error = ENOMEM;
3260 break;
3261 }
3262 if (seg->s_ops == &segvn_ops) {
3263 stype = SEGOP_GETTYPE(seg, raddr);
3264 sflags = stype & (MAP_TEXT | MAP_INITDATA);
3265 stype &= (MAP_SHARED | MAP_PRIVATE);
3266 if (segvn && (rflags != sflags ||
3267 rtype != stype)) {
3268 /*
3269 * The next segment is also segvn but
3270 * has different flags and/or type.
3271 */
3272 ASSERT(setsize != 0);
3273 error = as_iset_default_lpsize(as,
3274 setaddr, setsize, rflags, rtype);
3275 if (error) {
3276 break;
3277 }
3278 rflags = sflags;
3279 rtype = stype;
3280 setaddr = raddr;
3281 setsize = 0;
3282 } else if (!segvn) {
3283 rflags = sflags;
3284 rtype = stype;
3285 setaddr = raddr;
3286 setsize = 0;
3287 segvn = 1;
3288 }
3289 } else if (segvn) {
3290 /* The next segment is not segvn. */
3291 ASSERT(setsize != 0);
3292 error = as_iset_default_lpsize(as,
3293 setaddr, setsize, rflags, rtype);
3294 if (error) {
3295 break;
3296 }
3297 segvn = 0;
3298 }
3299 }
3300 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3301 ssize = seg->s_base + seg->s_size - raddr;
3302 } else {
3303 ssize = rsize;
3304 }
3305 }
3306 if (error == 0 && segvn) {
3307 /* The last chunk when rsize == 0. */
3308 ASSERT(setsize != 0);
3309 error = as_iset_default_lpsize(as, setaddr, setsize,
3310 rflags, rtype);
3311 }
3312
3313 if (error == IE_RETRY) {
3314 goto again;
3315 } else if (error == IE_NOMEM) {
3316 error = EAGAIN;
3317 } else if (error == ENOTSUP) {
3318 error = EINVAL;
3319 } else if (error == EAGAIN) {
3320 mutex_enter(&as->a_contents);
3321 if (!AS_ISNOUNMAPWAIT(as)) {
3322 if (AS_ISUNMAPWAIT(as) == 0) {
3323 cv_broadcast(&as->a_cv);
3324 }
3325 AS_SETUNMAPWAIT(as);
3326 AS_LOCK_EXIT(as);
3327 while (AS_ISUNMAPWAIT(as)) {
3328 cv_wait(&as->a_cv, &as->a_contents);
3329 }
3330 mutex_exit(&as->a_contents);
3331 AS_LOCK_ENTER(as, RW_WRITER);
3332 } else {
3333 /*
3334 * We may have raced with
3335 * segvn_reclaim()/segspt_reclaim(). In this case
3336 * clean nounmapwait flag and retry since softlockcnt
3337 * in this segment may be already 0. We don't drop as
3338 * writer lock so our number of retries without
3339 * sleeping should be very small. See segvn_reclaim()
3340 * for more comments.
3341 */
3342 AS_CLRNOUNMAPWAIT(as);
3343 mutex_exit(&as->a_contents);
3344 }
3345 goto again;
3346 }
3347
3348 as_setwatch(as);
3349 AS_LOCK_EXIT(as);
3350 return (error);
3351 }
3352
3353 /*
3354 * Setup all of the uninitialized watched pages that we can.
3355 */
3356 void
as_setwatch(struct as * as)3357 as_setwatch(struct as *as)
3358 {
3359 struct watched_page *pwp;
3360 struct seg *seg;
3361 caddr_t vaddr;
3362 uint_t prot;
3363 int err, retrycnt;
3364
3365 if (avl_numnodes(&as->a_wpage) == 0)
3366 return;
3367
3368 ASSERT(AS_WRITE_HELD(as));
3369
3370 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3371 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3372 retrycnt = 0;
3373 retry:
3374 vaddr = pwp->wp_vaddr;
3375 if (pwp->wp_oprot != 0 || /* already set up */
3376 (seg = as_segat(as, vaddr)) == NULL ||
3377 SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3378 continue;
3379
3380 pwp->wp_oprot = prot;
3381 if (pwp->wp_read)
3382 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3383 if (pwp->wp_write)
3384 prot &= ~PROT_WRITE;
3385 if (pwp->wp_exec)
3386 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3387 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3388 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3389 if (err == IE_RETRY) {
3390 pwp->wp_oprot = 0;
3391 ASSERT(retrycnt == 0);
3392 retrycnt++;
3393 goto retry;
3394 }
3395 }
3396 pwp->wp_prot = prot;
3397 }
3398 }
3399
3400 /*
3401 * Clear all of the watched pages in the address space.
3402 */
3403 void
as_clearwatch(struct as * as)3404 as_clearwatch(struct as *as)
3405 {
3406 struct watched_page *pwp;
3407 struct seg *seg;
3408 caddr_t vaddr;
3409 uint_t prot;
3410 int err, retrycnt;
3411
3412 if (avl_numnodes(&as->a_wpage) == 0)
3413 return;
3414
3415 ASSERT(AS_WRITE_HELD(as));
3416
3417 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3418 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3419 retrycnt = 0;
3420 retry:
3421 vaddr = pwp->wp_vaddr;
3422 if (pwp->wp_oprot == 0 || /* not set up */
3423 (seg = as_segat(as, vaddr)) == NULL)
3424 continue;
3425
3426 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3427 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3428 if (err == IE_RETRY) {
3429 ASSERT(retrycnt == 0);
3430 retrycnt++;
3431 goto retry;
3432 }
3433 }
3434 pwp->wp_oprot = 0;
3435 pwp->wp_prot = 0;
3436 }
3437 }
3438
3439 /*
3440 * Force a new setup for all the watched pages in the range.
3441 */
3442 static void
as_setwatchprot(struct as * as,caddr_t addr,size_t size,uint_t prot)3443 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3444 {
3445 struct watched_page *pwp;
3446 struct watched_page tpw;
3447 caddr_t eaddr = addr + size;
3448 caddr_t vaddr;
3449 struct seg *seg;
3450 int err, retrycnt;
3451 uint_t wprot;
3452 avl_index_t where;
3453
3454 if (avl_numnodes(&as->a_wpage) == 0)
3455 return;
3456
3457 ASSERT(AS_WRITE_HELD(as));
3458
3459 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3460 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3461 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3462
3463 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3464 retrycnt = 0;
3465 vaddr = pwp->wp_vaddr;
3466
3467 wprot = prot;
3468 if (pwp->wp_read)
3469 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3470 if (pwp->wp_write)
3471 wprot &= ~PROT_WRITE;
3472 if (pwp->wp_exec)
3473 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3474 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3475 retry:
3476 seg = as_segat(as, vaddr);
3477 if (seg == NULL) {
3478 panic("as_setwatchprot: no seg");
3479 /*NOTREACHED*/
3480 }
3481 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
3482 if (err == IE_RETRY) {
3483 ASSERT(retrycnt == 0);
3484 retrycnt++;
3485 goto retry;
3486 }
3487 }
3488 pwp->wp_oprot = prot;
3489 pwp->wp_prot = wprot;
3490
3491 pwp = AVL_NEXT(&as->a_wpage, pwp);
3492 }
3493 }
3494
3495 /*
3496 * Clear all of the watched pages in the range.
3497 */
3498 static void
as_clearwatchprot(struct as * as,caddr_t addr,size_t size)3499 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3500 {
3501 caddr_t eaddr = addr + size;
3502 struct watched_page *pwp;
3503 struct watched_page tpw;
3504 uint_t prot;
3505 struct seg *seg;
3506 int err, retrycnt;
3507 avl_index_t where;
3508
3509 if (avl_numnodes(&as->a_wpage) == 0)
3510 return;
3511
3512 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3513 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3514 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3515
3516 ASSERT(AS_WRITE_HELD(as));
3517
3518 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3519
3520 if ((prot = pwp->wp_oprot) != 0) {
3521 retrycnt = 0;
3522
3523 if (prot != pwp->wp_prot) {
3524 retry:
3525 seg = as_segat(as, pwp->wp_vaddr);
3526 if (seg == NULL)
3527 continue;
3528 err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3529 PAGESIZE, prot);
3530 if (err == IE_RETRY) {
3531 ASSERT(retrycnt == 0);
3532 retrycnt++;
3533 goto retry;
3534
3535 }
3536 }
3537 pwp->wp_oprot = 0;
3538 pwp->wp_prot = 0;
3539 }
3540
3541 pwp = AVL_NEXT(&as->a_wpage, pwp);
3542 }
3543 }
3544
3545 void
as_signal_proc(struct as * as,k_siginfo_t * siginfo)3546 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3547 {
3548 struct proc *p;
3549
3550 mutex_enter(&pidlock);
3551 for (p = practive; p; p = p->p_next) {
3552 if (p->p_as == as) {
3553 mutex_enter(&p->p_lock);
3554 if (p->p_as == as)
3555 sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3556 mutex_exit(&p->p_lock);
3557 }
3558 }
3559 mutex_exit(&pidlock);
3560 }
3561
3562 /*
3563 * return memory object ID
3564 */
3565 int
as_getmemid(struct as * as,caddr_t addr,memid_t * memidp)3566 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3567 {
3568 struct seg *seg;
3569 int sts;
3570
3571 AS_LOCK_ENTER(as, RW_READER);
3572 seg = as_segat(as, addr);
3573 if (seg == NULL) {
3574 AS_LOCK_EXIT(as);
3575 return (EFAULT);
3576 }
3577 /*
3578 * catch old drivers which may not support getmemid
3579 */
3580 if (seg->s_ops->getmemid == NULL) {
3581 AS_LOCK_EXIT(as);
3582 return (ENODEV);
3583 }
3584
3585 sts = SEGOP_GETMEMID(seg, addr, memidp);
3586
3587 AS_LOCK_EXIT(as);
3588 return (sts);
3589 }
3590