xref: /illumos-gate/usr/src/uts/common/syscall/memcntl.c (revision 9164eb65b5c2638abc35517e4302cf4c142c3855)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <sys/types.h>
34 #include <sys/bitmap.h>
35 #include <sys/sysmacros.h>
36 #include <sys/kmem.h>
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/user.h>
40 #include <sys/unistd.h>
41 #include <sys/errno.h>
42 #include <sys/proc.h>
43 #include <sys/mman.h>
44 #include <sys/tuneable.h>
45 #include <sys/cmn_err.h>
46 #include <sys/cred.h>
47 #include <sys/vmsystm.h>
48 #include <sys/debug.h>
49 #include <sys/policy.h>
50 
51 #include <vm/as.h>
52 #include <vm/seg.h>
53 
54 static uint_t mem_getpgszc(size_t);
55 
56 /*
57  * Memory control operations
58  */
59 int
60 memcntl(caddr_t addr, size_t len, int cmd, caddr_t arg, int attr, int mask)
61 {
62 	struct as *as = ttoproc(curthread)->p_as;
63 	struct proc *p = ttoproc(curthread);
64 	size_t pgsz;
65 	uint_t szc, oszc, pgcmd;
66 	int error = 0;
67 	faultcode_t fc;
68 	uintptr_t iarg;
69 	STRUCT_DECL(memcntl_mha, mha);
70 
71 	if (mask)
72 		return (set_errno(EINVAL));
73 	if ((cmd == MC_LOCKAS) || (cmd == MC_UNLOCKAS)) {
74 		if ((addr != 0) || (len != 0)) {
75 			return (set_errno(EINVAL));
76 		}
77 	} else if (cmd != MC_HAT_ADVISE) {
78 		if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0) {
79 			return (set_errno(EINVAL));
80 		}
81 		/*
82 		 * We're only concerned with the address range
83 		 * here, not the protections.  The protections
84 		 * are only used as a "filter" in this code,
85 		 * they aren't set or modified here.
86 		 */
87 		if (valid_usr_range(addr, len, 0, as,
88 		    as->a_userlimit) != RANGE_OKAY) {
89 			return (set_errno(ENOMEM));
90 		}
91 	}
92 
93 	if (cmd == MC_HAT_ADVISE) {
94 		if (attr != 0 || mask != 0) {
95 			return (set_errno(EINVAL));
96 		}
97 
98 	} else {
99 		if ((VALID_ATTR & attr) != attr) {
100 			return (set_errno(EINVAL));
101 		}
102 		if ((attr & SHARED) && (attr & PRIVATE)) {
103 			return (set_errno(EINVAL));
104 		}
105 		if (((cmd == MC_LOCKAS) || (cmd == MC_LOCK) ||
106 		    (cmd == MC_UNLOCKAS) || (cmd == MC_UNLOCK)) &&
107 		    (error = secpolicy_lock_memory(CRED())) != 0)
108 			return (set_errno(error));
109 	}
110 	if (attr) {
111 		attr |= PROT_USER;
112 	}
113 
114 	switch (cmd) {
115 	case MC_SYNC:
116 		/*
117 		 * MS_SYNC used to be defined to be zero but is now non-zero.
118 		 * For binary compatibility we still accept zero
119 		 * (the absence of MS_ASYNC) to mean the same thing.
120 		 */
121 		iarg = (uintptr_t)arg;
122 		if ((iarg & ~MS_INVALIDATE) == 0)
123 			iarg |= MS_SYNC;
124 
125 		if (((iarg & ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE)) != 0) ||
126 			((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC))) {
127 			error = set_errno(EINVAL);
128 		} else {
129 			error = as_ctl(as, addr, len, cmd, attr, iarg, NULL, 0);
130 			if (error) {
131 				(void) set_errno(error);
132 			}
133 		}
134 		return (error);
135 	case MC_LOCKAS:
136 		if ((uintptr_t)arg & ~(MCL_FUTURE|MCL_CURRENT) ||
137 		    (uintptr_t)arg == 0) {
138 			return (set_errno(EINVAL));
139 		}
140 		break;
141 	case MC_LOCK:
142 	case MC_UNLOCKAS:
143 	case MC_UNLOCK:
144 		break;
145 	case MC_HAT_ADVISE:
146 		/*
147 		 * Set prefered page size.
148 		 */
149 		STRUCT_INIT(mha, get_udatamodel());
150 		if (copyin(arg, STRUCT_BUF(mha), STRUCT_SIZE(mha))) {
151 			return (set_errno(EFAULT));
152 		}
153 
154 		pgcmd = STRUCT_FGET(mha, mha_cmd);
155 
156 		/*
157 		 * Currently only MHA_MAPSIZE_VA, MHA_MAPSIZE_STACK
158 		 * and MHA_MAPSIZE_BSSBRK are supported. Only one
159 		 * command may be specified at a time.
160 		 */
161 		if ((~(MHA_MAPSIZE_VA|MHA_MAPSIZE_STACK|MHA_MAPSIZE_BSSBRK) &
162 		    pgcmd) || pgcmd == 0 || !ISP2(pgcmd) ||
163 		    STRUCT_FGET(mha, mha_flags))
164 			return (set_errno(EINVAL));
165 
166 		pgsz = STRUCT_FGET(mha, mha_pagesize);
167 
168 		/*
169 		 * call platform specific map_pgsz() routine to get the
170 		 * optimal pgsz if pgsz is 0.
171 		 *
172 		 * For stack and heap operations addr and len must be zero.
173 		 */
174 		if ((pgcmd & (MHA_MAPSIZE_BSSBRK|MHA_MAPSIZE_STACK)) != 0) {
175 			if (addr != NULL || len != 0) {
176 				return (set_errno(EINVAL));
177 			}
178 
179 			/*
180 			 * Disable autompss for this process unless pgsz == 0,
181 			 * which means the system should pick.  In the
182 			 * pgsz == 0 case, leave the SAUTOLPG setting alone, as
183 			 * we don't want to enable it when someone has
184 			 * disabled automatic large page selection for the
185 			 * whole system.
186 			 */
187 			mutex_enter(&p->p_lock);
188 			if (pgsz != 0) {
189 				p->p_flag &= ~SAUTOLPG;
190 			}
191 			mutex_exit(&p->p_lock);
192 
193 			as_rangelock(as);
194 
195 			if (pgsz == 0) {
196 				int	type;
197 
198 				if (pgcmd == MHA_MAPSIZE_BSSBRK)
199 					type = MAPPGSZ_HEAP;
200 				else
201 					type = MAPPGSZ_STK;
202 
203 				pgsz = map_pgsz(type, p, 0, 0, NULL);
204 			}
205 		} else {
206 			/*
207 			 * Note that we don't disable automatic large page
208 			 * selection for anon segments based on use of
209 			 * memcntl().
210 			 */
211 			if (pgsz == 0) {
212 				pgsz = map_pgsz(MAPPGSZ_VA, p, addr, len,
213 				    NULL);
214 			}
215 
216 			/*
217 			 * addr and len must be prefered page size aligned
218 			 * and valid for range specified.
219 			 */
220 			if (!IS_P2ALIGNED(addr, pgsz) ||
221 			    !IS_P2ALIGNED(len, pgsz)) {
222 				return (set_errno(EINVAL));
223 			}
224 			if (valid_usr_range(addr, len, 0, as,
225 			    as->a_userlimit) != RANGE_OKAY) {
226 				return (set_errno(ENOMEM));
227 			}
228 		}
229 
230 		szc = mem_getpgszc(pgsz);
231 		if (szc == (uint_t)-1) {
232 			if ((pgcmd & (MHA_MAPSIZE_BSSBRK|MHA_MAPSIZE_STACK))
233 			    != 0) {
234 				as_rangeunlock(as);
235 			}
236 			return (set_errno(EINVAL));
237 		}
238 
239 		/*
240 		 * For stack and heap operations we first need to pad
241 		 * out existing range (create new mappings) to the new
242 		 * prefered page size boundary. Also the start of the
243 		 * .bss for the heap or user's stack base may not be on
244 		 * the new prefered page size boundary. For these cases
245 		 * we align the base of the request on the new prefered
246 		 * page size.
247 		 */
248 		if (pgcmd & MHA_MAPSIZE_BSSBRK) {
249 			if (szc == p->p_brkpageszc) {
250 				as_rangeunlock(as);
251 				return (0);
252 			}
253 			if (szc > p->p_brkpageszc) {
254 				error = brk_internal(p->p_brkbase
255 				    + p->p_brksize, szc);
256 				if (error) {
257 					as_rangeunlock(as);
258 					return (set_errno(error));
259 				}
260 			}
261 			oszc = p->p_brkpageszc;
262 			p->p_brkpageszc = szc;
263 
264 			ASSERT(IS_P2ALIGNED(p->p_brkbase + p->p_brksize, pgsz));
265 			addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
266 			    pgsz);
267 			len = (p->p_brkbase + p->p_brksize) - addr;
268 			ASSERT(IS_P2ALIGNED(len, pgsz));
269 			/*
270 			 * Perhaps no existing pages to promote.
271 			 */
272 			if (len == 0) {
273 				as_rangeunlock(as);
274 				return (0);
275 			}
276 		}
277 		/*
278 		 * The code below, as does grow.c, assumes stacks always grow
279 		 * downward.
280 		 */
281 		if (pgcmd & MHA_MAPSIZE_STACK) {
282 			/*
283 			 * Some boxes (x86) have a top of stack that
284 			 * is not large page aligned. Since stacks are
285 			 * usually small we'll just return and do nothing
286 			 * for theses cases. Prefeered page size is advisory
287 			 * so no need to return an error.
288 			 */
289 			if (szc == p->p_stkpageszc ||
290 			    !IS_P2ALIGNED(p->p_usrstack, pgsz)) {
291 				as_rangeunlock(as);
292 				return (0);
293 			}
294 
295 			if (szc > p->p_stkpageszc) {
296 				error = grow_internal(p->p_usrstack
297 				    - p->p_stksize, szc);
298 				if (error) {
299 					as_rangeunlock(as);
300 					return (set_errno(error));
301 				}
302 			}
303 			oszc = p->p_stkpageszc;
304 			p->p_stkpageszc = szc;
305 
306 			ASSERT(IS_P2ALIGNED(p->p_usrstack, pgsz));
307 			addr = p->p_usrstack - p->p_stksize;
308 			len = p->p_stksize;
309 
310 			/*
311 			 * Perhaps nothing to promote, we wrapped around
312 			 * or grow did not not grow the stack to a large
313 			 * page boundary.
314 			 */
315 			if (!IS_P2ALIGNED(len, pgsz) || len == 0 ||
316 			    addr >= p->p_usrstack || (addr + len) < addr) {
317 				as_rangeunlock(as);
318 				return (0);
319 			}
320 		}
321 		ASSERT(IS_P2ALIGNED(addr, pgsz));
322 		ASSERT(IS_P2ALIGNED(len, pgsz));
323 		error = as_setpagesize(as, addr, len, szc, B_TRUE);
324 
325 		/*
326 		 * On stack or heap failures restore original
327 		 * pg size code.
328 		 */
329 		if (error) {
330 			if ((pgcmd & MHA_MAPSIZE_BSSBRK) != 0) {
331 				p->p_brkpageszc = oszc;
332 			}
333 			if ((pgcmd & MHA_MAPSIZE_STACK) != 0) {
334 				p->p_stkpageszc = oszc;
335 			}
336 			(void) set_errno(error);
337 		}
338 		if ((pgcmd & (MHA_MAPSIZE_BSSBRK|MHA_MAPSIZE_STACK)) != 0) {
339 			as_rangeunlock(as);
340 		}
341 		return (error);
342 	case MC_ADVISE:
343 		switch ((uintptr_t)arg) {
344 		case MADV_WILLNEED:
345 			fc = as_faulta(as, addr, len);
346 			if (fc) {
347 				if (FC_CODE(fc) == FC_OBJERR)
348 					error = set_errno(FC_ERRNO(fc));
349 				else if (FC_CODE(fc) == FC_NOMAP)
350 					error = set_errno(ENOMEM);
351 				else
352 					error = set_errno(EINVAL);
353 				return (error);
354 			}
355 			break;
356 
357 		case MADV_DONTNEED:
358 			/*
359 			 * For now, don't need is turned into an as_ctl(MC_SYNC)
360 			 * operation flagged for async invalidate.
361 			 */
362 			error = as_ctl(as, addr, len, MC_SYNC, attr,
363 			    MS_ASYNC | MS_INVALIDATE, NULL, 0);
364 			if (error)
365 				(void) set_errno(error);
366 			return (error);
367 
368 		default:
369 			error = as_ctl(as, addr, len, cmd, attr,
370 			    (uintptr_t)arg, NULL, 0);
371 			if (error)
372 				(void) set_errno(error);
373 			return (error);
374 		}
375 		break;
376 	default:
377 		return (set_errno(EINVAL));
378 	}
379 
380 	error = as_ctl(as, addr, len, cmd, attr, (uintptr_t)arg, NULL, 0);
381 
382 	if (error)
383 		(void) set_errno(error);
384 	return (error);
385 }
386 
387 /*
388  * Return page size code for page size passed in. If
389  * matching page size not found or supported, return -1.
390  */
391 static uint_t
392 mem_getpgszc(size_t pgsz) {
393 	return ((uint_t)page_szc_user_filtered(pgsz));
394 }
395