xref: /illumos-gate/usr/src/uts/common/syscall/memcntl.c (revision 35a5a3587fd94b666239c157d3722745250ccbd7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 #include <sys/types.h>
33 #include <sys/bitmap.h>
34 #include <sys/sysmacros.h>
35 #include <sys/kmem.h>
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/user.h>
39 #include <sys/unistd.h>
40 #include <sys/errno.h>
41 #include <sys/proc.h>
42 #include <sys/mman.h>
43 #include <sys/tuneable.h>
44 #include <sys/cmn_err.h>
45 #include <sys/cred.h>
46 #include <sys/vmsystm.h>
47 #include <sys/debug.h>
48 #include <sys/policy.h>
49 
50 #include <vm/as.h>
51 #include <vm/seg.h>
52 
53 static uint_t mem_getpgszc(size_t);
54 
55 /*
56  * Memory control operations
57  */
58 int
59 memcntl(caddr_t addr, size_t len, int cmd, caddr_t arg, int attr, int mask)
60 {
61 	struct as *as = ttoproc(curthread)->p_as;
62 	struct proc *p = ttoproc(curthread);
63 	size_t pgsz;
64 	uint_t szc, oszc, pgcmd;
65 	int error = 0;
66 	faultcode_t fc;
67 	uintptr_t iarg;
68 	STRUCT_DECL(memcntl_mha, mha);
69 
70 	if (mask)
71 		return (set_errno(EINVAL));
72 	if ((cmd == MC_LOCKAS) || (cmd == MC_UNLOCKAS)) {
73 		if ((addr != 0) || (len != 0)) {
74 			return (set_errno(EINVAL));
75 		}
76 	} else if (cmd != MC_HAT_ADVISE) {
77 		if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0) {
78 			return (set_errno(EINVAL));
79 		}
80 		/*
81 		 * We're only concerned with the address range
82 		 * here, not the protections.  The protections
83 		 * are only used as a "filter" in this code,
84 		 * they aren't set or modified here.
85 		 */
86 		if (valid_usr_range(addr, len, 0, as,
87 		    as->a_userlimit) != RANGE_OKAY) {
88 			return (set_errno(ENOMEM));
89 		}
90 	}
91 
92 	if (cmd == MC_HAT_ADVISE) {
93 		if (attr != 0 || mask != 0) {
94 			return (set_errno(EINVAL));
95 		}
96 
97 	} else {
98 		if ((VALID_ATTR & attr) != attr) {
99 			return (set_errno(EINVAL));
100 		}
101 		if ((attr & SHARED) && (attr & PRIVATE)) {
102 			return (set_errno(EINVAL));
103 		}
104 		if (((cmd == MC_LOCKAS) || (cmd == MC_LOCK) ||
105 		    (cmd == MC_UNLOCKAS) || (cmd == MC_UNLOCK)) &&
106 		    (error = secpolicy_lock_memory(CRED())) != 0)
107 			return (set_errno(error));
108 	}
109 	if (attr) {
110 		attr |= PROT_USER;
111 	}
112 
113 	switch (cmd) {
114 	case MC_SYNC:
115 		/*
116 		 * MS_SYNC used to be defined to be zero but is now non-zero.
117 		 * For binary compatibility we still accept zero
118 		 * (the absence of MS_ASYNC) to mean the same thing.
119 		 */
120 		iarg = (uintptr_t)arg;
121 		if ((iarg & ~MS_INVALIDATE) == 0)
122 			iarg |= MS_SYNC;
123 
124 		if (((iarg & ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE)) != 0) ||
125 			((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC))) {
126 			error = set_errno(EINVAL);
127 		} else {
128 			error = as_ctl(as, addr, len, cmd, attr, iarg, NULL, 0);
129 			if (error) {
130 				(void) set_errno(error);
131 			}
132 		}
133 		return (error);
134 	case MC_LOCKAS:
135 		if ((uintptr_t)arg & ~(MCL_FUTURE|MCL_CURRENT) ||
136 		    (uintptr_t)arg == 0) {
137 			return (set_errno(EINVAL));
138 		}
139 		break;
140 	case MC_LOCK:
141 	case MC_UNLOCKAS:
142 	case MC_UNLOCK:
143 		break;
144 	case MC_HAT_ADVISE:
145 		/*
146 		 * Set prefered page size.
147 		 */
148 		STRUCT_INIT(mha, get_udatamodel());
149 		if (copyin(arg, STRUCT_BUF(mha), STRUCT_SIZE(mha))) {
150 			return (set_errno(EFAULT));
151 		}
152 
153 		pgcmd = STRUCT_FGET(mha, mha_cmd);
154 
155 		/*
156 		 * Currently only MHA_MAPSIZE_VA, MHA_MAPSIZE_STACK
157 		 * and MHA_MAPSIZE_BSSBRK are supported. Only one
158 		 * command may be specified at a time.
159 		 */
160 		if ((~(MHA_MAPSIZE_VA|MHA_MAPSIZE_STACK|MHA_MAPSIZE_BSSBRK) &
161 		    pgcmd) || pgcmd == 0 || !ISP2(pgcmd) ||
162 		    STRUCT_FGET(mha, mha_flags))
163 			return (set_errno(EINVAL));
164 
165 		pgsz = STRUCT_FGET(mha, mha_pagesize);
166 
167 		/*
168 		 * call platform specific map_pgsz() routine to get the
169 		 * optimal pgsz if pgsz is 0.
170 		 *
171 		 * For stack and heap operations addr and len must be zero.
172 		 */
173 		if ((pgcmd & (MHA_MAPSIZE_BSSBRK|MHA_MAPSIZE_STACK)) != 0) {
174 			if (addr != NULL || len != 0) {
175 				return (set_errno(EINVAL));
176 			}
177 
178 			/*
179 			 * Disable autompss for this process unless pgsz == 0,
180 			 * which means the system should pick.  In the
181 			 * pgsz == 0 case, leave the SAUTOLPG setting alone, as
182 			 * we don't want to enable it when someone has
183 			 * disabled automatic large page selection for the
184 			 * whole system.
185 			 */
186 			mutex_enter(&p->p_lock);
187 			if (pgsz != 0) {
188 				p->p_flag &= ~SAUTOLPG;
189 			}
190 			mutex_exit(&p->p_lock);
191 
192 			as_rangelock(as);
193 
194 			if (pgsz == 0) {
195 				int	type;
196 
197 				if (pgcmd == MHA_MAPSIZE_BSSBRK)
198 					type = MAPPGSZ_HEAP;
199 				else
200 					type = MAPPGSZ_STK;
201 
202 				pgsz = map_pgsz(type, p, 0, 0, 1);
203 			}
204 		} else {
205 			/*
206 			 * addr and len must be valid for range specified.
207 			 */
208 			if (valid_usr_range(addr, len, 0, as,
209 			    as->a_userlimit) != RANGE_OKAY) {
210 				return (set_errno(ENOMEM));
211 			}
212 			/*
213 			 * Note that we don't disable automatic large page
214 			 * selection for anon segments based on use of
215 			 * memcntl().
216 			 */
217 			if (pgsz == 0) {
218 				error = as_set_default_lpsize(as, addr, len);
219 				if (error) {
220 					(void) set_errno(error);
221 				}
222 				return (error);
223 			}
224 
225 			/*
226 			 * addr and len must be prefered page size aligned
227 			 */
228 			if (!IS_P2ALIGNED(addr, pgsz) ||
229 			    !IS_P2ALIGNED(len, pgsz)) {
230 				return (set_errno(EINVAL));
231 			}
232 		}
233 
234 		szc = mem_getpgszc(pgsz);
235 		if (szc == (uint_t)-1) {
236 			if ((pgcmd & (MHA_MAPSIZE_BSSBRK|MHA_MAPSIZE_STACK))
237 			    != 0) {
238 				as_rangeunlock(as);
239 			}
240 			return (set_errno(EINVAL));
241 		}
242 
243 		/*
244 		 * For stack and heap operations we first need to pad
245 		 * out existing range (create new mappings) to the new
246 		 * prefered page size boundary. Also the start of the
247 		 * .bss for the heap or user's stack base may not be on
248 		 * the new prefered page size boundary. For these cases
249 		 * we align the base of the request on the new prefered
250 		 * page size.
251 		 */
252 		if (pgcmd & MHA_MAPSIZE_BSSBRK) {
253 			if (szc == p->p_brkpageszc) {
254 				as_rangeunlock(as);
255 				return (0);
256 			}
257 			if (szc > p->p_brkpageszc) {
258 				error = brk_internal(p->p_brkbase
259 				    + p->p_brksize, szc);
260 				if (error) {
261 					as_rangeunlock(as);
262 					return (set_errno(error));
263 				}
264 			}
265 			/*
266 			 * It is possible for brk_internal to silently fail to
267 			 * promote the heap size, so don't panic or ASSERT.
268 			 */
269 			if (!IS_P2ALIGNED(p->p_brkbase + p->p_brksize, pgsz)) {
270 				as_rangeunlock(as);
271 				return (set_errno(ENOMEM));
272 			}
273 			oszc = p->p_brkpageszc;
274 			p->p_brkpageszc = szc;
275 
276 			addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
277 			    pgsz);
278 			len = (p->p_brkbase + p->p_brksize) - addr;
279 			ASSERT(IS_P2ALIGNED(len, pgsz));
280 			/*
281 			 * Perhaps no existing pages to promote.
282 			 */
283 			if (len == 0) {
284 				as_rangeunlock(as);
285 				return (0);
286 			}
287 		}
288 		/*
289 		 * The code below, as does grow.c, assumes stacks always grow
290 		 * downward.
291 		 */
292 		if (pgcmd & MHA_MAPSIZE_STACK) {
293 			if (szc == p->p_stkpageszc) {
294 				as_rangeunlock(as);
295 				return (0);
296 			}
297 
298 			if (szc > p->p_stkpageszc) {
299 				error = grow_internal(p->p_usrstack -
300 				    p->p_stksize, szc);
301 				if (error) {
302 					as_rangeunlock(as);
303 					return (set_errno(error));
304 				}
305 			}
306 			/*
307 			 * It is possible for grow_internal to silently fail to
308 			 * promote the stack size, so don't panic or ASSERT.
309 			 */
310 			if (!IS_P2ALIGNED(p->p_usrstack - p->p_stksize, pgsz)) {
311 				as_rangeunlock(as);
312 				return (set_errno(ENOMEM));
313 			}
314 			oszc = p->p_stkpageszc;
315 			p->p_stkpageszc = szc;
316 
317 			addr = p->p_usrstack - p->p_stksize;
318 			len = P2ALIGN(p->p_stksize, pgsz);
319 
320 			/*
321 			 * Perhaps nothing to promote.
322 			 */
323 			if (len == 0 || addr >= p->p_usrstack ||
324 			    (addr + len) < addr) {
325 				as_rangeunlock(as);
326 				return (0);
327 			}
328 		}
329 		ASSERT(IS_P2ALIGNED(addr, pgsz));
330 		ASSERT(IS_P2ALIGNED(len, pgsz));
331 		error = as_setpagesize(as, addr, len, szc, B_TRUE);
332 
333 		/*
334 		 * On stack or heap failures restore original
335 		 * pg size code.
336 		 */
337 		if (error) {
338 			if ((pgcmd & MHA_MAPSIZE_BSSBRK) != 0) {
339 				p->p_brkpageszc = oszc;
340 			}
341 			if ((pgcmd & MHA_MAPSIZE_STACK) != 0) {
342 				p->p_stkpageszc = oszc;
343 			}
344 			(void) set_errno(error);
345 		}
346 		if ((pgcmd & (MHA_MAPSIZE_BSSBRK|MHA_MAPSIZE_STACK)) != 0) {
347 			as_rangeunlock(as);
348 		}
349 		return (error);
350 	case MC_ADVISE:
351 		if ((uintptr_t)arg == MADV_FREE) {
352 			len &= PAGEMASK;
353 		}
354 		switch ((uintptr_t)arg) {
355 		case MADV_WILLNEED:
356 			fc = as_faulta(as, addr, len);
357 			if (fc) {
358 				if (FC_CODE(fc) == FC_OBJERR)
359 					error = set_errno(FC_ERRNO(fc));
360 				else if (FC_CODE(fc) == FC_NOMAP)
361 					error = set_errno(ENOMEM);
362 				else
363 					error = set_errno(EINVAL);
364 				return (error);
365 			}
366 			break;
367 
368 		case MADV_DONTNEED:
369 			/*
370 			 * For now, don't need is turned into an as_ctl(MC_SYNC)
371 			 * operation flagged for async invalidate.
372 			 */
373 			error = as_ctl(as, addr, len, MC_SYNC, attr,
374 			    MS_ASYNC | MS_INVALIDATE, NULL, 0);
375 			if (error)
376 				(void) set_errno(error);
377 			return (error);
378 
379 		default:
380 			error = as_ctl(as, addr, len, cmd, attr,
381 			    (uintptr_t)arg, NULL, 0);
382 			if (error)
383 				(void) set_errno(error);
384 			return (error);
385 		}
386 		break;
387 	default:
388 		return (set_errno(EINVAL));
389 	}
390 
391 	error = as_ctl(as, addr, len, cmd, attr, (uintptr_t)arg, NULL, 0);
392 
393 	if (error)
394 		(void) set_errno(error);
395 	return (error);
396 }
397 
398 /*
399  * Return page size code for page size passed in. If
400  * matching page size not found or supported, return -1.
401  */
402 static uint_t
403 mem_getpgszc(size_t pgsz) {
404 	return ((uint_t)page_szc_user_filtered(pgsz));
405 }
406