xref: /illumos-gate/usr/src/uts/common/syscall/memcntl.c (revision a38ee58261c5aa81028a4329e73da4016006aa99)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright (c) 2015 Joyent, Inc.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 
31 #include <sys/types.h>
32 #include <sys/bitmap.h>
33 #include <sys/sysmacros.h>
34 #include <sys/kmem.h>
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/user.h>
38 #include <sys/unistd.h>
39 #include <sys/errno.h>
40 #include <sys/proc.h>
41 #include <sys/mman.h>
42 #include <sys/tuneable.h>
43 #include <sys/cmn_err.h>
44 #include <sys/cred.h>
45 #include <sys/vmsystm.h>
46 #include <sys/debug.h>
47 #include <sys/policy.h>
48 
49 #include <vm/as.h>
50 #include <vm/seg.h>
51 
52 static uint_t mem_getpgszc(size_t);
53 
54 /*
55  * Memory control operations
56  */
57 int
58 memcntl(caddr_t addr, size_t len, int cmd, caddr_t arg, int attr, int mask)
59 {
60 	struct as *as = ttoproc(curthread)->p_as;
61 	struct proc *p = ttoproc(curthread);
62 	size_t pgsz;
63 	uint_t szc, oszc, pgcmd;
64 	int error = 0;
65 	faultcode_t fc;
66 	uintptr_t iarg;
67 	STRUCT_DECL(memcntl_mha, mha);
68 
69 	if (mask)
70 		return (set_errno(EINVAL));
71 	if ((cmd == MC_LOCKAS) || (cmd == MC_UNLOCKAS)) {
72 		if ((addr != 0) || (len != 0)) {
73 			return (set_errno(EINVAL));
74 		}
75 	} else if (cmd != MC_HAT_ADVISE) {
76 		if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0) {
77 			return (set_errno(EINVAL));
78 		}
79 		/*
80 		 * We're only concerned with the address range
81 		 * here, not the protections.  The protections
82 		 * are only used as a "filter" in this code,
83 		 * they aren't set or modified here.
84 		 */
85 		if (valid_usr_range(addr, len, 0, as,
86 		    as->a_userlimit) != RANGE_OKAY) {
87 			return (set_errno(ENOMEM));
88 		}
89 	}
90 
91 	if (cmd == MC_HAT_ADVISE) {
92 		if (attr != 0 || mask != 0) {
93 			return (set_errno(EINVAL));
94 		}
95 
96 	} else {
97 		if ((VALID_ATTR & attr) != attr) {
98 			return (set_errno(EINVAL));
99 		}
100 		if ((attr & SHARED) && (attr & PRIVATE)) {
101 			return (set_errno(EINVAL));
102 		}
103 		if (((cmd == MC_LOCKAS) || (cmd == MC_LOCK) ||
104 		    (cmd == MC_UNLOCKAS) || (cmd == MC_UNLOCK)) &&
105 		    (error = secpolicy_lock_memory(CRED())) != 0)
106 			return (set_errno(error));
107 	}
108 	if (attr) {
109 		attr |= PROT_USER;
110 	}
111 
112 	switch (cmd) {
113 	case MC_SYNC:
114 		/*
115 		 * MS_SYNC used to be defined to be zero but is now non-zero.
116 		 * For binary compatibility we still accept zero
117 		 * (the absence of MS_ASYNC) to mean the same thing.
118 		 */
119 		iarg = (uintptr_t)arg;
120 		if ((iarg & ~MS_INVALIDATE) == 0)
121 			iarg |= MS_SYNC;
122 
123 		if (((iarg & ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE)) != 0) ||
124 		    ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC))) {
125 			error = set_errno(EINVAL);
126 		} else {
127 			error = as_ctl(as, addr, len, cmd, attr, iarg, NULL, 0);
128 			if (error) {
129 				(void) set_errno(error);
130 			}
131 		}
132 		return (error);
133 	case MC_LOCKAS:
134 		if ((uintptr_t)arg & ~(MCL_FUTURE|MCL_CURRENT) ||
135 		    (uintptr_t)arg == 0) {
136 			return (set_errno(EINVAL));
137 		}
138 		break;
139 	case MC_LOCK:
140 	case MC_UNLOCKAS:
141 	case MC_UNLOCK:
142 		break;
143 	case MC_HAT_ADVISE:
144 		/*
145 		 * Set prefered page size.
146 		 */
147 		STRUCT_INIT(mha, get_udatamodel());
148 		if (copyin(arg, STRUCT_BUF(mha), STRUCT_SIZE(mha))) {
149 			return (set_errno(EFAULT));
150 		}
151 
152 		pgcmd = STRUCT_FGET(mha, mha_cmd);
153 
154 		/*
155 		 * Currently only MHA_MAPSIZE_VA, MHA_MAPSIZE_STACK
156 		 * and MHA_MAPSIZE_BSSBRK are supported. Only one
157 		 * command may be specified at a time.
158 		 */
159 		if ((~(MHA_MAPSIZE_VA|MHA_MAPSIZE_STACK|MHA_MAPSIZE_BSSBRK) &
160 		    pgcmd) || pgcmd == 0 || !ISP2(pgcmd) ||
161 		    STRUCT_FGET(mha, mha_flags))
162 			return (set_errno(EINVAL));
163 
164 		pgsz = STRUCT_FGET(mha, mha_pagesize);
165 
166 		/*
167 		 * call platform specific map_pgsz() routine to get the
168 		 * optimal pgsz if pgsz is 0.
169 		 *
170 		 * For stack and heap operations addr and len must be zero.
171 		 */
172 		if ((pgcmd & (MHA_MAPSIZE_BSSBRK|MHA_MAPSIZE_STACK)) != 0) {
173 			if (addr != NULL || len != 0) {
174 				return (set_errno(EINVAL));
175 			}
176 
177 			/*
178 			 * Disable autompss for this process unless pgsz == 0,
179 			 * which means the system should pick.  In the
180 			 * pgsz == 0 case, leave the SAUTOLPG setting alone, as
181 			 * we don't want to enable it when someone has
182 			 * disabled automatic large page selection for the
183 			 * whole system.
184 			 */
185 			mutex_enter(&p->p_lock);
186 			if (pgsz != 0) {
187 				p->p_flag &= ~SAUTOLPG;
188 			}
189 			mutex_exit(&p->p_lock);
190 
191 			as_rangelock(as);
192 
193 			if (pgsz == 0) {
194 				int	type;
195 
196 				if (pgcmd == MHA_MAPSIZE_BSSBRK)
197 					type = MAPPGSZ_HEAP;
198 				else
199 					type = MAPPGSZ_STK;
200 
201 				pgsz = map_pgsz(type, p, 0, 0, 1);
202 			}
203 		} else {
204 			/*
205 			 * addr and len must be valid for range specified.
206 			 */
207 			if (valid_usr_range(addr, len, 0, as,
208 			    as->a_userlimit) != RANGE_OKAY) {
209 				return (set_errno(ENOMEM));
210 			}
211 			/*
212 			 * Note that we don't disable automatic large page
213 			 * selection for anon segments based on use of
214 			 * memcntl().
215 			 */
216 			if (pgsz == 0) {
217 				error = as_set_default_lpsize(as, addr, len);
218 				if (error) {
219 					(void) set_errno(error);
220 				}
221 				return (error);
222 			}
223 
224 			/*
225 			 * addr and len must be prefered page size aligned
226 			 */
227 			if (!IS_P2ALIGNED(addr, pgsz) ||
228 			    !IS_P2ALIGNED(len, pgsz)) {
229 				return (set_errno(EINVAL));
230 			}
231 		}
232 
233 		szc = mem_getpgszc(pgsz);
234 		if (szc == (uint_t)-1) {
235 			if ((pgcmd & (MHA_MAPSIZE_BSSBRK|MHA_MAPSIZE_STACK))
236 			    != 0) {
237 				as_rangeunlock(as);
238 			}
239 			return (set_errno(EINVAL));
240 		}
241 
242 		/*
243 		 * For stack and heap operations we first need to pad
244 		 * out existing range (create new mappings) to the new
245 		 * prefered page size boundary. Also the start of the
246 		 * .bss for the heap or user's stack base may not be on
247 		 * the new prefered page size boundary. For these cases
248 		 * we align the base of the request on the new prefered
249 		 * page size.
250 		 */
251 		if (pgcmd & MHA_MAPSIZE_BSSBRK) {
252 			if (szc == p->p_brkpageszc) {
253 				as_rangeunlock(as);
254 				return (0);
255 			}
256 			if (szc > p->p_brkpageszc) {
257 				error = brk_internal(p->p_brkbase
258 				    + p->p_brksize, szc);
259 				if (error) {
260 					as_rangeunlock(as);
261 					return (set_errno(error));
262 				}
263 			}
264 			/*
265 			 * It is possible for brk_internal to silently fail to
266 			 * promote the heap size, so don't panic or ASSERT.
267 			 */
268 			if (!IS_P2ALIGNED(p->p_brkbase + p->p_brksize, pgsz)) {
269 				as_rangeunlock(as);
270 				return (set_errno(ENOMEM));
271 			}
272 			oszc = p->p_brkpageszc;
273 			p->p_brkpageszc = szc;
274 
275 			addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
276 			    pgsz);
277 			len = (p->p_brkbase + p->p_brksize) - addr;
278 			ASSERT(IS_P2ALIGNED(len, pgsz));
279 			/*
280 			 * Perhaps no existing pages to promote.
281 			 */
282 			if (len == 0) {
283 				as_rangeunlock(as);
284 				return (0);
285 			}
286 		}
287 		/*
288 		 * The code below, as does grow.c, assumes stacks always grow
289 		 * downward.
290 		 */
291 		if (pgcmd & MHA_MAPSIZE_STACK) {
292 			if (szc == p->p_stkpageszc) {
293 				as_rangeunlock(as);
294 				return (0);
295 			}
296 
297 			if (szc > p->p_stkpageszc) {
298 				error = grow_internal(p->p_usrstack -
299 				    p->p_stksize, szc);
300 				if (error) {
301 					as_rangeunlock(as);
302 					return (set_errno(error));
303 				}
304 			}
305 			/*
306 			 * It is possible for grow_internal to silently fail to
307 			 * promote the stack size, so don't panic or ASSERT.
308 			 */
309 			if (!IS_P2ALIGNED(p->p_usrstack - p->p_stksize, pgsz)) {
310 				as_rangeunlock(as);
311 				return (set_errno(ENOMEM));
312 			}
313 			oszc = p->p_stkpageszc;
314 			p->p_stkpageszc = szc;
315 
316 			addr = p->p_usrstack - p->p_stksize;
317 			len = P2ALIGN(p->p_stksize, pgsz);
318 
319 			/*
320 			 * Perhaps nothing to promote.
321 			 */
322 			if (len == 0 || addr >= p->p_usrstack ||
323 			    (addr + len) < addr) {
324 				as_rangeunlock(as);
325 				return (0);
326 			}
327 		}
328 		ASSERT(IS_P2ALIGNED(addr, pgsz));
329 		ASSERT(IS_P2ALIGNED(len, pgsz));
330 		error = as_setpagesize(as, addr, len, szc, B_TRUE);
331 
332 		/*
333 		 * On stack or heap failures restore original
334 		 * pg size code.
335 		 */
336 		if (error) {
337 			if ((pgcmd & MHA_MAPSIZE_BSSBRK) != 0) {
338 				p->p_brkpageszc = oszc;
339 			}
340 			if ((pgcmd & MHA_MAPSIZE_STACK) != 0) {
341 				p->p_stkpageszc = oszc;
342 			}
343 			(void) set_errno(error);
344 		}
345 		if ((pgcmd & (MHA_MAPSIZE_BSSBRK|MHA_MAPSIZE_STACK)) != 0) {
346 			as_rangeunlock(as);
347 		}
348 		return (error);
349 	case MC_ADVISE:
350 		if ((uintptr_t)arg == MADV_FREE) {
351 			len &= PAGEMASK;
352 		}
353 		switch ((uintptr_t)arg) {
354 		case MADV_WILLNEED:
355 			fc = as_faulta(as, addr, len);
356 			if (fc) {
357 				if (FC_CODE(fc) == FC_OBJERR)
358 					error = set_errno(FC_ERRNO(fc));
359 				else if (FC_CODE(fc) == FC_NOMAP)
360 					error = set_errno(ENOMEM);
361 				else
362 					error = set_errno(EINVAL);
363 				return (error);
364 			}
365 			break;
366 
367 		case MADV_DONTNEED:
368 			/*
369 			 * For now, don't need is turned into an as_ctl(MC_SYNC)
370 			 * operation flagged for async invalidate.
371 			 */
372 			error = as_ctl(as, addr, len, MC_SYNC, attr,
373 			    MS_ASYNC | MS_INVALIDATE, NULL, 0);
374 			if (error)
375 				(void) set_errno(error);
376 			return (error);
377 
378 		default:
379 			error = as_ctl(as, addr, len, cmd, attr,
380 			    (uintptr_t)arg, NULL, 0);
381 			if (error)
382 				(void) set_errno(error);
383 			return (error);
384 		}
385 		break;
386 	case MC_INHERIT_ZERO:
387 		if (arg != 0 || attr != 0 || mask != 0)
388 			return (set_errno(EINVAL));
389 		break;
390 	default:
391 		return (set_errno(EINVAL));
392 	}
393 
394 	error = as_ctl(as, addr, len, cmd, attr, (uintptr_t)arg, NULL, 0);
395 
396 	if (error)
397 		(void) set_errno(error);
398 	return (error);
399 }
400 
401 /*
402  * Return page size code for page size passed in. If
403  * matching page size not found or supported, return -1.
404  */
405 static uint_t
406 mem_getpgszc(size_t pgsz) {
407 	return ((uint_t)page_szc_user_filtered(pgsz));
408 }
409