xref: /illumos-gate/usr/src/uts/common/os/shm.c (revision 4de2612967d06c4fdbf524a62556a1e8118a006f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /*	  All Rights Reserved	*/
29 
30 /*
31  * University Copyright- Copyright (c) 1982, 1986, 1988
32  * The Regents of the University of California
33  * All Rights Reserved
34  *
35  * University Acknowledgment- Portions of this document are derived from
36  * software developed by the University of California, Berkeley, and its
37  * contributors.
38  */
39 
40 #pragma ident	"%Z%%M%	%I%	%E% SMI"
41 
42 /*
43  * Inter-Process Communication Shared Memory Facility.
44  *
45  * See os/ipc.c for a description of common IPC functionality.
46  *
47  * Resource controls
48  * -----------------
49  *
50  * Control:      project.max-shm-ids (rc_project_shmmni)
51  * Description:  Maximum number of shared memory ids allowed a project.
52  *
53  *   When shmget() is used to allocate a shared memory segment, one id
54  *   is allocated.  If the id allocation doesn't succeed, shmget()
55  *   fails and errno is set to ENOSPC.  Upon successful shmctl(,
56  *   IPC_RMID) the id is deallocated.
57  *
58  * Control:      project.max-shm-memory (rc_project_shmmax)
59  * Description:  Total amount of shared memory allowed a project.
60  *
61  *   When shmget() is used to allocate a shared memory segment, the
62  *   segment's size is allocated against this limit.  If the space
63  *   allocation doesn't succeed, shmget() fails and errno is set to
64  *   EINVAL.  The size will be deallocated once the last process has
65  *   detached the segment and the segment has been successfully
66  *   shmctl(, IPC_RMID)ed.
67  */
68 
69 #include <sys/types.h>
70 #include <sys/param.h>
71 #include <sys/cred.h>
72 #include <sys/errno.h>
73 #include <sys/time.h>
74 #include <sys/kmem.h>
75 #include <sys/user.h>
76 #include <sys/proc.h>
77 #include <sys/systm.h>
78 #include <sys/prsystm.h>
79 #include <sys/sysmacros.h>
80 #include <sys/tuneable.h>
81 #include <sys/vm.h>
82 #include <sys/mman.h>
83 #include <sys/swap.h>
84 #include <sys/cmn_err.h>
85 #include <sys/debug.h>
86 #include <sys/lwpchan_impl.h>
87 #include <sys/avl.h>
88 #include <sys/modctl.h>
89 #include <sys/syscall.h>
90 #include <sys/task.h>
91 #include <sys/project.h>
92 #include <sys/policy.h>
93 #include <sys/zone.h>
94 
95 #include <sys/ipc.h>
96 #include <sys/ipc_impl.h>
97 #include <sys/shm.h>
98 #include <sys/shm_impl.h>
99 
100 #include <vm/hat.h>
101 #include <vm/seg.h>
102 #include <vm/as.h>
103 #include <vm/seg_vn.h>
104 #include <vm/anon.h>
105 #include <vm/page.h>
106 #include <vm/vpage.h>
107 #include <vm/seg_spt.h>
108 
109 #include <c2/audit.h>
110 
111 static int shmem_lock(struct anon_map *amp);
112 static void shmem_unlock(struct anon_map *amp, uint_t lck);
113 static void sa_add(struct proc *pp, caddr_t addr, size_t len, ulong_t flags,
114 	kshmid_t *id);
115 static void shm_rm_amp(struct anon_map *amp, uint_t lckflag);
116 static void shm_dtor(kipc_perm_t *);
117 static void shm_rmid(kipc_perm_t *);
118 static void shm_remove_zone(zoneid_t, void *);
119 
120 /*
121  * Semantics for share_page_table and ism_off:
122  *
123  * These are hooks in /etc/system - only for internal testing purpose.
124  *
125  * Setting share_page_table automatically turns on the SHM_SHARE_MMU (ISM) flag
126  * in a call to shmat(2). In other words, with share_page_table set, you always
127  * get ISM, even if say, DISM is specified. It should really be called "ism_on".
128  *
129  * Setting ism_off turns off the SHM_SHARE_MMU flag from the flags passed to
130  * shmat(2).
131  *
132  * If both share_page_table and ism_off are set, share_page_table prevails.
133  *
134  * Although these tunables should probably be removed, they do have some
135  * external exposure; as long as they exist, they should at least work sensibly.
136  */
137 
138 int share_page_table;
139 int ism_off;
140 
141 /*
142  * The following tunables are obsolete.  Though for compatibility we
143  * still read and interpret shminfo_shmmax and shminfo_shmmni (see
144  * os/project.c), the preferred mechanism for administrating the IPC
145  * Shared Memory facility is through the resource controls described at
146  * the top of this file.
147  */
148 size_t	shminfo_shmmax = 0x800000;	/* (obsolete) */
149 int	shminfo_shmmni = 100;		/* (obsolete) */
150 size_t	shminfo_shmmin = 1;		/* (obsolete) */
151 int	shminfo_shmseg = 6;		/* (obsolete) */
152 
153 extern rctl_hndl_t rc_project_shmmax;
154 extern rctl_hndl_t rc_project_shmmni;
155 static ipc_service_t *shm_svc;
156 static zone_key_t shm_zone_key;
157 
158 /*
159  * Module linkage information for the kernel.
160  */
161 static uintptr_t shmsys(int, uintptr_t, uintptr_t, uintptr_t);
162 
163 static struct sysent ipcshm_sysent = {
164 	4,
165 #ifdef	_SYSCALL32_IMPL
166 	SE_ARGC | SE_NOUNLOAD | SE_64RVAL,
167 #else	/* _SYSCALL32_IMPL */
168 	SE_ARGC | SE_NOUNLOAD | SE_32RVAL1,
169 #endif	/* _SYSCALL32_IMPL */
170 	(int (*)())shmsys
171 };
172 
173 #ifdef	_SYSCALL32_IMPL
174 static struct sysent ipcshm_sysent32 = {
175 	4,
176 	SE_ARGC | SE_NOUNLOAD | SE_32RVAL1,
177 	(int (*)())shmsys
178 };
179 #endif	/* _SYSCALL32_IMPL */
180 
181 static struct modlsys modlsys = {
182 	&mod_syscallops, "System V shared memory", &ipcshm_sysent
183 };
184 
185 #ifdef	_SYSCALL32_IMPL
186 static struct modlsys modlsys32 = {
187 	&mod_syscallops32, "32-bit System V shared memory", &ipcshm_sysent32
188 };
189 #endif	/* _SYSCALL32_IMPL */
190 
191 static struct modlinkage modlinkage = {
192 	MODREV_1,
193 	&modlsys,
194 #ifdef	_SYSCALL32_IMPL
195 	&modlsys32,
196 #endif
197 	NULL
198 };
199 
200 
201 int
202 _init(void)
203 {
204 	int result;
205 
206 	shm_svc = ipcs_create("shmids", rc_project_shmmni, sizeof (kshmid_t),
207 	    shm_dtor, shm_rmid, AT_IPC_SHM,
208 	    offsetof(kproject_data_t, kpd_shmmni));
209 	zone_key_create(&shm_zone_key, NULL, shm_remove_zone, NULL);
210 
211 	if ((result = mod_install(&modlinkage)) == 0)
212 		return (0);
213 
214 	(void) zone_key_delete(shm_zone_key);
215 	ipcs_destroy(shm_svc);
216 
217 	return (result);
218 }
219 
220 int
221 _fini(void)
222 {
223 	return (EBUSY);
224 }
225 
226 int
227 _info(struct modinfo *modinfop)
228 {
229 	return (mod_info(&modlinkage, modinfop));
230 }
231 
232 /*
233  * Shmat (attach shared segment) system call.
234  */
235 static int
236 shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
237 {
238 	kshmid_t *sp;	/* shared memory header ptr */
239 	size_t	size;
240 	int	error = 0;
241 	proc_t *pp = curproc;
242 	struct as *as = pp->p_as;
243 	struct segvn_crargs	crargs;	/* segvn create arguments */
244 	kmutex_t	*lock;
245 	struct seg 	*segspt = NULL;
246 	caddr_t		addr = uaddr;
247 	int		flags = (uflags & SHMAT_VALID_FLAGS_MASK);
248 	int		useISM;
249 	uchar_t		prot = PROT_ALL;
250 	int result;
251 
252 	if ((lock = ipc_lookup(shm_svc, shmid, (kipc_perm_t **)&sp)) == NULL)
253 		return (EINVAL);
254 	if (error = ipcperm_access(&sp->shm_perm, SHM_R, CRED()))
255 		goto errret;
256 	if ((flags & SHM_RDONLY) == 0 &&
257 	    (error = ipcperm_access(&sp->shm_perm, SHM_W, CRED())))
258 		goto errret;
259 	if (spt_invalid(flags)) {
260 		error = EINVAL;
261 		goto errret;
262 	}
263 	if (ism_off)
264 		flags = flags & ~SHM_SHARE_MMU;
265 	if (share_page_table) {
266 		flags = flags & ~SHM_PAGEABLE;
267 		flags = flags | SHM_SHARE_MMU;
268 	}
269 	useISM = (spt_locked(flags) || spt_pageable(flags));
270 	if (useISM && (error = ipcperm_access(&sp->shm_perm, SHM_W, CRED())))
271 		goto errret;
272 	if (useISM && isspt(sp)) {
273 		uint_t newsptflags = flags | spt_flags(sp->shm_sptseg);
274 		/*
275 		 * If trying to change an existing {D}ISM segment from ISM
276 		 * to DISM or vice versa, return error. Note that this
277 		 * validation of flags needs to be done after the effect of
278 		 * tunables such as ism_off and share_page_table, for
279 		 * semantics that are consistent with the tunables' settings.
280 		 */
281 		if (spt_invalid(newsptflags)) {
282 			error = EINVAL;
283 			goto errret;
284 		}
285 	}
286 	ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER);
287 	size = sp->shm_amp->size;
288 	ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock);
289 
290 	/* somewhere to record spt info for final detach */
291 	if (sp->shm_sptinfo == NULL)
292 		sp->shm_sptinfo = kmem_zalloc(sizeof (sptinfo_t), KM_SLEEP);
293 
294 	as_rangelock(as);
295 
296 	if (useISM) {
297 		/*
298 		 * Handle ISM
299 		 */
300 		uint_t	n, share_szc;
301 		size_t	share_size;
302 		struct	shm_data ssd;
303 		uintptr_t align_hint;
304 
305 		n = page_num_pagesizes();
306 		if (n < 2) { /* large pages aren't supported */
307 			as_rangeunlock(as);
308 			error = EINVAL;
309 			goto errret;
310 		}
311 
312 		/*
313 		 * Pick a share pagesize to use, if (!isspt(sp)).
314 		 * Otherwise use the already chosen page size.
315 		 *
316 		 * For the initial shmat (!isspt(sp)), where sptcreate is
317 		 * called, map_pgsz is called to recommend a [D]ISM pagesize,
318 		 * important for systems which offer more than one potential
319 		 * [D]ISM pagesize.
320 		 * If the shmat is just to attach to an already created
321 		 * [D]ISM segment, then use the previously selected page size.
322 		 */
323 		if (!isspt(sp)) {
324 			share_size = map_pgsz(MAPPGSZ_ISM,
325 			    pp, addr, size, NULL);
326 			if (share_size == 0) {
327 				as_rangeunlock(as);
328 				error = EINVAL;
329 				goto errret;
330 			}
331 			share_szc = page_szc(share_size);
332 		} else {
333 			share_szc = sp->shm_sptseg->s_szc;
334 			share_size = page_get_pagesize(share_szc);
335 		}
336 		size = P2ROUNDUP(size, share_size);
337 
338 		align_hint = share_size;
339 #if defined(__i386) || defined(__amd64)
340 		/*
341 		 * For 64 bit amd64, we want to share an entire page table
342 		 * if possible. We know (ugh) that there are 512 entries in
343 		 * in a page table. The number for 32 bit non-PAE should be
344 		 * 1024, but I'm not going to special case that. Note using 512
345 		 * won't cause a failure below. It retries with align_hint set
346 		 * to share_size
347 		 */
348 		while (size >= 512 * (uint64_t)align_hint)
349 			align_hint *= 512;
350 #endif /* __i386 || __amd64 */
351 
352 #if defined(__sparcv9)
353 		if (addr == 0 && curproc->p_model == DATAMODEL_LP64) {
354 			/*
355 			 * If no address has been passed in, and this is a
356 			 * 64-bit process, we'll try to find an address
357 			 * in the predict-ISM zone.
358 			 */
359 			caddr_t predbase = (caddr_t)PREDISM_1T_BASE;
360 			size_t len = PREDISM_BOUND - PREDISM_1T_BASE;
361 
362 			as_purge(as);
363 			if (as_gap(as, size + share_size, &predbase, &len,
364 			    AH_LO, (caddr_t)NULL) != -1) {
365 				/*
366 				 * We found an address which looks like a
367 				 * candidate.  We want to round it up, and
368 				 * then check that it's a valid user range.
369 				 * This assures that we won't fail below.
370 				 */
371 				addr = (caddr_t)P2ROUNDUP((uintptr_t)predbase,
372 				    share_size);
373 
374 				if (valid_usr_range(addr, size, prot,
375 				    as, as->a_userlimit) != RANGE_OKAY) {
376 					addr = 0;
377 				}
378 			}
379 		}
380 #endif /* __sparcv9 */
381 
382 		if (addr == 0) {
383 			for (;;) {
384 				addr = (caddr_t)align_hint;
385 				map_addr(&addr, size, 0ll, 1, MAP_ALIGN);
386 				if (addr != NULL || align_hint == share_size)
387 					break;
388 				align_hint = share_size;
389 			}
390 			if (addr == NULL) {
391 				as_rangeunlock(as);
392 				error = ENOMEM;
393 				goto errret;
394 			}
395 			ASSERT(((uintptr_t)addr & (align_hint - 1)) == 0);
396 		} else {
397 			/* Use the user-supplied attach address */
398 			caddr_t base;
399 			size_t len;
400 
401 			/*
402 			 * Check that the address range
403 			 *  1) is properly aligned
404 			 *  2) is correct in unix terms
405 			 *  3) is within an unmapped address segment
406 			 */
407 			base = addr;
408 			len = size;		/* use spt aligned size */
409 			/* XXX - in SunOS, is sp->shm_segsz */
410 			if ((uintptr_t)base & (share_size - 1)) {
411 				error = EINVAL;
412 				as_rangeunlock(as);
413 				goto errret;
414 			}
415 			result = valid_usr_range(base, len, prot, as,
416 			    as->a_userlimit);
417 			if (result == RANGE_BADPROT) {
418 				/*
419 				 * We try to accomodate processors which
420 				 * may not support execute permissions on
421 				 * all ISM segments by trying the check
422 				 * again but without PROT_EXEC.
423 				 */
424 				prot &= ~PROT_EXEC;
425 				result = valid_usr_range(base, len, prot, as,
426 				    as->a_userlimit);
427 			}
428 			as_purge(as);
429 			if (result != RANGE_OKAY ||
430 			    as_gap(as, len, &base, &len, AH_LO,
431 			    (caddr_t)NULL) != 0) {
432 				error = EINVAL;
433 				as_rangeunlock(as);
434 				goto errret;
435 			}
436 		}
437 
438 		if (!isspt(sp)) {
439 			error = sptcreate(size, &segspt, sp->shm_amp, prot,
440 			    flags, share_szc);
441 			if (error) {
442 				as_rangeunlock(as);
443 				goto errret;
444 			}
445 			sp->shm_sptinfo->sptas = segspt->s_as;
446 			sp->shm_sptseg = segspt;
447 			sp->shm_sptprot = prot;
448 			sp->shm_lkcnt = 0;
449 		} else if ((prot & sp->shm_sptprot) != sp->shm_sptprot) {
450 			/*
451 			 * Ensure we're attaching to an ISM segment with
452 			 * fewer or equal permissions than what we're
453 			 * allowed.  Fail if the segment has more
454 			 * permissions than what we're allowed.
455 			 */
456 			error = EACCES;
457 			as_rangeunlock(as);
458 			goto errret;
459 		}
460 
461 		ssd.shm_sptseg = sp->shm_sptseg;
462 		ssd.shm_sptas = sp->shm_sptinfo->sptas;
463 		ssd.shm_amp = sp->shm_amp;
464 		error = as_map(as, addr, size, segspt_shmattach, &ssd);
465 		if (error == 0)
466 			sp->shm_ismattch++; /* keep count of ISM attaches */
467 	} else {
468 
469 		/*
470 		 * Normal case.
471 		 */
472 		if (flags & SHM_RDONLY)
473 			prot &= ~PROT_WRITE;
474 
475 		if (addr == 0) {
476 			/* Let the system pick the attach address */
477 			map_addr(&addr, size, 0ll, 1, 0);
478 			if (addr == NULL) {
479 				as_rangeunlock(as);
480 				error = ENOMEM;
481 				goto errret;
482 			}
483 		} else {
484 			/* Use the user-supplied attach address */
485 			caddr_t base;
486 			size_t len;
487 
488 			if (flags & SHM_RND)
489 				addr = (caddr_t)((uintptr_t)addr &
490 				    ~(SHMLBA - 1));
491 			/*
492 			 * Check that the address range
493 			 *  1) is properly aligned
494 			 *  2) is correct in unix terms
495 			 *  3) is within an unmapped address segment
496 			 */
497 			base = addr;
498 			len = size;		/* use aligned size */
499 			/* XXX - in SunOS, is sp->shm_segsz */
500 			if ((uintptr_t)base & PAGEOFFSET) {
501 				error = EINVAL;
502 				as_rangeunlock(as);
503 				goto errret;
504 			}
505 			result = valid_usr_range(base, len, prot, as,
506 			    as->a_userlimit);
507 			if (result == RANGE_BADPROT) {
508 				prot &= ~PROT_EXEC;
509 				result = valid_usr_range(base, len, prot, as,
510 				    as->a_userlimit);
511 			}
512 			as_purge(as);
513 			if (result != RANGE_OKAY ||
514 			    as_gap(as, len, &base, &len,
515 			    AH_LO, (caddr_t)NULL) != 0) {
516 				error = EINVAL;
517 				as_rangeunlock(as);
518 				goto errret;
519 			}
520 		}
521 
522 		/* Initialize the create arguments and map the segment */
523 		crargs = *(struct segvn_crargs *)zfod_argsp;
524 		crargs.offset = 0;
525 		crargs.type = MAP_SHARED;
526 		crargs.amp = sp->shm_amp;
527 		crargs.prot = prot;
528 		crargs.maxprot = crargs.prot;
529 		crargs.flags = 0;
530 
531 		error = as_map(as, addr, size, segvn_create, &crargs);
532 	}
533 
534 	as_rangeunlock(as);
535 	if (error)
536 		goto errret;
537 
538 	/* record shmem range for the detach */
539 	sa_add(pp, addr, (size_t)size, useISM ? SHMSA_ISM : 0, sp);
540 	*rvp = (uintptr_t)addr;
541 
542 	sp->shm_atime = gethrestime_sec();
543 	sp->shm_lpid = pp->p_pid;
544 	ipc_hold(shm_svc, (kipc_perm_t *)sp);
545 errret:
546 	mutex_exit(lock);
547 	return (error);
548 }
549 
550 static void
551 shm_dtor(kipc_perm_t *perm)
552 {
553 	kshmid_t *sp = (kshmid_t *)perm;
554 	uint_t cnt;
555 
556 	if (sp->shm_sptinfo) {
557 		if (isspt(sp))
558 			sptdestroy(sp->shm_sptinfo->sptas, sp->shm_amp);
559 		kmem_free(sp->shm_sptinfo, sizeof (sptinfo_t));
560 	}
561 
562 	ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER);
563 	cnt = --sp->shm_amp->refcnt;
564 	ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock);
565 	ASSERT(cnt == 0);
566 	shm_rm_amp(sp->shm_amp, sp->shm_lkcnt);
567 
568 	if (sp->shm_perm.ipc_id != IPC_ID_INVAL) {
569 		ipcs_lock(shm_svc);
570 		sp->shm_perm.ipc_proj->kpj_data.kpd_shmmax -=
571 		    ptob(btopr(sp->shm_segsz));
572 		ipcs_unlock(shm_svc);
573 	}
574 }
575 
576 /* ARGSUSED */
577 static void
578 shm_rmid(kipc_perm_t *perm)
579 {
580 	/* nothing to do */
581 }
582 
583 /*
584  * Shmctl system call.
585  */
586 /* ARGSUSED */
587 static int
588 shmctl(int shmid, int cmd, void *arg)
589 {
590 	kshmid_t		*sp;	/* shared memory header ptr */
591 	STRUCT_DECL(shmid_ds, ds);	/* for SVR4 IPC_SET */
592 	int			error = 0;
593 	struct cred 		*cr = CRED();
594 	kmutex_t		*lock;
595 	model_t			mdl = get_udatamodel();
596 	struct shmid_ds64	ds64;
597 	shmatt_t		nattch;
598 
599 	STRUCT_INIT(ds, mdl);
600 
601 	/*
602 	 * Perform pre- or non-lookup actions (e.g. copyins, RMID).
603 	 */
604 	switch (cmd) {
605 	case IPC_SET:
606 		if (copyin(arg, STRUCT_BUF(ds), STRUCT_SIZE(ds)))
607 			return (EFAULT);
608 		break;
609 
610 	case IPC_SET64:
611 		if (copyin(arg, &ds64, sizeof (struct shmid_ds64)))
612 			return (EFAULT);
613 		break;
614 
615 	case IPC_RMID:
616 		return (ipc_rmid(shm_svc, shmid, cr));
617 	}
618 
619 	if ((lock = ipc_lookup(shm_svc, shmid, (kipc_perm_t **)&sp)) == NULL)
620 		return (EINVAL);
621 
622 	switch (cmd) {
623 	/* Set ownership and permissions. */
624 	case IPC_SET:
625 		if (error = ipcperm_set(shm_svc, cr, &sp->shm_perm,
626 		    &STRUCT_BUF(ds)->shm_perm, mdl))
627 				break;
628 		sp->shm_ctime = gethrestime_sec();
629 		break;
630 
631 	case IPC_STAT:
632 		if (error = ipcperm_access(&sp->shm_perm, SHM_R, cr))
633 			break;
634 
635 		nattch = sp->shm_perm.ipc_ref - 1;
636 
637 		ipcperm_stat(&STRUCT_BUF(ds)->shm_perm, &sp->shm_perm, mdl);
638 		STRUCT_FSET(ds, shm_segsz, sp->shm_segsz);
639 		STRUCT_FSETP(ds, shm_amp, NULL);	/* kernel addr */
640 		STRUCT_FSET(ds, shm_lkcnt, sp->shm_lkcnt);
641 		STRUCT_FSET(ds, shm_lpid, sp->shm_lpid);
642 		STRUCT_FSET(ds, shm_cpid, sp->shm_cpid);
643 		STRUCT_FSET(ds, shm_nattch, nattch);
644 		STRUCT_FSET(ds, shm_cnattch, sp->shm_ismattch);
645 		STRUCT_FSET(ds, shm_atime, sp->shm_atime);
646 		STRUCT_FSET(ds, shm_dtime, sp->shm_dtime);
647 		STRUCT_FSET(ds, shm_ctime, sp->shm_ctime);
648 
649 		mutex_exit(lock);
650 		if (copyout(STRUCT_BUF(ds), arg, STRUCT_SIZE(ds)))
651 			return (EFAULT);
652 
653 		return (0);
654 
655 	case IPC_SET64:
656 		if (error = ipcperm_set64(shm_svc, cr,
657 		    &sp->shm_perm, &ds64.shmx_perm))
658 			break;
659 		sp->shm_ctime = gethrestime_sec();
660 		break;
661 
662 	case IPC_STAT64:
663 		nattch = sp->shm_perm.ipc_ref - 1;
664 
665 		ipcperm_stat64(&ds64.shmx_perm, &sp->shm_perm);
666 		ds64.shmx_segsz = sp->shm_segsz;
667 		ds64.shmx_lkcnt = sp->shm_lkcnt;
668 		ds64.shmx_lpid = sp->shm_lpid;
669 		ds64.shmx_cpid = sp->shm_cpid;
670 		ds64.shmx_nattch = nattch;
671 		ds64.shmx_cnattch = sp->shm_ismattch;
672 		ds64.shmx_atime = sp->shm_atime;
673 		ds64.shmx_dtime = sp->shm_dtime;
674 		ds64.shmx_ctime = sp->shm_ctime;
675 
676 		mutex_exit(lock);
677 		if (copyout(&ds64, arg, sizeof (struct shmid_ds64)))
678 			return (EFAULT);
679 
680 		return (0);
681 
682 	/* Lock segment in memory */
683 	case SHM_LOCK:
684 		if ((error = secpolicy_lock_memory(cr)) != 0)
685 			break;
686 
687 		if (!isspt(sp) && (sp->shm_lkcnt++ == 0)) {
688 			if (error = shmem_lock(sp->shm_amp)) {
689 			    ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER);
690 			    cmn_err(CE_NOTE,
691 				"shmctl - couldn't lock %ld pages into memory",
692 				sp->shm_amp->size);
693 			    ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock);
694 			    error = ENOMEM;
695 			    sp->shm_lkcnt--;
696 			    shmem_unlock(sp->shm_amp, 0);
697 			}
698 		}
699 		break;
700 
701 	/* Unlock segment */
702 	case SHM_UNLOCK:
703 		if ((error = secpolicy_lock_memory(cr)) != 0)
704 			break;
705 
706 		if (!isspt(sp)) {
707 			if (sp->shm_lkcnt && (--sp->shm_lkcnt == 0)) {
708 				shmem_unlock(sp->shm_amp, 1);
709 			}
710 		}
711 		break;
712 
713 	default:
714 		error = EINVAL;
715 		break;
716 	}
717 	mutex_exit(lock);
718 	return (error);
719 }
720 
721 static void
722 shm_detach(proc_t *pp, segacct_t *sap)
723 {
724 	kshmid_t	*sp = sap->sa_id;
725 	size_t		len = sap->sa_len;
726 	caddr_t		addr = sap->sa_addr;
727 
728 	/*
729 	 * Discard lwpchan mappings.
730 	 */
731 	if (pp->p_lcp != NULL)
732 		lwpchan_delete_mapping(pp, addr, addr + len);
733 	(void) as_unmap(pp->p_as, addr, len);
734 
735 	/*
736 	 * Perform some detach-time accounting.
737 	 */
738 	(void) ipc_lock(shm_svc, sp->shm_perm.ipc_id);
739 	if (sap->sa_flags & SHMSA_ISM)
740 		sp->shm_ismattch--;
741 	sp->shm_dtime = gethrestime_sec();
742 	sp->shm_lpid = pp->p_pid;
743 	ipc_rele(shm_svc, (kipc_perm_t *)sp);	/* Drops lock */
744 
745 	kmem_free(sap, sizeof (segacct_t));
746 }
747 
748 static int
749 shmdt(caddr_t addr)
750 {
751 	proc_t *pp = curproc;
752 	segacct_t *sap, template;
753 
754 	mutex_enter(&pp->p_lock);
755 	prbarrier(pp);			/* block /proc.  See shmgetid(). */
756 
757 	template.sa_addr = addr;
758 	template.sa_len = 0;
759 	if ((pp->p_segacct == NULL) ||
760 	    ((sap = avl_find(pp->p_segacct, &template, NULL)) == NULL)) {
761 		mutex_exit(&pp->p_lock);
762 		return (EINVAL);
763 	}
764 	avl_remove(pp->p_segacct, sap);
765 	mutex_exit(&pp->p_lock);
766 
767 	shm_detach(pp, sap);
768 
769 	return (0);
770 }
771 
772 /*
773  * Remove all shared memory segments associated with a given zone.
774  * Called by zone_shutdown when the zone is halted.
775  */
776 /*ARGSUSED1*/
777 static void
778 shm_remove_zone(zoneid_t zoneid, void *arg)
779 {
780 	ipc_remove_zone(shm_svc, zoneid);
781 }
782 
783 /*
784  * Shmget (create new shmem) system call.
785  */
786 static int
787 shmget(key_t key, size_t size, int shmflg, uintptr_t *rvp)
788 {
789 	proc_t		*pp = curproc;
790 	kshmid_t	*sp;
791 	kmutex_t	*lock;
792 	int		error;
793 
794 top:
795 	if (error = ipc_get(shm_svc, key, shmflg, (kipc_perm_t **)&sp, &lock))
796 		return (error);
797 
798 	if (!IPC_FREE(&sp->shm_perm)) {
799 		/*
800 		 * A segment with the requested key exists.
801 		 */
802 		if (size > sp->shm_segsz) {
803 			mutex_exit(lock);
804 			return (EINVAL);
805 		}
806 	} else {
807 		/*
808 		 * A new segment should be created.
809 		 */
810 		size_t npages = btopr(size);
811 		size_t rsize = ptob(npages);
812 
813 		/*
814 		 * Check rsize and the per-project limit on shared
815 		 * memory.  Checking rsize handles both the size == 0
816 		 * case and the size < ULONG_MAX & PAGEMASK case (i.e.
817 		 * rounding up wraps a size_t).
818 		 */
819 		if (rsize == 0 || (rctl_test(rc_project_shmmax,
820 		    pp->p_task->tk_proj->kpj_rctls, pp, rsize,
821 		    RCA_SAFE) & RCT_DENY)) {
822 
823 			mutex_exit(&pp->p_lock);
824 			mutex_exit(lock);
825 			ipc_cleanup(shm_svc, (kipc_perm_t *)sp);
826 			return (EINVAL);
827 		}
828 		mutex_exit(&pp->p_lock);
829 		mutex_exit(lock);
830 
831 		if (anon_resv(rsize) == 0) {
832 			ipc_cleanup(shm_svc, (kipc_perm_t *)sp);
833 			return (ENOMEM);
834 		}
835 
836 		sp->shm_amp = anonmap_alloc(rsize, rsize);
837 
838 		/*
839 		 * Store the original user's requested size, in bytes,
840 		 * rather than the page-aligned size.  The former is
841 		 * used for IPC_STAT and shmget() lookups.  The latter
842 		 * is saved in the anon_map structure and is used for
843 		 * calls to the vm layer.
844 		 */
845 		sp->shm_segsz = size;
846 		sp->shm_atime = sp->shm_dtime = 0;
847 		sp->shm_ctime = gethrestime_sec();
848 		sp->shm_lpid = (pid_t)0;
849 		sp->shm_cpid = curproc->p_pid;
850 		sp->shm_ismattch = 0;
851 		sp->shm_sptinfo = NULL;
852 
853 		/*
854 		 * Check limits one last time, push id into global
855 		 * visibility, and update resource usage counts.
856 		 */
857 		if (error = ipc_commit_begin(shm_svc, key, shmflg,
858 		    (kipc_perm_t *)sp)) {
859 			if (error == EAGAIN)
860 				goto top;
861 			return (error);
862 		}
863 
864 		if (rctl_test(rc_project_shmmax,
865 		    sp->shm_perm.ipc_proj->kpj_rctls, pp, rsize,
866 		    RCA_SAFE) & RCT_DENY) {
867 			ipc_cleanup(shm_svc, (kipc_perm_t *)sp);
868 			return (EINVAL);
869 		}
870 		sp->shm_perm.ipc_proj->kpj_data.kpd_shmmax += rsize;
871 
872 		lock = ipc_commit_end(shm_svc, &sp->shm_perm);
873 	}
874 
875 #ifdef C2_AUDIT
876 	if (audit_active)
877 		audit_ipcget(AT_IPC_SHM, (void *)sp);
878 #endif
879 
880 	*rvp = (uintptr_t)(sp->shm_perm.ipc_id);
881 
882 	mutex_exit(lock);
883 	return (0);
884 }
885 
886 /*
887  * shmids system call.
888  */
889 static int
890 shmids(int *buf, uint_t nids, uint_t *pnids)
891 {
892 	return (ipc_ids(shm_svc, buf, nids, pnids));
893 }
894 
895 /*
896  * System entry point for shmat, shmctl, shmdt, and shmget system calls.
897  */
898 static uintptr_t
899 shmsys(int opcode, uintptr_t a0, uintptr_t a1, uintptr_t a2)
900 {
901 	int	error;
902 	uintptr_t r_val = 0;
903 
904 	switch (opcode) {
905 	case SHMAT:
906 		error = shmat((int)a0, (caddr_t)a1, (int)a2, &r_val);
907 		break;
908 	case SHMCTL:
909 		error = shmctl((int)a0, (int)a1, (void *)a2);
910 		break;
911 	case SHMDT:
912 		error = shmdt((caddr_t)a0);
913 		break;
914 	case SHMGET:
915 		error = shmget((key_t)a0, (size_t)a1, (int)a2, &r_val);
916 		break;
917 	case SHMIDS:
918 		error = shmids((int *)a0, (uint_t)a1, (uint_t *)a2);
919 		break;
920 	default:
921 		error = EINVAL;
922 		break;
923 	}
924 
925 	if (error)
926 		return ((uintptr_t)set_errno(error));
927 
928 	return (r_val);
929 }
930 
931 /*
932  * segacct_t comparator
933  * This works as expected, with one minor change: the first of two real
934  * segments with equal addresses is considered to be 'greater than' the
935  * second.  We only return equal when searching using a template, in
936  * which case we explicitly set the template segment's length to 0
937  * (which is invalid for a real segment).
938  */
939 static int
940 shm_sacompar(const void *x, const void *y)
941 {
942 	segacct_t *sa1 = (segacct_t *)x;
943 	segacct_t *sa2 = (segacct_t *)y;
944 
945 	if (sa1->sa_addr < sa2->sa_addr)
946 		return (-1);
947 	if (sa1->sa_addr > sa2->sa_addr)
948 		return (1);
949 	if ((sa1->sa_len == 0) || (sa2->sa_len == 0))
950 		return (0);
951 	return (1);
952 }
953 
954 /*
955  * add this record to the segacct list.
956  */
957 static void
958 sa_add(struct proc *pp, caddr_t addr, size_t len, ulong_t flags, kshmid_t *id)
959 {
960 	segacct_t *nsap;
961 	avl_tree_t *tree = NULL;
962 	avl_index_t where;
963 
964 	nsap = kmem_alloc(sizeof (segacct_t), KM_SLEEP);
965 	nsap->sa_addr = addr;
966 	nsap->sa_len  = len;
967 	nsap->sa_flags = flags;
968 	nsap->sa_id = id;
969 
970 	if (pp->p_segacct == NULL)
971 		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
972 
973 	mutex_enter(&pp->p_lock);
974 	prbarrier(pp);			/* block /proc.  See shmgetid(). */
975 
976 	if (pp->p_segacct == NULL) {
977 		avl_create(tree, shm_sacompar, sizeof (segacct_t),
978 		    offsetof(segacct_t, sa_tree));
979 		pp->p_segacct = tree;
980 	} else if (tree) {
981 		kmem_free(tree, sizeof (avl_tree_t));
982 	}
983 
984 	/*
985 	 * We can ignore the result of avl_find, as the comparator will
986 	 * never return equal for segments with non-zero length.  This
987 	 * is a necessary hack to get around the fact that we do, in
988 	 * fact, have duplicate keys.
989 	 */
990 	(void) avl_find(pp->p_segacct, nsap, &where);
991 	avl_insert(pp->p_segacct, nsap, where);
992 
993 	mutex_exit(&pp->p_lock);
994 }
995 
996 /*
997  * Duplicate parent's segacct records in child.
998  */
999 void
1000 shmfork(struct proc *ppp, struct proc *cpp)
1001 {
1002 	segacct_t *sap;
1003 	kshmid_t *sp;
1004 	kmutex_t *mp;
1005 
1006 	ASSERT(ppp->p_segacct != NULL);
1007 
1008 	/*
1009 	 * We are the only lwp running in the parent so nobody can
1010 	 * mess with our p_segacct list.  Thus it is safe to traverse
1011 	 * the list without holding p_lock.  This is essential because
1012 	 * we can't hold p_lock during a KM_SLEEP allocation.
1013 	 */
1014 	for (sap = (segacct_t *)avl_first(ppp->p_segacct); sap != NULL;
1015 	    sap = (segacct_t *)AVL_NEXT(ppp->p_segacct, sap)) {
1016 		sa_add(cpp, sap->sa_addr, sap->sa_len, sap->sa_flags,
1017 		    sap->sa_id);
1018 		sp = sap->sa_id;
1019 		mp = ipc_lock(shm_svc, sp->shm_perm.ipc_id);
1020 		if (sap->sa_flags & SHMSA_ISM)
1021 			sp->shm_ismattch++;
1022 		ipc_hold(shm_svc, (kipc_perm_t *)sp);
1023 		mutex_exit(mp);
1024 	}
1025 }
1026 
1027 /*
1028  * Detach shared memory segments from exiting process.
1029  */
1030 void
1031 shmexit(struct proc *pp)
1032 {
1033 	segacct_t *sap;
1034 	avl_tree_t *tree;
1035 	void *cookie = NULL;
1036 
1037 	ASSERT(pp->p_segacct != NULL);
1038 
1039 	mutex_enter(&pp->p_lock);
1040 	prbarrier(pp);
1041 	tree = pp->p_segacct;
1042 	pp->p_segacct = NULL;
1043 	mutex_exit(&pp->p_lock);
1044 
1045 	while ((sap = avl_destroy_nodes(tree, &cookie)) != NULL)
1046 		(void) shm_detach(pp, sap);
1047 
1048 	avl_destroy(tree);
1049 	kmem_free(tree, sizeof (avl_tree_t));
1050 }
1051 
1052 /*
1053  * At this time pages should be in memory, so just lock them.
1054  */
1055 static void
1056 lock_again(size_t npages, struct anon_map *amp)
1057 {
1058 	struct anon *ap;
1059 	struct page *pp;
1060 	struct vnode *vp;
1061 	anoff_t off;
1062 	ulong_t anon_idx;
1063 	anon_sync_obj_t cookie;
1064 
1065 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
1066 
1067 	for (anon_idx = 0; npages != 0; anon_idx++, npages--) {
1068 
1069 		anon_array_enter(amp, anon_idx, &cookie);
1070 		ap = anon_get_ptr(amp->ahp, anon_idx);
1071 		swap_xlate(ap, &vp, &off);
1072 		anon_array_exit(&cookie);
1073 
1074 		pp = page_lookup(vp, (u_offset_t)off, SE_SHARED);
1075 		if (pp == NULL) {
1076 			panic("lock_again: page not in the system");
1077 			/*NOTREACHED*/
1078 		}
1079 		(void) page_pp_lock(pp, 0, 0);
1080 		page_unlock(pp);
1081 	}
1082 	ANON_LOCK_EXIT(&amp->a_rwlock);
1083 }
1084 
1085 /* check if this segment is already locked. */
1086 /*ARGSUSED*/
1087 static int
1088 check_locked(struct as *as, struct segvn_data *svd, size_t npages)
1089 {
1090 	struct vpage *vpp = svd->vpage;
1091 	size_t i;
1092 	if (svd->vpage == NULL)
1093 		return (0);		/* unlocked */
1094 
1095 	SEGVN_LOCK_ENTER(as, &svd->lock, RW_READER);
1096 	for (i = 0; i < npages; i++, vpp++) {
1097 		if (VPP_ISPPLOCK(vpp) == 0) {
1098 			SEGVN_LOCK_EXIT(as, &svd->lock);
1099 			return (1);	/* partially locked */
1100 		}
1101 	}
1102 	SEGVN_LOCK_EXIT(as, &svd->lock);
1103 	return (2);			/* locked */
1104 }
1105 
1106 
1107 /*
1108  * Attach the shared memory segment to the process
1109  * address space and lock the pages.
1110  */
1111 static int
1112 shmem_lock(struct anon_map *amp)
1113 {
1114 	size_t npages = btopr(amp->size);
1115 	struct seg *seg;
1116 	struct as *as;
1117 	struct segvn_crargs crargs;
1118 	struct segvn_data *svd;
1119 	proc_t *p = curproc;
1120 	caddr_t addr;
1121 	uint_t error, ret;
1122 	caddr_t seg_base;
1123 	size_t  seg_sz;
1124 
1125 	as = p->p_as;
1126 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1127 	/* check if shared memory is already attached */
1128 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
1129 		svd = (struct segvn_data *)seg->s_data;
1130 		if ((seg->s_ops == &segvn_ops) && (svd->amp == amp) &&
1131 		    (amp->size == seg->s_size)) {
1132 			switch (ret = check_locked(as, svd, npages)) {
1133 			case 0:			/* unlocked */
1134 			case 1:			/* partially locked */
1135 				seg_base = seg->s_base;
1136 				seg_sz = seg->s_size;
1137 
1138 				AS_LOCK_EXIT(as, &as->a_lock);
1139 				if ((error = as_ctl(as, seg_base, seg_sz,
1140 					MC_LOCK, 0, 0, NULL, 0)) == 0)
1141 					lock_again(npages, amp);
1142 				(void) as_ctl(as, seg_base, seg_sz, MC_UNLOCK,
1143 					0, 0, NULL, NULL);
1144 				return (error);
1145 			case 2:			/* locked */
1146 				AS_LOCK_EXIT(as, &as->a_lock);
1147 				lock_again(npages, amp);
1148 				return (0);
1149 			default:
1150 				cmn_err(CE_WARN, "shmem_lock: deflt %d", ret);
1151 				break;
1152 			}
1153 		}
1154 	}
1155 	AS_LOCK_EXIT(as, &as->a_lock);
1156 
1157 	/* attach shm segment to our address space */
1158 	as_rangelock(as);
1159 	map_addr(&addr, amp->size, 0ll, 1, 0);
1160 	if (addr == NULL) {
1161 		as_rangeunlock(as);
1162 		return (ENOMEM);
1163 	}
1164 
1165 	/* Initialize the create arguments and map the segment */
1166 	crargs = *(struct segvn_crargs *)zfod_argsp;	/* structure copy */
1167 	crargs.offset = (u_offset_t)0;
1168 	crargs.type = MAP_SHARED;
1169 	crargs.amp = amp;
1170 	crargs.prot = PROT_ALL;
1171 	crargs.maxprot = crargs.prot;
1172 	crargs.flags = 0;
1173 
1174 	error = as_map(as, addr, amp->size, segvn_create, &crargs);
1175 	as_rangeunlock(as);
1176 	if (!error) {
1177 		if ((error = as_ctl(as, addr, amp->size, MC_LOCK, 0, 0,
1178 			NULL, 0)) == 0) {
1179 			lock_again(npages, amp);
1180 		}
1181 		(void) as_unmap(as, addr, amp->size);
1182 	}
1183 	return (error);
1184 }
1185 
1186 
1187 /*
1188  * Unlock shared memory
1189  */
1190 static void
1191 shmem_unlock(struct anon_map *amp, uint_t lck)
1192 {
1193 	struct anon *ap;
1194 	pgcnt_t npages = btopr(amp->size);
1195 	struct vnode *vp;
1196 	struct page *pp;
1197 	anoff_t off;
1198 	ulong_t anon_idx;
1199 
1200 	for (anon_idx = 0; anon_idx < npages; anon_idx++) {
1201 
1202 		if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) {
1203 			if (lck) {
1204 				panic("shmem_unlock: null app");
1205 				/*NOTREACHED*/
1206 			}
1207 			continue;
1208 		}
1209 		swap_xlate(ap, &vp, &off);
1210 		pp = page_lookup(vp, off, SE_SHARED);
1211 		if (pp == NULL) {
1212 			if (lck) {
1213 				panic("shmem_unlock: page not in the system");
1214 				/*NOTREACHED*/
1215 			}
1216 			continue;
1217 		}
1218 		if (pp->p_lckcnt) {
1219 			page_pp_unlock(pp, 0, 0);
1220 		}
1221 		page_unlock(pp);
1222 	}
1223 }
1224 
1225 /*
1226  * We call this routine when we have removed all references to this
1227  * amp.  This means all shmdt()s and the IPC_RMID have been done.
1228  */
1229 static void
1230 shm_rm_amp(struct anon_map *amp, uint_t lckflag)
1231 {
1232 	/*
1233 	 * If we are finally deleting the
1234 	 * shared memory, and if no one did
1235 	 * the SHM_UNLOCK, we must do it now.
1236 	 */
1237 	shmem_unlock(amp, lckflag);
1238 
1239 	/*
1240 	 * Free up the anon_map.
1241 	 */
1242 	lgrp_shm_policy_fini(amp, NULL);
1243 	anon_free(amp->ahp, 0, amp->size);
1244 	anon_unresv(amp->swresv);
1245 	anonmap_free(amp);
1246 }
1247 
1248 /*
1249  * Return the shared memory id for the process's virtual address.
1250  * Return SHMID_NONE if addr is not within a SysV shared memory segment.
1251  * Return SHMID_FREE if addr's SysV shared memory segment's id has been freed.
1252  *
1253  * shmgetid() is called from code in /proc with the process locked but
1254  * with pp->p_lock not held.  The address space lock is held, so we
1255  * cannot grab pp->p_lock here due to lock-ordering constraints.
1256  * Because of all this, modifications to the p_segacct list must only
1257  * be made after calling prbarrier() to ensure the process is not locked.
1258  * See shmdt() and sa_add(), above. shmgetid() may also be called on a
1259  * thread's own process without the process locked.
1260  */
1261 int
1262 shmgetid(proc_t *pp, caddr_t addr)
1263 {
1264 	segacct_t *sap, template;
1265 
1266 	ASSERT(MUTEX_NOT_HELD(&pp->p_lock));
1267 	ASSERT((pp->p_proc_flag & P_PR_LOCK) || pp == curproc);
1268 
1269 	if (pp->p_segacct == NULL)
1270 		return (SHMID_NONE);
1271 
1272 	template.sa_addr = addr;
1273 	template.sa_len = 0;
1274 	if ((sap = avl_find(pp->p_segacct, &template, NULL)) == NULL)
1275 		return (SHMID_NONE);
1276 
1277 	if (IPC_FREE(&sap->sa_id->shm_perm))
1278 		return (SHMID_FREE);
1279 
1280 	return (sap->sa_id->shm_perm.ipc_id);
1281 }
1282