xref: /titanic_51/usr/src/uts/common/os/shm.c (revision c793af95640863cd29868fc7c419c5d2496b207b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /*	  All Rights Reserved	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 #pragma ident	"%Z%%M%	%I%	%E% SMI"
40 
41 /*
42  * Inter-Process Communication Shared Memory Facility.
43  *
44  * See os/ipc.c for a description of common IPC functionality.
45  *
46  * Resource controls
47  * -----------------
48  *
49  * Control:      project.max-shm-ids (rc_project_shmmni)
50  * Description:  Maximum number of shared memory ids allowed a project.
51  *
52  *   When shmget() is used to allocate a shared memory segment, one id
53  *   is allocated.  If the id allocation doesn't succeed, shmget()
54  *   fails and errno is set to ENOSPC.  Upon successful shmctl(,
55  *   IPC_RMID) the id is deallocated.
56  *
57  * Control:      project.max-shm-memory (rc_project_shmmax)
58  * Description:  Total amount of shared memory allowed a project.
59  *
60  *   When shmget() is used to allocate a shared memory segment, the
61  *   segment's size is allocated against this limit.  If the space
62  *   allocation doesn't succeed, shmget() fails and errno is set to
63  *   EINVAL.  The size will be deallocated once the last process has
64  *   detached the segment and the segment has been successfully
65  *   shmctl(, IPC_RMID)ed.
66  */
67 
68 #include <sys/types.h>
69 #include <sys/param.h>
70 #include <sys/cred.h>
71 #include <sys/errno.h>
72 #include <sys/time.h>
73 #include <sys/kmem.h>
74 #include <sys/user.h>
75 #include <sys/proc.h>
76 #include <sys/systm.h>
77 #include <sys/prsystm.h>
78 #include <sys/sysmacros.h>
79 #include <sys/tuneable.h>
80 #include <sys/vm.h>
81 #include <sys/mman.h>
82 #include <sys/swap.h>
83 #include <sys/cmn_err.h>
84 #include <sys/debug.h>
85 #include <sys/lwpchan_impl.h>
86 #include <sys/avl.h>
87 #include <sys/modctl.h>
88 #include <sys/syscall.h>
89 #include <sys/task.h>
90 #include <sys/project.h>
91 #include <sys/policy.h>
92 #include <sys/zone.h>
93 
94 #include <sys/ipc.h>
95 #include <sys/ipc_impl.h>
96 #include <sys/shm.h>
97 #include <sys/shm_impl.h>
98 
99 #include <vm/hat.h>
100 #include <vm/seg.h>
101 #include <vm/as.h>
102 #include <vm/seg_vn.h>
103 #include <vm/anon.h>
104 #include <vm/page.h>
105 #include <vm/vpage.h>
106 #include <vm/seg_spt.h>
107 
108 #include <c2/audit.h>
109 
110 static int shmem_lock(struct anon_map *amp);
111 static void shmem_unlock(struct anon_map *amp, uint_t lck);
112 static void sa_add(struct proc *pp, caddr_t addr, size_t len, ulong_t flags,
113 	kshmid_t *id);
114 static void shm_rm_amp(struct anon_map *amp, uint_t lckflag);
115 static void shm_dtor(kipc_perm_t *);
116 static void shm_rmid(kipc_perm_t *);
117 static void shm_remove_zone(zoneid_t, void *);
118 
119 /*
120  * Semantics for share_page_table and ism_off:
121  *
122  * These are hooks in /etc/system - only for internal testing purpose.
123  *
124  * Setting share_page_table automatically turns on the SHM_SHARE_MMU (ISM) flag
125  * in a call to shmat(2). In other words, with share_page_table set, you always
126  * get ISM, even if say, DISM is specified. It should really be called "ism_on".
127  *
128  * Setting ism_off turns off the SHM_SHARE_MMU flag from the flags passed to
129  * shmat(2).
130  *
131  * If both share_page_table and ism_off are set, share_page_table prevails.
132  *
133  * Although these tunables should probably be removed, they do have some
134  * external exposure; as long as they exist, they should at least work sensibly.
135  */
136 
137 int share_page_table;
138 int ism_off;
139 
140 /*
141  * The following tunables are obsolete.  Though for compatibility we
142  * still read and interpret shminfo_shmmax and shminfo_shmmni (see
143  * os/project.c), the preferred mechanism for administrating the IPC
144  * Shared Memory facility is through the resource controls described at
145  * the top of this file.
146  */
147 size_t	shminfo_shmmax = 0x800000;	/* (obsolete) */
148 int	shminfo_shmmni = 100;		/* (obsolete) */
149 size_t	shminfo_shmmin = 1;		/* (obsolete) */
150 int	shminfo_shmseg = 6;		/* (obsolete) */
151 
152 extern rctl_hndl_t rc_project_shmmax;
153 extern rctl_hndl_t rc_project_shmmni;
154 static ipc_service_t *shm_svc;
155 static zone_key_t shm_zone_key;
156 
157 /*
158  * Module linkage information for the kernel.
159  */
160 static uintptr_t shmsys(int, uintptr_t, uintptr_t, uintptr_t);
161 
162 static struct sysent ipcshm_sysent = {
163 	4,
164 #ifdef	_SYSCALL32_IMPL
165 	SE_ARGC | SE_NOUNLOAD | SE_64RVAL,
166 #else	/* _SYSCALL32_IMPL */
167 	SE_ARGC | SE_NOUNLOAD | SE_32RVAL1,
168 #endif	/* _SYSCALL32_IMPL */
169 	(int (*)())shmsys
170 };
171 
172 #ifdef	_SYSCALL32_IMPL
173 static struct sysent ipcshm_sysent32 = {
174 	4,
175 	SE_ARGC | SE_NOUNLOAD | SE_32RVAL1,
176 	(int (*)())shmsys
177 };
178 #endif	/* _SYSCALL32_IMPL */
179 
180 static struct modlsys modlsys = {
181 	&mod_syscallops, "System V shared memory", &ipcshm_sysent
182 };
183 
184 #ifdef	_SYSCALL32_IMPL
185 static struct modlsys modlsys32 = {
186 	&mod_syscallops32, "32-bit System V shared memory", &ipcshm_sysent32
187 };
188 #endif	/* _SYSCALL32_IMPL */
189 
190 static struct modlinkage modlinkage = {
191 	MODREV_1,
192 	&modlsys,
193 #ifdef	_SYSCALL32_IMPL
194 	&modlsys32,
195 #endif
196 	NULL
197 };
198 
199 
200 int
201 _init(void)
202 {
203 	int result;
204 
205 	shm_svc = ipcs_create("shmids", rc_project_shmmni, sizeof (kshmid_t),
206 	    shm_dtor, shm_rmid, AT_IPC_SHM,
207 	    offsetof(kproject_data_t, kpd_shmmni));
208 	zone_key_create(&shm_zone_key, NULL, shm_remove_zone, NULL);
209 
210 	if ((result = mod_install(&modlinkage)) == 0)
211 		return (0);
212 
213 	(void) zone_key_delete(shm_zone_key);
214 	ipcs_destroy(shm_svc);
215 
216 	return (result);
217 }
218 
219 int
220 _fini(void)
221 {
222 	return (EBUSY);
223 }
224 
225 int
226 _info(struct modinfo *modinfop)
227 {
228 	return (mod_info(&modlinkage, modinfop));
229 }
230 
231 /*
232  * Shmat (attach shared segment) system call.
233  */
234 static int
235 shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
236 {
237 	kshmid_t *sp;	/* shared memory header ptr */
238 	size_t	size;
239 	int	error = 0;
240 	proc_t *pp = curproc;
241 	struct as *as = pp->p_as;
242 	struct segvn_crargs	crargs;	/* segvn create arguments */
243 	kmutex_t	*lock;
244 	struct seg 	*segspt = NULL;
245 	caddr_t		addr = uaddr;
246 	int		flags = (uflags & SHMAT_VALID_FLAGS_MASK);
247 	int		useISM;
248 	uchar_t		prot = PROT_ALL;
249 	int result;
250 
251 	if ((lock = ipc_lookup(shm_svc, shmid, (kipc_perm_t **)&sp)) == NULL)
252 		return (EINVAL);
253 	if (error = ipcperm_access(&sp->shm_perm, SHM_R, CRED()))
254 		goto errret;
255 	if ((flags & SHM_RDONLY) == 0 &&
256 	    (error = ipcperm_access(&sp->shm_perm, SHM_W, CRED())))
257 		goto errret;
258 	if (spt_invalid(flags)) {
259 		error = EINVAL;
260 		goto errret;
261 	}
262 	if (ism_off)
263 		flags = flags & ~SHM_SHARE_MMU;
264 	if (share_page_table) {
265 		flags = flags & ~SHM_PAGEABLE;
266 		flags = flags | SHM_SHARE_MMU;
267 	}
268 	useISM = (spt_locked(flags) || spt_pageable(flags));
269 	if (useISM && (error = ipcperm_access(&sp->shm_perm, SHM_W, CRED())))
270 		goto errret;
271 	if (useISM && isspt(sp)) {
272 		uint_t newsptflags = flags | spt_flags(sp->shm_sptseg);
273 		/*
274 		 * If trying to change an existing {D}ISM segment from ISM
275 		 * to DISM or vice versa, return error. Note that this
276 		 * validation of flags needs to be done after the effect of
277 		 * tunables such as ism_off and share_page_table, for
278 		 * semantics that are consistent with the tunables' settings.
279 		 */
280 		if (spt_invalid(newsptflags)) {
281 			error = EINVAL;
282 			goto errret;
283 		}
284 	}
285 	ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER);
286 	size = sp->shm_amp->size;
287 	ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock);
288 
289 	/* somewhere to record spt info for final detach */
290 	if (sp->shm_sptinfo == NULL)
291 		sp->shm_sptinfo = kmem_zalloc(sizeof (sptinfo_t), KM_SLEEP);
292 
293 	as_rangelock(as);
294 
295 	if (useISM) {
296 		/*
297 		 * Handle ISM
298 		 */
299 		uint_t	n, share_szc;
300 		size_t	share_size;
301 		struct	shm_data ssd;
302 		uintptr_t align_hint;
303 
304 		n = page_num_pagesizes();
305 		if (n < 2) { /* large pages aren't supported */
306 			as_rangeunlock(as);
307 			error = EINVAL;
308 			goto errret;
309 		}
310 
311 		/*
312 		 * Pick a share pagesize to use, if (!isspt(sp)).
313 		 * Otherwise use the already chosen page size.
314 		 *
315 		 * For the initial shmat (!isspt(sp)), where sptcreate is
316 		 * called, map_pgsz is called to recommend a [D]ISM pagesize,
317 		 * important for systems which offer more than one potential
318 		 * [D]ISM pagesize.
319 		 * If the shmat is just to attach to an already created
320 		 * [D]ISM segment, then use the previously selected page size.
321 		 */
322 		if (!isspt(sp)) {
323 			share_size = map_pgsz(MAPPGSZ_ISM,
324 			    pp, addr, size, NULL);
325 			if (share_size == 0) {
326 				as_rangeunlock(as);
327 				error = EINVAL;
328 				goto errret;
329 			}
330 			share_szc = page_szc(share_size);
331 		} else {
332 			share_szc = sp->shm_sptseg->s_szc;
333 			share_size = page_get_pagesize(share_szc);
334 		}
335 		size = P2ROUNDUP(size, share_size);
336 
337 		align_hint = share_size;
338 #if defined(__i386) || defined(__amd64)
339 		/*
340 		 * For 64 bit amd64, we want to share an entire page table
341 		 * if possible. We know (ugh) that there are 512 entries in
342 		 * in a page table. The number for 32 bit non-PAE should be
343 		 * 1024, but I'm not going to special case that. Note using 512
344 		 * won't cause a failure below. It retries with align_hint set
345 		 * to share_size
346 		 */
347 		while (size >= 512 * (uint64_t)align_hint)
348 			align_hint *= 512;
349 #endif /* __i386 || __amd64 */
350 
351 #if defined(__sparcv9)
352 		if (addr == 0 && curproc->p_model == DATAMODEL_LP64) {
353 			/*
354 			 * If no address has been passed in, and this is a
355 			 * 64-bit process, we'll try to find an address
356 			 * in the predict-ISM zone.
357 			 */
358 			caddr_t predbase = (caddr_t)PREDISM_1T_BASE;
359 			size_t len = PREDISM_BOUND - PREDISM_1T_BASE;
360 
361 			as_purge(as);
362 			if (as_gap(as, size + share_size, &predbase, &len,
363 			    AH_LO, (caddr_t)NULL) != -1) {
364 				/*
365 				 * We found an address which looks like a
366 				 * candidate.  We want to round it up, and
367 				 * then check that it's a valid user range.
368 				 * This assures that we won't fail below.
369 				 */
370 				addr = (caddr_t)P2ROUNDUP((uintptr_t)predbase,
371 				    share_size);
372 
373 				if (valid_usr_range(addr, size, prot,
374 				    as, as->a_userlimit) != RANGE_OKAY) {
375 					addr = 0;
376 				}
377 			}
378 		}
379 #endif /* __sparcv9 */
380 
381 		if (addr == 0) {
382 			for (;;) {
383 				addr = (caddr_t)align_hint;
384 				map_addr(&addr, size, 0ll, 1, MAP_ALIGN);
385 				if (addr != NULL || align_hint == share_size)
386 					break;
387 				align_hint = share_size;
388 			}
389 			if (addr == NULL) {
390 				as_rangeunlock(as);
391 				error = ENOMEM;
392 				goto errret;
393 			}
394 			ASSERT(((uintptr_t)addr & (align_hint - 1)) == 0);
395 		} else {
396 			/* Use the user-supplied attach address */
397 			caddr_t base;
398 			size_t len;
399 
400 			/*
401 			 * Check that the address range
402 			 *  1) is properly aligned
403 			 *  2) is correct in unix terms
404 			 *  3) is within an unmapped address segment
405 			 */
406 			base = addr;
407 			len = size;		/* use spt aligned size */
408 			/* XXX - in SunOS, is sp->shm_segsz */
409 			if ((uintptr_t)base & (share_size - 1)) {
410 				error = EINVAL;
411 				as_rangeunlock(as);
412 				goto errret;
413 			}
414 			result = valid_usr_range(base, len, prot, as,
415 			    as->a_userlimit);
416 			if (result == RANGE_BADPROT) {
417 				/*
418 				 * We try to accomodate processors which
419 				 * may not support execute permissions on
420 				 * all ISM segments by trying the check
421 				 * again but without PROT_EXEC.
422 				 */
423 				prot &= ~PROT_EXEC;
424 				result = valid_usr_range(base, len, prot, as,
425 				    as->a_userlimit);
426 			}
427 			as_purge(as);
428 			if (result != RANGE_OKAY ||
429 			    as_gap(as, len, &base, &len, AH_LO,
430 			    (caddr_t)NULL) != 0) {
431 				error = EINVAL;
432 				as_rangeunlock(as);
433 				goto errret;
434 			}
435 		}
436 
437 		if (!isspt(sp)) {
438 			error = sptcreate(size, &segspt, sp->shm_amp, prot,
439 			    flags, share_szc);
440 			if (error) {
441 				as_rangeunlock(as);
442 				goto errret;
443 			}
444 			sp->shm_sptinfo->sptas = segspt->s_as;
445 			sp->shm_sptseg = segspt;
446 			sp->shm_sptprot = prot;
447 			sp->shm_lkcnt = 0;
448 		} else if ((prot & sp->shm_sptprot) != sp->shm_sptprot) {
449 			/*
450 			 * Ensure we're attaching to an ISM segment with
451 			 * fewer or equal permissions than what we're
452 			 * allowed.  Fail if the segment has more
453 			 * permissions than what we're allowed.
454 			 */
455 			error = EACCES;
456 			as_rangeunlock(as);
457 			goto errret;
458 		}
459 
460 		ssd.shm_sptseg = sp->shm_sptseg;
461 		ssd.shm_sptas = sp->shm_sptinfo->sptas;
462 		ssd.shm_amp = sp->shm_amp;
463 		error = as_map(as, addr, size, segspt_shmattach, &ssd);
464 		if (error == 0)
465 			sp->shm_ismattch++; /* keep count of ISM attaches */
466 	} else {
467 
468 		/*
469 		 * Normal case.
470 		 */
471 		if (flags & SHM_RDONLY)
472 			prot &= ~PROT_WRITE;
473 
474 		if (addr == 0) {
475 			/* Let the system pick the attach address */
476 			map_addr(&addr, size, 0ll, 1, 0);
477 			if (addr == NULL) {
478 				as_rangeunlock(as);
479 				error = ENOMEM;
480 				goto errret;
481 			}
482 		} else {
483 			/* Use the user-supplied attach address */
484 			caddr_t base;
485 			size_t len;
486 
487 			if (flags & SHM_RND)
488 				addr = (caddr_t)((uintptr_t)addr &
489 				    ~(SHMLBA - 1));
490 			/*
491 			 * Check that the address range
492 			 *  1) is properly aligned
493 			 *  2) is correct in unix terms
494 			 *  3) is within an unmapped address segment
495 			 */
496 			base = addr;
497 			len = size;		/* use aligned size */
498 			/* XXX - in SunOS, is sp->shm_segsz */
499 			if ((uintptr_t)base & PAGEOFFSET) {
500 				error = EINVAL;
501 				as_rangeunlock(as);
502 				goto errret;
503 			}
504 			result = valid_usr_range(base, len, prot, as,
505 			    as->a_userlimit);
506 			if (result == RANGE_BADPROT) {
507 				prot &= ~PROT_EXEC;
508 				result = valid_usr_range(base, len, prot, as,
509 				    as->a_userlimit);
510 			}
511 			as_purge(as);
512 			if (result != RANGE_OKAY ||
513 			    as_gap(as, len, &base, &len,
514 			    AH_LO, (caddr_t)NULL) != 0) {
515 				error = EINVAL;
516 				as_rangeunlock(as);
517 				goto errret;
518 			}
519 		}
520 
521 		/* Initialize the create arguments and map the segment */
522 		crargs = *(struct segvn_crargs *)zfod_argsp;
523 		crargs.offset = 0;
524 		crargs.type = MAP_SHARED;
525 		crargs.amp = sp->shm_amp;
526 		crargs.prot = prot;
527 		crargs.maxprot = crargs.prot;
528 		crargs.flags = 0;
529 
530 		error = as_map(as, addr, size, segvn_create, &crargs);
531 	}
532 
533 	as_rangeunlock(as);
534 	if (error)
535 		goto errret;
536 
537 	/* record shmem range for the detach */
538 	sa_add(pp, addr, (size_t)size, useISM ? SHMSA_ISM : 0, sp);
539 	*rvp = (uintptr_t)addr;
540 
541 	sp->shm_atime = gethrestime_sec();
542 	sp->shm_lpid = pp->p_pid;
543 	ipc_hold(shm_svc, (kipc_perm_t *)sp);
544 errret:
545 	mutex_exit(lock);
546 	return (error);
547 }
548 
549 static void
550 shm_dtor(kipc_perm_t *perm)
551 {
552 	kshmid_t *sp = (kshmid_t *)perm;
553 	uint_t cnt;
554 
555 	if (sp->shm_sptinfo) {
556 		if (isspt(sp))
557 			sptdestroy(sp->shm_sptinfo->sptas, sp->shm_amp);
558 		kmem_free(sp->shm_sptinfo, sizeof (sptinfo_t));
559 	}
560 
561 	ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER);
562 	cnt = --sp->shm_amp->refcnt;
563 	ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock);
564 	ASSERT(cnt == 0);
565 	shm_rm_amp(sp->shm_amp, sp->shm_lkcnt);
566 
567 	if (sp->shm_perm.ipc_id != IPC_ID_INVAL) {
568 		ipcs_lock(shm_svc);
569 		sp->shm_perm.ipc_proj->kpj_data.kpd_shmmax -=
570 		    ptob(btopr(sp->shm_segsz));
571 		ipcs_unlock(shm_svc);
572 	}
573 }
574 
575 /* ARGSUSED */
576 static void
577 shm_rmid(kipc_perm_t *perm)
578 {
579 	/* nothing to do */
580 }
581 
582 /*
583  * Shmctl system call.
584  */
585 /* ARGSUSED */
586 static int
587 shmctl(int shmid, int cmd, void *arg)
588 {
589 	kshmid_t		*sp;	/* shared memory header ptr */
590 	STRUCT_DECL(shmid_ds, ds);	/* for SVR4 IPC_SET */
591 	int			error = 0;
592 	struct cred 		*cr = CRED();
593 	kmutex_t		*lock;
594 	model_t			mdl = get_udatamodel();
595 	struct shmid_ds64	ds64;
596 	shmatt_t		nattch;
597 
598 	STRUCT_INIT(ds, mdl);
599 
600 	/*
601 	 * Perform pre- or non-lookup actions (e.g. copyins, RMID).
602 	 */
603 	switch (cmd) {
604 	case IPC_SET:
605 		if (copyin(arg, STRUCT_BUF(ds), STRUCT_SIZE(ds)))
606 			return (EFAULT);
607 		break;
608 
609 	case IPC_SET64:
610 		if (copyin(arg, &ds64, sizeof (struct shmid_ds64)))
611 			return (EFAULT);
612 		break;
613 
614 	case IPC_RMID:
615 		return (ipc_rmid(shm_svc, shmid, cr));
616 	}
617 
618 	if ((lock = ipc_lookup(shm_svc, shmid, (kipc_perm_t **)&sp)) == NULL)
619 		return (EINVAL);
620 
621 	switch (cmd) {
622 	/* Set ownership and permissions. */
623 	case IPC_SET:
624 		if (error = ipcperm_set(shm_svc, cr, &sp->shm_perm,
625 		    &STRUCT_BUF(ds)->shm_perm, mdl))
626 				break;
627 		sp->shm_ctime = gethrestime_sec();
628 		break;
629 
630 	case IPC_STAT:
631 		if (error = ipcperm_access(&sp->shm_perm, SHM_R, cr))
632 			break;
633 
634 		nattch = sp->shm_perm.ipc_ref - 1;
635 
636 		ipcperm_stat(&STRUCT_BUF(ds)->shm_perm, &sp->shm_perm, mdl);
637 		STRUCT_FSET(ds, shm_segsz, sp->shm_segsz);
638 		STRUCT_FSETP(ds, shm_amp, NULL);	/* kernel addr */
639 		STRUCT_FSET(ds, shm_lkcnt, sp->shm_lkcnt);
640 		STRUCT_FSET(ds, shm_lpid, sp->shm_lpid);
641 		STRUCT_FSET(ds, shm_cpid, sp->shm_cpid);
642 		STRUCT_FSET(ds, shm_nattch, nattch);
643 		STRUCT_FSET(ds, shm_cnattch, sp->shm_ismattch);
644 		STRUCT_FSET(ds, shm_atime, sp->shm_atime);
645 		STRUCT_FSET(ds, shm_dtime, sp->shm_dtime);
646 		STRUCT_FSET(ds, shm_ctime, sp->shm_ctime);
647 
648 		mutex_exit(lock);
649 		if (copyout(STRUCT_BUF(ds), arg, STRUCT_SIZE(ds)))
650 			return (EFAULT);
651 
652 		return (0);
653 
654 	case IPC_SET64:
655 		if (error = ipcperm_set64(shm_svc, cr,
656 		    &sp->shm_perm, &ds64.shmx_perm))
657 			break;
658 		sp->shm_ctime = gethrestime_sec();
659 		break;
660 
661 	case IPC_STAT64:
662 		nattch = sp->shm_perm.ipc_ref - 1;
663 
664 		ipcperm_stat64(&ds64.shmx_perm, &sp->shm_perm);
665 		ds64.shmx_segsz = sp->shm_segsz;
666 		ds64.shmx_lkcnt = sp->shm_lkcnt;
667 		ds64.shmx_lpid = sp->shm_lpid;
668 		ds64.shmx_cpid = sp->shm_cpid;
669 		ds64.shmx_nattch = nattch;
670 		ds64.shmx_cnattch = sp->shm_ismattch;
671 		ds64.shmx_atime = sp->shm_atime;
672 		ds64.shmx_dtime = sp->shm_dtime;
673 		ds64.shmx_ctime = sp->shm_ctime;
674 
675 		mutex_exit(lock);
676 		if (copyout(&ds64, arg, sizeof (struct shmid_ds64)))
677 			return (EFAULT);
678 
679 		return (0);
680 
681 	/* Lock segment in memory */
682 	case SHM_LOCK:
683 		if ((error = secpolicy_lock_memory(cr)) != 0)
684 			break;
685 
686 		if (!isspt(sp) && (sp->shm_lkcnt++ == 0)) {
687 			if (error = shmem_lock(sp->shm_amp)) {
688 			    ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER);
689 			    cmn_err(CE_NOTE,
690 				"shmctl - couldn't lock %ld pages into memory",
691 				sp->shm_amp->size);
692 			    ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock);
693 			    error = ENOMEM;
694 			    sp->shm_lkcnt--;
695 			    shmem_unlock(sp->shm_amp, 0);
696 			}
697 		}
698 		break;
699 
700 	/* Unlock segment */
701 	case SHM_UNLOCK:
702 		if ((error = secpolicy_lock_memory(cr)) != 0)
703 			break;
704 
705 		if (!isspt(sp)) {
706 			if (sp->shm_lkcnt && (--sp->shm_lkcnt == 0)) {
707 				shmem_unlock(sp->shm_amp, 1);
708 			}
709 		}
710 		break;
711 
712 	default:
713 		error = EINVAL;
714 		break;
715 	}
716 	mutex_exit(lock);
717 	return (error);
718 }
719 
720 static void
721 shm_detach(proc_t *pp, segacct_t *sap)
722 {
723 	kshmid_t	*sp = sap->sa_id;
724 	size_t		len = sap->sa_len;
725 	caddr_t		addr = sap->sa_addr;
726 
727 	/*
728 	 * Discard lwpchan mappings.
729 	 */
730 	if (pp->p_lcp != NULL)
731 		lwpchan_delete_mapping(pp, addr, addr + len);
732 	(void) as_unmap(pp->p_as, addr, len);
733 
734 	/*
735 	 * Perform some detach-time accounting.
736 	 */
737 	(void) ipc_lock(shm_svc, sp->shm_perm.ipc_id);
738 	if (sap->sa_flags & SHMSA_ISM)
739 		sp->shm_ismattch--;
740 	sp->shm_dtime = gethrestime_sec();
741 	sp->shm_lpid = pp->p_pid;
742 	ipc_rele(shm_svc, (kipc_perm_t *)sp);	/* Drops lock */
743 
744 	kmem_free(sap, sizeof (segacct_t));
745 }
746 
747 static int
748 shmdt(caddr_t addr)
749 {
750 	proc_t *pp = curproc;
751 	segacct_t *sap, template;
752 
753 	mutex_enter(&pp->p_lock);
754 	prbarrier(pp);			/* block /proc.  See shmgetid(). */
755 
756 	template.sa_addr = addr;
757 	template.sa_len = 0;
758 	if ((pp->p_segacct == NULL) ||
759 	    ((sap = avl_find(pp->p_segacct, &template, NULL)) == NULL)) {
760 		mutex_exit(&pp->p_lock);
761 		return (EINVAL);
762 	}
763 	if (sap->sa_addr != addr) {
764 		mutex_exit(&pp->p_lock);
765 		return (EINVAL);
766 	}
767 	avl_remove(pp->p_segacct, sap);
768 	mutex_exit(&pp->p_lock);
769 
770 	shm_detach(pp, sap);
771 
772 	return (0);
773 }
774 
775 /*
776  * Remove all shared memory segments associated with a given zone.
777  * Called by zone_shutdown when the zone is halted.
778  */
779 /*ARGSUSED1*/
780 static void
781 shm_remove_zone(zoneid_t zoneid, void *arg)
782 {
783 	ipc_remove_zone(shm_svc, zoneid);
784 }
785 
786 /*
787  * Shmget (create new shmem) system call.
788  */
789 static int
790 shmget(key_t key, size_t size, int shmflg, uintptr_t *rvp)
791 {
792 	proc_t		*pp = curproc;
793 	kshmid_t	*sp;
794 	kmutex_t	*lock;
795 	int		error;
796 
797 top:
798 	if (error = ipc_get(shm_svc, key, shmflg, (kipc_perm_t **)&sp, &lock))
799 		return (error);
800 
801 	if (!IPC_FREE(&sp->shm_perm)) {
802 		/*
803 		 * A segment with the requested key exists.
804 		 */
805 		if (size > sp->shm_segsz) {
806 			mutex_exit(lock);
807 			return (EINVAL);
808 		}
809 	} else {
810 		/*
811 		 * A new segment should be created.
812 		 */
813 		size_t npages = btopr(size);
814 		size_t rsize = ptob(npages);
815 
816 		/*
817 		 * Check rsize and the per-project limit on shared
818 		 * memory.  Checking rsize handles both the size == 0
819 		 * case and the size < ULONG_MAX & PAGEMASK case (i.e.
820 		 * rounding up wraps a size_t).
821 		 */
822 		if (rsize == 0 || (rctl_test(rc_project_shmmax,
823 		    pp->p_task->tk_proj->kpj_rctls, pp, rsize,
824 		    RCA_SAFE) & RCT_DENY)) {
825 
826 			mutex_exit(&pp->p_lock);
827 			mutex_exit(lock);
828 			ipc_cleanup(shm_svc, (kipc_perm_t *)sp);
829 			return (EINVAL);
830 		}
831 		mutex_exit(&pp->p_lock);
832 		mutex_exit(lock);
833 
834 		if (anon_resv(rsize) == 0) {
835 			ipc_cleanup(shm_svc, (kipc_perm_t *)sp);
836 			return (ENOMEM);
837 		}
838 
839 		sp->shm_amp = anonmap_alloc(rsize, rsize);
840 
841 		/*
842 		 * Store the original user's requested size, in bytes,
843 		 * rather than the page-aligned size.  The former is
844 		 * used for IPC_STAT and shmget() lookups.  The latter
845 		 * is saved in the anon_map structure and is used for
846 		 * calls to the vm layer.
847 		 */
848 		sp->shm_segsz = size;
849 		sp->shm_atime = sp->shm_dtime = 0;
850 		sp->shm_ctime = gethrestime_sec();
851 		sp->shm_lpid = (pid_t)0;
852 		sp->shm_cpid = curproc->p_pid;
853 		sp->shm_ismattch = 0;
854 		sp->shm_sptinfo = NULL;
855 
856 		/*
857 		 * Check limits one last time, push id into global
858 		 * visibility, and update resource usage counts.
859 		 */
860 		if (error = ipc_commit_begin(shm_svc, key, shmflg,
861 		    (kipc_perm_t *)sp)) {
862 			if (error == EAGAIN)
863 				goto top;
864 			return (error);
865 		}
866 
867 		if (rctl_test(rc_project_shmmax,
868 		    sp->shm_perm.ipc_proj->kpj_rctls, pp, rsize,
869 		    RCA_SAFE) & RCT_DENY) {
870 			ipc_cleanup(shm_svc, (kipc_perm_t *)sp);
871 			return (EINVAL);
872 		}
873 		sp->shm_perm.ipc_proj->kpj_data.kpd_shmmax += rsize;
874 
875 		lock = ipc_commit_end(shm_svc, &sp->shm_perm);
876 	}
877 
878 #ifdef C2_AUDIT
879 	if (audit_active)
880 		audit_ipcget(AT_IPC_SHM, (void *)sp);
881 #endif
882 
883 	*rvp = (uintptr_t)(sp->shm_perm.ipc_id);
884 
885 	mutex_exit(lock);
886 	return (0);
887 }
888 
889 /*
890  * shmids system call.
891  */
892 static int
893 shmids(int *buf, uint_t nids, uint_t *pnids)
894 {
895 	return (ipc_ids(shm_svc, buf, nids, pnids));
896 }
897 
898 /*
899  * System entry point for shmat, shmctl, shmdt, and shmget system calls.
900  */
901 static uintptr_t
902 shmsys(int opcode, uintptr_t a0, uintptr_t a1, uintptr_t a2)
903 {
904 	int	error;
905 	uintptr_t r_val = 0;
906 
907 	switch (opcode) {
908 	case SHMAT:
909 		error = shmat((int)a0, (caddr_t)a1, (int)a2, &r_val);
910 		break;
911 	case SHMCTL:
912 		error = shmctl((int)a0, (int)a1, (void *)a2);
913 		break;
914 	case SHMDT:
915 		error = shmdt((caddr_t)a0);
916 		break;
917 	case SHMGET:
918 		error = shmget((key_t)a0, (size_t)a1, (int)a2, &r_val);
919 		break;
920 	case SHMIDS:
921 		error = shmids((int *)a0, (uint_t)a1, (uint_t *)a2);
922 		break;
923 	default:
924 		error = EINVAL;
925 		break;
926 	}
927 
928 	if (error)
929 		return ((uintptr_t)set_errno(error));
930 
931 	return (r_val);
932 }
933 
934 /*
935  * segacct_t comparator
936  * This works as expected, with one minor change: the first of two real
937  * segments with equal addresses is considered to be 'greater than' the
938  * second.  We only return equal when searching using a template, in
939  * which case we explicitly set the template segment's length to 0
940  * (which is invalid for a real segment).
941  */
942 static int
943 shm_sacompar(const void *x, const void *y)
944 {
945 	segacct_t *sa1 = (segacct_t *)x;
946 	segacct_t *sa2 = (segacct_t *)y;
947 
948 	if (sa1->sa_addr < sa2->sa_addr) {
949 		return (-1);
950 	} else if (sa2->sa_len != 0) {
951 		if (sa1->sa_addr >= sa2->sa_addr + sa2->sa_len) {
952 			return (1);
953 		} else if (sa1->sa_len != 0) {
954 			return (1);
955 		} else {
956 			return (0);
957 		}
958 	} else if (sa1->sa_addr > sa2->sa_addr) {
959 		return (1);
960 	} else {
961 		return (0);
962 	}
963 }
964 
965 /*
966  * add this record to the segacct list.
967  */
968 static void
969 sa_add(struct proc *pp, caddr_t addr, size_t len, ulong_t flags, kshmid_t *id)
970 {
971 	segacct_t *nsap;
972 	avl_tree_t *tree = NULL;
973 	avl_index_t where;
974 
975 	nsap = kmem_alloc(sizeof (segacct_t), KM_SLEEP);
976 	nsap->sa_addr = addr;
977 	nsap->sa_len  = len;
978 	nsap->sa_flags = flags;
979 	nsap->sa_id = id;
980 
981 	if (pp->p_segacct == NULL)
982 		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
983 
984 	mutex_enter(&pp->p_lock);
985 	prbarrier(pp);			/* block /proc.  See shmgetid(). */
986 
987 	if (pp->p_segacct == NULL) {
988 		avl_create(tree, shm_sacompar, sizeof (segacct_t),
989 		    offsetof(segacct_t, sa_tree));
990 		pp->p_segacct = tree;
991 	} else if (tree) {
992 		kmem_free(tree, sizeof (avl_tree_t));
993 	}
994 
995 	/*
996 	 * We can ignore the result of avl_find, as the comparator will
997 	 * never return equal for segments with non-zero length.  This
998 	 * is a necessary hack to get around the fact that we do, in
999 	 * fact, have duplicate keys.
1000 	 */
1001 	(void) avl_find(pp->p_segacct, nsap, &where);
1002 	avl_insert(pp->p_segacct, nsap, where);
1003 
1004 	mutex_exit(&pp->p_lock);
1005 }
1006 
1007 /*
1008  * Duplicate parent's segacct records in child.
1009  */
1010 void
1011 shmfork(struct proc *ppp, struct proc *cpp)
1012 {
1013 	segacct_t *sap;
1014 	kshmid_t *sp;
1015 	kmutex_t *mp;
1016 
1017 	ASSERT(ppp->p_segacct != NULL);
1018 
1019 	/*
1020 	 * We are the only lwp running in the parent so nobody can
1021 	 * mess with our p_segacct list.  Thus it is safe to traverse
1022 	 * the list without holding p_lock.  This is essential because
1023 	 * we can't hold p_lock during a KM_SLEEP allocation.
1024 	 */
1025 	for (sap = (segacct_t *)avl_first(ppp->p_segacct); sap != NULL;
1026 	    sap = (segacct_t *)AVL_NEXT(ppp->p_segacct, sap)) {
1027 		sa_add(cpp, sap->sa_addr, sap->sa_len, sap->sa_flags,
1028 		    sap->sa_id);
1029 		sp = sap->sa_id;
1030 		mp = ipc_lock(shm_svc, sp->shm_perm.ipc_id);
1031 		if (sap->sa_flags & SHMSA_ISM)
1032 			sp->shm_ismattch++;
1033 		ipc_hold(shm_svc, (kipc_perm_t *)sp);
1034 		mutex_exit(mp);
1035 	}
1036 }
1037 
1038 /*
1039  * Detach shared memory segments from exiting process.
1040  */
1041 void
1042 shmexit(struct proc *pp)
1043 {
1044 	segacct_t *sap;
1045 	avl_tree_t *tree;
1046 	void *cookie = NULL;
1047 
1048 	ASSERT(pp->p_segacct != NULL);
1049 
1050 	mutex_enter(&pp->p_lock);
1051 	prbarrier(pp);
1052 	tree = pp->p_segacct;
1053 	pp->p_segacct = NULL;
1054 	mutex_exit(&pp->p_lock);
1055 
1056 	while ((sap = avl_destroy_nodes(tree, &cookie)) != NULL)
1057 		(void) shm_detach(pp, sap);
1058 
1059 	avl_destroy(tree);
1060 	kmem_free(tree, sizeof (avl_tree_t));
1061 }
1062 
1063 /*
1064  * At this time pages should be in memory, so just lock them.
1065  */
1066 static void
1067 lock_again(size_t npages, struct anon_map *amp)
1068 {
1069 	struct anon *ap;
1070 	struct page *pp;
1071 	struct vnode *vp;
1072 	anoff_t off;
1073 	ulong_t anon_idx;
1074 	anon_sync_obj_t cookie;
1075 
1076 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
1077 
1078 	for (anon_idx = 0; npages != 0; anon_idx++, npages--) {
1079 
1080 		anon_array_enter(amp, anon_idx, &cookie);
1081 		ap = anon_get_ptr(amp->ahp, anon_idx);
1082 		swap_xlate(ap, &vp, &off);
1083 		anon_array_exit(&cookie);
1084 
1085 		pp = page_lookup(vp, (u_offset_t)off, SE_SHARED);
1086 		if (pp == NULL) {
1087 			panic("lock_again: page not in the system");
1088 			/*NOTREACHED*/
1089 		}
1090 		(void) page_pp_lock(pp, 0, 0);
1091 		page_unlock(pp);
1092 	}
1093 	ANON_LOCK_EXIT(&amp->a_rwlock);
1094 }
1095 
1096 /* check if this segment is already locked. */
1097 /*ARGSUSED*/
1098 static int
1099 check_locked(struct as *as, struct segvn_data *svd, size_t npages)
1100 {
1101 	struct vpage *vpp = svd->vpage;
1102 	size_t i;
1103 	if (svd->vpage == NULL)
1104 		return (0);		/* unlocked */
1105 
1106 	SEGVN_LOCK_ENTER(as, &svd->lock, RW_READER);
1107 	for (i = 0; i < npages; i++, vpp++) {
1108 		if (VPP_ISPPLOCK(vpp) == 0) {
1109 			SEGVN_LOCK_EXIT(as, &svd->lock);
1110 			return (1);	/* partially locked */
1111 		}
1112 	}
1113 	SEGVN_LOCK_EXIT(as, &svd->lock);
1114 	return (2);			/* locked */
1115 }
1116 
1117 
1118 /*
1119  * Attach the shared memory segment to the process
1120  * address space and lock the pages.
1121  */
1122 static int
1123 shmem_lock(struct anon_map *amp)
1124 {
1125 	size_t npages = btopr(amp->size);
1126 	struct seg *seg;
1127 	struct as *as;
1128 	struct segvn_crargs crargs;
1129 	struct segvn_data *svd;
1130 	proc_t *p = curproc;
1131 	caddr_t addr;
1132 	uint_t error, ret;
1133 	caddr_t seg_base;
1134 	size_t  seg_sz;
1135 
1136 	as = p->p_as;
1137 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1138 	/* check if shared memory is already attached */
1139 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
1140 		svd = (struct segvn_data *)seg->s_data;
1141 		if ((seg->s_ops == &segvn_ops) && (svd->amp == amp) &&
1142 		    (amp->size == seg->s_size)) {
1143 			switch (ret = check_locked(as, svd, npages)) {
1144 			case 0:			/* unlocked */
1145 			case 1:			/* partially locked */
1146 				seg_base = seg->s_base;
1147 				seg_sz = seg->s_size;
1148 
1149 				AS_LOCK_EXIT(as, &as->a_lock);
1150 				if ((error = as_ctl(as, seg_base, seg_sz,
1151 					MC_LOCK, 0, 0, NULL, 0)) == 0)
1152 					lock_again(npages, amp);
1153 				(void) as_ctl(as, seg_base, seg_sz, MC_UNLOCK,
1154 					0, 0, NULL, NULL);
1155 				return (error);
1156 			case 2:			/* locked */
1157 				AS_LOCK_EXIT(as, &as->a_lock);
1158 				lock_again(npages, amp);
1159 				return (0);
1160 			default:
1161 				cmn_err(CE_WARN, "shmem_lock: deflt %d", ret);
1162 				break;
1163 			}
1164 		}
1165 	}
1166 	AS_LOCK_EXIT(as, &as->a_lock);
1167 
1168 	/* attach shm segment to our address space */
1169 	as_rangelock(as);
1170 	map_addr(&addr, amp->size, 0ll, 1, 0);
1171 	if (addr == NULL) {
1172 		as_rangeunlock(as);
1173 		return (ENOMEM);
1174 	}
1175 
1176 	/* Initialize the create arguments and map the segment */
1177 	crargs = *(struct segvn_crargs *)zfod_argsp;	/* structure copy */
1178 	crargs.offset = (u_offset_t)0;
1179 	crargs.type = MAP_SHARED;
1180 	crargs.amp = amp;
1181 	crargs.prot = PROT_ALL;
1182 	crargs.maxprot = crargs.prot;
1183 	crargs.flags = 0;
1184 
1185 	error = as_map(as, addr, amp->size, segvn_create, &crargs);
1186 	as_rangeunlock(as);
1187 	if (!error) {
1188 		if ((error = as_ctl(as, addr, amp->size, MC_LOCK, 0, 0,
1189 			NULL, 0)) == 0) {
1190 			lock_again(npages, amp);
1191 		}
1192 		(void) as_unmap(as, addr, amp->size);
1193 	}
1194 	return (error);
1195 }
1196 
1197 
1198 /*
1199  * Unlock shared memory
1200  */
1201 static void
1202 shmem_unlock(struct anon_map *amp, uint_t lck)
1203 {
1204 	struct anon *ap;
1205 	pgcnt_t npages = btopr(amp->size);
1206 	struct vnode *vp;
1207 	struct page *pp;
1208 	anoff_t off;
1209 	ulong_t anon_idx;
1210 
1211 	for (anon_idx = 0; anon_idx < npages; anon_idx++) {
1212 
1213 		if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) {
1214 			if (lck) {
1215 				panic("shmem_unlock: null app");
1216 				/*NOTREACHED*/
1217 			}
1218 			continue;
1219 		}
1220 		swap_xlate(ap, &vp, &off);
1221 		pp = page_lookup(vp, off, SE_SHARED);
1222 		if (pp == NULL) {
1223 			if (lck) {
1224 				panic("shmem_unlock: page not in the system");
1225 				/*NOTREACHED*/
1226 			}
1227 			continue;
1228 		}
1229 		if (pp->p_lckcnt) {
1230 			page_pp_unlock(pp, 0, 0);
1231 		}
1232 		page_unlock(pp);
1233 	}
1234 }
1235 
1236 /*
1237  * We call this routine when we have removed all references to this
1238  * amp.  This means all shmdt()s and the IPC_RMID have been done.
1239  */
1240 static void
1241 shm_rm_amp(struct anon_map *amp, uint_t lckflag)
1242 {
1243 	/*
1244 	 * If we are finally deleting the
1245 	 * shared memory, and if no one did
1246 	 * the SHM_UNLOCK, we must do it now.
1247 	 */
1248 	shmem_unlock(amp, lckflag);
1249 
1250 	/*
1251 	 * Free up the anon_map.
1252 	 */
1253 	lgrp_shm_policy_fini(amp, NULL);
1254 	if (amp->a_szc != 0) {
1255 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
1256 		anon_shmap_free_pages(amp, 0, amp->size);
1257 		ANON_LOCK_EXIT(&amp->a_rwlock);
1258 	} else {
1259 		anon_free(amp->ahp, 0, amp->size);
1260 	}
1261 	anon_unresv(amp->swresv);
1262 	anonmap_free(amp);
1263 }
1264 
1265 /*
1266  * Return the shared memory id for the process's virtual address.
1267  * Return SHMID_NONE if addr is not within a SysV shared memory segment.
1268  * Return SHMID_FREE if addr's SysV shared memory segment's id has been freed.
1269  *
1270  * shmgetid() is called from code in /proc with the process locked but
1271  * with pp->p_lock not held.  The address space lock is held, so we
1272  * cannot grab pp->p_lock here due to lock-ordering constraints.
1273  * Because of all this, modifications to the p_segacct list must only
1274  * be made after calling prbarrier() to ensure the process is not locked.
1275  * See shmdt() and sa_add(), above. shmgetid() may also be called on a
1276  * thread's own process without the process locked.
1277  */
1278 int
1279 shmgetid(proc_t *pp, caddr_t addr)
1280 {
1281 	segacct_t *sap, template;
1282 
1283 	ASSERT(MUTEX_NOT_HELD(&pp->p_lock));
1284 	ASSERT((pp->p_proc_flag & P_PR_LOCK) || pp == curproc);
1285 
1286 	if (pp->p_segacct == NULL)
1287 		return (SHMID_NONE);
1288 
1289 	template.sa_addr = addr;
1290 	template.sa_len = 0;
1291 	if ((sap = avl_find(pp->p_segacct, &template, NULL)) == NULL)
1292 		return (SHMID_NONE);
1293 
1294 	if (IPC_FREE(&sap->sa_id->shm_perm))
1295 		return (SHMID_FREE);
1296 
1297 	return (sap->sa_id->shm_perm.ipc_id);
1298 }
1299