1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * Inter-Process Communication Shared Memory Facility. 43 * 44 * See os/ipc.c for a description of common IPC functionality. 45 * 46 * Resource controls 47 * ----------------- 48 * 49 * Control: project.max-shm-ids (rc_project_shmmni) 50 * Description: Maximum number of shared memory ids allowed a project. 51 * 52 * When shmget() is used to allocate a shared memory segment, one id 53 * is allocated. If the id allocation doesn't succeed, shmget() 54 * fails and errno is set to ENOSPC. Upon successful shmctl(, 55 * IPC_RMID) the id is deallocated. 56 * 57 * Control: project.max-shm-memory (rc_project_shmmax) 58 * Description: Total amount of shared memory allowed a project. 59 * 60 * When shmget() is used to allocate a shared memory segment, the 61 * segment's size is allocated against this limit. If the space 62 * allocation doesn't succeed, shmget() fails and errno is set to 63 * EINVAL. The size will be deallocated once the last process has 64 * detached the segment and the segment has been successfully 65 * shmctl(, IPC_RMID)ed. 66 */ 67 68 #include <sys/types.h> 69 #include <sys/param.h> 70 #include <sys/cred.h> 71 #include <sys/errno.h> 72 #include <sys/time.h> 73 #include <sys/kmem.h> 74 #include <sys/user.h> 75 #include <sys/proc.h> 76 #include <sys/systm.h> 77 #include <sys/prsystm.h> 78 #include <sys/sysmacros.h> 79 #include <sys/tuneable.h> 80 #include <sys/vm.h> 81 #include <sys/mman.h> 82 #include <sys/swap.h> 83 #include <sys/cmn_err.h> 84 #include <sys/debug.h> 85 #include <sys/lwpchan_impl.h> 86 #include <sys/avl.h> 87 #include <sys/modctl.h> 88 #include <sys/syscall.h> 89 #include <sys/task.h> 90 #include <sys/project.h> 91 #include <sys/policy.h> 92 #include <sys/zone.h> 93 94 #include <sys/ipc.h> 95 #include <sys/ipc_impl.h> 96 #include <sys/shm.h> 97 #include <sys/shm_impl.h> 98 99 #include <vm/hat.h> 100 #include <vm/seg.h> 101 #include <vm/as.h> 102 #include <vm/seg_vn.h> 103 #include <vm/anon.h> 104 #include <vm/page.h> 105 #include <vm/vpage.h> 106 #include <vm/seg_spt.h> 107 108 #include <c2/audit.h> 109 110 static int shmem_lock(struct anon_map *amp); 111 static void shmem_unlock(struct anon_map *amp, uint_t lck); 112 static void sa_add(struct proc *pp, caddr_t addr, size_t len, ulong_t flags, 113 kshmid_t *id); 114 static void shm_rm_amp(struct anon_map *amp, uint_t lckflag); 115 static void shm_dtor(kipc_perm_t *); 116 static void shm_rmid(kipc_perm_t *); 117 static void shm_remove_zone(zoneid_t, void *); 118 119 /* 120 * Semantics for share_page_table and ism_off: 121 * 122 * These are hooks in /etc/system - only for internal testing purpose. 123 * 124 * Setting share_page_table automatically turns on the SHM_SHARE_MMU (ISM) flag 125 * in a call to shmat(2). In other words, with share_page_table set, you always 126 * get ISM, even if say, DISM is specified. It should really be called "ism_on". 127 * 128 * Setting ism_off turns off the SHM_SHARE_MMU flag from the flags passed to 129 * shmat(2). 130 * 131 * If both share_page_table and ism_off are set, share_page_table prevails. 132 * 133 * Although these tunables should probably be removed, they do have some 134 * external exposure; as long as they exist, they should at least work sensibly. 135 */ 136 137 int share_page_table; 138 int ism_off; 139 140 /* 141 * The following tunables are obsolete. Though for compatibility we 142 * still read and interpret shminfo_shmmax and shminfo_shmmni (see 143 * os/project.c), the preferred mechanism for administrating the IPC 144 * Shared Memory facility is through the resource controls described at 145 * the top of this file. 146 */ 147 size_t shminfo_shmmax = 0x800000; /* (obsolete) */ 148 int shminfo_shmmni = 100; /* (obsolete) */ 149 size_t shminfo_shmmin = 1; /* (obsolete) */ 150 int shminfo_shmseg = 6; /* (obsolete) */ 151 152 extern rctl_hndl_t rc_project_shmmax; 153 extern rctl_hndl_t rc_project_shmmni; 154 static ipc_service_t *shm_svc; 155 static zone_key_t shm_zone_key; 156 157 /* 158 * Module linkage information for the kernel. 159 */ 160 static uintptr_t shmsys(int, uintptr_t, uintptr_t, uintptr_t); 161 162 static struct sysent ipcshm_sysent = { 163 4, 164 #ifdef _SYSCALL32_IMPL 165 SE_ARGC | SE_NOUNLOAD | SE_64RVAL, 166 #else /* _SYSCALL32_IMPL */ 167 SE_ARGC | SE_NOUNLOAD | SE_32RVAL1, 168 #endif /* _SYSCALL32_IMPL */ 169 (int (*)())shmsys 170 }; 171 172 #ifdef _SYSCALL32_IMPL 173 static struct sysent ipcshm_sysent32 = { 174 4, 175 SE_ARGC | SE_NOUNLOAD | SE_32RVAL1, 176 (int (*)())shmsys 177 }; 178 #endif /* _SYSCALL32_IMPL */ 179 180 static struct modlsys modlsys = { 181 &mod_syscallops, "System V shared memory", &ipcshm_sysent 182 }; 183 184 #ifdef _SYSCALL32_IMPL 185 static struct modlsys modlsys32 = { 186 &mod_syscallops32, "32-bit System V shared memory", &ipcshm_sysent32 187 }; 188 #endif /* _SYSCALL32_IMPL */ 189 190 static struct modlinkage modlinkage = { 191 MODREV_1, 192 &modlsys, 193 #ifdef _SYSCALL32_IMPL 194 &modlsys32, 195 #endif 196 NULL 197 }; 198 199 200 int 201 _init(void) 202 { 203 int result; 204 205 shm_svc = ipcs_create("shmids", rc_project_shmmni, sizeof (kshmid_t), 206 shm_dtor, shm_rmid, AT_IPC_SHM, 207 offsetof(kproject_data_t, kpd_shmmni)); 208 zone_key_create(&shm_zone_key, NULL, shm_remove_zone, NULL); 209 210 if ((result = mod_install(&modlinkage)) == 0) 211 return (0); 212 213 (void) zone_key_delete(shm_zone_key); 214 ipcs_destroy(shm_svc); 215 216 return (result); 217 } 218 219 int 220 _fini(void) 221 { 222 return (EBUSY); 223 } 224 225 int 226 _info(struct modinfo *modinfop) 227 { 228 return (mod_info(&modlinkage, modinfop)); 229 } 230 231 /* 232 * Shmat (attach shared segment) system call. 233 */ 234 static int 235 shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) 236 { 237 kshmid_t *sp; /* shared memory header ptr */ 238 size_t size; 239 int error = 0; 240 proc_t *pp = curproc; 241 struct as *as = pp->p_as; 242 struct segvn_crargs crargs; /* segvn create arguments */ 243 kmutex_t *lock; 244 struct seg *segspt = NULL; 245 caddr_t addr = uaddr; 246 int flags = (uflags & SHMAT_VALID_FLAGS_MASK); 247 int useISM; 248 uchar_t prot = PROT_ALL; 249 int result; 250 251 if ((lock = ipc_lookup(shm_svc, shmid, (kipc_perm_t **)&sp)) == NULL) 252 return (EINVAL); 253 if (error = ipcperm_access(&sp->shm_perm, SHM_R, CRED())) 254 goto errret; 255 if ((flags & SHM_RDONLY) == 0 && 256 (error = ipcperm_access(&sp->shm_perm, SHM_W, CRED()))) 257 goto errret; 258 if (spt_invalid(flags)) { 259 error = EINVAL; 260 goto errret; 261 } 262 if (ism_off) 263 flags = flags & ~SHM_SHARE_MMU; 264 if (share_page_table) { 265 flags = flags & ~SHM_PAGEABLE; 266 flags = flags | SHM_SHARE_MMU; 267 } 268 useISM = (spt_locked(flags) || spt_pageable(flags)); 269 if (useISM && (error = ipcperm_access(&sp->shm_perm, SHM_W, CRED()))) 270 goto errret; 271 if (useISM && isspt(sp)) { 272 uint_t newsptflags = flags | spt_flags(sp->shm_sptseg); 273 /* 274 * If trying to change an existing {D}ISM segment from ISM 275 * to DISM or vice versa, return error. Note that this 276 * validation of flags needs to be done after the effect of 277 * tunables such as ism_off and share_page_table, for 278 * semantics that are consistent with the tunables' settings. 279 */ 280 if (spt_invalid(newsptflags)) { 281 error = EINVAL; 282 goto errret; 283 } 284 } 285 ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER); 286 size = sp->shm_amp->size; 287 ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock); 288 289 /* somewhere to record spt info for final detach */ 290 if (sp->shm_sptinfo == NULL) 291 sp->shm_sptinfo = kmem_zalloc(sizeof (sptinfo_t), KM_SLEEP); 292 293 as_rangelock(as); 294 295 if (useISM) { 296 /* 297 * Handle ISM 298 */ 299 uint_t n, share_szc; 300 size_t share_size; 301 struct shm_data ssd; 302 uintptr_t align_hint; 303 304 n = page_num_pagesizes(); 305 if (n < 2) { /* large pages aren't supported */ 306 as_rangeunlock(as); 307 error = EINVAL; 308 goto errret; 309 } 310 311 /* 312 * Pick a share pagesize to use, if (!isspt(sp)). 313 * Otherwise use the already chosen page size. 314 * 315 * For the initial shmat (!isspt(sp)), where sptcreate is 316 * called, map_pgsz is called to recommend a [D]ISM pagesize, 317 * important for systems which offer more than one potential 318 * [D]ISM pagesize. 319 * If the shmat is just to attach to an already created 320 * [D]ISM segment, then use the previously selected page size. 321 */ 322 if (!isspt(sp)) { 323 share_size = map_pgsz(MAPPGSZ_ISM, 324 pp, addr, size, NULL); 325 if (share_size == 0) { 326 as_rangeunlock(as); 327 error = EINVAL; 328 goto errret; 329 } 330 share_szc = page_szc(share_size); 331 } else { 332 share_szc = sp->shm_sptseg->s_szc; 333 share_size = page_get_pagesize(share_szc); 334 } 335 size = P2ROUNDUP(size, share_size); 336 337 align_hint = share_size; 338 #if defined(__i386) || defined(__amd64) 339 /* 340 * For 64 bit amd64, we want to share an entire page table 341 * if possible. We know (ugh) that there are 512 entries in 342 * in a page table. The number for 32 bit non-PAE should be 343 * 1024, but I'm not going to special case that. Note using 512 344 * won't cause a failure below. It retries with align_hint set 345 * to share_size 346 */ 347 while (size >= 512 * (uint64_t)align_hint) 348 align_hint *= 512; 349 #endif /* __i386 || __amd64 */ 350 351 #if defined(__sparcv9) 352 if (addr == 0 && curproc->p_model == DATAMODEL_LP64) { 353 /* 354 * If no address has been passed in, and this is a 355 * 64-bit process, we'll try to find an address 356 * in the predict-ISM zone. 357 */ 358 caddr_t predbase = (caddr_t)PREDISM_1T_BASE; 359 size_t len = PREDISM_BOUND - PREDISM_1T_BASE; 360 361 as_purge(as); 362 if (as_gap(as, size + share_size, &predbase, &len, 363 AH_LO, (caddr_t)NULL) != -1) { 364 /* 365 * We found an address which looks like a 366 * candidate. We want to round it up, and 367 * then check that it's a valid user range. 368 * This assures that we won't fail below. 369 */ 370 addr = (caddr_t)P2ROUNDUP((uintptr_t)predbase, 371 share_size); 372 373 if (valid_usr_range(addr, size, prot, 374 as, as->a_userlimit) != RANGE_OKAY) { 375 addr = 0; 376 } 377 } 378 } 379 #endif /* __sparcv9 */ 380 381 if (addr == 0) { 382 for (;;) { 383 addr = (caddr_t)align_hint; 384 map_addr(&addr, size, 0ll, 1, MAP_ALIGN); 385 if (addr != NULL || align_hint == share_size) 386 break; 387 align_hint = share_size; 388 } 389 if (addr == NULL) { 390 as_rangeunlock(as); 391 error = ENOMEM; 392 goto errret; 393 } 394 ASSERT(((uintptr_t)addr & (align_hint - 1)) == 0); 395 } else { 396 /* Use the user-supplied attach address */ 397 caddr_t base; 398 size_t len; 399 400 /* 401 * Check that the address range 402 * 1) is properly aligned 403 * 2) is correct in unix terms 404 * 3) is within an unmapped address segment 405 */ 406 base = addr; 407 len = size; /* use spt aligned size */ 408 /* XXX - in SunOS, is sp->shm_segsz */ 409 if ((uintptr_t)base & (share_size - 1)) { 410 error = EINVAL; 411 as_rangeunlock(as); 412 goto errret; 413 } 414 result = valid_usr_range(base, len, prot, as, 415 as->a_userlimit); 416 if (result == RANGE_BADPROT) { 417 /* 418 * We try to accomodate processors which 419 * may not support execute permissions on 420 * all ISM segments by trying the check 421 * again but without PROT_EXEC. 422 */ 423 prot &= ~PROT_EXEC; 424 result = valid_usr_range(base, len, prot, as, 425 as->a_userlimit); 426 } 427 as_purge(as); 428 if (result != RANGE_OKAY || 429 as_gap(as, len, &base, &len, AH_LO, 430 (caddr_t)NULL) != 0) { 431 error = EINVAL; 432 as_rangeunlock(as); 433 goto errret; 434 } 435 } 436 437 if (!isspt(sp)) { 438 error = sptcreate(size, &segspt, sp->shm_amp, prot, 439 flags, share_szc); 440 if (error) { 441 as_rangeunlock(as); 442 goto errret; 443 } 444 sp->shm_sptinfo->sptas = segspt->s_as; 445 sp->shm_sptseg = segspt; 446 sp->shm_sptprot = prot; 447 sp->shm_lkcnt = 0; 448 } else if ((prot & sp->shm_sptprot) != sp->shm_sptprot) { 449 /* 450 * Ensure we're attaching to an ISM segment with 451 * fewer or equal permissions than what we're 452 * allowed. Fail if the segment has more 453 * permissions than what we're allowed. 454 */ 455 error = EACCES; 456 as_rangeunlock(as); 457 goto errret; 458 } 459 460 ssd.shm_sptseg = sp->shm_sptseg; 461 ssd.shm_sptas = sp->shm_sptinfo->sptas; 462 ssd.shm_amp = sp->shm_amp; 463 error = as_map(as, addr, size, segspt_shmattach, &ssd); 464 if (error == 0) 465 sp->shm_ismattch++; /* keep count of ISM attaches */ 466 } else { 467 468 /* 469 * Normal case. 470 */ 471 if (flags & SHM_RDONLY) 472 prot &= ~PROT_WRITE; 473 474 if (addr == 0) { 475 /* Let the system pick the attach address */ 476 map_addr(&addr, size, 0ll, 1, 0); 477 if (addr == NULL) { 478 as_rangeunlock(as); 479 error = ENOMEM; 480 goto errret; 481 } 482 } else { 483 /* Use the user-supplied attach address */ 484 caddr_t base; 485 size_t len; 486 487 if (flags & SHM_RND) 488 addr = (caddr_t)((uintptr_t)addr & 489 ~(SHMLBA - 1)); 490 /* 491 * Check that the address range 492 * 1) is properly aligned 493 * 2) is correct in unix terms 494 * 3) is within an unmapped address segment 495 */ 496 base = addr; 497 len = size; /* use aligned size */ 498 /* XXX - in SunOS, is sp->shm_segsz */ 499 if ((uintptr_t)base & PAGEOFFSET) { 500 error = EINVAL; 501 as_rangeunlock(as); 502 goto errret; 503 } 504 result = valid_usr_range(base, len, prot, as, 505 as->a_userlimit); 506 if (result == RANGE_BADPROT) { 507 prot &= ~PROT_EXEC; 508 result = valid_usr_range(base, len, prot, as, 509 as->a_userlimit); 510 } 511 as_purge(as); 512 if (result != RANGE_OKAY || 513 as_gap(as, len, &base, &len, 514 AH_LO, (caddr_t)NULL) != 0) { 515 error = EINVAL; 516 as_rangeunlock(as); 517 goto errret; 518 } 519 } 520 521 /* Initialize the create arguments and map the segment */ 522 crargs = *(struct segvn_crargs *)zfod_argsp; 523 crargs.offset = 0; 524 crargs.type = MAP_SHARED; 525 crargs.amp = sp->shm_amp; 526 crargs.prot = prot; 527 crargs.maxprot = crargs.prot; 528 crargs.flags = 0; 529 530 error = as_map(as, addr, size, segvn_create, &crargs); 531 } 532 533 as_rangeunlock(as); 534 if (error) 535 goto errret; 536 537 /* record shmem range for the detach */ 538 sa_add(pp, addr, (size_t)size, useISM ? SHMSA_ISM : 0, sp); 539 *rvp = (uintptr_t)addr; 540 541 sp->shm_atime = gethrestime_sec(); 542 sp->shm_lpid = pp->p_pid; 543 ipc_hold(shm_svc, (kipc_perm_t *)sp); 544 errret: 545 mutex_exit(lock); 546 return (error); 547 } 548 549 static void 550 shm_dtor(kipc_perm_t *perm) 551 { 552 kshmid_t *sp = (kshmid_t *)perm; 553 uint_t cnt; 554 555 if (sp->shm_sptinfo) { 556 if (isspt(sp)) 557 sptdestroy(sp->shm_sptinfo->sptas, sp->shm_amp); 558 kmem_free(sp->shm_sptinfo, sizeof (sptinfo_t)); 559 } 560 561 ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER); 562 cnt = --sp->shm_amp->refcnt; 563 ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock); 564 ASSERT(cnt == 0); 565 shm_rm_amp(sp->shm_amp, sp->shm_lkcnt); 566 567 if (sp->shm_perm.ipc_id != IPC_ID_INVAL) { 568 ipcs_lock(shm_svc); 569 sp->shm_perm.ipc_proj->kpj_data.kpd_shmmax -= 570 ptob(btopr(sp->shm_segsz)); 571 ipcs_unlock(shm_svc); 572 } 573 } 574 575 /* ARGSUSED */ 576 static void 577 shm_rmid(kipc_perm_t *perm) 578 { 579 /* nothing to do */ 580 } 581 582 /* 583 * Shmctl system call. 584 */ 585 /* ARGSUSED */ 586 static int 587 shmctl(int shmid, int cmd, void *arg) 588 { 589 kshmid_t *sp; /* shared memory header ptr */ 590 STRUCT_DECL(shmid_ds, ds); /* for SVR4 IPC_SET */ 591 int error = 0; 592 struct cred *cr = CRED(); 593 kmutex_t *lock; 594 model_t mdl = get_udatamodel(); 595 struct shmid_ds64 ds64; 596 shmatt_t nattch; 597 598 STRUCT_INIT(ds, mdl); 599 600 /* 601 * Perform pre- or non-lookup actions (e.g. copyins, RMID). 602 */ 603 switch (cmd) { 604 case IPC_SET: 605 if (copyin(arg, STRUCT_BUF(ds), STRUCT_SIZE(ds))) 606 return (EFAULT); 607 break; 608 609 case IPC_SET64: 610 if (copyin(arg, &ds64, sizeof (struct shmid_ds64))) 611 return (EFAULT); 612 break; 613 614 case IPC_RMID: 615 return (ipc_rmid(shm_svc, shmid, cr)); 616 } 617 618 if ((lock = ipc_lookup(shm_svc, shmid, (kipc_perm_t **)&sp)) == NULL) 619 return (EINVAL); 620 621 switch (cmd) { 622 /* Set ownership and permissions. */ 623 case IPC_SET: 624 if (error = ipcperm_set(shm_svc, cr, &sp->shm_perm, 625 &STRUCT_BUF(ds)->shm_perm, mdl)) 626 break; 627 sp->shm_ctime = gethrestime_sec(); 628 break; 629 630 case IPC_STAT: 631 if (error = ipcperm_access(&sp->shm_perm, SHM_R, cr)) 632 break; 633 634 nattch = sp->shm_perm.ipc_ref - 1; 635 636 ipcperm_stat(&STRUCT_BUF(ds)->shm_perm, &sp->shm_perm, mdl); 637 STRUCT_FSET(ds, shm_segsz, sp->shm_segsz); 638 STRUCT_FSETP(ds, shm_amp, NULL); /* kernel addr */ 639 STRUCT_FSET(ds, shm_lkcnt, sp->shm_lkcnt); 640 STRUCT_FSET(ds, shm_lpid, sp->shm_lpid); 641 STRUCT_FSET(ds, shm_cpid, sp->shm_cpid); 642 STRUCT_FSET(ds, shm_nattch, nattch); 643 STRUCT_FSET(ds, shm_cnattch, sp->shm_ismattch); 644 STRUCT_FSET(ds, shm_atime, sp->shm_atime); 645 STRUCT_FSET(ds, shm_dtime, sp->shm_dtime); 646 STRUCT_FSET(ds, shm_ctime, sp->shm_ctime); 647 648 mutex_exit(lock); 649 if (copyout(STRUCT_BUF(ds), arg, STRUCT_SIZE(ds))) 650 return (EFAULT); 651 652 return (0); 653 654 case IPC_SET64: 655 if (error = ipcperm_set64(shm_svc, cr, 656 &sp->shm_perm, &ds64.shmx_perm)) 657 break; 658 sp->shm_ctime = gethrestime_sec(); 659 break; 660 661 case IPC_STAT64: 662 nattch = sp->shm_perm.ipc_ref - 1; 663 664 ipcperm_stat64(&ds64.shmx_perm, &sp->shm_perm); 665 ds64.shmx_segsz = sp->shm_segsz; 666 ds64.shmx_lkcnt = sp->shm_lkcnt; 667 ds64.shmx_lpid = sp->shm_lpid; 668 ds64.shmx_cpid = sp->shm_cpid; 669 ds64.shmx_nattch = nattch; 670 ds64.shmx_cnattch = sp->shm_ismattch; 671 ds64.shmx_atime = sp->shm_atime; 672 ds64.shmx_dtime = sp->shm_dtime; 673 ds64.shmx_ctime = sp->shm_ctime; 674 675 mutex_exit(lock); 676 if (copyout(&ds64, arg, sizeof (struct shmid_ds64))) 677 return (EFAULT); 678 679 return (0); 680 681 /* Lock segment in memory */ 682 case SHM_LOCK: 683 if ((error = secpolicy_lock_memory(cr)) != 0) 684 break; 685 686 if (!isspt(sp) && (sp->shm_lkcnt++ == 0)) { 687 if (error = shmem_lock(sp->shm_amp)) { 688 ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER); 689 cmn_err(CE_NOTE, 690 "shmctl - couldn't lock %ld pages into memory", 691 sp->shm_amp->size); 692 ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock); 693 error = ENOMEM; 694 sp->shm_lkcnt--; 695 shmem_unlock(sp->shm_amp, 0); 696 } 697 } 698 break; 699 700 /* Unlock segment */ 701 case SHM_UNLOCK: 702 if ((error = secpolicy_lock_memory(cr)) != 0) 703 break; 704 705 if (!isspt(sp)) { 706 if (sp->shm_lkcnt && (--sp->shm_lkcnt == 0)) { 707 shmem_unlock(sp->shm_amp, 1); 708 } 709 } 710 break; 711 712 default: 713 error = EINVAL; 714 break; 715 } 716 mutex_exit(lock); 717 return (error); 718 } 719 720 static void 721 shm_detach(proc_t *pp, segacct_t *sap) 722 { 723 kshmid_t *sp = sap->sa_id; 724 size_t len = sap->sa_len; 725 caddr_t addr = sap->sa_addr; 726 727 /* 728 * Discard lwpchan mappings. 729 */ 730 if (pp->p_lcp != NULL) 731 lwpchan_delete_mapping(pp, addr, addr + len); 732 (void) as_unmap(pp->p_as, addr, len); 733 734 /* 735 * Perform some detach-time accounting. 736 */ 737 (void) ipc_lock(shm_svc, sp->shm_perm.ipc_id); 738 if (sap->sa_flags & SHMSA_ISM) 739 sp->shm_ismattch--; 740 sp->shm_dtime = gethrestime_sec(); 741 sp->shm_lpid = pp->p_pid; 742 ipc_rele(shm_svc, (kipc_perm_t *)sp); /* Drops lock */ 743 744 kmem_free(sap, sizeof (segacct_t)); 745 } 746 747 static int 748 shmdt(caddr_t addr) 749 { 750 proc_t *pp = curproc; 751 segacct_t *sap, template; 752 753 mutex_enter(&pp->p_lock); 754 prbarrier(pp); /* block /proc. See shmgetid(). */ 755 756 template.sa_addr = addr; 757 template.sa_len = 0; 758 if ((pp->p_segacct == NULL) || 759 ((sap = avl_find(pp->p_segacct, &template, NULL)) == NULL)) { 760 mutex_exit(&pp->p_lock); 761 return (EINVAL); 762 } 763 if (sap->sa_addr != addr) { 764 mutex_exit(&pp->p_lock); 765 return (EINVAL); 766 } 767 avl_remove(pp->p_segacct, sap); 768 mutex_exit(&pp->p_lock); 769 770 shm_detach(pp, sap); 771 772 return (0); 773 } 774 775 /* 776 * Remove all shared memory segments associated with a given zone. 777 * Called by zone_shutdown when the zone is halted. 778 */ 779 /*ARGSUSED1*/ 780 static void 781 shm_remove_zone(zoneid_t zoneid, void *arg) 782 { 783 ipc_remove_zone(shm_svc, zoneid); 784 } 785 786 /* 787 * Shmget (create new shmem) system call. 788 */ 789 static int 790 shmget(key_t key, size_t size, int shmflg, uintptr_t *rvp) 791 { 792 proc_t *pp = curproc; 793 kshmid_t *sp; 794 kmutex_t *lock; 795 int error; 796 797 top: 798 if (error = ipc_get(shm_svc, key, shmflg, (kipc_perm_t **)&sp, &lock)) 799 return (error); 800 801 if (!IPC_FREE(&sp->shm_perm)) { 802 /* 803 * A segment with the requested key exists. 804 */ 805 if (size > sp->shm_segsz) { 806 mutex_exit(lock); 807 return (EINVAL); 808 } 809 } else { 810 /* 811 * A new segment should be created. 812 */ 813 size_t npages = btopr(size); 814 size_t rsize = ptob(npages); 815 816 /* 817 * Check rsize and the per-project limit on shared 818 * memory. Checking rsize handles both the size == 0 819 * case and the size < ULONG_MAX & PAGEMASK case (i.e. 820 * rounding up wraps a size_t). 821 */ 822 if (rsize == 0 || (rctl_test(rc_project_shmmax, 823 pp->p_task->tk_proj->kpj_rctls, pp, rsize, 824 RCA_SAFE) & RCT_DENY)) { 825 826 mutex_exit(&pp->p_lock); 827 mutex_exit(lock); 828 ipc_cleanup(shm_svc, (kipc_perm_t *)sp); 829 return (EINVAL); 830 } 831 mutex_exit(&pp->p_lock); 832 mutex_exit(lock); 833 834 if (anon_resv(rsize) == 0) { 835 ipc_cleanup(shm_svc, (kipc_perm_t *)sp); 836 return (ENOMEM); 837 } 838 839 sp->shm_amp = anonmap_alloc(rsize, rsize); 840 841 /* 842 * Store the original user's requested size, in bytes, 843 * rather than the page-aligned size. The former is 844 * used for IPC_STAT and shmget() lookups. The latter 845 * is saved in the anon_map structure and is used for 846 * calls to the vm layer. 847 */ 848 sp->shm_segsz = size; 849 sp->shm_atime = sp->shm_dtime = 0; 850 sp->shm_ctime = gethrestime_sec(); 851 sp->shm_lpid = (pid_t)0; 852 sp->shm_cpid = curproc->p_pid; 853 sp->shm_ismattch = 0; 854 sp->shm_sptinfo = NULL; 855 856 /* 857 * Check limits one last time, push id into global 858 * visibility, and update resource usage counts. 859 */ 860 if (error = ipc_commit_begin(shm_svc, key, shmflg, 861 (kipc_perm_t *)sp)) { 862 if (error == EAGAIN) 863 goto top; 864 return (error); 865 } 866 867 if (rctl_test(rc_project_shmmax, 868 sp->shm_perm.ipc_proj->kpj_rctls, pp, rsize, 869 RCA_SAFE) & RCT_DENY) { 870 ipc_cleanup(shm_svc, (kipc_perm_t *)sp); 871 return (EINVAL); 872 } 873 sp->shm_perm.ipc_proj->kpj_data.kpd_shmmax += rsize; 874 875 lock = ipc_commit_end(shm_svc, &sp->shm_perm); 876 } 877 878 #ifdef C2_AUDIT 879 if (audit_active) 880 audit_ipcget(AT_IPC_SHM, (void *)sp); 881 #endif 882 883 *rvp = (uintptr_t)(sp->shm_perm.ipc_id); 884 885 mutex_exit(lock); 886 return (0); 887 } 888 889 /* 890 * shmids system call. 891 */ 892 static int 893 shmids(int *buf, uint_t nids, uint_t *pnids) 894 { 895 return (ipc_ids(shm_svc, buf, nids, pnids)); 896 } 897 898 /* 899 * System entry point for shmat, shmctl, shmdt, and shmget system calls. 900 */ 901 static uintptr_t 902 shmsys(int opcode, uintptr_t a0, uintptr_t a1, uintptr_t a2) 903 { 904 int error; 905 uintptr_t r_val = 0; 906 907 switch (opcode) { 908 case SHMAT: 909 error = shmat((int)a0, (caddr_t)a1, (int)a2, &r_val); 910 break; 911 case SHMCTL: 912 error = shmctl((int)a0, (int)a1, (void *)a2); 913 break; 914 case SHMDT: 915 error = shmdt((caddr_t)a0); 916 break; 917 case SHMGET: 918 error = shmget((key_t)a0, (size_t)a1, (int)a2, &r_val); 919 break; 920 case SHMIDS: 921 error = shmids((int *)a0, (uint_t)a1, (uint_t *)a2); 922 break; 923 default: 924 error = EINVAL; 925 break; 926 } 927 928 if (error) 929 return ((uintptr_t)set_errno(error)); 930 931 return (r_val); 932 } 933 934 /* 935 * segacct_t comparator 936 * This works as expected, with one minor change: the first of two real 937 * segments with equal addresses is considered to be 'greater than' the 938 * second. We only return equal when searching using a template, in 939 * which case we explicitly set the template segment's length to 0 940 * (which is invalid for a real segment). 941 */ 942 static int 943 shm_sacompar(const void *x, const void *y) 944 { 945 segacct_t *sa1 = (segacct_t *)x; 946 segacct_t *sa2 = (segacct_t *)y; 947 948 if (sa1->sa_addr < sa2->sa_addr) { 949 return (-1); 950 } else if (sa2->sa_len != 0) { 951 if (sa1->sa_addr >= sa2->sa_addr + sa2->sa_len) { 952 return (1); 953 } else if (sa1->sa_len != 0) { 954 return (1); 955 } else { 956 return (0); 957 } 958 } else if (sa1->sa_addr > sa2->sa_addr) { 959 return (1); 960 } else { 961 return (0); 962 } 963 } 964 965 /* 966 * add this record to the segacct list. 967 */ 968 static void 969 sa_add(struct proc *pp, caddr_t addr, size_t len, ulong_t flags, kshmid_t *id) 970 { 971 segacct_t *nsap; 972 avl_tree_t *tree = NULL; 973 avl_index_t where; 974 975 nsap = kmem_alloc(sizeof (segacct_t), KM_SLEEP); 976 nsap->sa_addr = addr; 977 nsap->sa_len = len; 978 nsap->sa_flags = flags; 979 nsap->sa_id = id; 980 981 if (pp->p_segacct == NULL) 982 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 983 984 mutex_enter(&pp->p_lock); 985 prbarrier(pp); /* block /proc. See shmgetid(). */ 986 987 if (pp->p_segacct == NULL) { 988 avl_create(tree, shm_sacompar, sizeof (segacct_t), 989 offsetof(segacct_t, sa_tree)); 990 pp->p_segacct = tree; 991 } else if (tree) { 992 kmem_free(tree, sizeof (avl_tree_t)); 993 } 994 995 /* 996 * We can ignore the result of avl_find, as the comparator will 997 * never return equal for segments with non-zero length. This 998 * is a necessary hack to get around the fact that we do, in 999 * fact, have duplicate keys. 1000 */ 1001 (void) avl_find(pp->p_segacct, nsap, &where); 1002 avl_insert(pp->p_segacct, nsap, where); 1003 1004 mutex_exit(&pp->p_lock); 1005 } 1006 1007 /* 1008 * Duplicate parent's segacct records in child. 1009 */ 1010 void 1011 shmfork(struct proc *ppp, struct proc *cpp) 1012 { 1013 segacct_t *sap; 1014 kshmid_t *sp; 1015 kmutex_t *mp; 1016 1017 ASSERT(ppp->p_segacct != NULL); 1018 1019 /* 1020 * We are the only lwp running in the parent so nobody can 1021 * mess with our p_segacct list. Thus it is safe to traverse 1022 * the list without holding p_lock. This is essential because 1023 * we can't hold p_lock during a KM_SLEEP allocation. 1024 */ 1025 for (sap = (segacct_t *)avl_first(ppp->p_segacct); sap != NULL; 1026 sap = (segacct_t *)AVL_NEXT(ppp->p_segacct, sap)) { 1027 sa_add(cpp, sap->sa_addr, sap->sa_len, sap->sa_flags, 1028 sap->sa_id); 1029 sp = sap->sa_id; 1030 mp = ipc_lock(shm_svc, sp->shm_perm.ipc_id); 1031 if (sap->sa_flags & SHMSA_ISM) 1032 sp->shm_ismattch++; 1033 ipc_hold(shm_svc, (kipc_perm_t *)sp); 1034 mutex_exit(mp); 1035 } 1036 } 1037 1038 /* 1039 * Detach shared memory segments from exiting process. 1040 */ 1041 void 1042 shmexit(struct proc *pp) 1043 { 1044 segacct_t *sap; 1045 avl_tree_t *tree; 1046 void *cookie = NULL; 1047 1048 ASSERT(pp->p_segacct != NULL); 1049 1050 mutex_enter(&pp->p_lock); 1051 prbarrier(pp); 1052 tree = pp->p_segacct; 1053 pp->p_segacct = NULL; 1054 mutex_exit(&pp->p_lock); 1055 1056 while ((sap = avl_destroy_nodes(tree, &cookie)) != NULL) 1057 (void) shm_detach(pp, sap); 1058 1059 avl_destroy(tree); 1060 kmem_free(tree, sizeof (avl_tree_t)); 1061 } 1062 1063 /* 1064 * At this time pages should be in memory, so just lock them. 1065 */ 1066 static void 1067 lock_again(size_t npages, struct anon_map *amp) 1068 { 1069 struct anon *ap; 1070 struct page *pp; 1071 struct vnode *vp; 1072 anoff_t off; 1073 ulong_t anon_idx; 1074 anon_sync_obj_t cookie; 1075 1076 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 1077 1078 for (anon_idx = 0; npages != 0; anon_idx++, npages--) { 1079 1080 anon_array_enter(amp, anon_idx, &cookie); 1081 ap = anon_get_ptr(amp->ahp, anon_idx); 1082 swap_xlate(ap, &vp, &off); 1083 anon_array_exit(&cookie); 1084 1085 pp = page_lookup(vp, (u_offset_t)off, SE_SHARED); 1086 if (pp == NULL) { 1087 panic("lock_again: page not in the system"); 1088 /*NOTREACHED*/ 1089 } 1090 (void) page_pp_lock(pp, 0, 0); 1091 page_unlock(pp); 1092 } 1093 ANON_LOCK_EXIT(&->a_rwlock); 1094 } 1095 1096 /* check if this segment is already locked. */ 1097 /*ARGSUSED*/ 1098 static int 1099 check_locked(struct as *as, struct segvn_data *svd, size_t npages) 1100 { 1101 struct vpage *vpp = svd->vpage; 1102 size_t i; 1103 if (svd->vpage == NULL) 1104 return (0); /* unlocked */ 1105 1106 SEGVN_LOCK_ENTER(as, &svd->lock, RW_READER); 1107 for (i = 0; i < npages; i++, vpp++) { 1108 if (VPP_ISPPLOCK(vpp) == 0) { 1109 SEGVN_LOCK_EXIT(as, &svd->lock); 1110 return (1); /* partially locked */ 1111 } 1112 } 1113 SEGVN_LOCK_EXIT(as, &svd->lock); 1114 return (2); /* locked */ 1115 } 1116 1117 1118 /* 1119 * Attach the shared memory segment to the process 1120 * address space and lock the pages. 1121 */ 1122 static int 1123 shmem_lock(struct anon_map *amp) 1124 { 1125 size_t npages = btopr(amp->size); 1126 struct seg *seg; 1127 struct as *as; 1128 struct segvn_crargs crargs; 1129 struct segvn_data *svd; 1130 proc_t *p = curproc; 1131 caddr_t addr; 1132 uint_t error, ret; 1133 caddr_t seg_base; 1134 size_t seg_sz; 1135 1136 as = p->p_as; 1137 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1138 /* check if shared memory is already attached */ 1139 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { 1140 svd = (struct segvn_data *)seg->s_data; 1141 if ((seg->s_ops == &segvn_ops) && (svd->amp == amp) && 1142 (amp->size == seg->s_size)) { 1143 switch (ret = check_locked(as, svd, npages)) { 1144 case 0: /* unlocked */ 1145 case 1: /* partially locked */ 1146 seg_base = seg->s_base; 1147 seg_sz = seg->s_size; 1148 1149 AS_LOCK_EXIT(as, &as->a_lock); 1150 if ((error = as_ctl(as, seg_base, seg_sz, 1151 MC_LOCK, 0, 0, NULL, 0)) == 0) 1152 lock_again(npages, amp); 1153 (void) as_ctl(as, seg_base, seg_sz, MC_UNLOCK, 1154 0, 0, NULL, NULL); 1155 return (error); 1156 case 2: /* locked */ 1157 AS_LOCK_EXIT(as, &as->a_lock); 1158 lock_again(npages, amp); 1159 return (0); 1160 default: 1161 cmn_err(CE_WARN, "shmem_lock: deflt %d", ret); 1162 break; 1163 } 1164 } 1165 } 1166 AS_LOCK_EXIT(as, &as->a_lock); 1167 1168 /* attach shm segment to our address space */ 1169 as_rangelock(as); 1170 map_addr(&addr, amp->size, 0ll, 1, 0); 1171 if (addr == NULL) { 1172 as_rangeunlock(as); 1173 return (ENOMEM); 1174 } 1175 1176 /* Initialize the create arguments and map the segment */ 1177 crargs = *(struct segvn_crargs *)zfod_argsp; /* structure copy */ 1178 crargs.offset = (u_offset_t)0; 1179 crargs.type = MAP_SHARED; 1180 crargs.amp = amp; 1181 crargs.prot = PROT_ALL; 1182 crargs.maxprot = crargs.prot; 1183 crargs.flags = 0; 1184 1185 error = as_map(as, addr, amp->size, segvn_create, &crargs); 1186 as_rangeunlock(as); 1187 if (!error) { 1188 if ((error = as_ctl(as, addr, amp->size, MC_LOCK, 0, 0, 1189 NULL, 0)) == 0) { 1190 lock_again(npages, amp); 1191 } 1192 (void) as_unmap(as, addr, amp->size); 1193 } 1194 return (error); 1195 } 1196 1197 1198 /* 1199 * Unlock shared memory 1200 */ 1201 static void 1202 shmem_unlock(struct anon_map *amp, uint_t lck) 1203 { 1204 struct anon *ap; 1205 pgcnt_t npages = btopr(amp->size); 1206 struct vnode *vp; 1207 struct page *pp; 1208 anoff_t off; 1209 ulong_t anon_idx; 1210 1211 for (anon_idx = 0; anon_idx < npages; anon_idx++) { 1212 1213 if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) { 1214 if (lck) { 1215 panic("shmem_unlock: null app"); 1216 /*NOTREACHED*/ 1217 } 1218 continue; 1219 } 1220 swap_xlate(ap, &vp, &off); 1221 pp = page_lookup(vp, off, SE_SHARED); 1222 if (pp == NULL) { 1223 if (lck) { 1224 panic("shmem_unlock: page not in the system"); 1225 /*NOTREACHED*/ 1226 } 1227 continue; 1228 } 1229 if (pp->p_lckcnt) { 1230 page_pp_unlock(pp, 0, 0); 1231 } 1232 page_unlock(pp); 1233 } 1234 } 1235 1236 /* 1237 * We call this routine when we have removed all references to this 1238 * amp. This means all shmdt()s and the IPC_RMID have been done. 1239 */ 1240 static void 1241 shm_rm_amp(struct anon_map *amp, uint_t lckflag) 1242 { 1243 /* 1244 * If we are finally deleting the 1245 * shared memory, and if no one did 1246 * the SHM_UNLOCK, we must do it now. 1247 */ 1248 shmem_unlock(amp, lckflag); 1249 1250 /* 1251 * Free up the anon_map. 1252 */ 1253 lgrp_shm_policy_fini(amp, NULL); 1254 if (amp->a_szc != 0) { 1255 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1256 anon_shmap_free_pages(amp, 0, amp->size); 1257 ANON_LOCK_EXIT(&->a_rwlock); 1258 } else { 1259 anon_free(amp->ahp, 0, amp->size); 1260 } 1261 anon_unresv(amp->swresv); 1262 anonmap_free(amp); 1263 } 1264 1265 /* 1266 * Return the shared memory id for the process's virtual address. 1267 * Return SHMID_NONE if addr is not within a SysV shared memory segment. 1268 * Return SHMID_FREE if addr's SysV shared memory segment's id has been freed. 1269 * 1270 * shmgetid() is called from code in /proc with the process locked but 1271 * with pp->p_lock not held. The address space lock is held, so we 1272 * cannot grab pp->p_lock here due to lock-ordering constraints. 1273 * Because of all this, modifications to the p_segacct list must only 1274 * be made after calling prbarrier() to ensure the process is not locked. 1275 * See shmdt() and sa_add(), above. shmgetid() may also be called on a 1276 * thread's own process without the process locked. 1277 */ 1278 int 1279 shmgetid(proc_t *pp, caddr_t addr) 1280 { 1281 segacct_t *sap, template; 1282 1283 ASSERT(MUTEX_NOT_HELD(&pp->p_lock)); 1284 ASSERT((pp->p_proc_flag & P_PR_LOCK) || pp == curproc); 1285 1286 if (pp->p_segacct == NULL) 1287 return (SHMID_NONE); 1288 1289 template.sa_addr = addr; 1290 template.sa_len = 0; 1291 if ((sap = avl_find(pp->p_segacct, &template, NULL)) == NULL) 1292 return (SHMID_NONE); 1293 1294 if (IPC_FREE(&sap->sa_id->shm_perm)) 1295 return (SHMID_FREE); 1296 1297 return (sap->sa_id->shm_perm.ipc_id); 1298 } 1299