1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 /* 40 * Inter-Process Communication Shared Memory Facility. 41 * 42 * See os/ipc.c for a description of common IPC functionality. 43 * 44 * Resource controls 45 * ----------------- 46 * 47 * Control: zone.max-shm-ids (rc_zone_shmmni) 48 * Description: Maximum number of shared memory ids allowed a zone. 49 * 50 * When shmget() is used to allocate a shared memory segment, one id 51 * is allocated. If the id allocation doesn't succeed, shmget() 52 * fails and errno is set to ENOSPC. Upon successful shmctl(, 53 * IPC_RMID) the id is deallocated. 54 * 55 * Control: project.max-shm-ids (rc_project_shmmni) 56 * Description: Maximum number of shared memory ids allowed a project. 57 * 58 * When shmget() is used to allocate a shared memory segment, one id 59 * is allocated. If the id allocation doesn't succeed, shmget() 60 * fails and errno is set to ENOSPC. Upon successful shmctl(, 61 * IPC_RMID) the id is deallocated. 62 * 63 * Control: zone.max-shm-memory (rc_zone_shmmax) 64 * Description: Total amount of shared memory allowed a zone. 65 * 66 * When shmget() is used to allocate a shared memory segment, the 67 * segment's size is allocated against this limit. If the space 68 * allocation doesn't succeed, shmget() fails and errno is set to 69 * EINVAL. The size will be deallocated once the last process has 70 * detached the segment and the segment has been successfully 71 * shmctl(, IPC_RMID)ed. 72 * 73 * Control: project.max-shm-memory (rc_project_shmmax) 74 * Description: Total amount of shared memory allowed a project. 75 * 76 * When shmget() is used to allocate a shared memory segment, the 77 * segment's size is allocated against this limit. If the space 78 * allocation doesn't succeed, shmget() fails and errno is set to 79 * EINVAL. The size will be deallocated once the last process has 80 * detached the segment and the segment has been successfully 81 * shmctl(, IPC_RMID)ed. 82 */ 83 84 #include <sys/types.h> 85 #include <sys/param.h> 86 #include <sys/cred.h> 87 #include <sys/errno.h> 88 #include <sys/time.h> 89 #include <sys/kmem.h> 90 #include <sys/user.h> 91 #include <sys/proc.h> 92 #include <sys/systm.h> 93 #include <sys/prsystm.h> 94 #include <sys/sysmacros.h> 95 #include <sys/tuneable.h> 96 #include <sys/vm.h> 97 #include <sys/mman.h> 98 #include <sys/swap.h> 99 #include <sys/cmn_err.h> 100 #include <sys/debug.h> 101 #include <sys/lwpchan_impl.h> 102 #include <sys/avl.h> 103 #include <sys/modctl.h> 104 #include <sys/syscall.h> 105 #include <sys/task.h> 106 #include <sys/project.h> 107 #include <sys/policy.h> 108 #include <sys/zone.h> 109 #include <sys/rctl.h> 110 111 #include <sys/ipc.h> 112 #include <sys/ipc_impl.h> 113 #include <sys/shm.h> 114 #include <sys/shm_impl.h> 115 116 #include <vm/hat.h> 117 #include <vm/seg.h> 118 #include <vm/as.h> 119 #include <vm/seg_vn.h> 120 #include <vm/anon.h> 121 #include <vm/page.h> 122 #include <vm/vpage.h> 123 #include <vm/seg_spt.h> 124 125 #include <c2/audit.h> 126 127 static int shmem_lock(kshmid_t *sp, struct anon_map *amp); 128 static void shmem_unlock(kshmid_t *sp, struct anon_map *amp); 129 static void sa_add(struct proc *pp, caddr_t addr, size_t len, ulong_t flags, 130 kshmid_t *id); 131 static void shm_rm_amp(kshmid_t *sp); 132 static void shm_dtor(kipc_perm_t *); 133 static void shm_rmid(kipc_perm_t *); 134 static void shm_remove_zone(zoneid_t, void *); 135 136 /* 137 * Semantics for share_page_table and ism_off: 138 * 139 * These are hooks in /etc/system - only for internal testing purpose. 140 * 141 * Setting share_page_table automatically turns on the SHM_SHARE_MMU (ISM) flag 142 * in a call to shmat(2). In other words, with share_page_table set, you always 143 * get ISM, even if say, DISM is specified. It should really be called "ism_on". 144 * 145 * Setting ism_off turns off the SHM_SHARE_MMU flag from the flags passed to 146 * shmat(2). 147 * 148 * If both share_page_table and ism_off are set, share_page_table prevails. 149 * 150 * Although these tunables should probably be removed, they do have some 151 * external exposure; as long as they exist, they should at least work sensibly. 152 */ 153 154 int share_page_table; 155 int ism_off; 156 157 /* 158 * The following tunables are obsolete. Though for compatibility we 159 * still read and interpret shminfo_shmmax and shminfo_shmmni (see 160 * os/project.c), the preferred mechanism for administrating the IPC 161 * Shared Memory facility is through the resource controls described at 162 * the top of this file. 163 */ 164 size_t shminfo_shmmax = 0x800000; /* (obsolete) */ 165 int shminfo_shmmni = 100; /* (obsolete) */ 166 size_t shminfo_shmmin = 1; /* (obsolete) */ 167 int shminfo_shmseg = 6; /* (obsolete) */ 168 169 extern rctl_hndl_t rc_zone_shmmax; 170 extern rctl_hndl_t rc_zone_shmmni; 171 extern rctl_hndl_t rc_project_shmmax; 172 extern rctl_hndl_t rc_project_shmmni; 173 static ipc_service_t *shm_svc; 174 static zone_key_t shm_zone_key; 175 176 /* 177 * Module linkage information for the kernel. 178 */ 179 static uintptr_t shmsys(int, uintptr_t, uintptr_t, uintptr_t); 180 181 static struct sysent ipcshm_sysent = { 182 4, 183 #ifdef _SYSCALL32_IMPL 184 SE_ARGC | SE_NOUNLOAD | SE_64RVAL, 185 #else /* _SYSCALL32_IMPL */ 186 SE_ARGC | SE_NOUNLOAD | SE_32RVAL1, 187 #endif /* _SYSCALL32_IMPL */ 188 (int (*)())shmsys 189 }; 190 191 #ifdef _SYSCALL32_IMPL 192 static struct sysent ipcshm_sysent32 = { 193 4, 194 SE_ARGC | SE_NOUNLOAD | SE_32RVAL1, 195 (int (*)())shmsys 196 }; 197 #endif /* _SYSCALL32_IMPL */ 198 199 static struct modlsys modlsys = { 200 &mod_syscallops, "System V shared memory", &ipcshm_sysent 201 }; 202 203 #ifdef _SYSCALL32_IMPL 204 static struct modlsys modlsys32 = { 205 &mod_syscallops32, "32-bit System V shared memory", &ipcshm_sysent32 206 }; 207 #endif /* _SYSCALL32_IMPL */ 208 209 static struct modlinkage modlinkage = { 210 MODREV_1, 211 &modlsys, 212 #ifdef _SYSCALL32_IMPL 213 &modlsys32, 214 #endif 215 NULL 216 }; 217 218 219 int 220 _init(void) 221 { 222 int result; 223 224 shm_svc = ipcs_create("shmids", rc_project_shmmni, rc_zone_shmmni, 225 sizeof (kshmid_t), shm_dtor, shm_rmid, AT_IPC_SHM, 226 offsetof(ipc_rqty_t, ipcq_shmmni)); 227 zone_key_create(&shm_zone_key, NULL, shm_remove_zone, NULL); 228 229 if ((result = mod_install(&modlinkage)) == 0) 230 return (0); 231 232 (void) zone_key_delete(shm_zone_key); 233 ipcs_destroy(shm_svc); 234 235 return (result); 236 } 237 238 int 239 _fini(void) 240 { 241 return (EBUSY); 242 } 243 244 int 245 _info(struct modinfo *modinfop) 246 { 247 return (mod_info(&modlinkage, modinfop)); 248 } 249 250 /* 251 * Shmat (attach shared segment) system call. 252 */ 253 static int 254 shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) 255 { 256 kshmid_t *sp; /* shared memory header ptr */ 257 size_t size; 258 int error = 0; 259 proc_t *pp = curproc; 260 struct as *as = pp->p_as; 261 struct segvn_crargs crargs; /* segvn create arguments */ 262 kmutex_t *lock; 263 struct seg *segspt = NULL; 264 caddr_t addr = uaddr; 265 int flags = (uflags & SHMAT_VALID_FLAGS_MASK); 266 int useISM; 267 uchar_t prot = PROT_ALL; 268 int result; 269 270 if ((lock = ipc_lookup(shm_svc, shmid, (kipc_perm_t **)&sp)) == NULL) 271 return (EINVAL); 272 if (error = ipcperm_access(&sp->shm_perm, SHM_R, CRED())) 273 goto errret; 274 if ((flags & SHM_RDONLY) == 0 && 275 (error = ipcperm_access(&sp->shm_perm, SHM_W, CRED()))) 276 goto errret; 277 if (spt_invalid(flags)) { 278 error = EINVAL; 279 goto errret; 280 } 281 if (ism_off) 282 flags = flags & ~SHM_SHARE_MMU; 283 if (share_page_table) { 284 flags = flags & ~SHM_PAGEABLE; 285 flags = flags | SHM_SHARE_MMU; 286 } 287 useISM = (spt_locked(flags) || spt_pageable(flags)); 288 if (useISM && (error = ipcperm_access(&sp->shm_perm, SHM_W, CRED()))) 289 goto errret; 290 if (useISM && isspt(sp)) { 291 uint_t newsptflags = flags | spt_flags(sp->shm_sptseg); 292 /* 293 * If trying to change an existing {D}ISM segment from ISM 294 * to DISM or vice versa, return error. Note that this 295 * validation of flags needs to be done after the effect of 296 * tunables such as ism_off and share_page_table, for 297 * semantics that are consistent with the tunables' settings. 298 */ 299 if (spt_invalid(newsptflags)) { 300 error = EINVAL; 301 goto errret; 302 } 303 } 304 ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER); 305 size = sp->shm_amp->size; 306 ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock); 307 308 /* somewhere to record spt info for final detach */ 309 if (sp->shm_sptinfo == NULL) 310 sp->shm_sptinfo = kmem_zalloc(sizeof (sptinfo_t), KM_SLEEP); 311 312 as_rangelock(as); 313 314 if (useISM) { 315 /* 316 * Handle ISM 317 */ 318 uint_t share_szc; 319 size_t share_size; 320 struct shm_data ssd; 321 uintptr_t align_hint; 322 323 /* 324 * Pick a share pagesize to use, if (!isspt(sp)). 325 * Otherwise use the already chosen page size. 326 * 327 * For the initial shmat (!isspt(sp)), where sptcreate is 328 * called, map_pgsz is called to recommend a [D]ISM pagesize, 329 * important for systems which offer more than one potential 330 * [D]ISM pagesize. 331 * If the shmat is just to attach to an already created 332 * [D]ISM segment, then use the previously selected page size. 333 */ 334 if (!isspt(sp)) { 335 share_size = map_pgsz(MAPPGSZ_ISM, pp, addr, size, 0); 336 if (share_size == 0) { 337 as_rangeunlock(as); 338 error = EINVAL; 339 goto errret; 340 } 341 share_szc = page_szc(share_size); 342 } else { 343 share_szc = sp->shm_sptseg->s_szc; 344 share_size = page_get_pagesize(share_szc); 345 } 346 size = P2ROUNDUP(size, share_size); 347 348 align_hint = share_size; 349 #if defined(__i386) || defined(__amd64) 350 /* 351 * For x86, we want to share as much of the page table tree 352 * as possible. We use a large align_hint at first, but 353 * if that fails, then the code below retries with align_hint 354 * set to share_size. 355 * 356 * The explicit extern here is due to the difficulties 357 * of getting to platform dependent includes. When/if the 358 * platform dependent bits of this function are cleaned up, 359 * another way of doing this should found. 360 */ 361 { 362 extern uint_t ptes_per_table; 363 364 while (size >= ptes_per_table * (uint64_t)align_hint) 365 align_hint *= ptes_per_table; 366 } 367 #endif /* __i386 || __amd64 */ 368 369 #if defined(__sparcv9) 370 if (addr == 0 && 371 pp->p_model == DATAMODEL_LP64 && AS_TYPE_64BIT(as)) { 372 /* 373 * If no address has been passed in, and this is a 374 * 64-bit process, we'll try to find an address 375 * in the predict-ISM zone. 376 */ 377 caddr_t predbase = (caddr_t)PREDISM_1T_BASE; 378 size_t len = PREDISM_BOUND - PREDISM_1T_BASE; 379 380 as_purge(as); 381 if (as_gap(as, size + share_size, &predbase, &len, 382 AH_LO, (caddr_t)NULL) != -1) { 383 /* 384 * We found an address which looks like a 385 * candidate. We want to round it up, and 386 * then check that it's a valid user range. 387 * This assures that we won't fail below. 388 */ 389 addr = (caddr_t)P2ROUNDUP((uintptr_t)predbase, 390 share_size); 391 392 if (valid_usr_range(addr, size, prot, 393 as, as->a_userlimit) != RANGE_OKAY) { 394 addr = 0; 395 } 396 } 397 } 398 #endif /* __sparcv9 */ 399 400 if (addr == 0) { 401 for (;;) { 402 addr = (caddr_t)align_hint; 403 map_addr(&addr, size, 0ll, 1, MAP_ALIGN); 404 if (addr != NULL || align_hint == share_size) 405 break; 406 align_hint = share_size; 407 } 408 if (addr == NULL) { 409 as_rangeunlock(as); 410 error = ENOMEM; 411 goto errret; 412 } 413 ASSERT(((uintptr_t)addr & (align_hint - 1)) == 0); 414 } else { 415 /* Use the user-supplied attach address */ 416 caddr_t base; 417 size_t len; 418 419 /* 420 * Check that the address range 421 * 1) is properly aligned 422 * 2) is correct in unix terms 423 * 3) is within an unmapped address segment 424 */ 425 base = addr; 426 len = size; /* use spt aligned size */ 427 /* XXX - in SunOS, is sp->shm_segsz */ 428 if ((uintptr_t)base & (share_size - 1)) { 429 error = EINVAL; 430 as_rangeunlock(as); 431 goto errret; 432 } 433 result = valid_usr_range(base, len, prot, as, 434 as->a_userlimit); 435 if (result == RANGE_BADPROT) { 436 /* 437 * We try to accomodate processors which 438 * may not support execute permissions on 439 * all ISM segments by trying the check 440 * again but without PROT_EXEC. 441 */ 442 prot &= ~PROT_EXEC; 443 result = valid_usr_range(base, len, prot, as, 444 as->a_userlimit); 445 } 446 as_purge(as); 447 if (result != RANGE_OKAY || 448 as_gap(as, len, &base, &len, AH_LO, 449 (caddr_t)NULL) != 0) { 450 error = EINVAL; 451 as_rangeunlock(as); 452 goto errret; 453 } 454 } 455 456 if (!isspt(sp)) { 457 error = sptcreate(size, &segspt, sp->shm_amp, prot, 458 flags, share_szc); 459 if (error) { 460 as_rangeunlock(as); 461 goto errret; 462 } 463 sp->shm_sptinfo->sptas = segspt->s_as; 464 sp->shm_sptseg = segspt; 465 sp->shm_sptprot = prot; 466 } else if ((prot & sp->shm_sptprot) != sp->shm_sptprot) { 467 /* 468 * Ensure we're attaching to an ISM segment with 469 * fewer or equal permissions than what we're 470 * allowed. Fail if the segment has more 471 * permissions than what we're allowed. 472 */ 473 error = EACCES; 474 as_rangeunlock(as); 475 goto errret; 476 } 477 478 ssd.shm_sptseg = sp->shm_sptseg; 479 ssd.shm_sptas = sp->shm_sptinfo->sptas; 480 ssd.shm_amp = sp->shm_amp; 481 error = as_map(as, addr, size, segspt_shmattach, &ssd); 482 if (error == 0) 483 sp->shm_ismattch++; /* keep count of ISM attaches */ 484 } else { 485 486 /* 487 * Normal case. 488 */ 489 if (flags & SHM_RDONLY) 490 prot &= ~PROT_WRITE; 491 492 if (addr == 0) { 493 /* Let the system pick the attach address */ 494 map_addr(&addr, size, 0ll, 1, 0); 495 if (addr == NULL) { 496 as_rangeunlock(as); 497 error = ENOMEM; 498 goto errret; 499 } 500 } else { 501 /* Use the user-supplied attach address */ 502 caddr_t base; 503 size_t len; 504 505 if (flags & SHM_RND) 506 addr = (caddr_t)((uintptr_t)addr & 507 ~(SHMLBA - 1)); 508 /* 509 * Check that the address range 510 * 1) is properly aligned 511 * 2) is correct in unix terms 512 * 3) is within an unmapped address segment 513 */ 514 base = addr; 515 len = size; /* use aligned size */ 516 /* XXX - in SunOS, is sp->shm_segsz */ 517 if ((uintptr_t)base & PAGEOFFSET) { 518 error = EINVAL; 519 as_rangeunlock(as); 520 goto errret; 521 } 522 result = valid_usr_range(base, len, prot, as, 523 as->a_userlimit); 524 if (result == RANGE_BADPROT) { 525 prot &= ~PROT_EXEC; 526 result = valid_usr_range(base, len, prot, as, 527 as->a_userlimit); 528 } 529 as_purge(as); 530 if (result != RANGE_OKAY || 531 as_gap(as, len, &base, &len, 532 AH_LO, (caddr_t)NULL) != 0) { 533 error = EINVAL; 534 as_rangeunlock(as); 535 goto errret; 536 } 537 } 538 539 /* Initialize the create arguments and map the segment */ 540 crargs = *(struct segvn_crargs *)zfod_argsp; 541 crargs.offset = 0; 542 crargs.type = MAP_SHARED; 543 crargs.amp = sp->shm_amp; 544 crargs.prot = prot; 545 crargs.maxprot = crargs.prot; 546 crargs.flags = 0; 547 548 error = as_map(as, addr, size, segvn_create, &crargs); 549 } 550 551 as_rangeunlock(as); 552 if (error) 553 goto errret; 554 555 /* record shmem range for the detach */ 556 sa_add(pp, addr, (size_t)size, useISM ? SHMSA_ISM : 0, sp); 557 *rvp = (uintptr_t)addr; 558 559 sp->shm_atime = gethrestime_sec(); 560 sp->shm_lpid = pp->p_pid; 561 ipc_hold(shm_svc, (kipc_perm_t *)sp); 562 563 /* 564 * Tell machine specific code that lwp has mapped shared memory 565 */ 566 LWP_MMODEL_SHARED_AS(addr, size); 567 568 errret: 569 mutex_exit(lock); 570 return (error); 571 } 572 573 static void 574 shm_dtor(kipc_perm_t *perm) 575 { 576 kshmid_t *sp = (kshmid_t *)perm; 577 uint_t cnt; 578 size_t rsize; 579 580 ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER); 581 anonmap_purge(sp->shm_amp); 582 ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock); 583 584 if (sp->shm_sptinfo) { 585 if (isspt(sp)) { 586 sptdestroy(sp->shm_sptinfo->sptas, sp->shm_amp); 587 sp->shm_lkcnt = 0; 588 } 589 kmem_free(sp->shm_sptinfo, sizeof (sptinfo_t)); 590 } 591 592 if (sp->shm_lkcnt > 0) { 593 shmem_unlock(sp, sp->shm_amp); 594 sp->shm_lkcnt = 0; 595 } 596 597 ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER); 598 cnt = --sp->shm_amp->refcnt; 599 ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock); 600 ASSERT(cnt == 0); 601 shm_rm_amp(sp); 602 603 if (sp->shm_perm.ipc_id != IPC_ID_INVAL) { 604 rsize = ptob(btopr(sp->shm_segsz)); 605 ipcs_lock(shm_svc); 606 sp->shm_perm.ipc_proj->kpj_data.kpd_shmmax -= rsize; 607 sp->shm_perm.ipc_zone_ref.zref_zone->zone_shmmax -= rsize; 608 ipcs_unlock(shm_svc); 609 } 610 } 611 612 /* ARGSUSED */ 613 static void 614 shm_rmid(kipc_perm_t *perm) 615 { 616 /* nothing to do */ 617 } 618 619 /* 620 * Shmctl system call. 621 */ 622 /* ARGSUSED */ 623 static int 624 shmctl(int shmid, int cmd, void *arg) 625 { 626 kshmid_t *sp; /* shared memory header ptr */ 627 STRUCT_DECL(shmid_ds, ds); /* for SVR4 IPC_SET */ 628 int error = 0; 629 struct cred *cr = CRED(); 630 kmutex_t *lock; 631 model_t mdl = get_udatamodel(); 632 struct shmid_ds64 ds64; 633 shmatt_t nattch; 634 635 STRUCT_INIT(ds, mdl); 636 637 /* 638 * Perform pre- or non-lookup actions (e.g. copyins, RMID). 639 */ 640 switch (cmd) { 641 case IPC_SET: 642 if (copyin(arg, STRUCT_BUF(ds), STRUCT_SIZE(ds))) 643 return (EFAULT); 644 break; 645 646 case IPC_SET64: 647 if (copyin(arg, &ds64, sizeof (struct shmid_ds64))) 648 return (EFAULT); 649 break; 650 651 case IPC_RMID: 652 return (ipc_rmid(shm_svc, shmid, cr)); 653 } 654 655 if ((lock = ipc_lookup(shm_svc, shmid, (kipc_perm_t **)&sp)) == NULL) 656 return (EINVAL); 657 658 switch (cmd) { 659 /* Set ownership and permissions. */ 660 case IPC_SET: 661 if (error = ipcperm_set(shm_svc, cr, &sp->shm_perm, 662 &STRUCT_BUF(ds)->shm_perm, mdl)) 663 break; 664 sp->shm_ctime = gethrestime_sec(); 665 break; 666 667 case IPC_STAT: 668 if (error = ipcperm_access(&sp->shm_perm, SHM_R, cr)) 669 break; 670 671 nattch = sp->shm_perm.ipc_ref - 1; 672 673 ipcperm_stat(&STRUCT_BUF(ds)->shm_perm, &sp->shm_perm, mdl); 674 STRUCT_FSET(ds, shm_segsz, sp->shm_segsz); 675 STRUCT_FSETP(ds, shm_amp, NULL); /* kernel addr */ 676 STRUCT_FSET(ds, shm_lkcnt, sp->shm_lkcnt); 677 STRUCT_FSET(ds, shm_lpid, sp->shm_lpid); 678 STRUCT_FSET(ds, shm_cpid, sp->shm_cpid); 679 STRUCT_FSET(ds, shm_nattch, nattch); 680 STRUCT_FSET(ds, shm_cnattch, sp->shm_ismattch); 681 STRUCT_FSET(ds, shm_atime, sp->shm_atime); 682 STRUCT_FSET(ds, shm_dtime, sp->shm_dtime); 683 STRUCT_FSET(ds, shm_ctime, sp->shm_ctime); 684 685 mutex_exit(lock); 686 if (copyout(STRUCT_BUF(ds), arg, STRUCT_SIZE(ds))) 687 return (EFAULT); 688 689 return (0); 690 691 case IPC_SET64: 692 if (error = ipcperm_set64(shm_svc, cr, 693 &sp->shm_perm, &ds64.shmx_perm)) 694 break; 695 sp->shm_ctime = gethrestime_sec(); 696 break; 697 698 case IPC_STAT64: 699 nattch = sp->shm_perm.ipc_ref - 1; 700 701 ipcperm_stat64(&ds64.shmx_perm, &sp->shm_perm); 702 ds64.shmx_segsz = sp->shm_segsz; 703 ds64.shmx_lkcnt = sp->shm_lkcnt; 704 ds64.shmx_lpid = sp->shm_lpid; 705 ds64.shmx_cpid = sp->shm_cpid; 706 ds64.shmx_nattch = nattch; 707 ds64.shmx_cnattch = sp->shm_ismattch; 708 ds64.shmx_atime = sp->shm_atime; 709 ds64.shmx_dtime = sp->shm_dtime; 710 ds64.shmx_ctime = sp->shm_ctime; 711 712 mutex_exit(lock); 713 if (copyout(&ds64, arg, sizeof (struct shmid_ds64))) 714 return (EFAULT); 715 716 return (0); 717 718 /* Lock segment in memory */ 719 case SHM_LOCK: 720 if ((error = secpolicy_lock_memory(cr)) != 0) 721 break; 722 723 /* protect against overflow */ 724 if (sp->shm_lkcnt >= USHRT_MAX) { 725 error = ENOMEM; 726 break; 727 } 728 if (!isspt(sp) && (sp->shm_lkcnt++ == 0)) { 729 if (error = shmem_lock(sp, sp->shm_amp)) { 730 ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, 731 RW_WRITER); 732 cmn_err(CE_NOTE, "shmctl - couldn't lock %ld" 733 " pages into memory", sp->shm_amp->size); 734 ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock); 735 error = ENOMEM; 736 sp->shm_lkcnt--; 737 } 738 } 739 break; 740 741 /* Unlock segment */ 742 case SHM_UNLOCK: 743 if ((error = secpolicy_lock_memory(cr)) != 0) 744 break; 745 746 if (sp->shm_lkcnt && (--sp->shm_lkcnt == 0)) { 747 shmem_unlock(sp, sp->shm_amp); 748 } 749 break; 750 751 default: 752 error = EINVAL; 753 break; 754 } 755 mutex_exit(lock); 756 return (error); 757 } 758 759 static void 760 shm_detach(proc_t *pp, segacct_t *sap) 761 { 762 kshmid_t *sp = sap->sa_id; 763 size_t len = sap->sa_len; 764 caddr_t addr = sap->sa_addr; 765 766 /* 767 * Discard lwpchan mappings. 768 */ 769 if (pp->p_lcp != NULL) 770 lwpchan_delete_mapping(pp, addr, addr + len); 771 (void) as_unmap(pp->p_as, addr, len); 772 773 /* 774 * Perform some detach-time accounting. 775 */ 776 (void) ipc_lock(shm_svc, sp->shm_perm.ipc_id); 777 if (sap->sa_flags & SHMSA_ISM) 778 sp->shm_ismattch--; 779 sp->shm_dtime = gethrestime_sec(); 780 sp->shm_lpid = pp->p_pid; 781 ipc_rele(shm_svc, (kipc_perm_t *)sp); /* Drops lock */ 782 783 kmem_free(sap, sizeof (segacct_t)); 784 } 785 786 static int 787 shmdt(caddr_t addr) 788 { 789 proc_t *pp = curproc; 790 segacct_t *sap, template; 791 792 mutex_enter(&pp->p_lock); 793 prbarrier(pp); /* block /proc. See shmgetid(). */ 794 795 template.sa_addr = addr; 796 template.sa_len = 0; 797 if ((pp->p_segacct == NULL) || 798 ((sap = avl_find(pp->p_segacct, &template, NULL)) == NULL)) { 799 mutex_exit(&pp->p_lock); 800 return (EINVAL); 801 } 802 if (sap->sa_addr != addr) { 803 mutex_exit(&pp->p_lock); 804 return (EINVAL); 805 } 806 avl_remove(pp->p_segacct, sap); 807 mutex_exit(&pp->p_lock); 808 809 shm_detach(pp, sap); 810 811 return (0); 812 } 813 814 /* 815 * Remove all shared memory segments associated with a given zone. 816 * Called by zone_shutdown when the zone is halted. 817 */ 818 /*ARGSUSED1*/ 819 static void 820 shm_remove_zone(zoneid_t zoneid, void *arg) 821 { 822 ipc_remove_zone(shm_svc, zoneid); 823 } 824 825 /* 826 * Shmget (create new shmem) system call. 827 */ 828 static int 829 shmget(key_t key, size_t size, int shmflg, uintptr_t *rvp) 830 { 831 proc_t *pp = curproc; 832 kshmid_t *sp; 833 kmutex_t *lock; 834 int error; 835 836 top: 837 if (error = ipc_get(shm_svc, key, shmflg, (kipc_perm_t **)&sp, &lock)) 838 return (error); 839 840 if (!IPC_FREE(&sp->shm_perm)) { 841 /* 842 * A segment with the requested key exists. 843 */ 844 if (size > sp->shm_segsz) { 845 mutex_exit(lock); 846 return (EINVAL); 847 } 848 } else { 849 /* 850 * A new segment should be created. 851 */ 852 size_t npages = btopr(size); 853 size_t rsize = ptob(npages); 854 855 /* 856 * Check rsize and the per-project and per-zone limit on 857 * shared memory. Checking rsize handles both the size == 0 858 * case and the size < ULONG_MAX & PAGEMASK case (i.e. 859 * rounding up wraps a size_t). 860 */ 861 if (rsize == 0 || 862 (rctl_test(rc_project_shmmax, 863 pp->p_task->tk_proj->kpj_rctls, pp, rsize, 864 RCA_SAFE) & RCT_DENY) || 865 (rctl_test(rc_zone_shmmax, 866 pp->p_zone->zone_rctls, pp, rsize, 867 RCA_SAFE) & RCT_DENY)) { 868 869 mutex_exit(&pp->p_lock); 870 mutex_exit(lock); 871 ipc_cleanup(shm_svc, (kipc_perm_t *)sp); 872 return (EINVAL); 873 } 874 mutex_exit(&pp->p_lock); 875 mutex_exit(lock); 876 877 if (anon_resv(rsize) == 0) { 878 ipc_cleanup(shm_svc, (kipc_perm_t *)sp); 879 return (ENOMEM); 880 } 881 882 /* 883 * If any new failure points are introduced between the 884 * the above anon_resv() and the below ipc_commit_begin(), 885 * these failure points will need to unreserve the anon 886 * reserved using anon_unresv(). 887 * 888 * Once ipc_commit_begin() is called, the anon reserved 889 * above will be automatically unreserved by future calls to 890 * ipcs_cleanup() -> shm_dtor() -> shm_rm_amp(). If 891 * ipc_commit_begin() fails, it internally calls shm_dtor(), 892 * unreserving the above anon, and freeing the below amp. 893 */ 894 895 sp->shm_amp = anonmap_alloc(rsize, rsize, ANON_SLEEP); 896 sp->shm_amp->a_sp = sp; 897 /* 898 * Store the original user's requested size, in bytes, 899 * rather than the page-aligned size. The former is 900 * used for IPC_STAT and shmget() lookups. The latter 901 * is saved in the anon_map structure and is used for 902 * calls to the vm layer. 903 */ 904 sp->shm_segsz = size; 905 sp->shm_atime = sp->shm_dtime = 0; 906 sp->shm_ctime = gethrestime_sec(); 907 sp->shm_lpid = (pid_t)0; 908 sp->shm_cpid = curproc->p_pid; 909 sp->shm_ismattch = 0; 910 sp->shm_sptinfo = NULL; 911 /* 912 * Check limits one last time, push id into global 913 * visibility, and update resource usage counts. 914 */ 915 if (error = ipc_commit_begin(shm_svc, key, shmflg, 916 (kipc_perm_t *)sp)) { 917 if (error == EAGAIN) 918 goto top; 919 return (error); 920 } 921 922 if ((rctl_test(rc_project_shmmax, 923 sp->shm_perm.ipc_proj->kpj_rctls, pp, rsize, 924 RCA_SAFE) & RCT_DENY) || 925 (rctl_test(rc_zone_shmmax, 926 sp->shm_perm.ipc_zone_ref.zref_zone->zone_rctls, pp, rsize, 927 RCA_SAFE) & RCT_DENY)) { 928 ipc_cleanup(shm_svc, (kipc_perm_t *)sp); 929 return (EINVAL); 930 } 931 sp->shm_perm.ipc_proj->kpj_data.kpd_shmmax += rsize; 932 sp->shm_perm.ipc_zone_ref.zref_zone->zone_shmmax += rsize; 933 934 lock = ipc_commit_end(shm_svc, &sp->shm_perm); 935 } 936 937 if (AU_AUDITING()) 938 audit_ipcget(AT_IPC_SHM, (void *)sp); 939 940 *rvp = (uintptr_t)(sp->shm_perm.ipc_id); 941 942 mutex_exit(lock); 943 return (0); 944 } 945 946 /* 947 * shmids system call. 948 */ 949 static int 950 shmids(int *buf, uint_t nids, uint_t *pnids) 951 { 952 return (ipc_ids(shm_svc, buf, nids, pnids)); 953 } 954 955 /* 956 * System entry point for shmat, shmctl, shmdt, and shmget system calls. 957 */ 958 static uintptr_t 959 shmsys(int opcode, uintptr_t a0, uintptr_t a1, uintptr_t a2) 960 { 961 int error; 962 uintptr_t r_val = 0; 963 964 switch (opcode) { 965 case SHMAT: 966 error = shmat((int)a0, (caddr_t)a1, (int)a2, &r_val); 967 break; 968 case SHMCTL: 969 error = shmctl((int)a0, (int)a1, (void *)a2); 970 break; 971 case SHMDT: 972 error = shmdt((caddr_t)a0); 973 break; 974 case SHMGET: 975 error = shmget((key_t)a0, (size_t)a1, (int)a2, &r_val); 976 break; 977 case SHMIDS: 978 error = shmids((int *)a0, (uint_t)a1, (uint_t *)a2); 979 break; 980 default: 981 error = EINVAL; 982 break; 983 } 984 985 if (error) 986 return ((uintptr_t)set_errno(error)); 987 988 return (r_val); 989 } 990 991 /* 992 * segacct_t comparator 993 * This works as expected, with one minor change: the first of two real 994 * segments with equal addresses is considered to be 'greater than' the 995 * second. We only return equal when searching using a template, in 996 * which case we explicitly set the template segment's length to 0 997 * (which is invalid for a real segment). 998 */ 999 static int 1000 shm_sacompar(const void *x, const void *y) 1001 { 1002 segacct_t *sa1 = (segacct_t *)x; 1003 segacct_t *sa2 = (segacct_t *)y; 1004 1005 if (sa1->sa_addr < sa2->sa_addr) { 1006 return (-1); 1007 } else if (sa2->sa_len != 0) { 1008 if (sa1->sa_addr >= sa2->sa_addr + sa2->sa_len) { 1009 return (1); 1010 } else if (sa1->sa_len != 0) { 1011 return (1); 1012 } else { 1013 return (0); 1014 } 1015 } else if (sa1->sa_addr > sa2->sa_addr) { 1016 return (1); 1017 } else { 1018 return (0); 1019 } 1020 } 1021 1022 /* 1023 * add this record to the segacct list. 1024 */ 1025 static void 1026 sa_add(struct proc *pp, caddr_t addr, size_t len, ulong_t flags, kshmid_t *id) 1027 { 1028 segacct_t *nsap; 1029 avl_tree_t *tree = NULL; 1030 avl_index_t where; 1031 1032 nsap = kmem_alloc(sizeof (segacct_t), KM_SLEEP); 1033 nsap->sa_addr = addr; 1034 nsap->sa_len = len; 1035 nsap->sa_flags = flags; 1036 nsap->sa_id = id; 1037 1038 if (pp->p_segacct == NULL) 1039 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 1040 1041 mutex_enter(&pp->p_lock); 1042 prbarrier(pp); /* block /proc. See shmgetid(). */ 1043 1044 if (pp->p_segacct == NULL) { 1045 avl_create(tree, shm_sacompar, sizeof (segacct_t), 1046 offsetof(segacct_t, sa_tree)); 1047 pp->p_segacct = tree; 1048 } else if (tree) { 1049 kmem_free(tree, sizeof (avl_tree_t)); 1050 } 1051 1052 /* 1053 * We can ignore the result of avl_find, as the comparator will 1054 * never return equal for segments with non-zero length. This 1055 * is a necessary hack to get around the fact that we do, in 1056 * fact, have duplicate keys. 1057 */ 1058 (void) avl_find(pp->p_segacct, nsap, &where); 1059 avl_insert(pp->p_segacct, nsap, where); 1060 1061 mutex_exit(&pp->p_lock); 1062 } 1063 1064 /* 1065 * Duplicate parent's segacct records in child. 1066 */ 1067 void 1068 shmfork(struct proc *ppp, struct proc *cpp) 1069 { 1070 segacct_t *sap; 1071 kshmid_t *sp; 1072 kmutex_t *mp; 1073 1074 ASSERT(ppp->p_segacct != NULL); 1075 1076 /* 1077 * We are the only lwp running in the parent so nobody can 1078 * mess with our p_segacct list. Thus it is safe to traverse 1079 * the list without holding p_lock. This is essential because 1080 * we can't hold p_lock during a KM_SLEEP allocation. 1081 */ 1082 for (sap = (segacct_t *)avl_first(ppp->p_segacct); sap != NULL; 1083 sap = (segacct_t *)AVL_NEXT(ppp->p_segacct, sap)) { 1084 sa_add(cpp, sap->sa_addr, sap->sa_len, sap->sa_flags, 1085 sap->sa_id); 1086 sp = sap->sa_id; 1087 mp = ipc_lock(shm_svc, sp->shm_perm.ipc_id); 1088 if (sap->sa_flags & SHMSA_ISM) 1089 sp->shm_ismattch++; 1090 ipc_hold(shm_svc, (kipc_perm_t *)sp); 1091 mutex_exit(mp); 1092 } 1093 } 1094 1095 /* 1096 * Detach shared memory segments from exiting process. 1097 */ 1098 void 1099 shmexit(struct proc *pp) 1100 { 1101 segacct_t *sap; 1102 avl_tree_t *tree; 1103 void *cookie = NULL; 1104 1105 ASSERT(pp->p_segacct != NULL); 1106 1107 mutex_enter(&pp->p_lock); 1108 prbarrier(pp); 1109 tree = pp->p_segacct; 1110 pp->p_segacct = NULL; 1111 mutex_exit(&pp->p_lock); 1112 1113 while ((sap = avl_destroy_nodes(tree, &cookie)) != NULL) 1114 (void) shm_detach(pp, sap); 1115 1116 avl_destroy(tree); 1117 kmem_free(tree, sizeof (avl_tree_t)); 1118 } 1119 1120 /* 1121 * At this time pages should be in memory, so just lock them. 1122 */ 1123 static void 1124 lock_again(size_t npages, kshmid_t *sp, struct anon_map *amp) 1125 { 1126 struct anon *ap; 1127 struct page *pp; 1128 struct vnode *vp; 1129 u_offset_t off; 1130 ulong_t anon_idx; 1131 anon_sync_obj_t cookie; 1132 1133 mutex_enter(&sp->shm_mlock); 1134 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 1135 for (anon_idx = 0; npages != 0; anon_idx++, npages--) { 1136 1137 anon_array_enter(amp, anon_idx, &cookie); 1138 ap = anon_get_ptr(amp->ahp, anon_idx); 1139 ASSERT(ap != NULL); 1140 swap_xlate(ap, &vp, &off); 1141 anon_array_exit(&cookie); 1142 1143 pp = page_lookup(vp, off, SE_SHARED); 1144 if (pp == NULL) { 1145 panic("lock_again: page not in the system"); 1146 /*NOTREACHED*/ 1147 } 1148 /* page should already be locked by caller */ 1149 ASSERT(pp->p_lckcnt > 0); 1150 (void) page_pp_lock(pp, 0, 0); 1151 page_unlock(pp); 1152 } 1153 ANON_LOCK_EXIT(&->a_rwlock); 1154 mutex_exit(&sp->shm_mlock); 1155 } 1156 1157 /* 1158 * Attach the shared memory segment to the process 1159 * address space and lock the pages. 1160 */ 1161 static int 1162 shmem_lock(kshmid_t *sp, struct anon_map *amp) 1163 { 1164 size_t npages = btopr(amp->size); 1165 struct as *as; 1166 struct segvn_crargs crargs; 1167 uint_t error; 1168 1169 /* 1170 * A later ISM/DISM attach may increase the size of the amp, so 1171 * cache the number of pages locked for the future shmem_unlock() 1172 */ 1173 sp->shm_lkpages = npages; 1174 1175 as = as_alloc(); 1176 /* Initialize the create arguments and map the segment */ 1177 crargs = *(struct segvn_crargs *)zfod_argsp; /* structure copy */ 1178 crargs.offset = (u_offset_t)0; 1179 crargs.type = MAP_SHARED; 1180 crargs.amp = amp; 1181 crargs.prot = PROT_ALL; 1182 crargs.maxprot = crargs.prot; 1183 crargs.flags = 0; 1184 error = as_map(as, 0x0, amp->size, segvn_create, &crargs); 1185 if (!error) { 1186 if ((error = as_ctl(as, 0x0, amp->size, MC_LOCK, 0, 0, 1187 NULL, 0)) == 0) { 1188 lock_again(npages, sp, amp); 1189 } 1190 (void) as_unmap(as, 0x0, amp->size); 1191 } 1192 as_free(as); 1193 return (error); 1194 } 1195 1196 1197 /* 1198 * Unlock shared memory 1199 */ 1200 static void 1201 shmem_unlock(kshmid_t *sp, struct anon_map *amp) 1202 { 1203 struct anon *ap; 1204 pgcnt_t npages = sp->shm_lkpages; 1205 struct vnode *vp; 1206 struct page *pp; 1207 u_offset_t off; 1208 ulong_t anon_idx; 1209 size_t unlocked_bytes = 0; 1210 kproject_t *proj; 1211 anon_sync_obj_t cookie; 1212 1213 proj = sp->shm_perm.ipc_proj; 1214 mutex_enter(&sp->shm_mlock); 1215 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 1216 for (anon_idx = 0; anon_idx < npages; anon_idx++) { 1217 1218 anon_array_enter(amp, anon_idx, &cookie); 1219 if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) { 1220 panic("shmem_unlock: null app"); 1221 /*NOTREACHED*/ 1222 } 1223 swap_xlate(ap, &vp, &off); 1224 anon_array_exit(&cookie); 1225 pp = page_lookup(vp, off, SE_SHARED); 1226 if (pp == NULL) { 1227 panic("shmem_unlock: page not in the system"); 1228 /*NOTREACHED*/ 1229 } 1230 /* 1231 * Page should at least have once lock from previous 1232 * shmem_lock 1233 */ 1234 ASSERT(pp->p_lckcnt > 0); 1235 page_pp_unlock(pp, 0, 0); 1236 if (pp->p_lckcnt == 0) 1237 unlocked_bytes += PAGESIZE; 1238 1239 page_unlock(pp); 1240 } 1241 1242 if (unlocked_bytes > 0) { 1243 rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0); 1244 } 1245 1246 ANON_LOCK_EXIT(&->a_rwlock); 1247 mutex_exit(&sp->shm_mlock); 1248 } 1249 1250 /* 1251 * We call this routine when we have removed all references to this 1252 * amp. This means all shmdt()s and the IPC_RMID have been done. 1253 */ 1254 static void 1255 shm_rm_amp(kshmid_t *sp) 1256 { 1257 struct anon_map *amp = sp->shm_amp; 1258 zone_t *zone; 1259 1260 zone = sp->shm_perm.ipc_zone_ref.zref_zone; 1261 ASSERT(zone != NULL); 1262 /* 1263 * Free up the anon_map. 1264 */ 1265 lgrp_shm_policy_fini(amp, NULL); 1266 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1267 if (amp->a_szc != 0) { 1268 anon_shmap_free_pages(amp, 0, amp->size); 1269 } else { 1270 anon_free(amp->ahp, 0, amp->size); 1271 } 1272 ANON_LOCK_EXIT(&->a_rwlock); 1273 anon_unresv_zone(amp->swresv, zone); 1274 anonmap_free(amp); 1275 } 1276 1277 /* 1278 * Return the shared memory id for the process's virtual address. 1279 * Return SHMID_NONE if addr is not within a SysV shared memory segment. 1280 * Return SHMID_FREE if addr's SysV shared memory segment's id has been freed. 1281 * 1282 * shmgetid() is called from code in /proc with the process locked but 1283 * with pp->p_lock not held. The address space lock is held, so we 1284 * cannot grab pp->p_lock here due to lock-ordering constraints. 1285 * Because of all this, modifications to the p_segacct list must only 1286 * be made after calling prbarrier() to ensure the process is not locked. 1287 * See shmdt() and sa_add(), above. shmgetid() may also be called on a 1288 * thread's own process without the process locked. 1289 */ 1290 int 1291 shmgetid(proc_t *pp, caddr_t addr) 1292 { 1293 segacct_t *sap, template; 1294 1295 ASSERT(MUTEX_NOT_HELD(&pp->p_lock)); 1296 ASSERT((pp->p_proc_flag & P_PR_LOCK) || pp == curproc); 1297 1298 if (pp->p_segacct == NULL) 1299 return (SHMID_NONE); 1300 1301 template.sa_addr = addr; 1302 template.sa_len = 0; 1303 if ((sap = avl_find(pp->p_segacct, &template, NULL)) == NULL) 1304 return (SHMID_NONE); 1305 1306 if (IPC_FREE(&sap->sa_id->shm_perm)) 1307 return (SHMID_FREE); 1308 1309 return (sap->sa_id->shm_perm.ipc_id); 1310 } 1311