1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * Inter-Process Communication Shared Memory Facility. 43 * 44 * See os/ipc.c for a description of common IPC functionality. 45 * 46 * Resource controls 47 * ----------------- 48 * 49 * Control: zone.max-shm-ids (rc_zone_shmmni) 50 * Description: Maximum number of shared memory ids allowed a zone. 51 * 52 * When shmget() is used to allocate a shared memory segment, one id 53 * is allocated. If the id allocation doesn't succeed, shmget() 54 * fails and errno is set to ENOSPC. Upon successful shmctl(, 55 * IPC_RMID) the id is deallocated. 56 * 57 * Control: project.max-shm-ids (rc_project_shmmni) 58 * Description: Maximum number of shared memory ids allowed a project. 59 * 60 * When shmget() is used to allocate a shared memory segment, one id 61 * is allocated. If the id allocation doesn't succeed, shmget() 62 * fails and errno is set to ENOSPC. Upon successful shmctl(, 63 * IPC_RMID) the id is deallocated. 64 * 65 * Control: zone.max-shm-memory (rc_zone_shmmax) 66 * Description: Total amount of shared memory allowed a zone. 67 * 68 * When shmget() is used to allocate a shared memory segment, the 69 * segment's size is allocated against this limit. If the space 70 * allocation doesn't succeed, shmget() fails and errno is set to 71 * EINVAL. The size will be deallocated once the last process has 72 * detached the segment and the segment has been successfully 73 * shmctl(, IPC_RMID)ed. 74 * 75 * Control: project.max-shm-memory (rc_project_shmmax) 76 * Description: Total amount of shared memory allowed a project. 77 * 78 * When shmget() is used to allocate a shared memory segment, the 79 * segment's size is allocated against this limit. If the space 80 * allocation doesn't succeed, shmget() fails and errno is set to 81 * EINVAL. The size will be deallocated once the last process has 82 * detached the segment and the segment has been successfully 83 * shmctl(, IPC_RMID)ed. 84 */ 85 86 #include <sys/types.h> 87 #include <sys/param.h> 88 #include <sys/cred.h> 89 #include <sys/errno.h> 90 #include <sys/time.h> 91 #include <sys/kmem.h> 92 #include <sys/user.h> 93 #include <sys/proc.h> 94 #include <sys/systm.h> 95 #include <sys/prsystm.h> 96 #include <sys/sysmacros.h> 97 #include <sys/tuneable.h> 98 #include <sys/vm.h> 99 #include <sys/mman.h> 100 #include <sys/swap.h> 101 #include <sys/cmn_err.h> 102 #include <sys/debug.h> 103 #include <sys/lwpchan_impl.h> 104 #include <sys/avl.h> 105 #include <sys/modctl.h> 106 #include <sys/syscall.h> 107 #include <sys/task.h> 108 #include <sys/project.h> 109 #include <sys/policy.h> 110 #include <sys/zone.h> 111 #include <sys/rctl.h> 112 113 #include <sys/ipc.h> 114 #include <sys/ipc_impl.h> 115 #include <sys/shm.h> 116 #include <sys/shm_impl.h> 117 118 #include <vm/hat.h> 119 #include <vm/seg.h> 120 #include <vm/as.h> 121 #include <vm/seg_vn.h> 122 #include <vm/anon.h> 123 #include <vm/page.h> 124 #include <vm/vpage.h> 125 #include <vm/seg_spt.h> 126 127 #include <c2/audit.h> 128 129 static int shmem_lock(kshmid_t *sp, struct anon_map *amp); 130 static void shmem_unlock(kshmid_t *sp, struct anon_map *amp); 131 static void sa_add(struct proc *pp, caddr_t addr, size_t len, ulong_t flags, 132 kshmid_t *id); 133 static void shm_rm_amp(kshmid_t *sp); 134 static void shm_dtor(kipc_perm_t *); 135 static void shm_rmid(kipc_perm_t *); 136 static void shm_remove_zone(zoneid_t, void *); 137 138 /* 139 * Semantics for share_page_table and ism_off: 140 * 141 * These are hooks in /etc/system - only for internal testing purpose. 142 * 143 * Setting share_page_table automatically turns on the SHM_SHARE_MMU (ISM) flag 144 * in a call to shmat(2). In other words, with share_page_table set, you always 145 * get ISM, even if say, DISM is specified. It should really be called "ism_on". 146 * 147 * Setting ism_off turns off the SHM_SHARE_MMU flag from the flags passed to 148 * shmat(2). 149 * 150 * If both share_page_table and ism_off are set, share_page_table prevails. 151 * 152 * Although these tunables should probably be removed, they do have some 153 * external exposure; as long as they exist, they should at least work sensibly. 154 */ 155 156 int share_page_table; 157 int ism_off; 158 159 /* 160 * The following tunables are obsolete. Though for compatibility we 161 * still read and interpret shminfo_shmmax and shminfo_shmmni (see 162 * os/project.c), the preferred mechanism for administrating the IPC 163 * Shared Memory facility is through the resource controls described at 164 * the top of this file. 165 */ 166 size_t shminfo_shmmax = 0x800000; /* (obsolete) */ 167 int shminfo_shmmni = 100; /* (obsolete) */ 168 size_t shminfo_shmmin = 1; /* (obsolete) */ 169 int shminfo_shmseg = 6; /* (obsolete) */ 170 171 extern rctl_hndl_t rc_zone_shmmax; 172 extern rctl_hndl_t rc_zone_shmmni; 173 extern rctl_hndl_t rc_project_shmmax; 174 extern rctl_hndl_t rc_project_shmmni; 175 static ipc_service_t *shm_svc; 176 static zone_key_t shm_zone_key; 177 178 /* 179 * Module linkage information for the kernel. 180 */ 181 static uintptr_t shmsys(int, uintptr_t, uintptr_t, uintptr_t); 182 183 static struct sysent ipcshm_sysent = { 184 4, 185 #ifdef _SYSCALL32_IMPL 186 SE_ARGC | SE_NOUNLOAD | SE_64RVAL, 187 #else /* _SYSCALL32_IMPL */ 188 SE_ARGC | SE_NOUNLOAD | SE_32RVAL1, 189 #endif /* _SYSCALL32_IMPL */ 190 (int (*)())shmsys 191 }; 192 193 #ifdef _SYSCALL32_IMPL 194 static struct sysent ipcshm_sysent32 = { 195 4, 196 SE_ARGC | SE_NOUNLOAD | SE_32RVAL1, 197 (int (*)())shmsys 198 }; 199 #endif /* _SYSCALL32_IMPL */ 200 201 static struct modlsys modlsys = { 202 &mod_syscallops, "System V shared memory", &ipcshm_sysent 203 }; 204 205 #ifdef _SYSCALL32_IMPL 206 static struct modlsys modlsys32 = { 207 &mod_syscallops32, "32-bit System V shared memory", &ipcshm_sysent32 208 }; 209 #endif /* _SYSCALL32_IMPL */ 210 211 static struct modlinkage modlinkage = { 212 MODREV_1, 213 &modlsys, 214 #ifdef _SYSCALL32_IMPL 215 &modlsys32, 216 #endif 217 NULL 218 }; 219 220 221 int 222 _init(void) 223 { 224 int result; 225 226 shm_svc = ipcs_create("shmids", rc_project_shmmni, rc_zone_shmmni, 227 sizeof (kshmid_t), shm_dtor, shm_rmid, AT_IPC_SHM, 228 offsetof(ipc_rqty_t, ipcq_shmmni)); 229 zone_key_create(&shm_zone_key, NULL, shm_remove_zone, NULL); 230 231 if ((result = mod_install(&modlinkage)) == 0) 232 return (0); 233 234 (void) zone_key_delete(shm_zone_key); 235 ipcs_destroy(shm_svc); 236 237 return (result); 238 } 239 240 int 241 _fini(void) 242 { 243 return (EBUSY); 244 } 245 246 int 247 _info(struct modinfo *modinfop) 248 { 249 return (mod_info(&modlinkage, modinfop)); 250 } 251 252 /* 253 * Shmat (attach shared segment) system call. 254 */ 255 static int 256 shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) 257 { 258 kshmid_t *sp; /* shared memory header ptr */ 259 size_t size; 260 int error = 0; 261 proc_t *pp = curproc; 262 struct as *as = pp->p_as; 263 struct segvn_crargs crargs; /* segvn create arguments */ 264 kmutex_t *lock; 265 struct seg *segspt = NULL; 266 caddr_t addr = uaddr; 267 int flags = (uflags & SHMAT_VALID_FLAGS_MASK); 268 int useISM; 269 uchar_t prot = PROT_ALL; 270 int result; 271 272 if ((lock = ipc_lookup(shm_svc, shmid, (kipc_perm_t **)&sp)) == NULL) 273 return (EINVAL); 274 if (error = ipcperm_access(&sp->shm_perm, SHM_R, CRED())) 275 goto errret; 276 if ((flags & SHM_RDONLY) == 0 && 277 (error = ipcperm_access(&sp->shm_perm, SHM_W, CRED()))) 278 goto errret; 279 if (spt_invalid(flags)) { 280 error = EINVAL; 281 goto errret; 282 } 283 if (ism_off) 284 flags = flags & ~SHM_SHARE_MMU; 285 if (share_page_table) { 286 flags = flags & ~SHM_PAGEABLE; 287 flags = flags | SHM_SHARE_MMU; 288 } 289 useISM = (spt_locked(flags) || spt_pageable(flags)); 290 if (useISM && (error = ipcperm_access(&sp->shm_perm, SHM_W, CRED()))) 291 goto errret; 292 if (useISM && isspt(sp)) { 293 uint_t newsptflags = flags | spt_flags(sp->shm_sptseg); 294 /* 295 * If trying to change an existing {D}ISM segment from ISM 296 * to DISM or vice versa, return error. Note that this 297 * validation of flags needs to be done after the effect of 298 * tunables such as ism_off and share_page_table, for 299 * semantics that are consistent with the tunables' settings. 300 */ 301 if (spt_invalid(newsptflags)) { 302 error = EINVAL; 303 goto errret; 304 } 305 } 306 ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER); 307 size = sp->shm_amp->size; 308 ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock); 309 310 /* somewhere to record spt info for final detach */ 311 if (sp->shm_sptinfo == NULL) 312 sp->shm_sptinfo = kmem_zalloc(sizeof (sptinfo_t), KM_SLEEP); 313 314 as_rangelock(as); 315 316 if (useISM) { 317 /* 318 * Handle ISM 319 */ 320 uint_t share_szc; 321 size_t share_size; 322 struct shm_data ssd; 323 uintptr_t align_hint; 324 325 /* 326 * Pick a share pagesize to use, if (!isspt(sp)). 327 * Otherwise use the already chosen page size. 328 * 329 * For the initial shmat (!isspt(sp)), where sptcreate is 330 * called, map_pgsz is called to recommend a [D]ISM pagesize, 331 * important for systems which offer more than one potential 332 * [D]ISM pagesize. 333 * If the shmat is just to attach to an already created 334 * [D]ISM segment, then use the previously selected page size. 335 */ 336 if (!isspt(sp)) { 337 share_size = map_pgsz(MAPPGSZ_ISM, pp, addr, size, 0); 338 if (share_size == 0) { 339 as_rangeunlock(as); 340 error = EINVAL; 341 goto errret; 342 } 343 share_szc = page_szc(share_size); 344 } else { 345 share_szc = sp->shm_sptseg->s_szc; 346 share_size = page_get_pagesize(share_szc); 347 } 348 size = P2ROUNDUP(size, share_size); 349 350 align_hint = share_size; 351 #if defined(__i386) || defined(__amd64) 352 /* 353 * For x86, we want to share as much of the page table tree 354 * as possible. We use a large align_hint at first, but 355 * if that fails, then the code below retries with align_hint 356 * set to share_size. 357 * 358 * The explicit extern here is due to the difficulties 359 * of getting to platform dependent includes. When/if the 360 * platform dependent bits of this function are cleaned up, 361 * another way of doing this should found. 362 */ 363 { 364 extern uint_t ptes_per_table; 365 366 while (size >= ptes_per_table * (uint64_t)align_hint) 367 align_hint *= ptes_per_table; 368 } 369 #endif /* __i386 || __amd64 */ 370 371 #if defined(__sparcv9) 372 if (addr == 0 && curproc->p_model == DATAMODEL_LP64) { 373 /* 374 * If no address has been passed in, and this is a 375 * 64-bit process, we'll try to find an address 376 * in the predict-ISM zone. 377 */ 378 caddr_t predbase = (caddr_t)PREDISM_1T_BASE; 379 size_t len = PREDISM_BOUND - PREDISM_1T_BASE; 380 381 as_purge(as); 382 if (as_gap(as, size + share_size, &predbase, &len, 383 AH_LO, (caddr_t)NULL) != -1) { 384 /* 385 * We found an address which looks like a 386 * candidate. We want to round it up, and 387 * then check that it's a valid user range. 388 * This assures that we won't fail below. 389 */ 390 addr = (caddr_t)P2ROUNDUP((uintptr_t)predbase, 391 share_size); 392 393 if (valid_usr_range(addr, size, prot, 394 as, as->a_userlimit) != RANGE_OKAY) { 395 addr = 0; 396 } 397 } 398 } 399 #endif /* __sparcv9 */ 400 401 if (addr == 0) { 402 for (;;) { 403 addr = (caddr_t)align_hint; 404 map_addr(&addr, size, 0ll, 1, MAP_ALIGN); 405 if (addr != NULL || align_hint == share_size) 406 break; 407 align_hint = share_size; 408 } 409 if (addr == NULL) { 410 as_rangeunlock(as); 411 error = ENOMEM; 412 goto errret; 413 } 414 ASSERT(((uintptr_t)addr & (align_hint - 1)) == 0); 415 } else { 416 /* Use the user-supplied attach address */ 417 caddr_t base; 418 size_t len; 419 420 /* 421 * Check that the address range 422 * 1) is properly aligned 423 * 2) is correct in unix terms 424 * 3) is within an unmapped address segment 425 */ 426 base = addr; 427 len = size; /* use spt aligned size */ 428 /* XXX - in SunOS, is sp->shm_segsz */ 429 if ((uintptr_t)base & (share_size - 1)) { 430 error = EINVAL; 431 as_rangeunlock(as); 432 goto errret; 433 } 434 result = valid_usr_range(base, len, prot, as, 435 as->a_userlimit); 436 if (result == RANGE_BADPROT) { 437 /* 438 * We try to accomodate processors which 439 * may not support execute permissions on 440 * all ISM segments by trying the check 441 * again but without PROT_EXEC. 442 */ 443 prot &= ~PROT_EXEC; 444 result = valid_usr_range(base, len, prot, as, 445 as->a_userlimit); 446 } 447 as_purge(as); 448 if (result != RANGE_OKAY || 449 as_gap(as, len, &base, &len, AH_LO, 450 (caddr_t)NULL) != 0) { 451 error = EINVAL; 452 as_rangeunlock(as); 453 goto errret; 454 } 455 } 456 457 if (!isspt(sp)) { 458 error = sptcreate(size, &segspt, sp->shm_amp, prot, 459 flags, share_szc); 460 if (error) { 461 as_rangeunlock(as); 462 goto errret; 463 } 464 sp->shm_sptinfo->sptas = segspt->s_as; 465 sp->shm_sptseg = segspt; 466 sp->shm_sptprot = prot; 467 } else if ((prot & sp->shm_sptprot) != sp->shm_sptprot) { 468 /* 469 * Ensure we're attaching to an ISM segment with 470 * fewer or equal permissions than what we're 471 * allowed. Fail if the segment has more 472 * permissions than what we're allowed. 473 */ 474 error = EACCES; 475 as_rangeunlock(as); 476 goto errret; 477 } 478 479 ssd.shm_sptseg = sp->shm_sptseg; 480 ssd.shm_sptas = sp->shm_sptinfo->sptas; 481 ssd.shm_amp = sp->shm_amp; 482 error = as_map(as, addr, size, segspt_shmattach, &ssd); 483 if (error == 0) 484 sp->shm_ismattch++; /* keep count of ISM attaches */ 485 } else { 486 487 /* 488 * Normal case. 489 */ 490 if (flags & SHM_RDONLY) 491 prot &= ~PROT_WRITE; 492 493 if (addr == 0) { 494 /* Let the system pick the attach address */ 495 map_addr(&addr, size, 0ll, 1, 0); 496 if (addr == NULL) { 497 as_rangeunlock(as); 498 error = ENOMEM; 499 goto errret; 500 } 501 } else { 502 /* Use the user-supplied attach address */ 503 caddr_t base; 504 size_t len; 505 506 if (flags & SHM_RND) 507 addr = (caddr_t)((uintptr_t)addr & 508 ~(SHMLBA - 1)); 509 /* 510 * Check that the address range 511 * 1) is properly aligned 512 * 2) is correct in unix terms 513 * 3) is within an unmapped address segment 514 */ 515 base = addr; 516 len = size; /* use aligned size */ 517 /* XXX - in SunOS, is sp->shm_segsz */ 518 if ((uintptr_t)base & PAGEOFFSET) { 519 error = EINVAL; 520 as_rangeunlock(as); 521 goto errret; 522 } 523 result = valid_usr_range(base, len, prot, as, 524 as->a_userlimit); 525 if (result == RANGE_BADPROT) { 526 prot &= ~PROT_EXEC; 527 result = valid_usr_range(base, len, prot, as, 528 as->a_userlimit); 529 } 530 as_purge(as); 531 if (result != RANGE_OKAY || 532 as_gap(as, len, &base, &len, 533 AH_LO, (caddr_t)NULL) != 0) { 534 error = EINVAL; 535 as_rangeunlock(as); 536 goto errret; 537 } 538 } 539 540 /* Initialize the create arguments and map the segment */ 541 crargs = *(struct segvn_crargs *)zfod_argsp; 542 crargs.offset = 0; 543 crargs.type = MAP_SHARED; 544 crargs.amp = sp->shm_amp; 545 crargs.prot = prot; 546 crargs.maxprot = crargs.prot; 547 crargs.flags = 0; 548 549 error = as_map(as, addr, size, segvn_create, &crargs); 550 } 551 552 as_rangeunlock(as); 553 if (error) 554 goto errret; 555 556 /* record shmem range for the detach */ 557 sa_add(pp, addr, (size_t)size, useISM ? SHMSA_ISM : 0, sp); 558 *rvp = (uintptr_t)addr; 559 560 sp->shm_atime = gethrestime_sec(); 561 sp->shm_lpid = pp->p_pid; 562 ipc_hold(shm_svc, (kipc_perm_t *)sp); 563 errret: 564 mutex_exit(lock); 565 return (error); 566 } 567 568 static void 569 shm_dtor(kipc_perm_t *perm) 570 { 571 kshmid_t *sp = (kshmid_t *)perm; 572 uint_t cnt; 573 size_t rsize; 574 575 if (sp->shm_lkcnt > 0) { 576 shmem_unlock(sp, sp->shm_amp); 577 sp->shm_lkcnt = 0; 578 } 579 580 if (sp->shm_sptinfo) { 581 if (isspt(sp)) 582 sptdestroy(sp->shm_sptinfo->sptas, sp->shm_amp); 583 kmem_free(sp->shm_sptinfo, sizeof (sptinfo_t)); 584 } 585 586 ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER); 587 cnt = --sp->shm_amp->refcnt; 588 ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock); 589 ASSERT(cnt == 0); 590 shm_rm_amp(sp); 591 592 if (sp->shm_perm.ipc_id != IPC_ID_INVAL) { 593 rsize = ptob(btopr(sp->shm_segsz)); 594 ipcs_lock(shm_svc); 595 sp->shm_perm.ipc_proj->kpj_data.kpd_shmmax -= rsize; 596 sp->shm_perm.ipc_zone->zone_shmmax -= rsize; 597 ipcs_unlock(shm_svc); 598 } 599 } 600 601 /* ARGSUSED */ 602 static void 603 shm_rmid(kipc_perm_t *perm) 604 { 605 /* nothing to do */ 606 } 607 608 /* 609 * Shmctl system call. 610 */ 611 /* ARGSUSED */ 612 static int 613 shmctl(int shmid, int cmd, void *arg) 614 { 615 kshmid_t *sp; /* shared memory header ptr */ 616 STRUCT_DECL(shmid_ds, ds); /* for SVR4 IPC_SET */ 617 int error = 0; 618 struct cred *cr = CRED(); 619 kmutex_t *lock; 620 model_t mdl = get_udatamodel(); 621 struct shmid_ds64 ds64; 622 shmatt_t nattch; 623 624 STRUCT_INIT(ds, mdl); 625 626 /* 627 * Perform pre- or non-lookup actions (e.g. copyins, RMID). 628 */ 629 switch (cmd) { 630 case IPC_SET: 631 if (copyin(arg, STRUCT_BUF(ds), STRUCT_SIZE(ds))) 632 return (EFAULT); 633 break; 634 635 case IPC_SET64: 636 if (copyin(arg, &ds64, sizeof (struct shmid_ds64))) 637 return (EFAULT); 638 break; 639 640 case IPC_RMID: 641 return (ipc_rmid(shm_svc, shmid, cr)); 642 } 643 644 if ((lock = ipc_lookup(shm_svc, shmid, (kipc_perm_t **)&sp)) == NULL) 645 return (EINVAL); 646 647 switch (cmd) { 648 /* Set ownership and permissions. */ 649 case IPC_SET: 650 if (error = ipcperm_set(shm_svc, cr, &sp->shm_perm, 651 &STRUCT_BUF(ds)->shm_perm, mdl)) 652 break; 653 sp->shm_ctime = gethrestime_sec(); 654 break; 655 656 case IPC_STAT: 657 if (error = ipcperm_access(&sp->shm_perm, SHM_R, cr)) 658 break; 659 660 nattch = sp->shm_perm.ipc_ref - 1; 661 662 ipcperm_stat(&STRUCT_BUF(ds)->shm_perm, &sp->shm_perm, mdl); 663 STRUCT_FSET(ds, shm_segsz, sp->shm_segsz); 664 STRUCT_FSETP(ds, shm_amp, NULL); /* kernel addr */ 665 STRUCT_FSET(ds, shm_lkcnt, sp->shm_lkcnt); 666 STRUCT_FSET(ds, shm_lpid, sp->shm_lpid); 667 STRUCT_FSET(ds, shm_cpid, sp->shm_cpid); 668 STRUCT_FSET(ds, shm_nattch, nattch); 669 STRUCT_FSET(ds, shm_cnattch, sp->shm_ismattch); 670 STRUCT_FSET(ds, shm_atime, sp->shm_atime); 671 STRUCT_FSET(ds, shm_dtime, sp->shm_dtime); 672 STRUCT_FSET(ds, shm_ctime, sp->shm_ctime); 673 674 mutex_exit(lock); 675 if (copyout(STRUCT_BUF(ds), arg, STRUCT_SIZE(ds))) 676 return (EFAULT); 677 678 return (0); 679 680 case IPC_SET64: 681 if (error = ipcperm_set64(shm_svc, cr, 682 &sp->shm_perm, &ds64.shmx_perm)) 683 break; 684 sp->shm_ctime = gethrestime_sec(); 685 break; 686 687 case IPC_STAT64: 688 nattch = sp->shm_perm.ipc_ref - 1; 689 690 ipcperm_stat64(&ds64.shmx_perm, &sp->shm_perm); 691 ds64.shmx_segsz = sp->shm_segsz; 692 ds64.shmx_lkcnt = sp->shm_lkcnt; 693 ds64.shmx_lpid = sp->shm_lpid; 694 ds64.shmx_cpid = sp->shm_cpid; 695 ds64.shmx_nattch = nattch; 696 ds64.shmx_cnattch = sp->shm_ismattch; 697 ds64.shmx_atime = sp->shm_atime; 698 ds64.shmx_dtime = sp->shm_dtime; 699 ds64.shmx_ctime = sp->shm_ctime; 700 701 mutex_exit(lock); 702 if (copyout(&ds64, arg, sizeof (struct shmid_ds64))) 703 return (EFAULT); 704 705 return (0); 706 707 /* Lock segment in memory */ 708 case SHM_LOCK: 709 if ((error = secpolicy_lock_memory(cr)) != 0) 710 break; 711 712 /* protect against overflow */ 713 if (sp->shm_lkcnt >= USHRT_MAX) { 714 error = ENOMEM; 715 break; 716 } 717 if (!isspt(sp) && (sp->shm_lkcnt++ == 0)) { 718 if (error = shmem_lock(sp, sp->shm_amp)) { 719 ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, 720 RW_WRITER); 721 cmn_err(CE_NOTE, 722 "shmctl - couldn't lock %ld pages into " 723 "memory", sp->shm_amp->size); 724 ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock); 725 error = ENOMEM; 726 sp->shm_lkcnt--; 727 } 728 } 729 break; 730 731 /* Unlock segment */ 732 case SHM_UNLOCK: 733 if ((error = secpolicy_lock_memory(cr)) != 0) 734 break; 735 736 if (sp->shm_lkcnt && (--sp->shm_lkcnt == 0)) { 737 shmem_unlock(sp, sp->shm_amp); 738 } 739 break; 740 741 default: 742 error = EINVAL; 743 break; 744 } 745 mutex_exit(lock); 746 return (error); 747 } 748 749 static void 750 shm_detach(proc_t *pp, segacct_t *sap) 751 { 752 kshmid_t *sp = sap->sa_id; 753 size_t len = sap->sa_len; 754 caddr_t addr = sap->sa_addr; 755 756 /* 757 * Discard lwpchan mappings. 758 */ 759 if (pp->p_lcp != NULL) 760 lwpchan_delete_mapping(pp, addr, addr + len); 761 (void) as_unmap(pp->p_as, addr, len); 762 763 /* 764 * Perform some detach-time accounting. 765 */ 766 (void) ipc_lock(shm_svc, sp->shm_perm.ipc_id); 767 if (sap->sa_flags & SHMSA_ISM) 768 sp->shm_ismattch--; 769 sp->shm_dtime = gethrestime_sec(); 770 sp->shm_lpid = pp->p_pid; 771 ipc_rele(shm_svc, (kipc_perm_t *)sp); /* Drops lock */ 772 773 kmem_free(sap, sizeof (segacct_t)); 774 } 775 776 static int 777 shmdt(caddr_t addr) 778 { 779 proc_t *pp = curproc; 780 segacct_t *sap, template; 781 782 mutex_enter(&pp->p_lock); 783 prbarrier(pp); /* block /proc. See shmgetid(). */ 784 785 template.sa_addr = addr; 786 template.sa_len = 0; 787 if ((pp->p_segacct == NULL) || 788 ((sap = avl_find(pp->p_segacct, &template, NULL)) == NULL)) { 789 mutex_exit(&pp->p_lock); 790 return (EINVAL); 791 } 792 if (sap->sa_addr != addr) { 793 mutex_exit(&pp->p_lock); 794 return (EINVAL); 795 } 796 avl_remove(pp->p_segacct, sap); 797 mutex_exit(&pp->p_lock); 798 799 shm_detach(pp, sap); 800 801 return (0); 802 } 803 804 /* 805 * Remove all shared memory segments associated with a given zone. 806 * Called by zone_shutdown when the zone is halted. 807 */ 808 /*ARGSUSED1*/ 809 static void 810 shm_remove_zone(zoneid_t zoneid, void *arg) 811 { 812 ipc_remove_zone(shm_svc, zoneid); 813 } 814 815 /* 816 * Shmget (create new shmem) system call. 817 */ 818 static int 819 shmget(key_t key, size_t size, int shmflg, uintptr_t *rvp) 820 { 821 proc_t *pp = curproc; 822 kshmid_t *sp; 823 kmutex_t *lock; 824 int error; 825 826 top: 827 if (error = ipc_get(shm_svc, key, shmflg, (kipc_perm_t **)&sp, &lock)) 828 return (error); 829 830 if (!IPC_FREE(&sp->shm_perm)) { 831 /* 832 * A segment with the requested key exists. 833 */ 834 if (size > sp->shm_segsz) { 835 mutex_exit(lock); 836 return (EINVAL); 837 } 838 } else { 839 /* 840 * A new segment should be created. 841 */ 842 size_t npages = btopr(size); 843 size_t rsize = ptob(npages); 844 845 /* 846 * Check rsize and the per-project and per-zone limit on 847 * shared memory. Checking rsize handles both the size == 0 848 * case and the size < ULONG_MAX & PAGEMASK case (i.e. 849 * rounding up wraps a size_t). 850 */ 851 if (rsize == 0 || 852 (rctl_test(rc_project_shmmax, 853 pp->p_task->tk_proj->kpj_rctls, pp, rsize, 854 RCA_SAFE) & RCT_DENY) || 855 (rctl_test(rc_zone_shmmax, 856 pp->p_zone->zone_rctls, pp, rsize, 857 RCA_SAFE) & RCT_DENY)) { 858 859 mutex_exit(&pp->p_lock); 860 mutex_exit(lock); 861 ipc_cleanup(shm_svc, (kipc_perm_t *)sp); 862 return (EINVAL); 863 } 864 mutex_exit(&pp->p_lock); 865 mutex_exit(lock); 866 867 if (anon_resv(rsize) == 0) { 868 ipc_cleanup(shm_svc, (kipc_perm_t *)sp); 869 return (ENOMEM); 870 } 871 872 /* 873 * If any new failure points are introduced between the 874 * the above anon_resv() and the below ipc_commit_begin(), 875 * these failure points will need to unreserve the anon 876 * reserved using anon_unresv(). 877 * 878 * Once ipc_commit_begin() is called, the anon reserved 879 * above will be automatically unreserved by future calls to 880 * ipcs_cleanup() -> shm_dtor() -> shm_rm_amp(). If 881 * ipc_commit_begin() fails, it internally calls shm_dtor(), 882 * unreserving the above anon, and freeing the below amp. 883 */ 884 885 sp->shm_amp = anonmap_alloc(rsize, rsize, ANON_SLEEP); 886 sp->shm_amp->a_sp = sp; 887 /* 888 * Store the original user's requested size, in bytes, 889 * rather than the page-aligned size. The former is 890 * used for IPC_STAT and shmget() lookups. The latter 891 * is saved in the anon_map structure and is used for 892 * calls to the vm layer. 893 */ 894 sp->shm_segsz = size; 895 sp->shm_atime = sp->shm_dtime = 0; 896 sp->shm_ctime = gethrestime_sec(); 897 sp->shm_lpid = (pid_t)0; 898 sp->shm_cpid = curproc->p_pid; 899 sp->shm_ismattch = 0; 900 sp->shm_sptinfo = NULL; 901 /* 902 * Check limits one last time, push id into global 903 * visibility, and update resource usage counts. 904 */ 905 if (error = ipc_commit_begin(shm_svc, key, shmflg, 906 (kipc_perm_t *)sp)) { 907 if (error == EAGAIN) 908 goto top; 909 return (error); 910 } 911 912 if ((rctl_test(rc_project_shmmax, 913 sp->shm_perm.ipc_proj->kpj_rctls, pp, rsize, 914 RCA_SAFE) & RCT_DENY) || 915 (rctl_test(rc_zone_shmmax, 916 sp->shm_perm.ipc_zone->zone_rctls, pp, rsize, 917 RCA_SAFE) & RCT_DENY)) { 918 ipc_cleanup(shm_svc, (kipc_perm_t *)sp); 919 return (EINVAL); 920 } 921 sp->shm_perm.ipc_proj->kpj_data.kpd_shmmax += rsize; 922 sp->shm_perm.ipc_zone->zone_shmmax += rsize; 923 924 lock = ipc_commit_end(shm_svc, &sp->shm_perm); 925 } 926 927 if (audit_active) 928 audit_ipcget(AT_IPC_SHM, (void *)sp); 929 930 *rvp = (uintptr_t)(sp->shm_perm.ipc_id); 931 932 mutex_exit(lock); 933 return (0); 934 } 935 936 /* 937 * shmids system call. 938 */ 939 static int 940 shmids(int *buf, uint_t nids, uint_t *pnids) 941 { 942 return (ipc_ids(shm_svc, buf, nids, pnids)); 943 } 944 945 /* 946 * System entry point for shmat, shmctl, shmdt, and shmget system calls. 947 */ 948 static uintptr_t 949 shmsys(int opcode, uintptr_t a0, uintptr_t a1, uintptr_t a2) 950 { 951 int error; 952 uintptr_t r_val = 0; 953 954 switch (opcode) { 955 case SHMAT: 956 error = shmat((int)a0, (caddr_t)a1, (int)a2, &r_val); 957 break; 958 case SHMCTL: 959 error = shmctl((int)a0, (int)a1, (void *)a2); 960 break; 961 case SHMDT: 962 error = shmdt((caddr_t)a0); 963 break; 964 case SHMGET: 965 error = shmget((key_t)a0, (size_t)a1, (int)a2, &r_val); 966 break; 967 case SHMIDS: 968 error = shmids((int *)a0, (uint_t)a1, (uint_t *)a2); 969 break; 970 default: 971 error = EINVAL; 972 break; 973 } 974 975 if (error) 976 return ((uintptr_t)set_errno(error)); 977 978 return (r_val); 979 } 980 981 /* 982 * segacct_t comparator 983 * This works as expected, with one minor change: the first of two real 984 * segments with equal addresses is considered to be 'greater than' the 985 * second. We only return equal when searching using a template, in 986 * which case we explicitly set the template segment's length to 0 987 * (which is invalid for a real segment). 988 */ 989 static int 990 shm_sacompar(const void *x, const void *y) 991 { 992 segacct_t *sa1 = (segacct_t *)x; 993 segacct_t *sa2 = (segacct_t *)y; 994 995 if (sa1->sa_addr < sa2->sa_addr) { 996 return (-1); 997 } else if (sa2->sa_len != 0) { 998 if (sa1->sa_addr >= sa2->sa_addr + sa2->sa_len) { 999 return (1); 1000 } else if (sa1->sa_len != 0) { 1001 return (1); 1002 } else { 1003 return (0); 1004 } 1005 } else if (sa1->sa_addr > sa2->sa_addr) { 1006 return (1); 1007 } else { 1008 return (0); 1009 } 1010 } 1011 1012 /* 1013 * add this record to the segacct list. 1014 */ 1015 static void 1016 sa_add(struct proc *pp, caddr_t addr, size_t len, ulong_t flags, kshmid_t *id) 1017 { 1018 segacct_t *nsap; 1019 avl_tree_t *tree = NULL; 1020 avl_index_t where; 1021 1022 nsap = kmem_alloc(sizeof (segacct_t), KM_SLEEP); 1023 nsap->sa_addr = addr; 1024 nsap->sa_len = len; 1025 nsap->sa_flags = flags; 1026 nsap->sa_id = id; 1027 1028 if (pp->p_segacct == NULL) 1029 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 1030 1031 mutex_enter(&pp->p_lock); 1032 prbarrier(pp); /* block /proc. See shmgetid(). */ 1033 1034 if (pp->p_segacct == NULL) { 1035 avl_create(tree, shm_sacompar, sizeof (segacct_t), 1036 offsetof(segacct_t, sa_tree)); 1037 pp->p_segacct = tree; 1038 } else if (tree) { 1039 kmem_free(tree, sizeof (avl_tree_t)); 1040 } 1041 1042 /* 1043 * We can ignore the result of avl_find, as the comparator will 1044 * never return equal for segments with non-zero length. This 1045 * is a necessary hack to get around the fact that we do, in 1046 * fact, have duplicate keys. 1047 */ 1048 (void) avl_find(pp->p_segacct, nsap, &where); 1049 avl_insert(pp->p_segacct, nsap, where); 1050 1051 mutex_exit(&pp->p_lock); 1052 } 1053 1054 /* 1055 * Duplicate parent's segacct records in child. 1056 */ 1057 void 1058 shmfork(struct proc *ppp, struct proc *cpp) 1059 { 1060 segacct_t *sap; 1061 kshmid_t *sp; 1062 kmutex_t *mp; 1063 1064 ASSERT(ppp->p_segacct != NULL); 1065 1066 /* 1067 * We are the only lwp running in the parent so nobody can 1068 * mess with our p_segacct list. Thus it is safe to traverse 1069 * the list without holding p_lock. This is essential because 1070 * we can't hold p_lock during a KM_SLEEP allocation. 1071 */ 1072 for (sap = (segacct_t *)avl_first(ppp->p_segacct); sap != NULL; 1073 sap = (segacct_t *)AVL_NEXT(ppp->p_segacct, sap)) { 1074 sa_add(cpp, sap->sa_addr, sap->sa_len, sap->sa_flags, 1075 sap->sa_id); 1076 sp = sap->sa_id; 1077 mp = ipc_lock(shm_svc, sp->shm_perm.ipc_id); 1078 if (sap->sa_flags & SHMSA_ISM) 1079 sp->shm_ismattch++; 1080 ipc_hold(shm_svc, (kipc_perm_t *)sp); 1081 mutex_exit(mp); 1082 } 1083 } 1084 1085 /* 1086 * Detach shared memory segments from exiting process. 1087 */ 1088 void 1089 shmexit(struct proc *pp) 1090 { 1091 segacct_t *sap; 1092 avl_tree_t *tree; 1093 void *cookie = NULL; 1094 1095 ASSERT(pp->p_segacct != NULL); 1096 1097 mutex_enter(&pp->p_lock); 1098 prbarrier(pp); 1099 tree = pp->p_segacct; 1100 pp->p_segacct = NULL; 1101 mutex_exit(&pp->p_lock); 1102 1103 while ((sap = avl_destroy_nodes(tree, &cookie)) != NULL) 1104 (void) shm_detach(pp, sap); 1105 1106 avl_destroy(tree); 1107 kmem_free(tree, sizeof (avl_tree_t)); 1108 } 1109 1110 /* 1111 * At this time pages should be in memory, so just lock them. 1112 */ 1113 static void 1114 lock_again(size_t npages, kshmid_t *sp, struct anon_map *amp) 1115 { 1116 struct anon *ap; 1117 struct page *pp; 1118 struct vnode *vp; 1119 u_offset_t off; 1120 ulong_t anon_idx; 1121 anon_sync_obj_t cookie; 1122 1123 mutex_enter(&sp->shm_mlock); 1124 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 1125 for (anon_idx = 0; npages != 0; anon_idx++, npages--) { 1126 1127 anon_array_enter(amp, anon_idx, &cookie); 1128 ap = anon_get_ptr(amp->ahp, anon_idx); 1129 ASSERT(ap != NULL); 1130 swap_xlate(ap, &vp, &off); 1131 anon_array_exit(&cookie); 1132 1133 pp = page_lookup(vp, off, SE_SHARED); 1134 if (pp == NULL) { 1135 panic("lock_again: page not in the system"); 1136 /*NOTREACHED*/ 1137 } 1138 /* page should already be locked by caller */ 1139 ASSERT(pp->p_lckcnt > 0); 1140 (void) page_pp_lock(pp, 0, 0); 1141 page_unlock(pp); 1142 } 1143 ANON_LOCK_EXIT(&->a_rwlock); 1144 mutex_exit(&sp->shm_mlock); 1145 } 1146 1147 /* 1148 * Attach the shared memory segment to the process 1149 * address space and lock the pages. 1150 */ 1151 static int 1152 shmem_lock(kshmid_t *sp, struct anon_map *amp) 1153 { 1154 size_t npages = btopr(amp->size); 1155 struct as *as; 1156 struct segvn_crargs crargs; 1157 uint_t error; 1158 1159 /* 1160 * A later ISM/DISM attach may increase the size of the amp, so 1161 * cache the number of pages locked for the future shmem_unlock() 1162 */ 1163 sp->shm_lkpages = npages; 1164 1165 as = as_alloc(); 1166 /* Initialize the create arguments and map the segment */ 1167 crargs = *(struct segvn_crargs *)zfod_argsp; /* structure copy */ 1168 crargs.offset = (u_offset_t)0; 1169 crargs.type = MAP_SHARED; 1170 crargs.amp = amp; 1171 crargs.prot = PROT_ALL; 1172 crargs.maxprot = crargs.prot; 1173 crargs.flags = 0; 1174 error = as_map(as, 0x0, amp->size, segvn_create, &crargs); 1175 if (!error) { 1176 if ((error = as_ctl(as, 0x0, amp->size, MC_LOCK, 0, 0, 1177 NULL, 0)) == 0) { 1178 lock_again(npages, sp, amp); 1179 } 1180 (void) as_unmap(as, 0x0, amp->size); 1181 } 1182 as_free(as); 1183 return (error); 1184 } 1185 1186 1187 /* 1188 * Unlock shared memory 1189 */ 1190 static void 1191 shmem_unlock(kshmid_t *sp, struct anon_map *amp) 1192 { 1193 struct anon *ap; 1194 pgcnt_t npages = sp->shm_lkpages; 1195 struct vnode *vp; 1196 struct page *pp; 1197 u_offset_t off; 1198 ulong_t anon_idx; 1199 size_t unlocked_bytes = 0; 1200 kproject_t *proj; 1201 anon_sync_obj_t cookie; 1202 1203 proj = sp->shm_perm.ipc_proj; 1204 mutex_enter(&sp->shm_mlock); 1205 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 1206 for (anon_idx = 0; anon_idx < npages; anon_idx++) { 1207 1208 anon_array_enter(amp, anon_idx, &cookie); 1209 if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) { 1210 panic("shmem_unlock: null app"); 1211 /*NOTREACHED*/ 1212 } 1213 swap_xlate(ap, &vp, &off); 1214 anon_array_exit(&cookie); 1215 pp = page_lookup(vp, off, SE_SHARED); 1216 if (pp == NULL) { 1217 panic("shmem_unlock: page not in the system"); 1218 /*NOTREACHED*/ 1219 } 1220 /* 1221 * Page should at least have once lock from previous 1222 * shmem_lock 1223 */ 1224 ASSERT(pp->p_lckcnt > 0); 1225 page_pp_unlock(pp, 0, 0); 1226 if (pp->p_lckcnt == 0) 1227 unlocked_bytes += PAGESIZE; 1228 1229 page_unlock(pp); 1230 } 1231 1232 if (unlocked_bytes > 0) { 1233 rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0); 1234 } 1235 1236 ANON_LOCK_EXIT(&->a_rwlock); 1237 mutex_exit(&sp->shm_mlock); 1238 } 1239 1240 /* 1241 * We call this routine when we have removed all references to this 1242 * amp. This means all shmdt()s and the IPC_RMID have been done. 1243 */ 1244 static void 1245 shm_rm_amp(kshmid_t *sp) 1246 { 1247 struct anon_map *amp = sp->shm_amp; 1248 zone_t *zone; 1249 1250 zone = sp->shm_perm.ipc_zone; 1251 ASSERT(zone != NULL); 1252 /* 1253 * Free up the anon_map. 1254 */ 1255 lgrp_shm_policy_fini(amp, NULL); 1256 if (amp->a_szc != 0) { 1257 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1258 anon_shmap_free_pages(amp, 0, amp->size); 1259 ANON_LOCK_EXIT(&->a_rwlock); 1260 } else { 1261 anon_free(amp->ahp, 0, amp->size); 1262 } 1263 anon_unresv_zone(amp->swresv, zone); 1264 anonmap_free(amp); 1265 } 1266 1267 /* 1268 * Return the shared memory id for the process's virtual address. 1269 * Return SHMID_NONE if addr is not within a SysV shared memory segment. 1270 * Return SHMID_FREE if addr's SysV shared memory segment's id has been freed. 1271 * 1272 * shmgetid() is called from code in /proc with the process locked but 1273 * with pp->p_lock not held. The address space lock is held, so we 1274 * cannot grab pp->p_lock here due to lock-ordering constraints. 1275 * Because of all this, modifications to the p_segacct list must only 1276 * be made after calling prbarrier() to ensure the process is not locked. 1277 * See shmdt() and sa_add(), above. shmgetid() may also be called on a 1278 * thread's own process without the process locked. 1279 */ 1280 int 1281 shmgetid(proc_t *pp, caddr_t addr) 1282 { 1283 segacct_t *sap, template; 1284 1285 ASSERT(MUTEX_NOT_HELD(&pp->p_lock)); 1286 ASSERT((pp->p_proc_flag & P_PR_LOCK) || pp == curproc); 1287 1288 if (pp->p_segacct == NULL) 1289 return (SHMID_NONE); 1290 1291 template.sa_addr = addr; 1292 template.sa_len = 0; 1293 if ((sap = avl_find(pp->p_segacct, &template, NULL)) == NULL) 1294 return (SHMID_NONE); 1295 1296 if (IPC_FREE(&sap->sa_id->shm_perm)) 1297 return (SHMID_FREE); 1298 1299 return (sap->sa_id->shm_perm.ipc_id); 1300 } 1301