1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * Inter-Process Communication Shared Memory Facility. 43 * 44 * See os/ipc.c for a description of common IPC functionality. 45 * 46 * Resource controls 47 * ----------------- 48 * 49 * Control: zone.max-shm-ids (rc_zone_shmmni) 50 * Description: Maximum number of shared memory ids allowed a zone. 51 * 52 * When shmget() is used to allocate a shared memory segment, one id 53 * is allocated. If the id allocation doesn't succeed, shmget() 54 * fails and errno is set to ENOSPC. Upon successful shmctl(, 55 * IPC_RMID) the id is deallocated. 56 * 57 * Control: project.max-shm-ids (rc_project_shmmni) 58 * Description: Maximum number of shared memory ids allowed a project. 59 * 60 * When shmget() is used to allocate a shared memory segment, one id 61 * is allocated. If the id allocation doesn't succeed, shmget() 62 * fails and errno is set to ENOSPC. Upon successful shmctl(, 63 * IPC_RMID) the id is deallocated. 64 * 65 * Control: zone.max-shm-memory (rc_zone_shmmax) 66 * Description: Total amount of shared memory allowed a zone. 67 * 68 * When shmget() is used to allocate a shared memory segment, the 69 * segment's size is allocated against this limit. If the space 70 * allocation doesn't succeed, shmget() fails and errno is set to 71 * EINVAL. The size will be deallocated once the last process has 72 * detached the segment and the segment has been successfully 73 * shmctl(, IPC_RMID)ed. 74 * 75 * Control: project.max-shm-memory (rc_project_shmmax) 76 * Description: Total amount of shared memory allowed a project. 77 * 78 * When shmget() is used to allocate a shared memory segment, the 79 * segment's size is allocated against this limit. If the space 80 * allocation doesn't succeed, shmget() fails and errno is set to 81 * EINVAL. The size will be deallocated once the last process has 82 * detached the segment and the segment has been successfully 83 * shmctl(, IPC_RMID)ed. 84 */ 85 86 #include <sys/types.h> 87 #include <sys/param.h> 88 #include <sys/cred.h> 89 #include <sys/errno.h> 90 #include <sys/time.h> 91 #include <sys/kmem.h> 92 #include <sys/user.h> 93 #include <sys/proc.h> 94 #include <sys/systm.h> 95 #include <sys/prsystm.h> 96 #include <sys/sysmacros.h> 97 #include <sys/tuneable.h> 98 #include <sys/vm.h> 99 #include <sys/mman.h> 100 #include <sys/swap.h> 101 #include <sys/cmn_err.h> 102 #include <sys/debug.h> 103 #include <sys/lwpchan_impl.h> 104 #include <sys/avl.h> 105 #include <sys/modctl.h> 106 #include <sys/syscall.h> 107 #include <sys/task.h> 108 #include <sys/project.h> 109 #include <sys/policy.h> 110 #include <sys/zone.h> 111 #include <sys/rctl.h> 112 113 #include <sys/ipc.h> 114 #include <sys/ipc_impl.h> 115 #include <sys/shm.h> 116 #include <sys/shm_impl.h> 117 118 #include <vm/hat.h> 119 #include <vm/seg.h> 120 #include <vm/as.h> 121 #include <vm/seg_vn.h> 122 #include <vm/anon.h> 123 #include <vm/page.h> 124 #include <vm/vpage.h> 125 #include <vm/seg_spt.h> 126 127 #include <c2/audit.h> 128 129 static int shmem_lock(kshmid_t *sp, struct anon_map *amp); 130 static void shmem_unlock(kshmid_t *sp, struct anon_map *amp); 131 static void sa_add(struct proc *pp, caddr_t addr, size_t len, ulong_t flags, 132 kshmid_t *id); 133 static void shm_rm_amp(kshmid_t *sp); 134 static void shm_dtor(kipc_perm_t *); 135 static void shm_rmid(kipc_perm_t *); 136 static void shm_remove_zone(zoneid_t, void *); 137 138 /* 139 * Semantics for share_page_table and ism_off: 140 * 141 * These are hooks in /etc/system - only for internal testing purpose. 142 * 143 * Setting share_page_table automatically turns on the SHM_SHARE_MMU (ISM) flag 144 * in a call to shmat(2). In other words, with share_page_table set, you always 145 * get ISM, even if say, DISM is specified. It should really be called "ism_on". 146 * 147 * Setting ism_off turns off the SHM_SHARE_MMU flag from the flags passed to 148 * shmat(2). 149 * 150 * If both share_page_table and ism_off are set, share_page_table prevails. 151 * 152 * Although these tunables should probably be removed, they do have some 153 * external exposure; as long as they exist, they should at least work sensibly. 154 */ 155 156 int share_page_table; 157 int ism_off; 158 159 /* 160 * The following tunables are obsolete. Though for compatibility we 161 * still read and interpret shminfo_shmmax and shminfo_shmmni (see 162 * os/project.c), the preferred mechanism for administrating the IPC 163 * Shared Memory facility is through the resource controls described at 164 * the top of this file. 165 */ 166 size_t shminfo_shmmax = 0x800000; /* (obsolete) */ 167 int shminfo_shmmni = 100; /* (obsolete) */ 168 size_t shminfo_shmmin = 1; /* (obsolete) */ 169 int shminfo_shmseg = 6; /* (obsolete) */ 170 171 extern rctl_hndl_t rc_zone_shmmax; 172 extern rctl_hndl_t rc_zone_shmmni; 173 extern rctl_hndl_t rc_project_shmmax; 174 extern rctl_hndl_t rc_project_shmmni; 175 static ipc_service_t *shm_svc; 176 static zone_key_t shm_zone_key; 177 178 /* 179 * Module linkage information for the kernel. 180 */ 181 static uintptr_t shmsys(int, uintptr_t, uintptr_t, uintptr_t); 182 183 static struct sysent ipcshm_sysent = { 184 4, 185 #ifdef _SYSCALL32_IMPL 186 SE_ARGC | SE_NOUNLOAD | SE_64RVAL, 187 #else /* _SYSCALL32_IMPL */ 188 SE_ARGC | SE_NOUNLOAD | SE_32RVAL1, 189 #endif /* _SYSCALL32_IMPL */ 190 (int (*)())shmsys 191 }; 192 193 #ifdef _SYSCALL32_IMPL 194 static struct sysent ipcshm_sysent32 = { 195 4, 196 SE_ARGC | SE_NOUNLOAD | SE_32RVAL1, 197 (int (*)())shmsys 198 }; 199 #endif /* _SYSCALL32_IMPL */ 200 201 static struct modlsys modlsys = { 202 &mod_syscallops, "System V shared memory", &ipcshm_sysent 203 }; 204 205 #ifdef _SYSCALL32_IMPL 206 static struct modlsys modlsys32 = { 207 &mod_syscallops32, "32-bit System V shared memory", &ipcshm_sysent32 208 }; 209 #endif /* _SYSCALL32_IMPL */ 210 211 static struct modlinkage modlinkage = { 212 MODREV_1, 213 &modlsys, 214 #ifdef _SYSCALL32_IMPL 215 &modlsys32, 216 #endif 217 NULL 218 }; 219 220 221 int 222 _init(void) 223 { 224 int result; 225 226 shm_svc = ipcs_create("shmids", rc_project_shmmni, rc_zone_shmmni, 227 sizeof (kshmid_t), shm_dtor, shm_rmid, AT_IPC_SHM, 228 offsetof(ipc_rqty_t, ipcq_shmmni)); 229 zone_key_create(&shm_zone_key, NULL, shm_remove_zone, NULL); 230 231 if ((result = mod_install(&modlinkage)) == 0) 232 return (0); 233 234 (void) zone_key_delete(shm_zone_key); 235 ipcs_destroy(shm_svc); 236 237 return (result); 238 } 239 240 int 241 _fini(void) 242 { 243 return (EBUSY); 244 } 245 246 int 247 _info(struct modinfo *modinfop) 248 { 249 return (mod_info(&modlinkage, modinfop)); 250 } 251 252 /* 253 * Shmat (attach shared segment) system call. 254 */ 255 static int 256 shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) 257 { 258 kshmid_t *sp; /* shared memory header ptr */ 259 size_t size; 260 int error = 0; 261 proc_t *pp = curproc; 262 struct as *as = pp->p_as; 263 struct segvn_crargs crargs; /* segvn create arguments */ 264 kmutex_t *lock; 265 struct seg *segspt = NULL; 266 caddr_t addr = uaddr; 267 int flags = (uflags & SHMAT_VALID_FLAGS_MASK); 268 int useISM; 269 uchar_t prot = PROT_ALL; 270 int result; 271 272 if ((lock = ipc_lookup(shm_svc, shmid, (kipc_perm_t **)&sp)) == NULL) 273 return (EINVAL); 274 if (error = ipcperm_access(&sp->shm_perm, SHM_R, CRED())) 275 goto errret; 276 if ((flags & SHM_RDONLY) == 0 && 277 (error = ipcperm_access(&sp->shm_perm, SHM_W, CRED()))) 278 goto errret; 279 if (spt_invalid(flags)) { 280 error = EINVAL; 281 goto errret; 282 } 283 if (ism_off) 284 flags = flags & ~SHM_SHARE_MMU; 285 if (share_page_table) { 286 flags = flags & ~SHM_PAGEABLE; 287 flags = flags | SHM_SHARE_MMU; 288 } 289 useISM = (spt_locked(flags) || spt_pageable(flags)); 290 if (useISM && (error = ipcperm_access(&sp->shm_perm, SHM_W, CRED()))) 291 goto errret; 292 if (useISM && isspt(sp)) { 293 uint_t newsptflags = flags | spt_flags(sp->shm_sptseg); 294 /* 295 * If trying to change an existing {D}ISM segment from ISM 296 * to DISM or vice versa, return error. Note that this 297 * validation of flags needs to be done after the effect of 298 * tunables such as ism_off and share_page_table, for 299 * semantics that are consistent with the tunables' settings. 300 */ 301 if (spt_invalid(newsptflags)) { 302 error = EINVAL; 303 goto errret; 304 } 305 } 306 ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER); 307 size = sp->shm_amp->size; 308 ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock); 309 310 /* somewhere to record spt info for final detach */ 311 if (sp->shm_sptinfo == NULL) 312 sp->shm_sptinfo = kmem_zalloc(sizeof (sptinfo_t), KM_SLEEP); 313 314 as_rangelock(as); 315 316 if (useISM) { 317 /* 318 * Handle ISM 319 */ 320 uint_t share_szc; 321 size_t share_size; 322 struct shm_data ssd; 323 uintptr_t align_hint; 324 325 /* 326 * Pick a share pagesize to use, if (!isspt(sp)). 327 * Otherwise use the already chosen page size. 328 * 329 * For the initial shmat (!isspt(sp)), where sptcreate is 330 * called, map_pgsz is called to recommend a [D]ISM pagesize, 331 * important for systems which offer more than one potential 332 * [D]ISM pagesize. 333 * If the shmat is just to attach to an already created 334 * [D]ISM segment, then use the previously selected page size. 335 */ 336 if (!isspt(sp)) { 337 share_size = map_pgsz(MAPPGSZ_ISM, pp, addr, size, 0); 338 if (share_size == 0) { 339 as_rangeunlock(as); 340 error = EINVAL; 341 goto errret; 342 } 343 share_szc = page_szc(share_size); 344 } else { 345 share_szc = sp->shm_sptseg->s_szc; 346 share_size = page_get_pagesize(share_szc); 347 } 348 size = P2ROUNDUP(size, share_size); 349 350 align_hint = share_size; 351 #if defined(__i386) || defined(__amd64) 352 /* 353 * For x86, we want to share as much of the page table tree 354 * as possible. We use a large align_hint at first, but 355 * if that fails, then the code below retries with align_hint 356 * set to share_size. 357 * 358 * The explicit extern here is due to the difficulties 359 * of getting to platform dependent includes. When/if the 360 * platform dependent bits of this function are cleaned up, 361 * another way of doing this should found. 362 */ 363 { 364 extern uint_t ptes_per_table; 365 366 while (size >= ptes_per_table * (uint64_t)align_hint) 367 align_hint *= ptes_per_table; 368 } 369 #endif /* __i386 || __amd64 */ 370 371 #if defined(__sparcv9) 372 if (addr == 0 && curproc->p_model == DATAMODEL_LP64) { 373 /* 374 * If no address has been passed in, and this is a 375 * 64-bit process, we'll try to find an address 376 * in the predict-ISM zone. 377 */ 378 caddr_t predbase = (caddr_t)PREDISM_1T_BASE; 379 size_t len = PREDISM_BOUND - PREDISM_1T_BASE; 380 381 as_purge(as); 382 if (as_gap(as, size + share_size, &predbase, &len, 383 AH_LO, (caddr_t)NULL) != -1) { 384 /* 385 * We found an address which looks like a 386 * candidate. We want to round it up, and 387 * then check that it's a valid user range. 388 * This assures that we won't fail below. 389 */ 390 addr = (caddr_t)P2ROUNDUP((uintptr_t)predbase, 391 share_size); 392 393 if (valid_usr_range(addr, size, prot, 394 as, as->a_userlimit) != RANGE_OKAY) { 395 addr = 0; 396 } 397 } 398 } 399 #endif /* __sparcv9 */ 400 401 if (addr == 0) { 402 for (;;) { 403 addr = (caddr_t)align_hint; 404 map_addr(&addr, size, 0ll, 1, MAP_ALIGN); 405 if (addr != NULL || align_hint == share_size) 406 break; 407 align_hint = share_size; 408 } 409 if (addr == NULL) { 410 as_rangeunlock(as); 411 error = ENOMEM; 412 goto errret; 413 } 414 ASSERT(((uintptr_t)addr & (align_hint - 1)) == 0); 415 } else { 416 /* Use the user-supplied attach address */ 417 caddr_t base; 418 size_t len; 419 420 /* 421 * Check that the address range 422 * 1) is properly aligned 423 * 2) is correct in unix terms 424 * 3) is within an unmapped address segment 425 */ 426 base = addr; 427 len = size; /* use spt aligned size */ 428 /* XXX - in SunOS, is sp->shm_segsz */ 429 if ((uintptr_t)base & (share_size - 1)) { 430 error = EINVAL; 431 as_rangeunlock(as); 432 goto errret; 433 } 434 result = valid_usr_range(base, len, prot, as, 435 as->a_userlimit); 436 if (result == RANGE_BADPROT) { 437 /* 438 * We try to accomodate processors which 439 * may not support execute permissions on 440 * all ISM segments by trying the check 441 * again but without PROT_EXEC. 442 */ 443 prot &= ~PROT_EXEC; 444 result = valid_usr_range(base, len, prot, as, 445 as->a_userlimit); 446 } 447 as_purge(as); 448 if (result != RANGE_OKAY || 449 as_gap(as, len, &base, &len, AH_LO, 450 (caddr_t)NULL) != 0) { 451 error = EINVAL; 452 as_rangeunlock(as); 453 goto errret; 454 } 455 } 456 457 if (!isspt(sp)) { 458 error = sptcreate(size, &segspt, sp->shm_amp, prot, 459 flags, share_szc); 460 if (error) { 461 as_rangeunlock(as); 462 goto errret; 463 } 464 sp->shm_sptinfo->sptas = segspt->s_as; 465 sp->shm_sptseg = segspt; 466 sp->shm_sptprot = prot; 467 } else if ((prot & sp->shm_sptprot) != sp->shm_sptprot) { 468 /* 469 * Ensure we're attaching to an ISM segment with 470 * fewer or equal permissions than what we're 471 * allowed. Fail if the segment has more 472 * permissions than what we're allowed. 473 */ 474 error = EACCES; 475 as_rangeunlock(as); 476 goto errret; 477 } 478 479 ssd.shm_sptseg = sp->shm_sptseg; 480 ssd.shm_sptas = sp->shm_sptinfo->sptas; 481 ssd.shm_amp = sp->shm_amp; 482 error = as_map(as, addr, size, segspt_shmattach, &ssd); 483 if (error == 0) 484 sp->shm_ismattch++; /* keep count of ISM attaches */ 485 } else { 486 487 /* 488 * Normal case. 489 */ 490 if (flags & SHM_RDONLY) 491 prot &= ~PROT_WRITE; 492 493 if (addr == 0) { 494 /* Let the system pick the attach address */ 495 map_addr(&addr, size, 0ll, 1, 0); 496 if (addr == NULL) { 497 as_rangeunlock(as); 498 error = ENOMEM; 499 goto errret; 500 } 501 } else { 502 /* Use the user-supplied attach address */ 503 caddr_t base; 504 size_t len; 505 506 if (flags & SHM_RND) 507 addr = (caddr_t)((uintptr_t)addr & 508 ~(SHMLBA - 1)); 509 /* 510 * Check that the address range 511 * 1) is properly aligned 512 * 2) is correct in unix terms 513 * 3) is within an unmapped address segment 514 */ 515 base = addr; 516 len = size; /* use aligned size */ 517 /* XXX - in SunOS, is sp->shm_segsz */ 518 if ((uintptr_t)base & PAGEOFFSET) { 519 error = EINVAL; 520 as_rangeunlock(as); 521 goto errret; 522 } 523 result = valid_usr_range(base, len, prot, as, 524 as->a_userlimit); 525 if (result == RANGE_BADPROT) { 526 prot &= ~PROT_EXEC; 527 result = valid_usr_range(base, len, prot, as, 528 as->a_userlimit); 529 } 530 as_purge(as); 531 if (result != RANGE_OKAY || 532 as_gap(as, len, &base, &len, 533 AH_LO, (caddr_t)NULL) != 0) { 534 error = EINVAL; 535 as_rangeunlock(as); 536 goto errret; 537 } 538 } 539 540 /* Initialize the create arguments and map the segment */ 541 crargs = *(struct segvn_crargs *)zfod_argsp; 542 crargs.offset = 0; 543 crargs.type = MAP_SHARED; 544 crargs.amp = sp->shm_amp; 545 crargs.prot = prot; 546 crargs.maxprot = crargs.prot; 547 crargs.flags = 0; 548 549 error = as_map(as, addr, size, segvn_create, &crargs); 550 } 551 552 as_rangeunlock(as); 553 if (error) 554 goto errret; 555 556 /* record shmem range for the detach */ 557 sa_add(pp, addr, (size_t)size, useISM ? SHMSA_ISM : 0, sp); 558 *rvp = (uintptr_t)addr; 559 560 sp->shm_atime = gethrestime_sec(); 561 sp->shm_lpid = pp->p_pid; 562 ipc_hold(shm_svc, (kipc_perm_t *)sp); 563 errret: 564 mutex_exit(lock); 565 return (error); 566 } 567 568 static void 569 shm_dtor(kipc_perm_t *perm) 570 { 571 kshmid_t *sp = (kshmid_t *)perm; 572 uint_t cnt; 573 size_t rsize; 574 575 if (sp->shm_lkcnt > 0) { 576 shmem_unlock(sp, sp->shm_amp); 577 sp->shm_lkcnt = 0; 578 } 579 580 if (sp->shm_sptinfo) { 581 if (isspt(sp)) 582 sptdestroy(sp->shm_sptinfo->sptas, sp->shm_amp); 583 kmem_free(sp->shm_sptinfo, sizeof (sptinfo_t)); 584 } 585 586 ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER); 587 cnt = --sp->shm_amp->refcnt; 588 ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock); 589 ASSERT(cnt == 0); 590 shm_rm_amp(sp); 591 592 if (sp->shm_perm.ipc_id != IPC_ID_INVAL) { 593 rsize = ptob(btopr(sp->shm_segsz)); 594 ipcs_lock(shm_svc); 595 sp->shm_perm.ipc_proj->kpj_data.kpd_shmmax -= rsize; 596 sp->shm_perm.ipc_zone->zone_shmmax -= rsize; 597 ipcs_unlock(shm_svc); 598 } 599 } 600 601 /* ARGSUSED */ 602 static void 603 shm_rmid(kipc_perm_t *perm) 604 { 605 /* nothing to do */ 606 } 607 608 /* 609 * Shmctl system call. 610 */ 611 /* ARGSUSED */ 612 static int 613 shmctl(int shmid, int cmd, void *arg) 614 { 615 kshmid_t *sp; /* shared memory header ptr */ 616 STRUCT_DECL(shmid_ds, ds); /* for SVR4 IPC_SET */ 617 int error = 0; 618 struct cred *cr = CRED(); 619 kmutex_t *lock; 620 model_t mdl = get_udatamodel(); 621 struct shmid_ds64 ds64; 622 shmatt_t nattch; 623 624 STRUCT_INIT(ds, mdl); 625 626 /* 627 * Perform pre- or non-lookup actions (e.g. copyins, RMID). 628 */ 629 switch (cmd) { 630 case IPC_SET: 631 if (copyin(arg, STRUCT_BUF(ds), STRUCT_SIZE(ds))) 632 return (EFAULT); 633 break; 634 635 case IPC_SET64: 636 if (copyin(arg, &ds64, sizeof (struct shmid_ds64))) 637 return (EFAULT); 638 break; 639 640 case IPC_RMID: 641 return (ipc_rmid(shm_svc, shmid, cr)); 642 } 643 644 if ((lock = ipc_lookup(shm_svc, shmid, (kipc_perm_t **)&sp)) == NULL) 645 return (EINVAL); 646 647 switch (cmd) { 648 /* Set ownership and permissions. */ 649 case IPC_SET: 650 if (error = ipcperm_set(shm_svc, cr, &sp->shm_perm, 651 &STRUCT_BUF(ds)->shm_perm, mdl)) 652 break; 653 sp->shm_ctime = gethrestime_sec(); 654 break; 655 656 case IPC_STAT: 657 if (error = ipcperm_access(&sp->shm_perm, SHM_R, cr)) 658 break; 659 660 nattch = sp->shm_perm.ipc_ref - 1; 661 662 ipcperm_stat(&STRUCT_BUF(ds)->shm_perm, &sp->shm_perm, mdl); 663 STRUCT_FSET(ds, shm_segsz, sp->shm_segsz); 664 STRUCT_FSETP(ds, shm_amp, NULL); /* kernel addr */ 665 STRUCT_FSET(ds, shm_lkcnt, sp->shm_lkcnt); 666 STRUCT_FSET(ds, shm_lpid, sp->shm_lpid); 667 STRUCT_FSET(ds, shm_cpid, sp->shm_cpid); 668 STRUCT_FSET(ds, shm_nattch, nattch); 669 STRUCT_FSET(ds, shm_cnattch, sp->shm_ismattch); 670 STRUCT_FSET(ds, shm_atime, sp->shm_atime); 671 STRUCT_FSET(ds, shm_dtime, sp->shm_dtime); 672 STRUCT_FSET(ds, shm_ctime, sp->shm_ctime); 673 674 mutex_exit(lock); 675 if (copyout(STRUCT_BUF(ds), arg, STRUCT_SIZE(ds))) 676 return (EFAULT); 677 678 return (0); 679 680 case IPC_SET64: 681 if (error = ipcperm_set64(shm_svc, cr, 682 &sp->shm_perm, &ds64.shmx_perm)) 683 break; 684 sp->shm_ctime = gethrestime_sec(); 685 break; 686 687 case IPC_STAT64: 688 nattch = sp->shm_perm.ipc_ref - 1; 689 690 ipcperm_stat64(&ds64.shmx_perm, &sp->shm_perm); 691 ds64.shmx_segsz = sp->shm_segsz; 692 ds64.shmx_lkcnt = sp->shm_lkcnt; 693 ds64.shmx_lpid = sp->shm_lpid; 694 ds64.shmx_cpid = sp->shm_cpid; 695 ds64.shmx_nattch = nattch; 696 ds64.shmx_cnattch = sp->shm_ismattch; 697 ds64.shmx_atime = sp->shm_atime; 698 ds64.shmx_dtime = sp->shm_dtime; 699 ds64.shmx_ctime = sp->shm_ctime; 700 701 mutex_exit(lock); 702 if (copyout(&ds64, arg, sizeof (struct shmid_ds64))) 703 return (EFAULT); 704 705 return (0); 706 707 /* Lock segment in memory */ 708 case SHM_LOCK: 709 if ((error = secpolicy_lock_memory(cr)) != 0) 710 break; 711 712 /* protect against overflow */ 713 if (sp->shm_lkcnt >= USHRT_MAX) { 714 error = ENOMEM; 715 break; 716 } 717 if (!isspt(sp) && (sp->shm_lkcnt++ == 0)) { 718 if (error = shmem_lock(sp, sp->shm_amp)) { 719 ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER); 720 cmn_err(CE_NOTE, 721 "shmctl - couldn't lock %ld pages into memory", 722 sp->shm_amp->size); 723 ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock); 724 error = ENOMEM; 725 sp->shm_lkcnt--; 726 } 727 } 728 break; 729 730 /* Unlock segment */ 731 case SHM_UNLOCK: 732 if ((error = secpolicy_lock_memory(cr)) != 0) 733 break; 734 735 if (sp->shm_lkcnt && (--sp->shm_lkcnt == 0)) { 736 shmem_unlock(sp, sp->shm_amp); 737 } 738 break; 739 740 default: 741 error = EINVAL; 742 break; 743 } 744 mutex_exit(lock); 745 return (error); 746 } 747 748 static void 749 shm_detach(proc_t *pp, segacct_t *sap) 750 { 751 kshmid_t *sp = sap->sa_id; 752 size_t len = sap->sa_len; 753 caddr_t addr = sap->sa_addr; 754 755 /* 756 * Discard lwpchan mappings. 757 */ 758 if (pp->p_lcp != NULL) 759 lwpchan_delete_mapping(pp, addr, addr + len); 760 (void) as_unmap(pp->p_as, addr, len); 761 762 /* 763 * Perform some detach-time accounting. 764 */ 765 (void) ipc_lock(shm_svc, sp->shm_perm.ipc_id); 766 if (sap->sa_flags & SHMSA_ISM) 767 sp->shm_ismattch--; 768 sp->shm_dtime = gethrestime_sec(); 769 sp->shm_lpid = pp->p_pid; 770 ipc_rele(shm_svc, (kipc_perm_t *)sp); /* Drops lock */ 771 772 kmem_free(sap, sizeof (segacct_t)); 773 } 774 775 static int 776 shmdt(caddr_t addr) 777 { 778 proc_t *pp = curproc; 779 segacct_t *sap, template; 780 781 mutex_enter(&pp->p_lock); 782 prbarrier(pp); /* block /proc. See shmgetid(). */ 783 784 template.sa_addr = addr; 785 template.sa_len = 0; 786 if ((pp->p_segacct == NULL) || 787 ((sap = avl_find(pp->p_segacct, &template, NULL)) == NULL)) { 788 mutex_exit(&pp->p_lock); 789 return (EINVAL); 790 } 791 if (sap->sa_addr != addr) { 792 mutex_exit(&pp->p_lock); 793 return (EINVAL); 794 } 795 avl_remove(pp->p_segacct, sap); 796 mutex_exit(&pp->p_lock); 797 798 shm_detach(pp, sap); 799 800 return (0); 801 } 802 803 /* 804 * Remove all shared memory segments associated with a given zone. 805 * Called by zone_shutdown when the zone is halted. 806 */ 807 /*ARGSUSED1*/ 808 static void 809 shm_remove_zone(zoneid_t zoneid, void *arg) 810 { 811 ipc_remove_zone(shm_svc, zoneid); 812 } 813 814 /* 815 * Shmget (create new shmem) system call. 816 */ 817 static int 818 shmget(key_t key, size_t size, int shmflg, uintptr_t *rvp) 819 { 820 proc_t *pp = curproc; 821 kshmid_t *sp; 822 kmutex_t *lock; 823 int error; 824 825 top: 826 if (error = ipc_get(shm_svc, key, shmflg, (kipc_perm_t **)&sp, &lock)) 827 return (error); 828 829 if (!IPC_FREE(&sp->shm_perm)) { 830 /* 831 * A segment with the requested key exists. 832 */ 833 if (size > sp->shm_segsz) { 834 mutex_exit(lock); 835 return (EINVAL); 836 } 837 } else { 838 /* 839 * A new segment should be created. 840 */ 841 size_t npages = btopr(size); 842 size_t rsize = ptob(npages); 843 844 /* 845 * Check rsize and the per-project and per-zone limit on 846 * shared memory. Checking rsize handles both the size == 0 847 * case and the size < ULONG_MAX & PAGEMASK case (i.e. 848 * rounding up wraps a size_t). 849 */ 850 if (rsize == 0 || 851 (rctl_test(rc_project_shmmax, 852 pp->p_task->tk_proj->kpj_rctls, pp, rsize, 853 RCA_SAFE) & RCT_DENY) || 854 (rctl_test(rc_zone_shmmax, 855 pp->p_zone->zone_rctls, pp, rsize, 856 RCA_SAFE) & RCT_DENY)) { 857 858 mutex_exit(&pp->p_lock); 859 mutex_exit(lock); 860 ipc_cleanup(shm_svc, (kipc_perm_t *)sp); 861 return (EINVAL); 862 } 863 mutex_exit(&pp->p_lock); 864 mutex_exit(lock); 865 866 if (anon_resv(rsize) == 0) { 867 ipc_cleanup(shm_svc, (kipc_perm_t *)sp); 868 return (ENOMEM); 869 } 870 871 /* 872 * If any new failure points are introduced between the 873 * the above anon_resv() and the below ipc_commit_begin(), 874 * these failure points will need to unreserve the anon 875 * reserved using anon_unresv(). 876 * 877 * Once ipc_commit_begin() is called, the anon reserved 878 * above will be automatically unreserved by future calls to 879 * ipcs_cleanup() -> shm_dtor() -> shm_rm_amp(). If 880 * ipc_commit_begin() fails, it internally calls shm_dtor(), 881 * unreserving the above anon, and freeing the below amp. 882 */ 883 884 sp->shm_amp = anonmap_alloc(rsize, rsize, ANON_SLEEP); 885 sp->shm_amp->a_sp = sp; 886 /* 887 * Store the original user's requested size, in bytes, 888 * rather than the page-aligned size. The former is 889 * used for IPC_STAT and shmget() lookups. The latter 890 * is saved in the anon_map structure and is used for 891 * calls to the vm layer. 892 */ 893 sp->shm_segsz = size; 894 sp->shm_atime = sp->shm_dtime = 0; 895 sp->shm_ctime = gethrestime_sec(); 896 sp->shm_lpid = (pid_t)0; 897 sp->shm_cpid = curproc->p_pid; 898 sp->shm_ismattch = 0; 899 sp->shm_sptinfo = NULL; 900 /* 901 * Check limits one last time, push id into global 902 * visibility, and update resource usage counts. 903 */ 904 if (error = ipc_commit_begin(shm_svc, key, shmflg, 905 (kipc_perm_t *)sp)) { 906 if (error == EAGAIN) 907 goto top; 908 return (error); 909 } 910 911 if ((rctl_test(rc_project_shmmax, 912 sp->shm_perm.ipc_proj->kpj_rctls, pp, rsize, 913 RCA_SAFE) & RCT_DENY) || 914 (rctl_test(rc_zone_shmmax, 915 sp->shm_perm.ipc_zone->zone_rctls, pp, rsize, 916 RCA_SAFE) & RCT_DENY)) { 917 ipc_cleanup(shm_svc, (kipc_perm_t *)sp); 918 return (EINVAL); 919 } 920 sp->shm_perm.ipc_proj->kpj_data.kpd_shmmax += rsize; 921 sp->shm_perm.ipc_zone->zone_shmmax += rsize; 922 923 lock = ipc_commit_end(shm_svc, &sp->shm_perm); 924 } 925 926 #ifdef C2_AUDIT 927 if (audit_active) 928 audit_ipcget(AT_IPC_SHM, (void *)sp); 929 #endif 930 931 *rvp = (uintptr_t)(sp->shm_perm.ipc_id); 932 933 mutex_exit(lock); 934 return (0); 935 } 936 937 /* 938 * shmids system call. 939 */ 940 static int 941 shmids(int *buf, uint_t nids, uint_t *pnids) 942 { 943 return (ipc_ids(shm_svc, buf, nids, pnids)); 944 } 945 946 /* 947 * System entry point for shmat, shmctl, shmdt, and shmget system calls. 948 */ 949 static uintptr_t 950 shmsys(int opcode, uintptr_t a0, uintptr_t a1, uintptr_t a2) 951 { 952 int error; 953 uintptr_t r_val = 0; 954 955 switch (opcode) { 956 case SHMAT: 957 error = shmat((int)a0, (caddr_t)a1, (int)a2, &r_val); 958 break; 959 case SHMCTL: 960 error = shmctl((int)a0, (int)a1, (void *)a2); 961 break; 962 case SHMDT: 963 error = shmdt((caddr_t)a0); 964 break; 965 case SHMGET: 966 error = shmget((key_t)a0, (size_t)a1, (int)a2, &r_val); 967 break; 968 case SHMIDS: 969 error = shmids((int *)a0, (uint_t)a1, (uint_t *)a2); 970 break; 971 default: 972 error = EINVAL; 973 break; 974 } 975 976 if (error) 977 return ((uintptr_t)set_errno(error)); 978 979 return (r_val); 980 } 981 982 /* 983 * segacct_t comparator 984 * This works as expected, with one minor change: the first of two real 985 * segments with equal addresses is considered to be 'greater than' the 986 * second. We only return equal when searching using a template, in 987 * which case we explicitly set the template segment's length to 0 988 * (which is invalid for a real segment). 989 */ 990 static int 991 shm_sacompar(const void *x, const void *y) 992 { 993 segacct_t *sa1 = (segacct_t *)x; 994 segacct_t *sa2 = (segacct_t *)y; 995 996 if (sa1->sa_addr < sa2->sa_addr) { 997 return (-1); 998 } else if (sa2->sa_len != 0) { 999 if (sa1->sa_addr >= sa2->sa_addr + sa2->sa_len) { 1000 return (1); 1001 } else if (sa1->sa_len != 0) { 1002 return (1); 1003 } else { 1004 return (0); 1005 } 1006 } else if (sa1->sa_addr > sa2->sa_addr) { 1007 return (1); 1008 } else { 1009 return (0); 1010 } 1011 } 1012 1013 /* 1014 * add this record to the segacct list. 1015 */ 1016 static void 1017 sa_add(struct proc *pp, caddr_t addr, size_t len, ulong_t flags, kshmid_t *id) 1018 { 1019 segacct_t *nsap; 1020 avl_tree_t *tree = NULL; 1021 avl_index_t where; 1022 1023 nsap = kmem_alloc(sizeof (segacct_t), KM_SLEEP); 1024 nsap->sa_addr = addr; 1025 nsap->sa_len = len; 1026 nsap->sa_flags = flags; 1027 nsap->sa_id = id; 1028 1029 if (pp->p_segacct == NULL) 1030 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 1031 1032 mutex_enter(&pp->p_lock); 1033 prbarrier(pp); /* block /proc. See shmgetid(). */ 1034 1035 if (pp->p_segacct == NULL) { 1036 avl_create(tree, shm_sacompar, sizeof (segacct_t), 1037 offsetof(segacct_t, sa_tree)); 1038 pp->p_segacct = tree; 1039 } else if (tree) { 1040 kmem_free(tree, sizeof (avl_tree_t)); 1041 } 1042 1043 /* 1044 * We can ignore the result of avl_find, as the comparator will 1045 * never return equal for segments with non-zero length. This 1046 * is a necessary hack to get around the fact that we do, in 1047 * fact, have duplicate keys. 1048 */ 1049 (void) avl_find(pp->p_segacct, nsap, &where); 1050 avl_insert(pp->p_segacct, nsap, where); 1051 1052 mutex_exit(&pp->p_lock); 1053 } 1054 1055 /* 1056 * Duplicate parent's segacct records in child. 1057 */ 1058 void 1059 shmfork(struct proc *ppp, struct proc *cpp) 1060 { 1061 segacct_t *sap; 1062 kshmid_t *sp; 1063 kmutex_t *mp; 1064 1065 ASSERT(ppp->p_segacct != NULL); 1066 1067 /* 1068 * We are the only lwp running in the parent so nobody can 1069 * mess with our p_segacct list. Thus it is safe to traverse 1070 * the list without holding p_lock. This is essential because 1071 * we can't hold p_lock during a KM_SLEEP allocation. 1072 */ 1073 for (sap = (segacct_t *)avl_first(ppp->p_segacct); sap != NULL; 1074 sap = (segacct_t *)AVL_NEXT(ppp->p_segacct, sap)) { 1075 sa_add(cpp, sap->sa_addr, sap->sa_len, sap->sa_flags, 1076 sap->sa_id); 1077 sp = sap->sa_id; 1078 mp = ipc_lock(shm_svc, sp->shm_perm.ipc_id); 1079 if (sap->sa_flags & SHMSA_ISM) 1080 sp->shm_ismattch++; 1081 ipc_hold(shm_svc, (kipc_perm_t *)sp); 1082 mutex_exit(mp); 1083 } 1084 } 1085 1086 /* 1087 * Detach shared memory segments from exiting process. 1088 */ 1089 void 1090 shmexit(struct proc *pp) 1091 { 1092 segacct_t *sap; 1093 avl_tree_t *tree; 1094 void *cookie = NULL; 1095 1096 ASSERT(pp->p_segacct != NULL); 1097 1098 mutex_enter(&pp->p_lock); 1099 prbarrier(pp); 1100 tree = pp->p_segacct; 1101 pp->p_segacct = NULL; 1102 mutex_exit(&pp->p_lock); 1103 1104 while ((sap = avl_destroy_nodes(tree, &cookie)) != NULL) 1105 (void) shm_detach(pp, sap); 1106 1107 avl_destroy(tree); 1108 kmem_free(tree, sizeof (avl_tree_t)); 1109 } 1110 1111 /* 1112 * At this time pages should be in memory, so just lock them. 1113 */ 1114 static void 1115 lock_again(size_t npages, kshmid_t *sp, struct anon_map *amp) 1116 { 1117 struct anon *ap; 1118 struct page *pp; 1119 struct vnode *vp; 1120 u_offset_t off; 1121 ulong_t anon_idx; 1122 anon_sync_obj_t cookie; 1123 1124 mutex_enter(&sp->shm_mlock); 1125 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 1126 for (anon_idx = 0; npages != 0; anon_idx++, npages--) { 1127 1128 anon_array_enter(amp, anon_idx, &cookie); 1129 ap = anon_get_ptr(amp->ahp, anon_idx); 1130 ASSERT(ap != NULL); 1131 swap_xlate(ap, &vp, &off); 1132 anon_array_exit(&cookie); 1133 1134 pp = page_lookup(vp, off, SE_SHARED); 1135 if (pp == NULL) { 1136 panic("lock_again: page not in the system"); 1137 /*NOTREACHED*/ 1138 } 1139 /* page should already be locked by caller */ 1140 ASSERT(pp->p_lckcnt > 0); 1141 (void) page_pp_lock(pp, 0, 0); 1142 page_unlock(pp); 1143 } 1144 ANON_LOCK_EXIT(&->a_rwlock); 1145 mutex_exit(&sp->shm_mlock); 1146 } 1147 1148 /* 1149 * Attach the shared memory segment to the process 1150 * address space and lock the pages. 1151 */ 1152 static int 1153 shmem_lock(kshmid_t *sp, struct anon_map *amp) 1154 { 1155 size_t npages = btopr(amp->size); 1156 struct as *as; 1157 struct segvn_crargs crargs; 1158 uint_t error; 1159 1160 /* 1161 * A later ISM/DISM attach may increase the size of the amp, so 1162 * cache the number of pages locked for the future shmem_unlock() 1163 */ 1164 sp->shm_lkpages = npages; 1165 1166 as = as_alloc(); 1167 /* Initialize the create arguments and map the segment */ 1168 crargs = *(struct segvn_crargs *)zfod_argsp; /* structure copy */ 1169 crargs.offset = (u_offset_t)0; 1170 crargs.type = MAP_SHARED; 1171 crargs.amp = amp; 1172 crargs.prot = PROT_ALL; 1173 crargs.maxprot = crargs.prot; 1174 crargs.flags = 0; 1175 error = as_map(as, 0x0, amp->size, segvn_create, &crargs); 1176 if (!error) { 1177 if ((error = as_ctl(as, 0x0, amp->size, MC_LOCK, 0, 0, 1178 NULL, 0)) == 0) { 1179 lock_again(npages, sp, amp); 1180 } 1181 (void) as_unmap(as, 0x0, amp->size); 1182 } 1183 as_free(as); 1184 return (error); 1185 } 1186 1187 1188 /* 1189 * Unlock shared memory 1190 */ 1191 static void 1192 shmem_unlock(kshmid_t *sp, struct anon_map *amp) 1193 { 1194 struct anon *ap; 1195 pgcnt_t npages = sp->shm_lkpages; 1196 struct vnode *vp; 1197 struct page *pp; 1198 u_offset_t off; 1199 ulong_t anon_idx; 1200 size_t unlocked_bytes = 0; 1201 kproject_t *proj; 1202 anon_sync_obj_t cookie; 1203 1204 proj = sp->shm_perm.ipc_proj; 1205 mutex_enter(&sp->shm_mlock); 1206 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 1207 for (anon_idx = 0; anon_idx < npages; anon_idx++) { 1208 1209 anon_array_enter(amp, anon_idx, &cookie); 1210 if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) { 1211 panic("shmem_unlock: null app"); 1212 /*NOTREACHED*/ 1213 } 1214 swap_xlate(ap, &vp, &off); 1215 anon_array_exit(&cookie); 1216 pp = page_lookup(vp, off, SE_SHARED); 1217 if (pp == NULL) { 1218 panic("shmem_unlock: page not in the system"); 1219 /*NOTREACHED*/ 1220 } 1221 /* 1222 * Page should at least have once lock from previous 1223 * shmem_lock 1224 */ 1225 ASSERT(pp->p_lckcnt > 0); 1226 page_pp_unlock(pp, 0, 0); 1227 if (pp->p_lckcnt == 0) 1228 unlocked_bytes += PAGESIZE; 1229 1230 page_unlock(pp); 1231 } 1232 1233 if (unlocked_bytes > 0) { 1234 rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0); 1235 } 1236 1237 ANON_LOCK_EXIT(&->a_rwlock); 1238 mutex_exit(&sp->shm_mlock); 1239 } 1240 1241 /* 1242 * We call this routine when we have removed all references to this 1243 * amp. This means all shmdt()s and the IPC_RMID have been done. 1244 */ 1245 static void 1246 shm_rm_amp(kshmid_t *sp) 1247 { 1248 struct anon_map *amp = sp->shm_amp; 1249 zone_t *zone; 1250 1251 zone = sp->shm_perm.ipc_zone; 1252 ASSERT(zone != NULL); 1253 /* 1254 * Free up the anon_map. 1255 */ 1256 lgrp_shm_policy_fini(amp, NULL); 1257 if (amp->a_szc != 0) { 1258 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1259 anon_shmap_free_pages(amp, 0, amp->size); 1260 ANON_LOCK_EXIT(&->a_rwlock); 1261 } else { 1262 anon_free(amp->ahp, 0, amp->size); 1263 } 1264 anon_unresv_zone(amp->swresv, zone); 1265 anonmap_free(amp); 1266 } 1267 1268 /* 1269 * Return the shared memory id for the process's virtual address. 1270 * Return SHMID_NONE if addr is not within a SysV shared memory segment. 1271 * Return SHMID_FREE if addr's SysV shared memory segment's id has been freed. 1272 * 1273 * shmgetid() is called from code in /proc with the process locked but 1274 * with pp->p_lock not held. The address space lock is held, so we 1275 * cannot grab pp->p_lock here due to lock-ordering constraints. 1276 * Because of all this, modifications to the p_segacct list must only 1277 * be made after calling prbarrier() to ensure the process is not locked. 1278 * See shmdt() and sa_add(), above. shmgetid() may also be called on a 1279 * thread's own process without the process locked. 1280 */ 1281 int 1282 shmgetid(proc_t *pp, caddr_t addr) 1283 { 1284 segacct_t *sap, template; 1285 1286 ASSERT(MUTEX_NOT_HELD(&pp->p_lock)); 1287 ASSERT((pp->p_proc_flag & P_PR_LOCK) || pp == curproc); 1288 1289 if (pp->p_segacct == NULL) 1290 return (SHMID_NONE); 1291 1292 template.sa_addr = addr; 1293 template.sa_len = 0; 1294 if ((sap = avl_find(pp->p_segacct, &template, NULL)) == NULL) 1295 return (SHMID_NONE); 1296 1297 if (IPC_FREE(&sap->sa_id->shm_perm)) 1298 return (SHMID_FREE); 1299 1300 return (sap->sa_id->shm_perm.ipc_id); 1301 } 1302