1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * Inter-Process Communication Shared Memory Facility. 43 * 44 * See os/ipc.c for a description of common IPC functionality. 45 * 46 * Resource controls 47 * ----------------- 48 * 49 * Control: zone.max-shm-ids (rc_zone_shmmni) 50 * Description: Maximum number of shared memory ids allowed a zone. 51 * 52 * When shmget() is used to allocate a shared memory segment, one id 53 * is allocated. If the id allocation doesn't succeed, shmget() 54 * fails and errno is set to ENOSPC. Upon successful shmctl(, 55 * IPC_RMID) the id is deallocated. 56 * 57 * Control: project.max-shm-ids (rc_project_shmmni) 58 * Description: Maximum number of shared memory ids allowed a project. 59 * 60 * When shmget() is used to allocate a shared memory segment, one id 61 * is allocated. If the id allocation doesn't succeed, shmget() 62 * fails and errno is set to ENOSPC. Upon successful shmctl(, 63 * IPC_RMID) the id is deallocated. 64 * 65 * Control: zone.max-shm-memory (rc_zone_shmmax) 66 * Description: Total amount of shared memory allowed a zone. 67 * 68 * When shmget() is used to allocate a shared memory segment, the 69 * segment's size is allocated against this limit. If the space 70 * allocation doesn't succeed, shmget() fails and errno is set to 71 * EINVAL. The size will be deallocated once the last process has 72 * detached the segment and the segment has been successfully 73 * shmctl(, IPC_RMID)ed. 74 * 75 * Control: project.max-shm-memory (rc_project_shmmax) 76 * Description: Total amount of shared memory allowed a project. 77 * 78 * When shmget() is used to allocate a shared memory segment, the 79 * segment's size is allocated against this limit. If the space 80 * allocation doesn't succeed, shmget() fails and errno is set to 81 * EINVAL. The size will be deallocated once the last process has 82 * detached the segment and the segment has been successfully 83 * shmctl(, IPC_RMID)ed. 84 */ 85 86 #include <sys/types.h> 87 #include <sys/param.h> 88 #include <sys/cred.h> 89 #include <sys/errno.h> 90 #include <sys/time.h> 91 #include <sys/kmem.h> 92 #include <sys/user.h> 93 #include <sys/proc.h> 94 #include <sys/systm.h> 95 #include <sys/prsystm.h> 96 #include <sys/sysmacros.h> 97 #include <sys/tuneable.h> 98 #include <sys/vm.h> 99 #include <sys/mman.h> 100 #include <sys/swap.h> 101 #include <sys/cmn_err.h> 102 #include <sys/debug.h> 103 #include <sys/lwpchan_impl.h> 104 #include <sys/avl.h> 105 #include <sys/modctl.h> 106 #include <sys/syscall.h> 107 #include <sys/task.h> 108 #include <sys/project.h> 109 #include <sys/policy.h> 110 #include <sys/zone.h> 111 #include <sys/rctl.h> 112 113 #include <sys/ipc.h> 114 #include <sys/ipc_impl.h> 115 #include <sys/shm.h> 116 #include <sys/shm_impl.h> 117 118 #include <vm/hat.h> 119 #include <vm/seg.h> 120 #include <vm/as.h> 121 #include <vm/seg_vn.h> 122 #include <vm/anon.h> 123 #include <vm/page.h> 124 #include <vm/vpage.h> 125 #include <vm/seg_spt.h> 126 127 #include <c2/audit.h> 128 129 static int shmem_lock(kshmid_t *sp, struct anon_map *amp); 130 static void shmem_unlock(kshmid_t *sp, struct anon_map *amp); 131 static void sa_add(struct proc *pp, caddr_t addr, size_t len, ulong_t flags, 132 kshmid_t *id); 133 static void shm_rm_amp(struct anon_map *amp); 134 static void shm_dtor(kipc_perm_t *); 135 static void shm_rmid(kipc_perm_t *); 136 static void shm_remove_zone(zoneid_t, void *); 137 138 /* 139 * Semantics for share_page_table and ism_off: 140 * 141 * These are hooks in /etc/system - only for internal testing purpose. 142 * 143 * Setting share_page_table automatically turns on the SHM_SHARE_MMU (ISM) flag 144 * in a call to shmat(2). In other words, with share_page_table set, you always 145 * get ISM, even if say, DISM is specified. It should really be called "ism_on". 146 * 147 * Setting ism_off turns off the SHM_SHARE_MMU flag from the flags passed to 148 * shmat(2). 149 * 150 * If both share_page_table and ism_off are set, share_page_table prevails. 151 * 152 * Although these tunables should probably be removed, they do have some 153 * external exposure; as long as they exist, they should at least work sensibly. 154 */ 155 156 int share_page_table; 157 int ism_off; 158 159 /* 160 * The following tunables are obsolete. Though for compatibility we 161 * still read and interpret shminfo_shmmax and shminfo_shmmni (see 162 * os/project.c), the preferred mechanism for administrating the IPC 163 * Shared Memory facility is through the resource controls described at 164 * the top of this file. 165 */ 166 size_t shminfo_shmmax = 0x800000; /* (obsolete) */ 167 int shminfo_shmmni = 100; /* (obsolete) */ 168 size_t shminfo_shmmin = 1; /* (obsolete) */ 169 int shminfo_shmseg = 6; /* (obsolete) */ 170 171 extern rctl_hndl_t rc_zone_shmmax; 172 extern rctl_hndl_t rc_zone_shmmni; 173 extern rctl_hndl_t rc_project_shmmax; 174 extern rctl_hndl_t rc_project_shmmni; 175 static ipc_service_t *shm_svc; 176 static zone_key_t shm_zone_key; 177 178 /* 179 * Module linkage information for the kernel. 180 */ 181 static uintptr_t shmsys(int, uintptr_t, uintptr_t, uintptr_t); 182 183 static struct sysent ipcshm_sysent = { 184 4, 185 #ifdef _SYSCALL32_IMPL 186 SE_ARGC | SE_NOUNLOAD | SE_64RVAL, 187 #else /* _SYSCALL32_IMPL */ 188 SE_ARGC | SE_NOUNLOAD | SE_32RVAL1, 189 #endif /* _SYSCALL32_IMPL */ 190 (int (*)())shmsys 191 }; 192 193 #ifdef _SYSCALL32_IMPL 194 static struct sysent ipcshm_sysent32 = { 195 4, 196 SE_ARGC | SE_NOUNLOAD | SE_32RVAL1, 197 (int (*)())shmsys 198 }; 199 #endif /* _SYSCALL32_IMPL */ 200 201 static struct modlsys modlsys = { 202 &mod_syscallops, "System V shared memory", &ipcshm_sysent 203 }; 204 205 #ifdef _SYSCALL32_IMPL 206 static struct modlsys modlsys32 = { 207 &mod_syscallops32, "32-bit System V shared memory", &ipcshm_sysent32 208 }; 209 #endif /* _SYSCALL32_IMPL */ 210 211 static struct modlinkage modlinkage = { 212 MODREV_1, 213 &modlsys, 214 #ifdef _SYSCALL32_IMPL 215 &modlsys32, 216 #endif 217 NULL 218 }; 219 220 221 int 222 _init(void) 223 { 224 int result; 225 226 shm_svc = ipcs_create("shmids", rc_project_shmmni, rc_zone_shmmni, 227 sizeof (kshmid_t), shm_dtor, shm_rmid, AT_IPC_SHM, 228 offsetof(ipc_rqty_t, ipcq_shmmni)); 229 zone_key_create(&shm_zone_key, NULL, shm_remove_zone, NULL); 230 231 if ((result = mod_install(&modlinkage)) == 0) 232 return (0); 233 234 (void) zone_key_delete(shm_zone_key); 235 ipcs_destroy(shm_svc); 236 237 return (result); 238 } 239 240 int 241 _fini(void) 242 { 243 return (EBUSY); 244 } 245 246 int 247 _info(struct modinfo *modinfop) 248 { 249 return (mod_info(&modlinkage, modinfop)); 250 } 251 252 /* 253 * Shmat (attach shared segment) system call. 254 */ 255 static int 256 shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) 257 { 258 kshmid_t *sp; /* shared memory header ptr */ 259 size_t size; 260 int error = 0; 261 proc_t *pp = curproc; 262 struct as *as = pp->p_as; 263 struct segvn_crargs crargs; /* segvn create arguments */ 264 kmutex_t *lock; 265 struct seg *segspt = NULL; 266 caddr_t addr = uaddr; 267 int flags = (uflags & SHMAT_VALID_FLAGS_MASK); 268 int useISM; 269 uchar_t prot = PROT_ALL; 270 int result; 271 272 if ((lock = ipc_lookup(shm_svc, shmid, (kipc_perm_t **)&sp)) == NULL) 273 return (EINVAL); 274 if (error = ipcperm_access(&sp->shm_perm, SHM_R, CRED())) 275 goto errret; 276 if ((flags & SHM_RDONLY) == 0 && 277 (error = ipcperm_access(&sp->shm_perm, SHM_W, CRED()))) 278 goto errret; 279 if (spt_invalid(flags)) { 280 error = EINVAL; 281 goto errret; 282 } 283 if (ism_off) 284 flags = flags & ~SHM_SHARE_MMU; 285 if (share_page_table) { 286 flags = flags & ~SHM_PAGEABLE; 287 flags = flags | SHM_SHARE_MMU; 288 } 289 useISM = (spt_locked(flags) || spt_pageable(flags)); 290 if (useISM && (error = ipcperm_access(&sp->shm_perm, SHM_W, CRED()))) 291 goto errret; 292 if (useISM && isspt(sp)) { 293 uint_t newsptflags = flags | spt_flags(sp->shm_sptseg); 294 /* 295 * If trying to change an existing {D}ISM segment from ISM 296 * to DISM or vice versa, return error. Note that this 297 * validation of flags needs to be done after the effect of 298 * tunables such as ism_off and share_page_table, for 299 * semantics that are consistent with the tunables' settings. 300 */ 301 if (spt_invalid(newsptflags)) { 302 error = EINVAL; 303 goto errret; 304 } 305 } 306 ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER); 307 size = sp->shm_amp->size; 308 ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock); 309 310 /* somewhere to record spt info for final detach */ 311 if (sp->shm_sptinfo == NULL) 312 sp->shm_sptinfo = kmem_zalloc(sizeof (sptinfo_t), KM_SLEEP); 313 314 as_rangelock(as); 315 316 if (useISM) { 317 /* 318 * Handle ISM 319 */ 320 uint_t n, share_szc; 321 size_t share_size; 322 struct shm_data ssd; 323 uintptr_t align_hint; 324 325 n = page_num_pagesizes(); 326 if (n < 2) { /* large pages aren't supported */ 327 as_rangeunlock(as); 328 error = EINVAL; 329 goto errret; 330 } 331 332 /* 333 * Pick a share pagesize to use, if (!isspt(sp)). 334 * Otherwise use the already chosen page size. 335 * 336 * For the initial shmat (!isspt(sp)), where sptcreate is 337 * called, map_pgsz is called to recommend a [D]ISM pagesize, 338 * important for systems which offer more than one potential 339 * [D]ISM pagesize. 340 * If the shmat is just to attach to an already created 341 * [D]ISM segment, then use the previously selected page size. 342 */ 343 if (!isspt(sp)) { 344 share_size = map_pgsz(MAPPGSZ_ISM, 345 pp, addr, size, NULL); 346 if (share_size == 0) { 347 as_rangeunlock(as); 348 error = EINVAL; 349 goto errret; 350 } 351 share_szc = page_szc(share_size); 352 } else { 353 share_szc = sp->shm_sptseg->s_szc; 354 share_size = page_get_pagesize(share_szc); 355 } 356 size = P2ROUNDUP(size, share_size); 357 358 align_hint = share_size; 359 #if defined(__i386) || defined(__amd64) 360 /* 361 * For 64 bit amd64, we want to share an entire page table 362 * if possible. We know (ugh) that there are 512 entries in 363 * in a page table. The number for 32 bit non-PAE should be 364 * 1024, but I'm not going to special case that. Note using 512 365 * won't cause a failure below. It retries with align_hint set 366 * to share_size 367 */ 368 while (size >= 512 * (uint64_t)align_hint) 369 align_hint *= 512; 370 #endif /* __i386 || __amd64 */ 371 372 #if defined(__sparcv9) 373 if (addr == 0 && curproc->p_model == DATAMODEL_LP64) { 374 /* 375 * If no address has been passed in, and this is a 376 * 64-bit process, we'll try to find an address 377 * in the predict-ISM zone. 378 */ 379 caddr_t predbase = (caddr_t)PREDISM_1T_BASE; 380 size_t len = PREDISM_BOUND - PREDISM_1T_BASE; 381 382 as_purge(as); 383 if (as_gap(as, size + share_size, &predbase, &len, 384 AH_LO, (caddr_t)NULL) != -1) { 385 /* 386 * We found an address which looks like a 387 * candidate. We want to round it up, and 388 * then check that it's a valid user range. 389 * This assures that we won't fail below. 390 */ 391 addr = (caddr_t)P2ROUNDUP((uintptr_t)predbase, 392 share_size); 393 394 if (valid_usr_range(addr, size, prot, 395 as, as->a_userlimit) != RANGE_OKAY) { 396 addr = 0; 397 } 398 } 399 } 400 #endif /* __sparcv9 */ 401 402 if (addr == 0) { 403 for (;;) { 404 addr = (caddr_t)align_hint; 405 map_addr(&addr, size, 0ll, 1, MAP_ALIGN); 406 if (addr != NULL || align_hint == share_size) 407 break; 408 align_hint = share_size; 409 } 410 if (addr == NULL) { 411 as_rangeunlock(as); 412 error = ENOMEM; 413 goto errret; 414 } 415 ASSERT(((uintptr_t)addr & (align_hint - 1)) == 0); 416 } else { 417 /* Use the user-supplied attach address */ 418 caddr_t base; 419 size_t len; 420 421 /* 422 * Check that the address range 423 * 1) is properly aligned 424 * 2) is correct in unix terms 425 * 3) is within an unmapped address segment 426 */ 427 base = addr; 428 len = size; /* use spt aligned size */ 429 /* XXX - in SunOS, is sp->shm_segsz */ 430 if ((uintptr_t)base & (share_size - 1)) { 431 error = EINVAL; 432 as_rangeunlock(as); 433 goto errret; 434 } 435 result = valid_usr_range(base, len, prot, as, 436 as->a_userlimit); 437 if (result == RANGE_BADPROT) { 438 /* 439 * We try to accomodate processors which 440 * may not support execute permissions on 441 * all ISM segments by trying the check 442 * again but without PROT_EXEC. 443 */ 444 prot &= ~PROT_EXEC; 445 result = valid_usr_range(base, len, prot, as, 446 as->a_userlimit); 447 } 448 as_purge(as); 449 if (result != RANGE_OKAY || 450 as_gap(as, len, &base, &len, AH_LO, 451 (caddr_t)NULL) != 0) { 452 error = EINVAL; 453 as_rangeunlock(as); 454 goto errret; 455 } 456 } 457 458 if (!isspt(sp)) { 459 error = sptcreate(size, &segspt, sp->shm_amp, prot, 460 flags, share_szc); 461 if (error) { 462 as_rangeunlock(as); 463 goto errret; 464 } 465 sp->shm_sptinfo->sptas = segspt->s_as; 466 sp->shm_sptseg = segspt; 467 sp->shm_sptprot = prot; 468 } else if ((prot & sp->shm_sptprot) != sp->shm_sptprot) { 469 /* 470 * Ensure we're attaching to an ISM segment with 471 * fewer or equal permissions than what we're 472 * allowed. Fail if the segment has more 473 * permissions than what we're allowed. 474 */ 475 error = EACCES; 476 as_rangeunlock(as); 477 goto errret; 478 } 479 480 ssd.shm_sptseg = sp->shm_sptseg; 481 ssd.shm_sptas = sp->shm_sptinfo->sptas; 482 ssd.shm_amp = sp->shm_amp; 483 error = as_map(as, addr, size, segspt_shmattach, &ssd); 484 if (error == 0) 485 sp->shm_ismattch++; /* keep count of ISM attaches */ 486 } else { 487 488 /* 489 * Normal case. 490 */ 491 if (flags & SHM_RDONLY) 492 prot &= ~PROT_WRITE; 493 494 if (addr == 0) { 495 /* Let the system pick the attach address */ 496 map_addr(&addr, size, 0ll, 1, 0); 497 if (addr == NULL) { 498 as_rangeunlock(as); 499 error = ENOMEM; 500 goto errret; 501 } 502 } else { 503 /* Use the user-supplied attach address */ 504 caddr_t base; 505 size_t len; 506 507 if (flags & SHM_RND) 508 addr = (caddr_t)((uintptr_t)addr & 509 ~(SHMLBA - 1)); 510 /* 511 * Check that the address range 512 * 1) is properly aligned 513 * 2) is correct in unix terms 514 * 3) is within an unmapped address segment 515 */ 516 base = addr; 517 len = size; /* use aligned size */ 518 /* XXX - in SunOS, is sp->shm_segsz */ 519 if ((uintptr_t)base & PAGEOFFSET) { 520 error = EINVAL; 521 as_rangeunlock(as); 522 goto errret; 523 } 524 result = valid_usr_range(base, len, prot, as, 525 as->a_userlimit); 526 if (result == RANGE_BADPROT) { 527 prot &= ~PROT_EXEC; 528 result = valid_usr_range(base, len, prot, as, 529 as->a_userlimit); 530 } 531 as_purge(as); 532 if (result != RANGE_OKAY || 533 as_gap(as, len, &base, &len, 534 AH_LO, (caddr_t)NULL) != 0) { 535 error = EINVAL; 536 as_rangeunlock(as); 537 goto errret; 538 } 539 } 540 541 /* Initialize the create arguments and map the segment */ 542 crargs = *(struct segvn_crargs *)zfod_argsp; 543 crargs.offset = 0; 544 crargs.type = MAP_SHARED; 545 crargs.amp = sp->shm_amp; 546 crargs.prot = prot; 547 crargs.maxprot = crargs.prot; 548 crargs.flags = 0; 549 550 error = as_map(as, addr, size, segvn_create, &crargs); 551 } 552 553 as_rangeunlock(as); 554 if (error) 555 goto errret; 556 557 /* record shmem range for the detach */ 558 sa_add(pp, addr, (size_t)size, useISM ? SHMSA_ISM : 0, sp); 559 *rvp = (uintptr_t)addr; 560 561 sp->shm_atime = gethrestime_sec(); 562 sp->shm_lpid = pp->p_pid; 563 ipc_hold(shm_svc, (kipc_perm_t *)sp); 564 errret: 565 mutex_exit(lock); 566 return (error); 567 } 568 569 static void 570 shm_dtor(kipc_perm_t *perm) 571 { 572 kshmid_t *sp = (kshmid_t *)perm; 573 uint_t cnt; 574 size_t rsize; 575 576 if (sp->shm_lkcnt > 0) { 577 shmem_unlock(sp, sp->shm_amp); 578 sp->shm_lkcnt = 0; 579 } 580 581 if (sp->shm_sptinfo) { 582 if (isspt(sp)) 583 sptdestroy(sp->shm_sptinfo->sptas, sp->shm_amp); 584 kmem_free(sp->shm_sptinfo, sizeof (sptinfo_t)); 585 } 586 587 ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER); 588 cnt = --sp->shm_amp->refcnt; 589 ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock); 590 ASSERT(cnt == 0); 591 shm_rm_amp(sp->shm_amp); 592 593 if (sp->shm_perm.ipc_id != IPC_ID_INVAL) { 594 rsize = ptob(btopr(sp->shm_segsz)); 595 ipcs_lock(shm_svc); 596 sp->shm_perm.ipc_proj->kpj_data.kpd_shmmax -= rsize; 597 sp->shm_perm.ipc_zone->zone_shmmax -= rsize; 598 ipcs_unlock(shm_svc); 599 } 600 } 601 602 /* ARGSUSED */ 603 static void 604 shm_rmid(kipc_perm_t *perm) 605 { 606 /* nothing to do */ 607 } 608 609 /* 610 * Shmctl system call. 611 */ 612 /* ARGSUSED */ 613 static int 614 shmctl(int shmid, int cmd, void *arg) 615 { 616 kshmid_t *sp; /* shared memory header ptr */ 617 STRUCT_DECL(shmid_ds, ds); /* for SVR4 IPC_SET */ 618 int error = 0; 619 struct cred *cr = CRED(); 620 kmutex_t *lock; 621 model_t mdl = get_udatamodel(); 622 struct shmid_ds64 ds64; 623 shmatt_t nattch; 624 625 STRUCT_INIT(ds, mdl); 626 627 /* 628 * Perform pre- or non-lookup actions (e.g. copyins, RMID). 629 */ 630 switch (cmd) { 631 case IPC_SET: 632 if (copyin(arg, STRUCT_BUF(ds), STRUCT_SIZE(ds))) 633 return (EFAULT); 634 break; 635 636 case IPC_SET64: 637 if (copyin(arg, &ds64, sizeof (struct shmid_ds64))) 638 return (EFAULT); 639 break; 640 641 case IPC_RMID: 642 return (ipc_rmid(shm_svc, shmid, cr)); 643 } 644 645 if ((lock = ipc_lookup(shm_svc, shmid, (kipc_perm_t **)&sp)) == NULL) 646 return (EINVAL); 647 648 switch (cmd) { 649 /* Set ownership and permissions. */ 650 case IPC_SET: 651 if (error = ipcperm_set(shm_svc, cr, &sp->shm_perm, 652 &STRUCT_BUF(ds)->shm_perm, mdl)) 653 break; 654 sp->shm_ctime = gethrestime_sec(); 655 break; 656 657 case IPC_STAT: 658 if (error = ipcperm_access(&sp->shm_perm, SHM_R, cr)) 659 break; 660 661 nattch = sp->shm_perm.ipc_ref - 1; 662 663 ipcperm_stat(&STRUCT_BUF(ds)->shm_perm, &sp->shm_perm, mdl); 664 STRUCT_FSET(ds, shm_segsz, sp->shm_segsz); 665 STRUCT_FSETP(ds, shm_amp, NULL); /* kernel addr */ 666 STRUCT_FSET(ds, shm_lkcnt, sp->shm_lkcnt); 667 STRUCT_FSET(ds, shm_lpid, sp->shm_lpid); 668 STRUCT_FSET(ds, shm_cpid, sp->shm_cpid); 669 STRUCT_FSET(ds, shm_nattch, nattch); 670 STRUCT_FSET(ds, shm_cnattch, sp->shm_ismattch); 671 STRUCT_FSET(ds, shm_atime, sp->shm_atime); 672 STRUCT_FSET(ds, shm_dtime, sp->shm_dtime); 673 STRUCT_FSET(ds, shm_ctime, sp->shm_ctime); 674 675 mutex_exit(lock); 676 if (copyout(STRUCT_BUF(ds), arg, STRUCT_SIZE(ds))) 677 return (EFAULT); 678 679 return (0); 680 681 case IPC_SET64: 682 if (error = ipcperm_set64(shm_svc, cr, 683 &sp->shm_perm, &ds64.shmx_perm)) 684 break; 685 sp->shm_ctime = gethrestime_sec(); 686 break; 687 688 case IPC_STAT64: 689 nattch = sp->shm_perm.ipc_ref - 1; 690 691 ipcperm_stat64(&ds64.shmx_perm, &sp->shm_perm); 692 ds64.shmx_segsz = sp->shm_segsz; 693 ds64.shmx_lkcnt = sp->shm_lkcnt; 694 ds64.shmx_lpid = sp->shm_lpid; 695 ds64.shmx_cpid = sp->shm_cpid; 696 ds64.shmx_nattch = nattch; 697 ds64.shmx_cnattch = sp->shm_ismattch; 698 ds64.shmx_atime = sp->shm_atime; 699 ds64.shmx_dtime = sp->shm_dtime; 700 ds64.shmx_ctime = sp->shm_ctime; 701 702 mutex_exit(lock); 703 if (copyout(&ds64, arg, sizeof (struct shmid_ds64))) 704 return (EFAULT); 705 706 return (0); 707 708 /* Lock segment in memory */ 709 case SHM_LOCK: 710 if ((error = secpolicy_lock_memory(cr)) != 0) 711 break; 712 713 /* protect against overflow */ 714 if (sp->shm_lkcnt >= USHRT_MAX) { 715 error = ENOMEM; 716 break; 717 } 718 if (!isspt(sp) && (sp->shm_lkcnt++ == 0)) { 719 if (error = shmem_lock(sp, sp->shm_amp)) { 720 ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER); 721 cmn_err(CE_NOTE, 722 "shmctl - couldn't lock %ld pages into memory", 723 sp->shm_amp->size); 724 ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock); 725 error = ENOMEM; 726 sp->shm_lkcnt--; 727 } 728 } 729 break; 730 731 /* Unlock segment */ 732 case SHM_UNLOCK: 733 if ((error = secpolicy_lock_memory(cr)) != 0) 734 break; 735 736 if (sp->shm_lkcnt && (--sp->shm_lkcnt == 0)) { 737 shmem_unlock(sp, sp->shm_amp); 738 } 739 break; 740 741 default: 742 error = EINVAL; 743 break; 744 } 745 mutex_exit(lock); 746 return (error); 747 } 748 749 static void 750 shm_detach(proc_t *pp, segacct_t *sap) 751 { 752 kshmid_t *sp = sap->sa_id; 753 size_t len = sap->sa_len; 754 caddr_t addr = sap->sa_addr; 755 756 /* 757 * Discard lwpchan mappings. 758 */ 759 if (pp->p_lcp != NULL) 760 lwpchan_delete_mapping(pp, addr, addr + len); 761 (void) as_unmap(pp->p_as, addr, len); 762 763 /* 764 * Perform some detach-time accounting. 765 */ 766 (void) ipc_lock(shm_svc, sp->shm_perm.ipc_id); 767 if (sap->sa_flags & SHMSA_ISM) 768 sp->shm_ismattch--; 769 sp->shm_dtime = gethrestime_sec(); 770 sp->shm_lpid = pp->p_pid; 771 ipc_rele(shm_svc, (kipc_perm_t *)sp); /* Drops lock */ 772 773 kmem_free(sap, sizeof (segacct_t)); 774 } 775 776 static int 777 shmdt(caddr_t addr) 778 { 779 proc_t *pp = curproc; 780 segacct_t *sap, template; 781 782 mutex_enter(&pp->p_lock); 783 prbarrier(pp); /* block /proc. See shmgetid(). */ 784 785 template.sa_addr = addr; 786 template.sa_len = 0; 787 if ((pp->p_segacct == NULL) || 788 ((sap = avl_find(pp->p_segacct, &template, NULL)) == NULL)) { 789 mutex_exit(&pp->p_lock); 790 return (EINVAL); 791 } 792 if (sap->sa_addr != addr) { 793 mutex_exit(&pp->p_lock); 794 return (EINVAL); 795 } 796 avl_remove(pp->p_segacct, sap); 797 mutex_exit(&pp->p_lock); 798 799 shm_detach(pp, sap); 800 801 return (0); 802 } 803 804 /* 805 * Remove all shared memory segments associated with a given zone. 806 * Called by zone_shutdown when the zone is halted. 807 */ 808 /*ARGSUSED1*/ 809 static void 810 shm_remove_zone(zoneid_t zoneid, void *arg) 811 { 812 ipc_remove_zone(shm_svc, zoneid); 813 } 814 815 /* 816 * Shmget (create new shmem) system call. 817 */ 818 static int 819 shmget(key_t key, size_t size, int shmflg, uintptr_t *rvp) 820 { 821 proc_t *pp = curproc; 822 kshmid_t *sp; 823 kmutex_t *lock; 824 int error; 825 826 top: 827 if (error = ipc_get(shm_svc, key, shmflg, (kipc_perm_t **)&sp, &lock)) 828 return (error); 829 830 if (!IPC_FREE(&sp->shm_perm)) { 831 /* 832 * A segment with the requested key exists. 833 */ 834 if (size > sp->shm_segsz) { 835 mutex_exit(lock); 836 return (EINVAL); 837 } 838 } else { 839 /* 840 * A new segment should be created. 841 */ 842 size_t npages = btopr(size); 843 size_t rsize = ptob(npages); 844 845 /* 846 * Check rsize and the per-project and per-zone limit on 847 * shared memory. Checking rsize handles both the size == 0 848 * case and the size < ULONG_MAX & PAGEMASK case (i.e. 849 * rounding up wraps a size_t). 850 */ 851 if (rsize == 0 || 852 (rctl_test(rc_project_shmmax, 853 pp->p_task->tk_proj->kpj_rctls, pp, rsize, 854 RCA_SAFE) & RCT_DENY) || 855 (rctl_test(rc_zone_shmmax, 856 pp->p_zone->zone_rctls, pp, rsize, 857 RCA_SAFE) & RCT_DENY)) { 858 859 mutex_exit(&pp->p_lock); 860 mutex_exit(lock); 861 ipc_cleanup(shm_svc, (kipc_perm_t *)sp); 862 return (EINVAL); 863 } 864 mutex_exit(&pp->p_lock); 865 mutex_exit(lock); 866 867 if (anon_resv(rsize) == 0) { 868 ipc_cleanup(shm_svc, (kipc_perm_t *)sp); 869 return (ENOMEM); 870 } 871 872 sp->shm_amp = anonmap_alloc(rsize, rsize); 873 sp->shm_amp->a_sp = sp; 874 /* 875 * Store the original user's requested size, in bytes, 876 * rather than the page-aligned size. The former is 877 * used for IPC_STAT and shmget() lookups. The latter 878 * is saved in the anon_map structure and is used for 879 * calls to the vm layer. 880 */ 881 sp->shm_segsz = size; 882 sp->shm_atime = sp->shm_dtime = 0; 883 sp->shm_ctime = gethrestime_sec(); 884 sp->shm_lpid = (pid_t)0; 885 sp->shm_cpid = curproc->p_pid; 886 sp->shm_ismattch = 0; 887 sp->shm_sptinfo = NULL; 888 /* 889 * Check limits one last time, push id into global 890 * visibility, and update resource usage counts. 891 */ 892 if (error = ipc_commit_begin(shm_svc, key, shmflg, 893 (kipc_perm_t *)sp)) { 894 if (error == EAGAIN) 895 goto top; 896 return (error); 897 } 898 899 if ((rctl_test(rc_project_shmmax, 900 sp->shm_perm.ipc_proj->kpj_rctls, pp, rsize, 901 RCA_SAFE) & RCT_DENY) || 902 (rctl_test(rc_zone_shmmax, 903 sp->shm_perm.ipc_zone->zone_rctls, pp, rsize, 904 RCA_SAFE) & RCT_DENY)) { 905 ipc_cleanup(shm_svc, (kipc_perm_t *)sp); 906 return (EINVAL); 907 } 908 sp->shm_perm.ipc_proj->kpj_data.kpd_shmmax += rsize; 909 sp->shm_perm.ipc_zone->zone_shmmax += rsize; 910 911 lock = ipc_commit_end(shm_svc, &sp->shm_perm); 912 } 913 914 #ifdef C2_AUDIT 915 if (audit_active) 916 audit_ipcget(AT_IPC_SHM, (void *)sp); 917 #endif 918 919 *rvp = (uintptr_t)(sp->shm_perm.ipc_id); 920 921 mutex_exit(lock); 922 return (0); 923 } 924 925 /* 926 * shmids system call. 927 */ 928 static int 929 shmids(int *buf, uint_t nids, uint_t *pnids) 930 { 931 return (ipc_ids(shm_svc, buf, nids, pnids)); 932 } 933 934 /* 935 * System entry point for shmat, shmctl, shmdt, and shmget system calls. 936 */ 937 static uintptr_t 938 shmsys(int opcode, uintptr_t a0, uintptr_t a1, uintptr_t a2) 939 { 940 int error; 941 uintptr_t r_val = 0; 942 943 switch (opcode) { 944 case SHMAT: 945 error = shmat((int)a0, (caddr_t)a1, (int)a2, &r_val); 946 break; 947 case SHMCTL: 948 error = shmctl((int)a0, (int)a1, (void *)a2); 949 break; 950 case SHMDT: 951 error = shmdt((caddr_t)a0); 952 break; 953 case SHMGET: 954 error = shmget((key_t)a0, (size_t)a1, (int)a2, &r_val); 955 break; 956 case SHMIDS: 957 error = shmids((int *)a0, (uint_t)a1, (uint_t *)a2); 958 break; 959 default: 960 error = EINVAL; 961 break; 962 } 963 964 if (error) 965 return ((uintptr_t)set_errno(error)); 966 967 return (r_val); 968 } 969 970 /* 971 * segacct_t comparator 972 * This works as expected, with one minor change: the first of two real 973 * segments with equal addresses is considered to be 'greater than' the 974 * second. We only return equal when searching using a template, in 975 * which case we explicitly set the template segment's length to 0 976 * (which is invalid for a real segment). 977 */ 978 static int 979 shm_sacompar(const void *x, const void *y) 980 { 981 segacct_t *sa1 = (segacct_t *)x; 982 segacct_t *sa2 = (segacct_t *)y; 983 984 if (sa1->sa_addr < sa2->sa_addr) { 985 return (-1); 986 } else if (sa2->sa_len != 0) { 987 if (sa1->sa_addr >= sa2->sa_addr + sa2->sa_len) { 988 return (1); 989 } else if (sa1->sa_len != 0) { 990 return (1); 991 } else { 992 return (0); 993 } 994 } else if (sa1->sa_addr > sa2->sa_addr) { 995 return (1); 996 } else { 997 return (0); 998 } 999 } 1000 1001 /* 1002 * add this record to the segacct list. 1003 */ 1004 static void 1005 sa_add(struct proc *pp, caddr_t addr, size_t len, ulong_t flags, kshmid_t *id) 1006 { 1007 segacct_t *nsap; 1008 avl_tree_t *tree = NULL; 1009 avl_index_t where; 1010 1011 nsap = kmem_alloc(sizeof (segacct_t), KM_SLEEP); 1012 nsap->sa_addr = addr; 1013 nsap->sa_len = len; 1014 nsap->sa_flags = flags; 1015 nsap->sa_id = id; 1016 1017 if (pp->p_segacct == NULL) 1018 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 1019 1020 mutex_enter(&pp->p_lock); 1021 prbarrier(pp); /* block /proc. See shmgetid(). */ 1022 1023 if (pp->p_segacct == NULL) { 1024 avl_create(tree, shm_sacompar, sizeof (segacct_t), 1025 offsetof(segacct_t, sa_tree)); 1026 pp->p_segacct = tree; 1027 } else if (tree) { 1028 kmem_free(tree, sizeof (avl_tree_t)); 1029 } 1030 1031 /* 1032 * We can ignore the result of avl_find, as the comparator will 1033 * never return equal for segments with non-zero length. This 1034 * is a necessary hack to get around the fact that we do, in 1035 * fact, have duplicate keys. 1036 */ 1037 (void) avl_find(pp->p_segacct, nsap, &where); 1038 avl_insert(pp->p_segacct, nsap, where); 1039 1040 mutex_exit(&pp->p_lock); 1041 } 1042 1043 /* 1044 * Duplicate parent's segacct records in child. 1045 */ 1046 void 1047 shmfork(struct proc *ppp, struct proc *cpp) 1048 { 1049 segacct_t *sap; 1050 kshmid_t *sp; 1051 kmutex_t *mp; 1052 1053 ASSERT(ppp->p_segacct != NULL); 1054 1055 /* 1056 * We are the only lwp running in the parent so nobody can 1057 * mess with our p_segacct list. Thus it is safe to traverse 1058 * the list without holding p_lock. This is essential because 1059 * we can't hold p_lock during a KM_SLEEP allocation. 1060 */ 1061 for (sap = (segacct_t *)avl_first(ppp->p_segacct); sap != NULL; 1062 sap = (segacct_t *)AVL_NEXT(ppp->p_segacct, sap)) { 1063 sa_add(cpp, sap->sa_addr, sap->sa_len, sap->sa_flags, 1064 sap->sa_id); 1065 sp = sap->sa_id; 1066 mp = ipc_lock(shm_svc, sp->shm_perm.ipc_id); 1067 if (sap->sa_flags & SHMSA_ISM) 1068 sp->shm_ismattch++; 1069 ipc_hold(shm_svc, (kipc_perm_t *)sp); 1070 mutex_exit(mp); 1071 } 1072 } 1073 1074 /* 1075 * Detach shared memory segments from exiting process. 1076 */ 1077 void 1078 shmexit(struct proc *pp) 1079 { 1080 segacct_t *sap; 1081 avl_tree_t *tree; 1082 void *cookie = NULL; 1083 1084 ASSERT(pp->p_segacct != NULL); 1085 1086 mutex_enter(&pp->p_lock); 1087 prbarrier(pp); 1088 tree = pp->p_segacct; 1089 pp->p_segacct = NULL; 1090 mutex_exit(&pp->p_lock); 1091 1092 while ((sap = avl_destroy_nodes(tree, &cookie)) != NULL) 1093 (void) shm_detach(pp, sap); 1094 1095 avl_destroy(tree); 1096 kmem_free(tree, sizeof (avl_tree_t)); 1097 } 1098 1099 /* 1100 * At this time pages should be in memory, so just lock them. 1101 */ 1102 static void 1103 lock_again(size_t npages, kshmid_t *sp, struct anon_map *amp) 1104 { 1105 struct anon *ap; 1106 struct page *pp; 1107 struct vnode *vp; 1108 u_offset_t off; 1109 ulong_t anon_idx; 1110 anon_sync_obj_t cookie; 1111 1112 mutex_enter(&sp->shm_mlock); 1113 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 1114 for (anon_idx = 0; npages != 0; anon_idx++, npages--) { 1115 1116 anon_array_enter(amp, anon_idx, &cookie); 1117 ap = anon_get_ptr(amp->ahp, anon_idx); 1118 ASSERT(ap != NULL); 1119 swap_xlate(ap, &vp, &off); 1120 anon_array_exit(&cookie); 1121 1122 pp = page_lookup(vp, off, SE_SHARED); 1123 if (pp == NULL) { 1124 panic("lock_again: page not in the system"); 1125 /*NOTREACHED*/ 1126 } 1127 /* page should already be locked by caller */ 1128 ASSERT(pp->p_lckcnt > 0); 1129 (void) page_pp_lock(pp, 0, 0); 1130 page_unlock(pp); 1131 } 1132 ANON_LOCK_EXIT(&->a_rwlock); 1133 mutex_exit(&sp->shm_mlock); 1134 } 1135 1136 /* 1137 * Attach the shared memory segment to the process 1138 * address space and lock the pages. 1139 */ 1140 static int 1141 shmem_lock(kshmid_t *sp, struct anon_map *amp) 1142 { 1143 size_t npages = btopr(amp->size); 1144 struct as *as; 1145 struct segvn_crargs crargs; 1146 uint_t error; 1147 1148 /* 1149 * A later ISM/DISM attach may increase the size of the amp, so 1150 * cache the number of pages locked for the future shmem_unlock() 1151 */ 1152 sp->shm_lkpages = npages; 1153 1154 as = as_alloc(); 1155 /* Initialize the create arguments and map the segment */ 1156 crargs = *(struct segvn_crargs *)zfod_argsp; /* structure copy */ 1157 crargs.offset = (u_offset_t)0; 1158 crargs.type = MAP_SHARED; 1159 crargs.amp = amp; 1160 crargs.prot = PROT_ALL; 1161 crargs.maxprot = crargs.prot; 1162 crargs.flags = 0; 1163 error = as_map(as, 0x0, amp->size, segvn_create, &crargs); 1164 if (!error) { 1165 if ((error = as_ctl(as, 0x0, amp->size, MC_LOCK, 0, 0, 1166 NULL, 0)) == 0) { 1167 lock_again(npages, sp, amp); 1168 } 1169 (void) as_unmap(as, 0x0, amp->size); 1170 } 1171 as_free(as); 1172 return (error); 1173 } 1174 1175 1176 /* 1177 * Unlock shared memory 1178 */ 1179 static void 1180 shmem_unlock(kshmid_t *sp, struct anon_map *amp) 1181 { 1182 struct anon *ap; 1183 pgcnt_t npages = sp->shm_lkpages; 1184 struct vnode *vp; 1185 struct page *pp; 1186 u_offset_t off; 1187 ulong_t anon_idx; 1188 size_t unlocked_bytes = 0; 1189 kproject_t *proj; 1190 anon_sync_obj_t cookie; 1191 1192 proj = sp->shm_perm.ipc_proj; 1193 mutex_enter(&sp->shm_mlock); 1194 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 1195 for (anon_idx = 0; anon_idx < npages; anon_idx++) { 1196 1197 anon_array_enter(amp, anon_idx, &cookie); 1198 if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) { 1199 panic("shmem_unlock: null app"); 1200 /*NOTREACHED*/ 1201 } 1202 swap_xlate(ap, &vp, &off); 1203 anon_array_exit(&cookie); 1204 pp = page_lookup(vp, off, SE_SHARED); 1205 if (pp == NULL) { 1206 panic("shmem_unlock: page not in the system"); 1207 /*NOTREACHED*/ 1208 } 1209 /* 1210 * Page should at least have once lock from previous 1211 * shmem_lock 1212 */ 1213 ASSERT(pp->p_lckcnt > 0); 1214 page_pp_unlock(pp, 0, 0); 1215 if (pp->p_lckcnt == 0) 1216 unlocked_bytes += PAGESIZE; 1217 1218 page_unlock(pp); 1219 } 1220 1221 if (unlocked_bytes > 0) { 1222 rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0); 1223 } 1224 1225 ANON_LOCK_EXIT(&->a_rwlock); 1226 mutex_exit(&sp->shm_mlock); 1227 } 1228 1229 /* 1230 * We call this routine when we have removed all references to this 1231 * amp. This means all shmdt()s and the IPC_RMID have been done. 1232 */ 1233 static void 1234 shm_rm_amp(struct anon_map *amp) 1235 { 1236 /* 1237 * Free up the anon_map. 1238 */ 1239 lgrp_shm_policy_fini(amp, NULL); 1240 if (amp->a_szc != 0) { 1241 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1242 anon_shmap_free_pages(amp, 0, amp->size); 1243 ANON_LOCK_EXIT(&->a_rwlock); 1244 } else { 1245 anon_free(amp->ahp, 0, amp->size); 1246 } 1247 anon_unresv(amp->swresv); 1248 anonmap_free(amp); 1249 } 1250 1251 /* 1252 * Return the shared memory id for the process's virtual address. 1253 * Return SHMID_NONE if addr is not within a SysV shared memory segment. 1254 * Return SHMID_FREE if addr's SysV shared memory segment's id has been freed. 1255 * 1256 * shmgetid() is called from code in /proc with the process locked but 1257 * with pp->p_lock not held. The address space lock is held, so we 1258 * cannot grab pp->p_lock here due to lock-ordering constraints. 1259 * Because of all this, modifications to the p_segacct list must only 1260 * be made after calling prbarrier() to ensure the process is not locked. 1261 * See shmdt() and sa_add(), above. shmgetid() may also be called on a 1262 * thread's own process without the process locked. 1263 */ 1264 int 1265 shmgetid(proc_t *pp, caddr_t addr) 1266 { 1267 segacct_t *sap, template; 1268 1269 ASSERT(MUTEX_NOT_HELD(&pp->p_lock)); 1270 ASSERT((pp->p_proc_flag & P_PR_LOCK) || pp == curproc); 1271 1272 if (pp->p_segacct == NULL) 1273 return (SHMID_NONE); 1274 1275 template.sa_addr = addr; 1276 template.sa_len = 0; 1277 if ((sap = avl_find(pp->p_segacct, &template, NULL)) == NULL) 1278 return (SHMID_NONE); 1279 1280 if (IPC_FREE(&sap->sa_id->shm_perm)) 1281 return (SHMID_FREE); 1282 1283 return (sap->sa_id->shm_perm.ipc_id); 1284 } 1285