1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * Inter-Process Communication Shared Memory Facility. 43 * 44 * See os/ipc.c for a description of common IPC functionality. 45 * 46 * Resource controls 47 * ----------------- 48 * 49 * Control: zone.max-shm-ids (rc_zone_shmmni) 50 * Description: Maximum number of shared memory ids allowed a zone. 51 * 52 * When shmget() is used to allocate a shared memory segment, one id 53 * is allocated. If the id allocation doesn't succeed, shmget() 54 * fails and errno is set to ENOSPC. Upon successful shmctl(, 55 * IPC_RMID) the id is deallocated. 56 * 57 * Control: project.max-shm-ids (rc_project_shmmni) 58 * Description: Maximum number of shared memory ids allowed a project. 59 * 60 * When shmget() is used to allocate a shared memory segment, one id 61 * is allocated. If the id allocation doesn't succeed, shmget() 62 * fails and errno is set to ENOSPC. Upon successful shmctl(, 63 * IPC_RMID) the id is deallocated. 64 * 65 * Control: zone.max-shm-memory (rc_zone_shmmax) 66 * Description: Total amount of shared memory allowed a zone. 67 * 68 * When shmget() is used to allocate a shared memory segment, the 69 * segment's size is allocated against this limit. If the space 70 * allocation doesn't succeed, shmget() fails and errno is set to 71 * EINVAL. The size will be deallocated once the last process has 72 * detached the segment and the segment has been successfully 73 * shmctl(, IPC_RMID)ed. 74 * 75 * Control: project.max-shm-memory (rc_project_shmmax) 76 * Description: Total amount of shared memory allowed a project. 77 * 78 * When shmget() is used to allocate a shared memory segment, the 79 * segment's size is allocated against this limit. If the space 80 * allocation doesn't succeed, shmget() fails and errno is set to 81 * EINVAL. The size will be deallocated once the last process has 82 * detached the segment and the segment has been successfully 83 * shmctl(, IPC_RMID)ed. 84 */ 85 86 #include <sys/types.h> 87 #include <sys/param.h> 88 #include <sys/cred.h> 89 #include <sys/errno.h> 90 #include <sys/time.h> 91 #include <sys/kmem.h> 92 #include <sys/user.h> 93 #include <sys/proc.h> 94 #include <sys/systm.h> 95 #include <sys/prsystm.h> 96 #include <sys/sysmacros.h> 97 #include <sys/tuneable.h> 98 #include <sys/vm.h> 99 #include <sys/mman.h> 100 #include <sys/swap.h> 101 #include <sys/cmn_err.h> 102 #include <sys/debug.h> 103 #include <sys/lwpchan_impl.h> 104 #include <sys/avl.h> 105 #include <sys/modctl.h> 106 #include <sys/syscall.h> 107 #include <sys/task.h> 108 #include <sys/project.h> 109 #include <sys/policy.h> 110 #include <sys/zone.h> 111 #include <sys/rctl.h> 112 113 #include <sys/ipc.h> 114 #include <sys/ipc_impl.h> 115 #include <sys/shm.h> 116 #include <sys/shm_impl.h> 117 118 #include <vm/hat.h> 119 #include <vm/seg.h> 120 #include <vm/as.h> 121 #include <vm/seg_vn.h> 122 #include <vm/anon.h> 123 #include <vm/page.h> 124 #include <vm/vpage.h> 125 #include <vm/seg_spt.h> 126 127 #include <c2/audit.h> 128 129 static int shmem_lock(kshmid_t *sp, struct anon_map *amp); 130 static void shmem_unlock(kshmid_t *sp, struct anon_map *amp); 131 static void sa_add(struct proc *pp, caddr_t addr, size_t len, ulong_t flags, 132 kshmid_t *id); 133 static void shm_rm_amp(struct anon_map *amp); 134 static void shm_dtor(kipc_perm_t *); 135 static void shm_rmid(kipc_perm_t *); 136 static void shm_remove_zone(zoneid_t, void *); 137 138 /* 139 * Semantics for share_page_table and ism_off: 140 * 141 * These are hooks in /etc/system - only for internal testing purpose. 142 * 143 * Setting share_page_table automatically turns on the SHM_SHARE_MMU (ISM) flag 144 * in a call to shmat(2). In other words, with share_page_table set, you always 145 * get ISM, even if say, DISM is specified. It should really be called "ism_on". 146 * 147 * Setting ism_off turns off the SHM_SHARE_MMU flag from the flags passed to 148 * shmat(2). 149 * 150 * If both share_page_table and ism_off are set, share_page_table prevails. 151 * 152 * Although these tunables should probably be removed, they do have some 153 * external exposure; as long as they exist, they should at least work sensibly. 154 */ 155 156 int share_page_table; 157 int ism_off; 158 159 /* 160 * The following tunables are obsolete. Though for compatibility we 161 * still read and interpret shminfo_shmmax and shminfo_shmmni (see 162 * os/project.c), the preferred mechanism for administrating the IPC 163 * Shared Memory facility is through the resource controls described at 164 * the top of this file. 165 */ 166 size_t shminfo_shmmax = 0x800000; /* (obsolete) */ 167 int shminfo_shmmni = 100; /* (obsolete) */ 168 size_t shminfo_shmmin = 1; /* (obsolete) */ 169 int shminfo_shmseg = 6; /* (obsolete) */ 170 171 extern rctl_hndl_t rc_zone_shmmax; 172 extern rctl_hndl_t rc_zone_shmmni; 173 extern rctl_hndl_t rc_project_shmmax; 174 extern rctl_hndl_t rc_project_shmmni; 175 static ipc_service_t *shm_svc; 176 static zone_key_t shm_zone_key; 177 178 /* 179 * Module linkage information for the kernel. 180 */ 181 static uintptr_t shmsys(int, uintptr_t, uintptr_t, uintptr_t); 182 183 static struct sysent ipcshm_sysent = { 184 4, 185 #ifdef _SYSCALL32_IMPL 186 SE_ARGC | SE_NOUNLOAD | SE_64RVAL, 187 #else /* _SYSCALL32_IMPL */ 188 SE_ARGC | SE_NOUNLOAD | SE_32RVAL1, 189 #endif /* _SYSCALL32_IMPL */ 190 (int (*)())shmsys 191 }; 192 193 #ifdef _SYSCALL32_IMPL 194 static struct sysent ipcshm_sysent32 = { 195 4, 196 SE_ARGC | SE_NOUNLOAD | SE_32RVAL1, 197 (int (*)())shmsys 198 }; 199 #endif /* _SYSCALL32_IMPL */ 200 201 static struct modlsys modlsys = { 202 &mod_syscallops, "System V shared memory", &ipcshm_sysent 203 }; 204 205 #ifdef _SYSCALL32_IMPL 206 static struct modlsys modlsys32 = { 207 &mod_syscallops32, "32-bit System V shared memory", &ipcshm_sysent32 208 }; 209 #endif /* _SYSCALL32_IMPL */ 210 211 static struct modlinkage modlinkage = { 212 MODREV_1, 213 &modlsys, 214 #ifdef _SYSCALL32_IMPL 215 &modlsys32, 216 #endif 217 NULL 218 }; 219 220 221 int 222 _init(void) 223 { 224 int result; 225 226 shm_svc = ipcs_create("shmids", rc_project_shmmni, rc_zone_shmmni, 227 sizeof (kshmid_t), shm_dtor, shm_rmid, AT_IPC_SHM, 228 offsetof(ipc_rqty_t, ipcq_shmmni)); 229 zone_key_create(&shm_zone_key, NULL, shm_remove_zone, NULL); 230 231 if ((result = mod_install(&modlinkage)) == 0) 232 return (0); 233 234 (void) zone_key_delete(shm_zone_key); 235 ipcs_destroy(shm_svc); 236 237 return (result); 238 } 239 240 int 241 _fini(void) 242 { 243 return (EBUSY); 244 } 245 246 int 247 _info(struct modinfo *modinfop) 248 { 249 return (mod_info(&modlinkage, modinfop)); 250 } 251 252 /* 253 * Shmat (attach shared segment) system call. 254 */ 255 static int 256 shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) 257 { 258 kshmid_t *sp; /* shared memory header ptr */ 259 size_t size; 260 int error = 0; 261 proc_t *pp = curproc; 262 struct as *as = pp->p_as; 263 struct segvn_crargs crargs; /* segvn create arguments */ 264 kmutex_t *lock; 265 struct seg *segspt = NULL; 266 caddr_t addr = uaddr; 267 int flags = (uflags & SHMAT_VALID_FLAGS_MASK); 268 int useISM; 269 uchar_t prot = PROT_ALL; 270 int result; 271 272 if ((lock = ipc_lookup(shm_svc, shmid, (kipc_perm_t **)&sp)) == NULL) 273 return (EINVAL); 274 if (error = ipcperm_access(&sp->shm_perm, SHM_R, CRED())) 275 goto errret; 276 if ((flags & SHM_RDONLY) == 0 && 277 (error = ipcperm_access(&sp->shm_perm, SHM_W, CRED()))) 278 goto errret; 279 if (spt_invalid(flags)) { 280 error = EINVAL; 281 goto errret; 282 } 283 if (ism_off) 284 flags = flags & ~SHM_SHARE_MMU; 285 if (share_page_table) { 286 flags = flags & ~SHM_PAGEABLE; 287 flags = flags | SHM_SHARE_MMU; 288 } 289 useISM = (spt_locked(flags) || spt_pageable(flags)); 290 if (useISM && (error = ipcperm_access(&sp->shm_perm, SHM_W, CRED()))) 291 goto errret; 292 if (useISM && isspt(sp)) { 293 uint_t newsptflags = flags | spt_flags(sp->shm_sptseg); 294 /* 295 * If trying to change an existing {D}ISM segment from ISM 296 * to DISM or vice versa, return error. Note that this 297 * validation of flags needs to be done after the effect of 298 * tunables such as ism_off and share_page_table, for 299 * semantics that are consistent with the tunables' settings. 300 */ 301 if (spt_invalid(newsptflags)) { 302 error = EINVAL; 303 goto errret; 304 } 305 } 306 ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER); 307 size = sp->shm_amp->size; 308 ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock); 309 310 /* somewhere to record spt info for final detach */ 311 if (sp->shm_sptinfo == NULL) 312 sp->shm_sptinfo = kmem_zalloc(sizeof (sptinfo_t), KM_SLEEP); 313 314 as_rangelock(as); 315 316 if (useISM) { 317 /* 318 * Handle ISM 319 */ 320 uint_t n, share_szc; 321 size_t share_size; 322 struct shm_data ssd; 323 uintptr_t align_hint; 324 325 n = page_num_pagesizes(); 326 if (n < 2) { /* large pages aren't supported */ 327 as_rangeunlock(as); 328 error = EINVAL; 329 goto errret; 330 } 331 332 /* 333 * Pick a share pagesize to use, if (!isspt(sp)). 334 * Otherwise use the already chosen page size. 335 * 336 * For the initial shmat (!isspt(sp)), where sptcreate is 337 * called, map_pgsz is called to recommend a [D]ISM pagesize, 338 * important for systems which offer more than one potential 339 * [D]ISM pagesize. 340 * If the shmat is just to attach to an already created 341 * [D]ISM segment, then use the previously selected page size. 342 */ 343 if (!isspt(sp)) { 344 share_size = map_pgsz(MAPPGSZ_ISM, pp, addr, size, 0); 345 if (share_size == 0) { 346 as_rangeunlock(as); 347 error = EINVAL; 348 goto errret; 349 } 350 share_szc = page_szc(share_size); 351 } else { 352 share_szc = sp->shm_sptseg->s_szc; 353 share_size = page_get_pagesize(share_szc); 354 } 355 size = P2ROUNDUP(size, share_size); 356 357 align_hint = share_size; 358 #if defined(__i386) || defined(__amd64) 359 /* 360 * For 64 bit amd64, we want to share an entire page table 361 * if possible. We know (ugh) that there are 512 entries in 362 * in a page table. The number for 32 bit non-PAE should be 363 * 1024, but I'm not going to special case that. Note using 512 364 * won't cause a failure below. It retries with align_hint set 365 * to share_size 366 */ 367 while (size >= 512 * (uint64_t)align_hint) 368 align_hint *= 512; 369 #endif /* __i386 || __amd64 */ 370 371 #if defined(__sparcv9) 372 if (addr == 0 && curproc->p_model == DATAMODEL_LP64) { 373 /* 374 * If no address has been passed in, and this is a 375 * 64-bit process, we'll try to find an address 376 * in the predict-ISM zone. 377 */ 378 caddr_t predbase = (caddr_t)PREDISM_1T_BASE; 379 size_t len = PREDISM_BOUND - PREDISM_1T_BASE; 380 381 as_purge(as); 382 if (as_gap(as, size + share_size, &predbase, &len, 383 AH_LO, (caddr_t)NULL) != -1) { 384 /* 385 * We found an address which looks like a 386 * candidate. We want to round it up, and 387 * then check that it's a valid user range. 388 * This assures that we won't fail below. 389 */ 390 addr = (caddr_t)P2ROUNDUP((uintptr_t)predbase, 391 share_size); 392 393 if (valid_usr_range(addr, size, prot, 394 as, as->a_userlimit) != RANGE_OKAY) { 395 addr = 0; 396 } 397 } 398 } 399 #endif /* __sparcv9 */ 400 401 if (addr == 0) { 402 for (;;) { 403 addr = (caddr_t)align_hint; 404 map_addr(&addr, size, 0ll, 1, MAP_ALIGN); 405 if (addr != NULL || align_hint == share_size) 406 break; 407 align_hint = share_size; 408 } 409 if (addr == NULL) { 410 as_rangeunlock(as); 411 error = ENOMEM; 412 goto errret; 413 } 414 ASSERT(((uintptr_t)addr & (align_hint - 1)) == 0); 415 } else { 416 /* Use the user-supplied attach address */ 417 caddr_t base; 418 size_t len; 419 420 /* 421 * Check that the address range 422 * 1) is properly aligned 423 * 2) is correct in unix terms 424 * 3) is within an unmapped address segment 425 */ 426 base = addr; 427 len = size; /* use spt aligned size */ 428 /* XXX - in SunOS, is sp->shm_segsz */ 429 if ((uintptr_t)base & (share_size - 1)) { 430 error = EINVAL; 431 as_rangeunlock(as); 432 goto errret; 433 } 434 result = valid_usr_range(base, len, prot, as, 435 as->a_userlimit); 436 if (result == RANGE_BADPROT) { 437 /* 438 * We try to accomodate processors which 439 * may not support execute permissions on 440 * all ISM segments by trying the check 441 * again but without PROT_EXEC. 442 */ 443 prot &= ~PROT_EXEC; 444 result = valid_usr_range(base, len, prot, as, 445 as->a_userlimit); 446 } 447 as_purge(as); 448 if (result != RANGE_OKAY || 449 as_gap(as, len, &base, &len, AH_LO, 450 (caddr_t)NULL) != 0) { 451 error = EINVAL; 452 as_rangeunlock(as); 453 goto errret; 454 } 455 } 456 457 if (!isspt(sp)) { 458 error = sptcreate(size, &segspt, sp->shm_amp, prot, 459 flags, share_szc); 460 if (error) { 461 as_rangeunlock(as); 462 goto errret; 463 } 464 sp->shm_sptinfo->sptas = segspt->s_as; 465 sp->shm_sptseg = segspt; 466 sp->shm_sptprot = prot; 467 } else if ((prot & sp->shm_sptprot) != sp->shm_sptprot) { 468 /* 469 * Ensure we're attaching to an ISM segment with 470 * fewer or equal permissions than what we're 471 * allowed. Fail if the segment has more 472 * permissions than what we're allowed. 473 */ 474 error = EACCES; 475 as_rangeunlock(as); 476 goto errret; 477 } 478 479 ssd.shm_sptseg = sp->shm_sptseg; 480 ssd.shm_sptas = sp->shm_sptinfo->sptas; 481 ssd.shm_amp = sp->shm_amp; 482 error = as_map(as, addr, size, segspt_shmattach, &ssd); 483 if (error == 0) 484 sp->shm_ismattch++; /* keep count of ISM attaches */ 485 } else { 486 487 /* 488 * Normal case. 489 */ 490 if (flags & SHM_RDONLY) 491 prot &= ~PROT_WRITE; 492 493 if (addr == 0) { 494 /* Let the system pick the attach address */ 495 map_addr(&addr, size, 0ll, 1, 0); 496 if (addr == NULL) { 497 as_rangeunlock(as); 498 error = ENOMEM; 499 goto errret; 500 } 501 } else { 502 /* Use the user-supplied attach address */ 503 caddr_t base; 504 size_t len; 505 506 if (flags & SHM_RND) 507 addr = (caddr_t)((uintptr_t)addr & 508 ~(SHMLBA - 1)); 509 /* 510 * Check that the address range 511 * 1) is properly aligned 512 * 2) is correct in unix terms 513 * 3) is within an unmapped address segment 514 */ 515 base = addr; 516 len = size; /* use aligned size */ 517 /* XXX - in SunOS, is sp->shm_segsz */ 518 if ((uintptr_t)base & PAGEOFFSET) { 519 error = EINVAL; 520 as_rangeunlock(as); 521 goto errret; 522 } 523 result = valid_usr_range(base, len, prot, as, 524 as->a_userlimit); 525 if (result == RANGE_BADPROT) { 526 prot &= ~PROT_EXEC; 527 result = valid_usr_range(base, len, prot, as, 528 as->a_userlimit); 529 } 530 as_purge(as); 531 if (result != RANGE_OKAY || 532 as_gap(as, len, &base, &len, 533 AH_LO, (caddr_t)NULL) != 0) { 534 error = EINVAL; 535 as_rangeunlock(as); 536 goto errret; 537 } 538 } 539 540 /* Initialize the create arguments and map the segment */ 541 crargs = *(struct segvn_crargs *)zfod_argsp; 542 crargs.offset = 0; 543 crargs.type = MAP_SHARED; 544 crargs.amp = sp->shm_amp; 545 crargs.prot = prot; 546 crargs.maxprot = crargs.prot; 547 crargs.flags = 0; 548 549 error = as_map(as, addr, size, segvn_create, &crargs); 550 } 551 552 as_rangeunlock(as); 553 if (error) 554 goto errret; 555 556 /* record shmem range for the detach */ 557 sa_add(pp, addr, (size_t)size, useISM ? SHMSA_ISM : 0, sp); 558 *rvp = (uintptr_t)addr; 559 560 sp->shm_atime = gethrestime_sec(); 561 sp->shm_lpid = pp->p_pid; 562 ipc_hold(shm_svc, (kipc_perm_t *)sp); 563 errret: 564 mutex_exit(lock); 565 return (error); 566 } 567 568 static void 569 shm_dtor(kipc_perm_t *perm) 570 { 571 kshmid_t *sp = (kshmid_t *)perm; 572 uint_t cnt; 573 size_t rsize; 574 575 if (sp->shm_lkcnt > 0) { 576 shmem_unlock(sp, sp->shm_amp); 577 sp->shm_lkcnt = 0; 578 } 579 580 if (sp->shm_sptinfo) { 581 if (isspt(sp)) 582 sptdestroy(sp->shm_sptinfo->sptas, sp->shm_amp); 583 kmem_free(sp->shm_sptinfo, sizeof (sptinfo_t)); 584 } 585 586 ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER); 587 cnt = --sp->shm_amp->refcnt; 588 ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock); 589 ASSERT(cnt == 0); 590 shm_rm_amp(sp->shm_amp); 591 592 if (sp->shm_perm.ipc_id != IPC_ID_INVAL) { 593 rsize = ptob(btopr(sp->shm_segsz)); 594 ipcs_lock(shm_svc); 595 sp->shm_perm.ipc_proj->kpj_data.kpd_shmmax -= rsize; 596 sp->shm_perm.ipc_zone->zone_shmmax -= rsize; 597 ipcs_unlock(shm_svc); 598 } 599 } 600 601 /* ARGSUSED */ 602 static void 603 shm_rmid(kipc_perm_t *perm) 604 { 605 /* nothing to do */ 606 } 607 608 /* 609 * Shmctl system call. 610 */ 611 /* ARGSUSED */ 612 static int 613 shmctl(int shmid, int cmd, void *arg) 614 { 615 kshmid_t *sp; /* shared memory header ptr */ 616 STRUCT_DECL(shmid_ds, ds); /* for SVR4 IPC_SET */ 617 int error = 0; 618 struct cred *cr = CRED(); 619 kmutex_t *lock; 620 model_t mdl = get_udatamodel(); 621 struct shmid_ds64 ds64; 622 shmatt_t nattch; 623 624 STRUCT_INIT(ds, mdl); 625 626 /* 627 * Perform pre- or non-lookup actions (e.g. copyins, RMID). 628 */ 629 switch (cmd) { 630 case IPC_SET: 631 if (copyin(arg, STRUCT_BUF(ds), STRUCT_SIZE(ds))) 632 return (EFAULT); 633 break; 634 635 case IPC_SET64: 636 if (copyin(arg, &ds64, sizeof (struct shmid_ds64))) 637 return (EFAULT); 638 break; 639 640 case IPC_RMID: 641 return (ipc_rmid(shm_svc, shmid, cr)); 642 } 643 644 if ((lock = ipc_lookup(shm_svc, shmid, (kipc_perm_t **)&sp)) == NULL) 645 return (EINVAL); 646 647 switch (cmd) { 648 /* Set ownership and permissions. */ 649 case IPC_SET: 650 if (error = ipcperm_set(shm_svc, cr, &sp->shm_perm, 651 &STRUCT_BUF(ds)->shm_perm, mdl)) 652 break; 653 sp->shm_ctime = gethrestime_sec(); 654 break; 655 656 case IPC_STAT: 657 if (error = ipcperm_access(&sp->shm_perm, SHM_R, cr)) 658 break; 659 660 nattch = sp->shm_perm.ipc_ref - 1; 661 662 ipcperm_stat(&STRUCT_BUF(ds)->shm_perm, &sp->shm_perm, mdl); 663 STRUCT_FSET(ds, shm_segsz, sp->shm_segsz); 664 STRUCT_FSETP(ds, shm_amp, NULL); /* kernel addr */ 665 STRUCT_FSET(ds, shm_lkcnt, sp->shm_lkcnt); 666 STRUCT_FSET(ds, shm_lpid, sp->shm_lpid); 667 STRUCT_FSET(ds, shm_cpid, sp->shm_cpid); 668 STRUCT_FSET(ds, shm_nattch, nattch); 669 STRUCT_FSET(ds, shm_cnattch, sp->shm_ismattch); 670 STRUCT_FSET(ds, shm_atime, sp->shm_atime); 671 STRUCT_FSET(ds, shm_dtime, sp->shm_dtime); 672 STRUCT_FSET(ds, shm_ctime, sp->shm_ctime); 673 674 mutex_exit(lock); 675 if (copyout(STRUCT_BUF(ds), arg, STRUCT_SIZE(ds))) 676 return (EFAULT); 677 678 return (0); 679 680 case IPC_SET64: 681 if (error = ipcperm_set64(shm_svc, cr, 682 &sp->shm_perm, &ds64.shmx_perm)) 683 break; 684 sp->shm_ctime = gethrestime_sec(); 685 break; 686 687 case IPC_STAT64: 688 nattch = sp->shm_perm.ipc_ref - 1; 689 690 ipcperm_stat64(&ds64.shmx_perm, &sp->shm_perm); 691 ds64.shmx_segsz = sp->shm_segsz; 692 ds64.shmx_lkcnt = sp->shm_lkcnt; 693 ds64.shmx_lpid = sp->shm_lpid; 694 ds64.shmx_cpid = sp->shm_cpid; 695 ds64.shmx_nattch = nattch; 696 ds64.shmx_cnattch = sp->shm_ismattch; 697 ds64.shmx_atime = sp->shm_atime; 698 ds64.shmx_dtime = sp->shm_dtime; 699 ds64.shmx_ctime = sp->shm_ctime; 700 701 mutex_exit(lock); 702 if (copyout(&ds64, arg, sizeof (struct shmid_ds64))) 703 return (EFAULT); 704 705 return (0); 706 707 /* Lock segment in memory */ 708 case SHM_LOCK: 709 if ((error = secpolicy_lock_memory(cr)) != 0) 710 break; 711 712 /* protect against overflow */ 713 if (sp->shm_lkcnt >= USHRT_MAX) { 714 error = ENOMEM; 715 break; 716 } 717 if (!isspt(sp) && (sp->shm_lkcnt++ == 0)) { 718 if (error = shmem_lock(sp, sp->shm_amp)) { 719 ANON_LOCK_ENTER(&sp->shm_amp->a_rwlock, RW_WRITER); 720 cmn_err(CE_NOTE, 721 "shmctl - couldn't lock %ld pages into memory", 722 sp->shm_amp->size); 723 ANON_LOCK_EXIT(&sp->shm_amp->a_rwlock); 724 error = ENOMEM; 725 sp->shm_lkcnt--; 726 } 727 } 728 break; 729 730 /* Unlock segment */ 731 case SHM_UNLOCK: 732 if ((error = secpolicy_lock_memory(cr)) != 0) 733 break; 734 735 if (sp->shm_lkcnt && (--sp->shm_lkcnt == 0)) { 736 shmem_unlock(sp, sp->shm_amp); 737 } 738 break; 739 740 default: 741 error = EINVAL; 742 break; 743 } 744 mutex_exit(lock); 745 return (error); 746 } 747 748 static void 749 shm_detach(proc_t *pp, segacct_t *sap) 750 { 751 kshmid_t *sp = sap->sa_id; 752 size_t len = sap->sa_len; 753 caddr_t addr = sap->sa_addr; 754 755 /* 756 * Discard lwpchan mappings. 757 */ 758 if (pp->p_lcp != NULL) 759 lwpchan_delete_mapping(pp, addr, addr + len); 760 (void) as_unmap(pp->p_as, addr, len); 761 762 /* 763 * Perform some detach-time accounting. 764 */ 765 (void) ipc_lock(shm_svc, sp->shm_perm.ipc_id); 766 if (sap->sa_flags & SHMSA_ISM) 767 sp->shm_ismattch--; 768 sp->shm_dtime = gethrestime_sec(); 769 sp->shm_lpid = pp->p_pid; 770 ipc_rele(shm_svc, (kipc_perm_t *)sp); /* Drops lock */ 771 772 kmem_free(sap, sizeof (segacct_t)); 773 } 774 775 static int 776 shmdt(caddr_t addr) 777 { 778 proc_t *pp = curproc; 779 segacct_t *sap, template; 780 781 mutex_enter(&pp->p_lock); 782 prbarrier(pp); /* block /proc. See shmgetid(). */ 783 784 template.sa_addr = addr; 785 template.sa_len = 0; 786 if ((pp->p_segacct == NULL) || 787 ((sap = avl_find(pp->p_segacct, &template, NULL)) == NULL)) { 788 mutex_exit(&pp->p_lock); 789 return (EINVAL); 790 } 791 if (sap->sa_addr != addr) { 792 mutex_exit(&pp->p_lock); 793 return (EINVAL); 794 } 795 avl_remove(pp->p_segacct, sap); 796 mutex_exit(&pp->p_lock); 797 798 shm_detach(pp, sap); 799 800 return (0); 801 } 802 803 /* 804 * Remove all shared memory segments associated with a given zone. 805 * Called by zone_shutdown when the zone is halted. 806 */ 807 /*ARGSUSED1*/ 808 static void 809 shm_remove_zone(zoneid_t zoneid, void *arg) 810 { 811 ipc_remove_zone(shm_svc, zoneid); 812 } 813 814 /* 815 * Shmget (create new shmem) system call. 816 */ 817 static int 818 shmget(key_t key, size_t size, int shmflg, uintptr_t *rvp) 819 { 820 proc_t *pp = curproc; 821 kshmid_t *sp; 822 kmutex_t *lock; 823 int error; 824 825 top: 826 if (error = ipc_get(shm_svc, key, shmflg, (kipc_perm_t **)&sp, &lock)) 827 return (error); 828 829 if (!IPC_FREE(&sp->shm_perm)) { 830 /* 831 * A segment with the requested key exists. 832 */ 833 if (size > sp->shm_segsz) { 834 mutex_exit(lock); 835 return (EINVAL); 836 } 837 } else { 838 /* 839 * A new segment should be created. 840 */ 841 size_t npages = btopr(size); 842 size_t rsize = ptob(npages); 843 844 /* 845 * Check rsize and the per-project and per-zone limit on 846 * shared memory. Checking rsize handles both the size == 0 847 * case and the size < ULONG_MAX & PAGEMASK case (i.e. 848 * rounding up wraps a size_t). 849 */ 850 if (rsize == 0 || 851 (rctl_test(rc_project_shmmax, 852 pp->p_task->tk_proj->kpj_rctls, pp, rsize, 853 RCA_SAFE) & RCT_DENY) || 854 (rctl_test(rc_zone_shmmax, 855 pp->p_zone->zone_rctls, pp, rsize, 856 RCA_SAFE) & RCT_DENY)) { 857 858 mutex_exit(&pp->p_lock); 859 mutex_exit(lock); 860 ipc_cleanup(shm_svc, (kipc_perm_t *)sp); 861 return (EINVAL); 862 } 863 mutex_exit(&pp->p_lock); 864 mutex_exit(lock); 865 866 if (anon_resv(rsize) == 0) { 867 ipc_cleanup(shm_svc, (kipc_perm_t *)sp); 868 return (ENOMEM); 869 } 870 871 sp->shm_amp = anonmap_alloc(rsize, rsize); 872 sp->shm_amp->a_sp = sp; 873 /* 874 * Store the original user's requested size, in bytes, 875 * rather than the page-aligned size. The former is 876 * used for IPC_STAT and shmget() lookups. The latter 877 * is saved in the anon_map structure and is used for 878 * calls to the vm layer. 879 */ 880 sp->shm_segsz = size; 881 sp->shm_atime = sp->shm_dtime = 0; 882 sp->shm_ctime = gethrestime_sec(); 883 sp->shm_lpid = (pid_t)0; 884 sp->shm_cpid = curproc->p_pid; 885 sp->shm_ismattch = 0; 886 sp->shm_sptinfo = NULL; 887 /* 888 * Check limits one last time, push id into global 889 * visibility, and update resource usage counts. 890 */ 891 if (error = ipc_commit_begin(shm_svc, key, shmflg, 892 (kipc_perm_t *)sp)) { 893 if (error == EAGAIN) 894 goto top; 895 return (error); 896 } 897 898 if ((rctl_test(rc_project_shmmax, 899 sp->shm_perm.ipc_proj->kpj_rctls, pp, rsize, 900 RCA_SAFE) & RCT_DENY) || 901 (rctl_test(rc_zone_shmmax, 902 sp->shm_perm.ipc_zone->zone_rctls, pp, rsize, 903 RCA_SAFE) & RCT_DENY)) { 904 ipc_cleanup(shm_svc, (kipc_perm_t *)sp); 905 return (EINVAL); 906 } 907 sp->shm_perm.ipc_proj->kpj_data.kpd_shmmax += rsize; 908 sp->shm_perm.ipc_zone->zone_shmmax += rsize; 909 910 lock = ipc_commit_end(shm_svc, &sp->shm_perm); 911 } 912 913 #ifdef C2_AUDIT 914 if (audit_active) 915 audit_ipcget(AT_IPC_SHM, (void *)sp); 916 #endif 917 918 *rvp = (uintptr_t)(sp->shm_perm.ipc_id); 919 920 mutex_exit(lock); 921 return (0); 922 } 923 924 /* 925 * shmids system call. 926 */ 927 static int 928 shmids(int *buf, uint_t nids, uint_t *pnids) 929 { 930 return (ipc_ids(shm_svc, buf, nids, pnids)); 931 } 932 933 /* 934 * System entry point for shmat, shmctl, shmdt, and shmget system calls. 935 */ 936 static uintptr_t 937 shmsys(int opcode, uintptr_t a0, uintptr_t a1, uintptr_t a2) 938 { 939 int error; 940 uintptr_t r_val = 0; 941 942 switch (opcode) { 943 case SHMAT: 944 error = shmat((int)a0, (caddr_t)a1, (int)a2, &r_val); 945 break; 946 case SHMCTL: 947 error = shmctl((int)a0, (int)a1, (void *)a2); 948 break; 949 case SHMDT: 950 error = shmdt((caddr_t)a0); 951 break; 952 case SHMGET: 953 error = shmget((key_t)a0, (size_t)a1, (int)a2, &r_val); 954 break; 955 case SHMIDS: 956 error = shmids((int *)a0, (uint_t)a1, (uint_t *)a2); 957 break; 958 default: 959 error = EINVAL; 960 break; 961 } 962 963 if (error) 964 return ((uintptr_t)set_errno(error)); 965 966 return (r_val); 967 } 968 969 /* 970 * segacct_t comparator 971 * This works as expected, with one minor change: the first of two real 972 * segments with equal addresses is considered to be 'greater than' the 973 * second. We only return equal when searching using a template, in 974 * which case we explicitly set the template segment's length to 0 975 * (which is invalid for a real segment). 976 */ 977 static int 978 shm_sacompar(const void *x, const void *y) 979 { 980 segacct_t *sa1 = (segacct_t *)x; 981 segacct_t *sa2 = (segacct_t *)y; 982 983 if (sa1->sa_addr < sa2->sa_addr) { 984 return (-1); 985 } else if (sa2->sa_len != 0) { 986 if (sa1->sa_addr >= sa2->sa_addr + sa2->sa_len) { 987 return (1); 988 } else if (sa1->sa_len != 0) { 989 return (1); 990 } else { 991 return (0); 992 } 993 } else if (sa1->sa_addr > sa2->sa_addr) { 994 return (1); 995 } else { 996 return (0); 997 } 998 } 999 1000 /* 1001 * add this record to the segacct list. 1002 */ 1003 static void 1004 sa_add(struct proc *pp, caddr_t addr, size_t len, ulong_t flags, kshmid_t *id) 1005 { 1006 segacct_t *nsap; 1007 avl_tree_t *tree = NULL; 1008 avl_index_t where; 1009 1010 nsap = kmem_alloc(sizeof (segacct_t), KM_SLEEP); 1011 nsap->sa_addr = addr; 1012 nsap->sa_len = len; 1013 nsap->sa_flags = flags; 1014 nsap->sa_id = id; 1015 1016 if (pp->p_segacct == NULL) 1017 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 1018 1019 mutex_enter(&pp->p_lock); 1020 prbarrier(pp); /* block /proc. See shmgetid(). */ 1021 1022 if (pp->p_segacct == NULL) { 1023 avl_create(tree, shm_sacompar, sizeof (segacct_t), 1024 offsetof(segacct_t, sa_tree)); 1025 pp->p_segacct = tree; 1026 } else if (tree) { 1027 kmem_free(tree, sizeof (avl_tree_t)); 1028 } 1029 1030 /* 1031 * We can ignore the result of avl_find, as the comparator will 1032 * never return equal for segments with non-zero length. This 1033 * is a necessary hack to get around the fact that we do, in 1034 * fact, have duplicate keys. 1035 */ 1036 (void) avl_find(pp->p_segacct, nsap, &where); 1037 avl_insert(pp->p_segacct, nsap, where); 1038 1039 mutex_exit(&pp->p_lock); 1040 } 1041 1042 /* 1043 * Duplicate parent's segacct records in child. 1044 */ 1045 void 1046 shmfork(struct proc *ppp, struct proc *cpp) 1047 { 1048 segacct_t *sap; 1049 kshmid_t *sp; 1050 kmutex_t *mp; 1051 1052 ASSERT(ppp->p_segacct != NULL); 1053 1054 /* 1055 * We are the only lwp running in the parent so nobody can 1056 * mess with our p_segacct list. Thus it is safe to traverse 1057 * the list without holding p_lock. This is essential because 1058 * we can't hold p_lock during a KM_SLEEP allocation. 1059 */ 1060 for (sap = (segacct_t *)avl_first(ppp->p_segacct); sap != NULL; 1061 sap = (segacct_t *)AVL_NEXT(ppp->p_segacct, sap)) { 1062 sa_add(cpp, sap->sa_addr, sap->sa_len, sap->sa_flags, 1063 sap->sa_id); 1064 sp = sap->sa_id; 1065 mp = ipc_lock(shm_svc, sp->shm_perm.ipc_id); 1066 if (sap->sa_flags & SHMSA_ISM) 1067 sp->shm_ismattch++; 1068 ipc_hold(shm_svc, (kipc_perm_t *)sp); 1069 mutex_exit(mp); 1070 } 1071 } 1072 1073 /* 1074 * Detach shared memory segments from exiting process. 1075 */ 1076 void 1077 shmexit(struct proc *pp) 1078 { 1079 segacct_t *sap; 1080 avl_tree_t *tree; 1081 void *cookie = NULL; 1082 1083 ASSERT(pp->p_segacct != NULL); 1084 1085 mutex_enter(&pp->p_lock); 1086 prbarrier(pp); 1087 tree = pp->p_segacct; 1088 pp->p_segacct = NULL; 1089 mutex_exit(&pp->p_lock); 1090 1091 while ((sap = avl_destroy_nodes(tree, &cookie)) != NULL) 1092 (void) shm_detach(pp, sap); 1093 1094 avl_destroy(tree); 1095 kmem_free(tree, sizeof (avl_tree_t)); 1096 } 1097 1098 /* 1099 * At this time pages should be in memory, so just lock them. 1100 */ 1101 static void 1102 lock_again(size_t npages, kshmid_t *sp, struct anon_map *amp) 1103 { 1104 struct anon *ap; 1105 struct page *pp; 1106 struct vnode *vp; 1107 u_offset_t off; 1108 ulong_t anon_idx; 1109 anon_sync_obj_t cookie; 1110 1111 mutex_enter(&sp->shm_mlock); 1112 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 1113 for (anon_idx = 0; npages != 0; anon_idx++, npages--) { 1114 1115 anon_array_enter(amp, anon_idx, &cookie); 1116 ap = anon_get_ptr(amp->ahp, anon_idx); 1117 ASSERT(ap != NULL); 1118 swap_xlate(ap, &vp, &off); 1119 anon_array_exit(&cookie); 1120 1121 pp = page_lookup(vp, off, SE_SHARED); 1122 if (pp == NULL) { 1123 panic("lock_again: page not in the system"); 1124 /*NOTREACHED*/ 1125 } 1126 /* page should already be locked by caller */ 1127 ASSERT(pp->p_lckcnt > 0); 1128 (void) page_pp_lock(pp, 0, 0); 1129 page_unlock(pp); 1130 } 1131 ANON_LOCK_EXIT(&->a_rwlock); 1132 mutex_exit(&sp->shm_mlock); 1133 } 1134 1135 /* 1136 * Attach the shared memory segment to the process 1137 * address space and lock the pages. 1138 */ 1139 static int 1140 shmem_lock(kshmid_t *sp, struct anon_map *amp) 1141 { 1142 size_t npages = btopr(amp->size); 1143 struct as *as; 1144 struct segvn_crargs crargs; 1145 uint_t error; 1146 1147 /* 1148 * A later ISM/DISM attach may increase the size of the amp, so 1149 * cache the number of pages locked for the future shmem_unlock() 1150 */ 1151 sp->shm_lkpages = npages; 1152 1153 as = as_alloc(); 1154 /* Initialize the create arguments and map the segment */ 1155 crargs = *(struct segvn_crargs *)zfod_argsp; /* structure copy */ 1156 crargs.offset = (u_offset_t)0; 1157 crargs.type = MAP_SHARED; 1158 crargs.amp = amp; 1159 crargs.prot = PROT_ALL; 1160 crargs.maxprot = crargs.prot; 1161 crargs.flags = 0; 1162 error = as_map(as, 0x0, amp->size, segvn_create, &crargs); 1163 if (!error) { 1164 if ((error = as_ctl(as, 0x0, amp->size, MC_LOCK, 0, 0, 1165 NULL, 0)) == 0) { 1166 lock_again(npages, sp, amp); 1167 } 1168 (void) as_unmap(as, 0x0, amp->size); 1169 } 1170 as_free(as); 1171 return (error); 1172 } 1173 1174 1175 /* 1176 * Unlock shared memory 1177 */ 1178 static void 1179 shmem_unlock(kshmid_t *sp, struct anon_map *amp) 1180 { 1181 struct anon *ap; 1182 pgcnt_t npages = sp->shm_lkpages; 1183 struct vnode *vp; 1184 struct page *pp; 1185 u_offset_t off; 1186 ulong_t anon_idx; 1187 size_t unlocked_bytes = 0; 1188 kproject_t *proj; 1189 anon_sync_obj_t cookie; 1190 1191 proj = sp->shm_perm.ipc_proj; 1192 mutex_enter(&sp->shm_mlock); 1193 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 1194 for (anon_idx = 0; anon_idx < npages; anon_idx++) { 1195 1196 anon_array_enter(amp, anon_idx, &cookie); 1197 if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) { 1198 panic("shmem_unlock: null app"); 1199 /*NOTREACHED*/ 1200 } 1201 swap_xlate(ap, &vp, &off); 1202 anon_array_exit(&cookie); 1203 pp = page_lookup(vp, off, SE_SHARED); 1204 if (pp == NULL) { 1205 panic("shmem_unlock: page not in the system"); 1206 /*NOTREACHED*/ 1207 } 1208 /* 1209 * Page should at least have once lock from previous 1210 * shmem_lock 1211 */ 1212 ASSERT(pp->p_lckcnt > 0); 1213 page_pp_unlock(pp, 0, 0); 1214 if (pp->p_lckcnt == 0) 1215 unlocked_bytes += PAGESIZE; 1216 1217 page_unlock(pp); 1218 } 1219 1220 if (unlocked_bytes > 0) { 1221 rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0); 1222 } 1223 1224 ANON_LOCK_EXIT(&->a_rwlock); 1225 mutex_exit(&sp->shm_mlock); 1226 } 1227 1228 /* 1229 * We call this routine when we have removed all references to this 1230 * amp. This means all shmdt()s and the IPC_RMID have been done. 1231 */ 1232 static void 1233 shm_rm_amp(struct anon_map *amp) 1234 { 1235 /* 1236 * Free up the anon_map. 1237 */ 1238 lgrp_shm_policy_fini(amp, NULL); 1239 if (amp->a_szc != 0) { 1240 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1241 anon_shmap_free_pages(amp, 0, amp->size); 1242 ANON_LOCK_EXIT(&->a_rwlock); 1243 } else { 1244 anon_free(amp->ahp, 0, amp->size); 1245 } 1246 anon_unresv(amp->swresv); 1247 anonmap_free(amp); 1248 } 1249 1250 /* 1251 * Return the shared memory id for the process's virtual address. 1252 * Return SHMID_NONE if addr is not within a SysV shared memory segment. 1253 * Return SHMID_FREE if addr's SysV shared memory segment's id has been freed. 1254 * 1255 * shmgetid() is called from code in /proc with the process locked but 1256 * with pp->p_lock not held. The address space lock is held, so we 1257 * cannot grab pp->p_lock here due to lock-ordering constraints. 1258 * Because of all this, modifications to the p_segacct list must only 1259 * be made after calling prbarrier() to ensure the process is not locked. 1260 * See shmdt() and sa_add(), above. shmgetid() may also be called on a 1261 * thread's own process without the process locked. 1262 */ 1263 int 1264 shmgetid(proc_t *pp, caddr_t addr) 1265 { 1266 segacct_t *sap, template; 1267 1268 ASSERT(MUTEX_NOT_HELD(&pp->p_lock)); 1269 ASSERT((pp->p_proc_flag & P_PR_LOCK) || pp == curproc); 1270 1271 if (pp->p_segacct == NULL) 1272 return (SHMID_NONE); 1273 1274 template.sa_addr = addr; 1275 template.sa_len = 0; 1276 if ((sap = avl_find(pp->p_segacct, &template, NULL)) == NULL) 1277 return (SHMID_NONE); 1278 1279 if (IPC_FREE(&sap->sa_id->shm_perm)) 1280 return (SHMID_FREE); 1281 1282 return (sap->sa_id->shm_perm.ipc_id); 1283 } 1284