1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright 2014 Joyent, Inc. All rights reserved. 25 * Copyright 2022 Garrett D'Amore <garrett@damore.org> 26 */ 27 28 #include <sys/types.h> 29 #include <sys/sysmacros.h> 30 #include <sys/kmem.h> 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/errno.h> 34 #include <sys/mman.h> 35 #include <sys/cmn_err.h> 36 #include <sys/cred.h> 37 #include <sys/vmsystm.h> 38 #include <sys/machsystm.h> 39 #include <sys/debug.h> 40 #include <vm/as.h> 41 #include <vm/seg.h> 42 #include <sys/vmparam.h> 43 #include <sys/vfs.h> 44 #include <sys/elf.h> 45 #include <sys/machelf.h> 46 #include <sys/corectl.h> 47 #include <sys/exec.h> 48 #include <sys/exechdr.h> 49 #include <sys/autoconf.h> 50 #include <sys/mem.h> 51 #include <vm/seg_dev.h> 52 #include <sys/vmparam.h> 53 #include <sys/mmapobj.h> 54 #include <sys/atomic.h> 55 56 /* 57 * Theory statement: 58 * 59 * The main driving force behind mmapobj is to interpret and map ELF files 60 * inside of the kernel instead of having the linker be responsible for this. 61 * 62 * mmapobj also supports the AOUT 4.x binary format as well as flat files in 63 * a read only manner. 64 * 65 * When interpreting and mapping an ELF file, mmapobj will map each PT_LOAD 66 * or PT_SUNWBSS segment according to the ELF standard. Refer to the "Linker 67 * and Libraries Guide" for more information about the standard and mapping 68 * rules. 69 * 70 * Having mmapobj interpret and map objects will allow the kernel to make the 71 * best decision for where to place the mappings for said objects. Thus, we 72 * can make optimizations inside of the kernel for specific platforms or cache 73 * mapping information to make mapping objects faster. The cache is ignored 74 * if ASLR is enabled. 75 * 76 * The lib_va_hash will be one such optimization. For each ELF object that 77 * mmapobj is asked to interpret, we will attempt to cache the information 78 * about the PT_LOAD and PT_SUNWBSS sections to speed up future mappings of 79 * the same objects. We will cache up to LIBVA_CACHED_SEGS (see below) program 80 * headers which should cover a majority of the libraries out there without 81 * wasting space. In order to make sure that the cached information is valid, 82 * we check the passed in vnode's mtime and ctime to make sure the vnode 83 * has not been modified since the last time we used it. 84 * 85 * In addition, the lib_va_hash may contain a preferred starting VA for the 86 * object which can be useful for platforms which support a shared context. 87 * This will increase the likelyhood that library text can be shared among 88 * many different processes. We limit the reserved VA space for 32 bit objects 89 * in order to minimize fragmenting the processes address space. 90 * 91 * In addition to the above, the mmapobj interface allows for padding to be 92 * requested before the first mapping and after the last mapping created. 93 * When padding is requested, no additional optimizations will be made for 94 * that request. 95 */ 96 97 /* 98 * Threshold to prevent allocating too much kernel memory to read in the 99 * program headers for an object. If it requires more than below, 100 * we will use a KM_NOSLEEP allocation to allocate memory to hold all of the 101 * program headers which could possibly fail. If less memory than below is 102 * needed, then we use a KM_SLEEP allocation and are willing to wait for the 103 * memory if we need to. 104 */ 105 size_t mmapobj_alloc_threshold = 65536; 106 107 /* Debug stats for test coverage */ 108 #ifdef DEBUG 109 struct mobj_stats { 110 uint_t mobjs_unmap_called; 111 uint_t mobjs_remap_devnull; 112 uint_t mobjs_lookup_start; 113 uint_t mobjs_alloc_start; 114 uint_t mobjs_alloc_vmem; 115 uint_t mobjs_add_collision; 116 uint_t mobjs_get_addr; 117 uint_t mobjs_map_flat_no_padding; 118 uint_t mobjs_map_flat_padding; 119 uint_t mobjs_map_ptload_text; 120 uint_t mobjs_map_ptload_initdata; 121 uint_t mobjs_map_ptload_preread; 122 uint_t mobjs_map_ptload_unaligned_text; 123 uint_t mobjs_map_ptload_unaligned_map_fail; 124 uint_t mobjs_map_ptload_unaligned_read_fail; 125 uint_t mobjs_zfoddiff; 126 uint_t mobjs_zfoddiff_nowrite; 127 uint_t mobjs_zfodextra; 128 uint_t mobjs_ptload_failed; 129 uint_t mobjs_map_elf_no_holes; 130 uint_t mobjs_unmap_hole; 131 uint_t mobjs_nomem_header; 132 uint_t mobjs_inval_header; 133 uint_t mobjs_overlap_header; 134 uint_t mobjs_np2_align; 135 uint_t mobjs_np2_align_overflow; 136 uint_t mobjs_exec_padding; 137 uint_t mobjs_exec_addr_mapped; 138 uint_t mobjs_exec_addr_devnull; 139 uint_t mobjs_exec_addr_in_use; 140 uint_t mobjs_lvp_found; 141 uint_t mobjs_no_loadable_yet; 142 uint_t mobjs_nothing_to_map; 143 uint_t mobjs_e2big; 144 uint_t mobjs_dyn_pad_align; 145 uint_t mobjs_dyn_pad_noalign; 146 uint_t mobjs_alloc_start_fail; 147 uint_t mobjs_lvp_nocache; 148 uint_t mobjs_extra_padding; 149 uint_t mobjs_lvp_not_needed; 150 uint_t mobjs_no_mem_map_sz; 151 uint_t mobjs_check_exec_failed; 152 uint_t mobjs_lvp_used; 153 uint_t mobjs_wrong_model; 154 uint_t mobjs_noexec_fs; 155 uint_t mobjs_e2big_et_rel; 156 uint_t mobjs_et_rel_mapped; 157 uint_t mobjs_unknown_elf_type; 158 uint_t mobjs_phent32_too_small; 159 uint_t mobjs_phent64_too_small; 160 uint_t mobjs_inval_elf_class; 161 uint_t mobjs_too_many_phdrs; 162 uint_t mobjs_no_phsize; 163 uint_t mobjs_phsize_large; 164 uint_t mobjs_phsize_xtralarge; 165 uint_t mobjs_fast_wrong_model; 166 uint_t mobjs_fast_e2big; 167 uint_t mobjs_fast; 168 uint_t mobjs_fast_success; 169 uint_t mobjs_fast_not_now; 170 uint_t mobjs_small_file; 171 uint_t mobjs_read_error; 172 uint_t mobjs_unsupported; 173 uint_t mobjs_flat_e2big; 174 uint_t mobjs_phent_align32; 175 uint_t mobjs_phent_align64; 176 uint_t mobjs_lib_va_find_hit; 177 uint_t mobjs_lib_va_find_delay_delete; 178 uint_t mobjs_lib_va_find_delete; 179 uint_t mobjs_lib_va_add_delay_delete; 180 uint_t mobjs_lib_va_add_delete; 181 uint_t mobjs_lib_va_create_failure; 182 uint_t mobjs_min_align; 183 } mobj_stats; 184 185 #define MOBJ_STAT_ADD(stat) ((mobj_stats.mobjs_##stat)++) 186 #else 187 #define MOBJ_STAT_ADD(stat) 188 #endif 189 190 /* 191 * Check if addr is at or above the address space reserved for the stack. 192 * The stack is at the top of the address space for all sparc processes 193 * and 64 bit x86 processes. For 32 bit x86, the stack is not at the top 194 * of the address space and thus this check wil always return false for 195 * 32 bit x86 processes. 196 */ 197 #if defined(__sparc) 198 #define OVERLAPS_STACK(addr, p) \ 199 (addr >= (p->p_usrstack - ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK))) 200 #elif defined(__amd64) 201 #define OVERLAPS_STACK(addr, p) \ 202 ((p->p_model == DATAMODEL_LP64) && \ 203 (addr >= (p->p_usrstack - ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK)))) 204 #endif 205 206 /* lv_flags values - bitmap */ 207 #define LV_ELF32 0x1 /* 32 bit ELF file */ 208 #define LV_ELF64 0x2 /* 64 bit ELF file */ 209 #define LV_DEL 0x4 /* delete when lv_refcnt hits zero */ 210 211 /* 212 * Note: lv_num_segs will denote how many segments this file has and will 213 * only be set after the lv_mps array has been filled out. 214 * lv_mps can only be valid if lv_num_segs is non-zero. 215 */ 216 struct lib_va { 217 struct lib_va *lv_next; 218 caddr_t lv_base_va; /* start va for library */ 219 ssize_t lv_len; /* total va span of library */ 220 size_t lv_align; /* minimum alignment */ 221 uint64_t lv_nodeid; /* filesystem node id */ 222 uint64_t lv_fsid; /* filesystem id */ 223 timestruc_t lv_ctime; /* last time file was changed */ 224 timestruc_t lv_mtime; /* or modified */ 225 mmapobj_result_t lv_mps[LIBVA_CACHED_SEGS]; /* cached pheaders */ 226 int lv_num_segs; /* # segs for this file */ 227 int lv_flags; 228 uint_t lv_refcnt; /* number of holds on struct */ 229 }; 230 231 #define LIB_VA_SIZE 1024 232 #define LIB_VA_MASK (LIB_VA_SIZE - 1) 233 #define LIB_VA_MUTEX_SHIFT 3 234 235 #if (LIB_VA_SIZE & (LIB_VA_SIZE - 1)) 236 #error "LIB_VA_SIZE is not a power of 2" 237 #endif 238 239 static struct lib_va *lib_va_hash[LIB_VA_SIZE]; 240 static kmutex_t lib_va_hash_mutex[LIB_VA_SIZE >> LIB_VA_MUTEX_SHIFT]; 241 242 #define LIB_VA_HASH_MUTEX(index) \ 243 (&lib_va_hash_mutex[index >> LIB_VA_MUTEX_SHIFT]) 244 245 #define LIB_VA_HASH(nodeid) \ 246 (((nodeid) ^ ((nodeid) << 7) ^ ((nodeid) << 13)) & LIB_VA_MASK) 247 248 #define LIB_VA_MATCH_ID(arg1, arg2) \ 249 ((arg1)->lv_nodeid == (arg2)->va_nodeid && \ 250 (arg1)->lv_fsid == (arg2)->va_fsid) 251 252 #define LIB_VA_MATCH_TIME(arg1, arg2) \ 253 ((arg1)->lv_ctime.tv_sec == (arg2)->va_ctime.tv_sec && \ 254 (arg1)->lv_mtime.tv_sec == (arg2)->va_mtime.tv_sec && \ 255 (arg1)->lv_ctime.tv_nsec == (arg2)->va_ctime.tv_nsec && \ 256 (arg1)->lv_mtime.tv_nsec == (arg2)->va_mtime.tv_nsec) 257 258 #define LIB_VA_MATCH(arg1, arg2) \ 259 (LIB_VA_MATCH_ID(arg1, arg2) && LIB_VA_MATCH_TIME(arg1, arg2)) 260 261 /* 262 * lib_va will be used for optimized allocation of address ranges for 263 * libraries, such that subsequent mappings of the same library will attempt 264 * to use the same VA as previous mappings of that library. 265 * In order to map libraries at the same VA in many processes, we need to carve 266 * out our own address space for them which is unique across many processes. 267 * We use different arenas for 32 bit and 64 bit libraries. 268 * 269 * Since the 32 bit address space is relatively small, we limit the number of 270 * libraries which try to use consistent virtual addresses to lib_threshold. 271 * For 64 bit libraries there is no such limit since the address space is large. 272 */ 273 static vmem_t *lib_va_32_arena; 274 static vmem_t *lib_va_64_arena; 275 uint_t lib_threshold = 20; /* modifiable via /etc/system */ 276 277 static kmutex_t lib_va_init_mutex; /* no need to initialize */ 278 279 /* 280 * Number of 32 bit and 64 bit libraries in lib_va hash. 281 */ 282 static uint_t libs_mapped_32 = 0; 283 static uint_t libs_mapped_64 = 0; 284 285 /* 286 * Free up the resources associated with lvp as well as lvp itself. 287 * We also decrement the number of libraries mapped via a lib_va 288 * cached virtual address. 289 */ 290 void 291 lib_va_free(struct lib_va *lvp) 292 { 293 int is_64bit = lvp->lv_flags & LV_ELF64; 294 ASSERT(lvp->lv_refcnt == 0); 295 296 if (lvp->lv_base_va != NULL) { 297 vmem_xfree(is_64bit ? lib_va_64_arena : lib_va_32_arena, 298 lvp->lv_base_va, lvp->lv_len); 299 if (is_64bit) { 300 atomic_dec_32(&libs_mapped_64); 301 } else { 302 atomic_dec_32(&libs_mapped_32); 303 } 304 } 305 kmem_free(lvp, sizeof (struct lib_va)); 306 } 307 308 /* 309 * See if the file associated with the vap passed in is in the lib_va hash. 310 * If it is and the file has not been modified since last use, then 311 * return a pointer to that data. Otherwise, return NULL if the file has 312 * changed or the file was not found in the hash. 313 */ 314 static struct lib_va * 315 lib_va_find(vattr_t *vap) 316 { 317 struct lib_va *lvp; 318 struct lib_va *del = NULL; 319 struct lib_va **tmp; 320 uint_t index; 321 index = LIB_VA_HASH(vap->va_nodeid); 322 323 mutex_enter(LIB_VA_HASH_MUTEX(index)); 324 tmp = &lib_va_hash[index]; 325 while (*tmp != NULL) { 326 lvp = *tmp; 327 if (LIB_VA_MATCH_ID(lvp, vap)) { 328 if (LIB_VA_MATCH_TIME(lvp, vap)) { 329 ASSERT((lvp->lv_flags & LV_DEL) == 0); 330 lvp->lv_refcnt++; 331 MOBJ_STAT_ADD(lib_va_find_hit); 332 } else { 333 /* 334 * file was updated since last use. 335 * need to remove it from list. 336 */ 337 del = lvp; 338 *tmp = del->lv_next; 339 del->lv_next = NULL; 340 /* 341 * If we can't delete it now, mark it for later 342 */ 343 if (del->lv_refcnt) { 344 MOBJ_STAT_ADD(lib_va_find_delay_delete); 345 del->lv_flags |= LV_DEL; 346 del = NULL; 347 } 348 lvp = NULL; 349 } 350 mutex_exit(LIB_VA_HASH_MUTEX(index)); 351 if (del) { 352 ASSERT(del->lv_refcnt == 0); 353 MOBJ_STAT_ADD(lib_va_find_delete); 354 lib_va_free(del); 355 } 356 return (lvp); 357 } 358 tmp = &lvp->lv_next; 359 } 360 mutex_exit(LIB_VA_HASH_MUTEX(index)); 361 return (NULL); 362 } 363 364 /* 365 * Add a new entry to the lib_va hash. 366 * Search the hash while holding the appropriate mutex to make sure that the 367 * data is not already in the cache. If we find data that is in the cache 368 * already and has not been modified since last use, we return NULL. If it 369 * has been modified since last use, we will remove that entry from 370 * the hash and it will be deleted once it's reference count reaches zero. 371 * If there is no current entry in the hash we will add the new entry and 372 * return it to the caller who is responsible for calling lib_va_release to 373 * drop their reference count on it. 374 * 375 * lv_num_segs will be set to zero since the caller needs to add that 376 * information to the data structure. 377 */ 378 static struct lib_va * 379 lib_va_add_hash(caddr_t base_va, ssize_t len, size_t align, vattr_t *vap) 380 { 381 struct lib_va *lvp; 382 uint_t index; 383 model_t model; 384 struct lib_va **tmp; 385 struct lib_va *del = NULL; 386 387 model = get_udatamodel(); 388 index = LIB_VA_HASH(vap->va_nodeid); 389 390 lvp = kmem_alloc(sizeof (struct lib_va), KM_SLEEP); 391 392 mutex_enter(LIB_VA_HASH_MUTEX(index)); 393 394 /* 395 * Make sure not adding same data a second time. 396 * The hash chains should be relatively short and adding 397 * is a relatively rare event, so it's worth the check. 398 */ 399 tmp = &lib_va_hash[index]; 400 while (*tmp != NULL) { 401 if (LIB_VA_MATCH_ID(*tmp, vap)) { 402 if (LIB_VA_MATCH_TIME(*tmp, vap)) { 403 mutex_exit(LIB_VA_HASH_MUTEX(index)); 404 kmem_free(lvp, sizeof (struct lib_va)); 405 return (NULL); 406 } 407 408 /* 409 * We have the same nodeid and fsid but the file has 410 * been modified since we last saw it. 411 * Need to remove the old node and add this new 412 * one. 413 * Could probably use a callback mechanism to make 414 * this cleaner. 415 */ 416 ASSERT(del == NULL); 417 del = *tmp; 418 *tmp = del->lv_next; 419 del->lv_next = NULL; 420 421 /* 422 * Check to see if we can free it. If lv_refcnt 423 * is greater than zero, than some other thread 424 * has a reference to the one we want to delete 425 * and we can not delete it. All of this is done 426 * under the lib_va_hash_mutex lock so it is atomic. 427 */ 428 if (del->lv_refcnt) { 429 MOBJ_STAT_ADD(lib_va_add_delay_delete); 430 del->lv_flags |= LV_DEL; 431 del = NULL; 432 } 433 /* tmp is already advanced */ 434 continue; 435 } 436 tmp = &((*tmp)->lv_next); 437 } 438 439 lvp->lv_base_va = base_va; 440 lvp->lv_len = len; 441 lvp->lv_align = align; 442 lvp->lv_nodeid = vap->va_nodeid; 443 lvp->lv_fsid = vap->va_fsid; 444 lvp->lv_ctime.tv_sec = vap->va_ctime.tv_sec; 445 lvp->lv_ctime.tv_nsec = vap->va_ctime.tv_nsec; 446 lvp->lv_mtime.tv_sec = vap->va_mtime.tv_sec; 447 lvp->lv_mtime.tv_nsec = vap->va_mtime.tv_nsec; 448 lvp->lv_next = NULL; 449 lvp->lv_refcnt = 1; 450 451 /* Caller responsible for filling this and lv_mps out */ 452 lvp->lv_num_segs = 0; 453 454 if (model == DATAMODEL_LP64) { 455 lvp->lv_flags = LV_ELF64; 456 } else { 457 ASSERT(model == DATAMODEL_ILP32); 458 lvp->lv_flags = LV_ELF32; 459 } 460 461 if (base_va != NULL) { 462 if (model == DATAMODEL_LP64) { 463 atomic_inc_32(&libs_mapped_64); 464 } else { 465 ASSERT(model == DATAMODEL_ILP32); 466 atomic_inc_32(&libs_mapped_32); 467 } 468 } 469 ASSERT(*tmp == NULL); 470 *tmp = lvp; 471 mutex_exit(LIB_VA_HASH_MUTEX(index)); 472 if (del) { 473 ASSERT(del->lv_refcnt == 0); 474 MOBJ_STAT_ADD(lib_va_add_delete); 475 lib_va_free(del); 476 } 477 return (lvp); 478 } 479 480 /* 481 * Release the hold on lvp which was acquired by lib_va_find or lib_va_add_hash. 482 * In addition, if this is the last hold and lvp is marked for deletion, 483 * free up it's reserved address space and free the structure. 484 */ 485 static void 486 lib_va_release(struct lib_va *lvp) 487 { 488 uint_t index; 489 int to_del = 0; 490 491 ASSERT(lvp->lv_refcnt > 0); 492 493 index = LIB_VA_HASH(lvp->lv_nodeid); 494 mutex_enter(LIB_VA_HASH_MUTEX(index)); 495 if (--lvp->lv_refcnt == 0 && (lvp->lv_flags & LV_DEL)) { 496 to_del = 1; 497 } 498 mutex_exit(LIB_VA_HASH_MUTEX(index)); 499 if (to_del) { 500 ASSERT(lvp->lv_next == 0); 501 lib_va_free(lvp); 502 } 503 } 504 505 /* 506 * Dummy function for mapping through /dev/null 507 * Normally I would have used mmmmap in common/io/mem.c 508 * but that is a static function, and for /dev/null, it 509 * just returns -1. 510 */ 511 /* ARGSUSED */ 512 static int 513 mmapobj_dummy(dev_t dev, off_t off, int prot) 514 { 515 return (-1); 516 } 517 518 /* 519 * Called when an error occurred which requires mmapobj to return failure. 520 * All mapped objects will be unmapped and /dev/null mappings will be 521 * reclaimed if necessary. 522 * num_mapped is the number of elements of mrp which have been mapped, and 523 * num_segs is the total number of elements in mrp. 524 * For e_type ET_EXEC, we need to unmap all of the elements in mrp since 525 * we had already made reservations for them. 526 * If num_mapped equals num_segs, then we know that we had fully mapped 527 * the file and only need to clean up the segments described. 528 * If they are not equal, then for ET_DYN we will unmap the range from the 529 * end of the last mapped segment to the end of the last segment in mrp 530 * since we would have made a reservation for that memory earlier. 531 * If e_type is passed in as zero, num_mapped must equal num_segs. 532 */ 533 void 534 mmapobj_unmap(mmapobj_result_t *mrp, int num_mapped, int num_segs, 535 ushort_t e_type) 536 { 537 int i; 538 struct as *as = curproc->p_as; 539 caddr_t addr; 540 size_t size; 541 542 if (e_type == ET_EXEC) { 543 num_mapped = num_segs; 544 } 545 #ifdef DEBUG 546 if (e_type == 0) { 547 ASSERT(num_mapped == num_segs); 548 } 549 #endif 550 551 MOBJ_STAT_ADD(unmap_called); 552 for (i = 0; i < num_mapped; i++) { 553 554 /* 555 * If we are going to have to create a mapping we need to 556 * make sure that no one else will use the address we 557 * need to remap between the time it is unmapped and 558 * mapped below. 559 */ 560 if (mrp[i].mr_flags & MR_RESV) { 561 as_rangelock(as); 562 } 563 /* Always need to unmap what we mapped */ 564 (void) as_unmap(as, mrp[i].mr_addr, mrp[i].mr_msize); 565 566 /* Need to reclaim /dev/null reservation from earlier */ 567 if (mrp[i].mr_flags & MR_RESV) { 568 struct segdev_crargs dev_a; 569 570 ASSERT(e_type != ET_DYN); 571 /* 572 * Use seg_dev segment driver for /dev/null mapping. 573 */ 574 dev_a.mapfunc = mmapobj_dummy; 575 dev_a.dev = makedevice(mm_major, M_NULL); 576 dev_a.offset = 0; 577 dev_a.type = 0; /* neither PRIVATE nor SHARED */ 578 dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE; 579 dev_a.hat_attr = 0; 580 dev_a.hat_flags = 0; 581 582 (void) as_map(as, mrp[i].mr_addr, mrp[i].mr_msize, 583 segdev_create, &dev_a); 584 MOBJ_STAT_ADD(remap_devnull); 585 as_rangeunlock(as); 586 } 587 } 588 589 if (num_mapped != num_segs) { 590 ASSERT(e_type == ET_DYN); 591 /* Need to unmap any reservation made after last mapped seg */ 592 if (num_mapped == 0) { 593 addr = mrp[0].mr_addr; 594 } else { 595 addr = mrp[num_mapped - 1].mr_addr + 596 mrp[num_mapped - 1].mr_msize; 597 } 598 size = (size_t)mrp[num_segs - 1].mr_addr + 599 mrp[num_segs - 1].mr_msize - (size_t)addr; 600 (void) as_unmap(as, addr, size); 601 602 /* 603 * Now we need to unmap the holes between mapped segs. 604 * Note that we have not mapped all of the segments and thus 605 * the holes between segments would not have been unmapped 606 * yet. If num_mapped == num_segs, then all of the holes 607 * between segments would have already been unmapped. 608 */ 609 610 for (i = 1; i < num_mapped; i++) { 611 addr = mrp[i - 1].mr_addr + mrp[i - 1].mr_msize; 612 size = mrp[i].mr_addr - addr; 613 (void) as_unmap(as, addr, size); 614 } 615 } 616 } 617 618 /* 619 * We need to add the start address into mrp so that the unmap function 620 * has absolute addresses to use. 621 */ 622 static void 623 mmapobj_unmap_exec(mmapobj_result_t *mrp, int num_mapped, caddr_t start_addr) 624 { 625 int i; 626 627 for (i = 0; i < num_mapped; i++) { 628 mrp[i].mr_addr += (size_t)start_addr; 629 } 630 mmapobj_unmap(mrp, num_mapped, num_mapped, ET_EXEC); 631 } 632 633 static caddr_t 634 mmapobj_lookup_start_addr(struct lib_va *lvp) 635 { 636 proc_t *p = curproc; 637 struct as *as = p->p_as; 638 struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_USER, PROT_ALL); 639 int error; 640 uint_t ma_flags = _MAP_LOW32; 641 caddr_t base = NULL; 642 size_t len; 643 size_t align; 644 645 ASSERT(lvp != NULL); 646 MOBJ_STAT_ADD(lookup_start); 647 648 as_rangelock(as); 649 650 base = lvp->lv_base_va; 651 len = lvp->lv_len; 652 653 /* 654 * If we don't have an expected base address, or the one that we want 655 * to use is not available or acceptable, go get an acceptable 656 * address range. 657 */ 658 if (base == NULL || as_gap(as, len, &base, &len, 0, NULL) || 659 valid_usr_range(base, len, PROT_ALL, as, as->a_userlimit) != 660 RANGE_OKAY || OVERLAPS_STACK(base + len, p)) { 661 if (lvp->lv_flags & LV_ELF64) { 662 ma_flags = 0; 663 } 664 665 align = lvp->lv_align; 666 if (align > 1) { 667 ma_flags |= MAP_ALIGN; 668 } 669 670 base = (caddr_t)align; 671 map_addr(&base, len, 0, 1, ma_flags); 672 } 673 674 /* 675 * Need to reserve the address space we're going to use. 676 * Don't reserve swap space since we'll be mapping over this. 677 */ 678 if (base != NULL) { 679 crargs.flags |= MAP_NORESERVE; 680 error = as_map(as, base, len, segvn_create, &crargs); 681 if (error) { 682 base = NULL; 683 } 684 } 685 686 as_rangeunlock(as); 687 return (base); 688 } 689 690 /* 691 * Get the starting address for a given file to be mapped and return it 692 * to the caller. If we're using lib_va and we need to allocate an address, 693 * we will attempt to allocate it from the global reserved pool such that the 694 * same address can be used in the future for this file. If we can't use the 695 * reserved address then we just get one that will fit in our address space. 696 * 697 * Returns the starting virtual address for the range to be mapped or NULL 698 * if an error is encountered. If we successfully insert the requested info 699 * into the lib_va hash, then *lvpp will be set to point to this lib_va 700 * structure. The structure will have a hold on it and thus lib_va_release 701 * needs to be called on it by the caller. This function will not fill out 702 * lv_mps or lv_num_segs since it does not have enough information to do so. 703 * The caller is responsible for doing this making sure that any modifications 704 * to lv_mps are visible before setting lv_num_segs. 705 */ 706 static caddr_t 707 mmapobj_alloc_start_addr(struct lib_va **lvpp, size_t len, int use_lib_va, 708 int randomize, size_t align, vattr_t *vap) 709 { 710 proc_t *p = curproc; 711 struct as *as = p->p_as; 712 struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_USER, PROT_ALL); 713 int error; 714 model_t model; 715 uint_t ma_flags = _MAP_LOW32; 716 caddr_t base = NULL; 717 vmem_t *model_vmem; 718 size_t lib_va_start; 719 size_t lib_va_end; 720 size_t lib_va_len; 721 722 ASSERT(lvpp != NULL); 723 ASSERT((randomize & use_lib_va) != 1); 724 725 MOBJ_STAT_ADD(alloc_start); 726 model = get_udatamodel(); 727 728 if (model == DATAMODEL_LP64) { 729 ma_flags = 0; 730 model_vmem = lib_va_64_arena; 731 } else { 732 ASSERT(model == DATAMODEL_ILP32); 733 model_vmem = lib_va_32_arena; 734 } 735 736 if (align > 1) { 737 ma_flags |= MAP_ALIGN; 738 } 739 740 if (randomize != 0) 741 ma_flags |= _MAP_RANDOMIZE; 742 743 if (use_lib_va) { 744 /* 745 * The first time through, we need to setup the lib_va arenas. 746 * We call map_addr to find a suitable range of memory to map 747 * the given library, and we will set the highest address 748 * in our vmem arena to the end of this adddress range. 749 * We allow up to half of the address space to be used 750 * for lib_va addresses but we do not prevent any allocations 751 * in this range from other allocation paths. 752 */ 753 if (lib_va_64_arena == NULL && model == DATAMODEL_LP64) { 754 mutex_enter(&lib_va_init_mutex); 755 if (lib_va_64_arena == NULL) { 756 base = (caddr_t)align; 757 as_rangelock(as); 758 map_addr(&base, len, 0, 1, ma_flags); 759 as_rangeunlock(as); 760 if (base == NULL) { 761 mutex_exit(&lib_va_init_mutex); 762 MOBJ_STAT_ADD(lib_va_create_failure); 763 goto nolibva; 764 } 765 lib_va_end = (size_t)base + len; 766 lib_va_len = lib_va_end >> 1; 767 lib_va_len = P2ROUNDUP(lib_va_len, PAGESIZE); 768 lib_va_start = lib_va_end - lib_va_len; 769 770 /* 771 * Need to make sure we avoid the address hole. 772 * We know lib_va_end is valid but we need to 773 * make sure lib_va_start is as well. 774 */ 775 if ((lib_va_end > (size_t)hole_end) && 776 (lib_va_start < (size_t)hole_end)) { 777 lib_va_start = P2ROUNDUP( 778 (size_t)hole_end, PAGESIZE); 779 lib_va_len = lib_va_end - lib_va_start; 780 } 781 lib_va_64_arena = vmem_create("lib_va_64", 782 (void *)lib_va_start, lib_va_len, PAGESIZE, 783 NULL, NULL, NULL, 0, 784 VM_NOSLEEP | VMC_IDENTIFIER); 785 if (lib_va_64_arena == NULL) { 786 mutex_exit(&lib_va_init_mutex); 787 goto nolibva; 788 } 789 } 790 model_vmem = lib_va_64_arena; 791 mutex_exit(&lib_va_init_mutex); 792 } else if (lib_va_32_arena == NULL && 793 model == DATAMODEL_ILP32) { 794 mutex_enter(&lib_va_init_mutex); 795 if (lib_va_32_arena == NULL) { 796 base = (caddr_t)align; 797 as_rangelock(as); 798 map_addr(&base, len, 0, 1, ma_flags); 799 as_rangeunlock(as); 800 if (base == NULL) { 801 mutex_exit(&lib_va_init_mutex); 802 MOBJ_STAT_ADD(lib_va_create_failure); 803 goto nolibva; 804 } 805 lib_va_end = (size_t)base + len; 806 lib_va_len = lib_va_end >> 1; 807 lib_va_len = P2ROUNDUP(lib_va_len, PAGESIZE); 808 lib_va_start = lib_va_end - lib_va_len; 809 lib_va_32_arena = vmem_create("lib_va_32", 810 (void *)lib_va_start, lib_va_len, PAGESIZE, 811 NULL, NULL, NULL, 0, 812 VM_NOSLEEP | VMC_IDENTIFIER); 813 if (lib_va_32_arena == NULL) { 814 mutex_exit(&lib_va_init_mutex); 815 goto nolibva; 816 } 817 } 818 model_vmem = lib_va_32_arena; 819 mutex_exit(&lib_va_init_mutex); 820 } 821 822 if (model == DATAMODEL_LP64 || libs_mapped_32 < lib_threshold) { 823 base = vmem_xalloc(model_vmem, len, align, 0, 0, NULL, 824 NULL, VM_NOSLEEP | VM_ENDALLOC); 825 MOBJ_STAT_ADD(alloc_vmem); 826 } 827 828 /* 829 * Even if the address fails to fit in our address space, 830 * or we can't use a reserved address, 831 * we should still save it off in lib_va_hash. 832 */ 833 *lvpp = lib_va_add_hash(base, len, align, vap); 834 835 /* 836 * Check for collision on insertion and free up our VA space. 837 * This is expected to be rare, so we'll just reset base to 838 * NULL instead of looking it up in the lib_va hash. 839 */ 840 if (*lvpp == NULL) { 841 if (base != NULL) { 842 vmem_xfree(model_vmem, base, len); 843 base = NULL; 844 MOBJ_STAT_ADD(add_collision); 845 } 846 } 847 } 848 849 nolibva: 850 as_rangelock(as); 851 852 /* 853 * If we don't have an expected base address, or the one that we want 854 * to use is not available or acceptable, go get an acceptable 855 * address range. 856 * 857 * If ASLR is enabled, we should never have used the cache, and should 858 * also start our real work here, in the consequent of the next 859 * condition. 860 */ 861 if (randomize != 0) 862 ASSERT(base == NULL); 863 864 if (base == NULL || as_gap(as, len, &base, &len, 0, NULL) || 865 valid_usr_range(base, len, PROT_ALL, as, as->a_userlimit) != 866 RANGE_OKAY || OVERLAPS_STACK(base + len, p)) { 867 MOBJ_STAT_ADD(get_addr); 868 base = (caddr_t)align; 869 map_addr(&base, len, 0, 1, ma_flags); 870 } 871 872 /* 873 * Need to reserve the address space we're going to use. 874 * Don't reserve swap space since we'll be mapping over this. 875 */ 876 if (base != NULL) { 877 /* Don't reserve swap space since we'll be mapping over this */ 878 crargs.flags |= MAP_NORESERVE; 879 error = as_map(as, base, len, segvn_create, &crargs); 880 if (error) { 881 base = NULL; 882 } 883 } 884 885 as_rangeunlock(as); 886 return (base); 887 } 888 889 /* 890 * Map the file associated with vp into the address space as a single 891 * read only private mapping. 892 * Returns 0 for success, and non-zero for failure to map the file. 893 */ 894 static int 895 mmapobj_map_flat(vnode_t *vp, mmapobj_result_t *mrp, size_t padding, 896 cred_t *fcred) 897 { 898 int error = 0; 899 struct as *as = curproc->p_as; 900 caddr_t addr = NULL; 901 caddr_t start_addr; 902 size_t len; 903 size_t pad_len; 904 int prot = PROT_USER | PROT_READ; 905 uint_t ma_flags = _MAP_LOW32; 906 vattr_t vattr; 907 struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_USER, PROT_ALL); 908 909 if (get_udatamodel() == DATAMODEL_LP64) { 910 ma_flags = 0; 911 } 912 913 vattr.va_mask = AT_SIZE; 914 error = VOP_GETATTR(vp, &vattr, 0, fcred, NULL); 915 if (error) { 916 return (error); 917 } 918 919 len = vattr.va_size; 920 921 ma_flags |= MAP_PRIVATE; 922 if (padding == 0) { 923 MOBJ_STAT_ADD(map_flat_no_padding); 924 error = VOP_MAP(vp, 0, as, &addr, len, prot, PROT_ALL, 925 ma_flags, fcred, NULL); 926 if (error == 0) { 927 mrp[0].mr_addr = addr; 928 mrp[0].mr_msize = len; 929 mrp[0].mr_fsize = len; 930 mrp[0].mr_offset = 0; 931 mrp[0].mr_prot = prot; 932 mrp[0].mr_flags = 0; 933 } 934 return (error); 935 } 936 937 /* padding was requested so there's more work to be done */ 938 MOBJ_STAT_ADD(map_flat_padding); 939 940 /* No need to reserve swap space now since it will be reserved later */ 941 crargs.flags |= MAP_NORESERVE; 942 943 /* Need to setup padding which can only be in PAGESIZE increments. */ 944 ASSERT((padding & PAGEOFFSET) == 0); 945 pad_len = len + (2 * padding); 946 947 as_rangelock(as); 948 map_addr(&addr, pad_len, 0, 1, ma_flags); 949 error = as_map(as, addr, pad_len, segvn_create, &crargs); 950 as_rangeunlock(as); 951 if (error) { 952 return (error); 953 } 954 start_addr = addr; 955 addr += padding; 956 ma_flags |= MAP_FIXED; 957 error = VOP_MAP(vp, 0, as, &addr, len, prot, PROT_ALL, ma_flags, 958 fcred, NULL); 959 if (error == 0) { 960 mrp[0].mr_addr = start_addr; 961 mrp[0].mr_msize = padding; 962 mrp[0].mr_fsize = 0; 963 mrp[0].mr_offset = 0; 964 mrp[0].mr_prot = 0; 965 mrp[0].mr_flags = MR_PADDING; 966 967 mrp[1].mr_addr = addr; 968 mrp[1].mr_msize = len; 969 mrp[1].mr_fsize = len; 970 mrp[1].mr_offset = 0; 971 mrp[1].mr_prot = prot; 972 mrp[1].mr_flags = 0; 973 974 mrp[2].mr_addr = addr + P2ROUNDUP(len, PAGESIZE); 975 mrp[2].mr_msize = padding; 976 mrp[2].mr_fsize = 0; 977 mrp[2].mr_offset = 0; 978 mrp[2].mr_prot = 0; 979 mrp[2].mr_flags = MR_PADDING; 980 } else { 981 /* Need to cleanup the as_map from earlier */ 982 (void) as_unmap(as, start_addr, pad_len); 983 } 984 return (error); 985 } 986 987 /* 988 * Map a PT_LOAD or PT_SUNWBSS section of an executable file into the user's 989 * address space. 990 * vp - vnode to be mapped in 991 * addr - start address 992 * len - length of vp to be mapped 993 * zfodlen - length of zero filled memory after len above 994 * offset - offset into file where mapping should start 995 * prot - protections for this mapping 996 * fcred - credentials for the file associated with vp at open time. 997 */ 998 static int 999 mmapobj_map_ptload(struct vnode *vp, caddr_t addr, size_t len, 1000 volatile size_t zfodlen, off_t offset, int prot, cred_t *fcred) 1001 { 1002 int error = 0; 1003 caddr_t zfodbase, oldaddr; 1004 size_t oldlen; 1005 size_t end; 1006 size_t zfoddiff; 1007 label_t ljb; 1008 struct as *as = curproc->p_as; 1009 model_t model; 1010 int full_page; 1011 1012 /* 1013 * See if addr and offset are aligned such that we can map in 1014 * full pages instead of partial pages. 1015 */ 1016 full_page = (((uintptr_t)addr & PAGEOFFSET) == 1017 ((uintptr_t)offset & PAGEOFFSET)); 1018 1019 model = get_udatamodel(); 1020 1021 oldaddr = addr; 1022 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1023 if (len) { 1024 spgcnt_t availm, npages; 1025 int preread; 1026 uint_t mflag = MAP_PRIVATE | MAP_FIXED; 1027 1028 if (model == DATAMODEL_ILP32) { 1029 mflag |= _MAP_LOW32; 1030 } 1031 /* We may need to map in extra bytes */ 1032 oldlen = len; 1033 len += ((size_t)oldaddr & PAGEOFFSET); 1034 1035 if (full_page) { 1036 offset = (off_t)((uintptr_t)offset & PAGEMASK); 1037 if ((prot & (PROT_WRITE | PROT_EXEC)) == PROT_EXEC) { 1038 mflag |= MAP_TEXT; 1039 MOBJ_STAT_ADD(map_ptload_text); 1040 } else { 1041 mflag |= MAP_INITDATA; 1042 MOBJ_STAT_ADD(map_ptload_initdata); 1043 } 1044 1045 /* 1046 * maxprot is passed as PROT_ALL so that mdb can 1047 * write to this segment. 1048 */ 1049 if ((error = VOP_MAP(vp, (offset_t)offset, as, &addr, 1050 len, prot, PROT_ALL, mflag, fcred, NULL)) != 0) { 1051 return (error); 1052 } 1053 1054 /* 1055 * If the segment can fit and is relatively small, then 1056 * we prefault the entire segment in. This is based 1057 * on the model that says the best working set of a 1058 * small program is all of its pages. 1059 * We only do this if freemem will not drop below 1060 * lotsfree since we don't want to induce paging. 1061 */ 1062 npages = (spgcnt_t)btopr(len); 1063 availm = freemem - lotsfree; 1064 preread = (npages < availm && len < PGTHRESH) ? 1 : 0; 1065 1066 /* 1067 * If we aren't prefaulting the segment, 1068 * increment "deficit", if necessary to ensure 1069 * that pages will become available when this 1070 * process starts executing. 1071 */ 1072 if (preread == 0 && npages > availm && 1073 deficit < lotsfree) { 1074 deficit += MIN((pgcnt_t)(npages - availm), 1075 lotsfree - deficit); 1076 } 1077 1078 if (preread) { 1079 (void) as_faulta(as, addr, len); 1080 MOBJ_STAT_ADD(map_ptload_preread); 1081 } 1082 } else { 1083 /* 1084 * addr and offset were not aligned such that we could 1085 * use VOP_MAP, thus we need to as_map the memory we 1086 * need and then read the data in from disk. 1087 * This code path is a corner case which should never 1088 * be taken, but hand crafted binaries could trigger 1089 * this logic and it needs to work correctly. 1090 */ 1091 MOBJ_STAT_ADD(map_ptload_unaligned_text); 1092 as_rangelock(as); 1093 (void) as_unmap(as, addr, len); 1094 1095 /* 1096 * We use zfod_argsp because we need to be able to 1097 * write to the mapping and then we'll change the 1098 * protections later if they are incorrect. 1099 */ 1100 error = as_map(as, addr, len, segvn_create, zfod_argsp); 1101 as_rangeunlock(as); 1102 if (error) { 1103 MOBJ_STAT_ADD(map_ptload_unaligned_map_fail); 1104 return (error); 1105 } 1106 1107 /* Now read in the data from disk */ 1108 error = vn_rdwr(UIO_READ, vp, oldaddr, oldlen, offset, 1109 UIO_USERSPACE, 0, (rlim64_t)0, fcred, NULL); 1110 if (error) { 1111 MOBJ_STAT_ADD(map_ptload_unaligned_read_fail); 1112 return (error); 1113 } 1114 1115 /* 1116 * Now set protections. 1117 */ 1118 if (prot != PROT_ZFOD) { 1119 (void) as_setprot(as, addr, len, prot); 1120 } 1121 } 1122 } 1123 1124 if (zfodlen) { 1125 end = (size_t)addr + len; 1126 zfodbase = (caddr_t)P2ROUNDUP(end, PAGESIZE); 1127 zfoddiff = (uintptr_t)zfodbase - end; 1128 if (zfoddiff) { 1129 /* 1130 * Before we go to zero the remaining space on the last 1131 * page, make sure we have write permission. 1132 * 1133 * We need to be careful how we zero-fill the last page 1134 * if the protection does not include PROT_WRITE. Using 1135 * as_setprot() can cause the VM segment code to call 1136 * segvn_vpage(), which must allocate a page struct for 1137 * each page in the segment. If we have a very large 1138 * segment, this may fail, so we check for that, even 1139 * though we ignore other return values from as_setprot. 1140 */ 1141 MOBJ_STAT_ADD(zfoddiff); 1142 if ((prot & PROT_WRITE) == 0) { 1143 if (as_setprot(as, (caddr_t)end, zfoddiff, 1144 prot | PROT_WRITE) == ENOMEM) 1145 return (ENOMEM); 1146 MOBJ_STAT_ADD(zfoddiff_nowrite); 1147 } 1148 if (on_fault(&ljb)) { 1149 no_fault(); 1150 if ((prot & PROT_WRITE) == 0) { 1151 (void) as_setprot(as, (caddr_t)end, 1152 zfoddiff, prot); 1153 } 1154 return (EFAULT); 1155 } 1156 uzero((void *)end, zfoddiff); 1157 no_fault(); 1158 1159 /* 1160 * Remove write protection to return to original state 1161 */ 1162 if ((prot & PROT_WRITE) == 0) { 1163 (void) as_setprot(as, (caddr_t)end, 1164 zfoddiff, prot); 1165 } 1166 } 1167 if (zfodlen > zfoddiff) { 1168 struct segvn_crargs crargs = 1169 SEGVN_ZFOD_ARGS(prot, PROT_ALL); 1170 1171 MOBJ_STAT_ADD(zfodextra); 1172 zfodlen -= zfoddiff; 1173 crargs.szc = AS_MAP_NO_LPOOB; 1174 1175 1176 as_rangelock(as); 1177 (void) as_unmap(as, (caddr_t)zfodbase, zfodlen); 1178 error = as_map(as, (caddr_t)zfodbase, 1179 zfodlen, segvn_create, &crargs); 1180 as_rangeunlock(as); 1181 if (error) { 1182 return (error); 1183 } 1184 } 1185 } 1186 return (0); 1187 } 1188 1189 /* 1190 * Map the ELF file represented by vp into the users address space. The 1191 * first mapping will start at start_addr and there will be num_elements 1192 * mappings. The mappings are described by the data in mrp which may be 1193 * modified upon returning from this function. 1194 * Returns 0 for success or errno for failure. 1195 */ 1196 static int 1197 mmapobj_map_elf(struct vnode *vp, caddr_t start_addr, mmapobj_result_t *mrp, 1198 int num_elements, cred_t *fcred, ushort_t e_type) 1199 { 1200 int i; 1201 int ret; 1202 caddr_t lo; 1203 caddr_t hi; 1204 struct as *as = curproc->p_as; 1205 1206 for (i = 0; i < num_elements; i++) { 1207 caddr_t addr; 1208 size_t p_memsz; 1209 size_t p_filesz; 1210 size_t zfodlen; 1211 offset_t p_offset; 1212 size_t dif; 1213 int prot; 1214 1215 /* Always need to adjust mr_addr */ 1216 addr = start_addr + (size_t)(mrp[i].mr_addr); 1217 mrp[i].mr_addr = 1218 (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 1219 1220 /* Padding has already been mapped */ 1221 if (MR_GET_TYPE(mrp[i].mr_flags) == MR_PADDING) { 1222 continue; 1223 } 1224 1225 /* Can't execute code from "noexec" mounted filesystem. */ 1226 if (((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0) && 1227 ((mrp[i].mr_prot & PROT_EXEC) != 0)) { 1228 MOBJ_STAT_ADD(noexec_fs); 1229 return (EACCES); 1230 } 1231 1232 p_memsz = mrp[i].mr_msize; 1233 p_filesz = mrp[i].mr_fsize; 1234 zfodlen = p_memsz - p_filesz; 1235 p_offset = mrp[i].mr_offset; 1236 dif = (uintptr_t)(addr) & PAGEOFFSET; 1237 prot = mrp[i].mr_prot | PROT_USER; 1238 ret = mmapobj_map_ptload(vp, addr, p_filesz, zfodlen, 1239 p_offset, prot, fcred); 1240 if (ret != 0) { 1241 MOBJ_STAT_ADD(ptload_failed); 1242 mmapobj_unmap(mrp, i, num_elements, e_type); 1243 return (ret); 1244 } 1245 1246 /* Need to cleanup mrp to reflect the actual values used */ 1247 mrp[i].mr_msize += dif; 1248 mrp[i].mr_offset = (size_t)addr & PAGEOFFSET; 1249 } 1250 1251 /* Also need to unmap any holes created above */ 1252 if (num_elements == 1) { 1253 MOBJ_STAT_ADD(map_elf_no_holes); 1254 return (0); 1255 } 1256 if (e_type == ET_EXEC) { 1257 return (0); 1258 } 1259 1260 as_rangelock(as); 1261 lo = start_addr; 1262 hi = mrp[0].mr_addr; 1263 1264 /* Remove holes made by the rest of the segments */ 1265 for (i = 0; i < num_elements - 1; i++) { 1266 lo = (caddr_t)P2ROUNDUP((size_t)(mrp[i].mr_addr) + 1267 mrp[i].mr_msize, PAGESIZE); 1268 hi = mrp[i + 1].mr_addr; 1269 if (lo < hi) { 1270 /* 1271 * If as_unmap fails we just use up a bit of extra 1272 * space 1273 */ 1274 (void) as_unmap(as, (caddr_t)lo, 1275 (size_t)hi - (size_t)lo); 1276 MOBJ_STAT_ADD(unmap_hole); 1277 } 1278 } 1279 as_rangeunlock(as); 1280 1281 return (0); 1282 } 1283 1284 /* Ugly hack to get STRUCT_* macros to work below */ 1285 struct myphdr { 1286 Phdr x; /* native version */ 1287 }; 1288 1289 struct myphdr32 { 1290 Elf32_Phdr x; 1291 }; 1292 1293 /* 1294 * Calculate and return the number of loadable segments in the ELF Phdr 1295 * represented by phdrbase as well as the len of the total mapping and 1296 * the max alignment that is needed for a given segment. On success, 1297 * 0 is returned, and *len, *loadable and *align have been filled out. 1298 * On failure, errno will be returned, which in this case is ENOTSUP 1299 * if we were passed an ELF file with overlapping segments. 1300 */ 1301 static int 1302 calc_loadable(Ehdr *ehdrp, caddr_t phdrbase, int nphdrs, size_t *len, 1303 int *loadable, size_t *align) 1304 { 1305 int i; 1306 int hsize; 1307 model_t model; 1308 ushort_t e_type = ehdrp->e_type; /* same offset 32 and 64 bit */ 1309 uint_t p_type; 1310 offset_t p_offset; 1311 size_t p_memsz; 1312 size_t p_align; 1313 caddr_t vaddr; 1314 int num_segs = 0; 1315 caddr_t start_addr = NULL; 1316 caddr_t p_end = NULL; 1317 size_t max_align = 0; 1318 size_t min_align = PAGESIZE; /* needed for vmem_xalloc */ 1319 STRUCT_HANDLE(myphdr, mph); 1320 #if defined(__sparc) 1321 extern int vac_size; 1322 1323 /* 1324 * Want to prevent aliasing by making the start address at least be 1325 * aligned to vac_size. 1326 */ 1327 min_align = MAX(PAGESIZE, vac_size); 1328 #endif 1329 1330 model = get_udatamodel(); 1331 STRUCT_SET_HANDLE(mph, model, (struct myphdr *)phdrbase); 1332 1333 /* hsize alignment should have been checked before calling this func */ 1334 if (model == DATAMODEL_LP64) { 1335 hsize = ehdrp->e_phentsize; 1336 if (hsize & 7) { 1337 return (ENOTSUP); 1338 } 1339 } else { 1340 ASSERT(model == DATAMODEL_ILP32); 1341 hsize = ((Elf32_Ehdr *)ehdrp)->e_phentsize; 1342 if (hsize & 3) { 1343 return (ENOTSUP); 1344 } 1345 } 1346 1347 /* 1348 * Determine the span of all loadable segments and calculate the 1349 * number of loadable segments. 1350 */ 1351 for (i = 0; i < nphdrs; i++) { 1352 p_type = STRUCT_FGET(mph, x.p_type); 1353 if (p_type == PT_LOAD || p_type == PT_SUNWBSS) { 1354 vaddr = (caddr_t)(uintptr_t)STRUCT_FGET(mph, x.p_vaddr); 1355 p_memsz = STRUCT_FGET(mph, x.p_memsz); 1356 1357 /* 1358 * Skip this header if it requests no memory to be 1359 * mapped. 1360 */ 1361 if (p_memsz == 0) { 1362 STRUCT_SET_HANDLE(mph, model, 1363 (struct myphdr *)((size_t)STRUCT_BUF(mph) + 1364 hsize)); 1365 MOBJ_STAT_ADD(nomem_header); 1366 continue; 1367 } 1368 if (num_segs++ == 0) { 1369 /* 1370 * The p_vaddr of the first PT_LOAD segment 1371 * must either be NULL or within the first 1372 * page in order to be interpreted. 1373 * Otherwise, its an invalid file. 1374 */ 1375 if (e_type == ET_DYN && 1376 ((caddr_t)((uintptr_t)vaddr & 1377 (uintptr_t)PAGEMASK) != NULL)) { 1378 MOBJ_STAT_ADD(inval_header); 1379 return (ENOTSUP); 1380 } 1381 start_addr = vaddr; 1382 /* 1383 * For the first segment, we need to map from 1384 * the beginning of the file, so we will 1385 * adjust the size of the mapping to include 1386 * this memory. 1387 */ 1388 p_offset = STRUCT_FGET(mph, x.p_offset); 1389 } else { 1390 p_offset = 0; 1391 } 1392 /* 1393 * Check to make sure that this mapping wouldn't 1394 * overlap a previous mapping. 1395 */ 1396 if (vaddr < p_end) { 1397 MOBJ_STAT_ADD(overlap_header); 1398 return (ENOTSUP); 1399 } 1400 1401 p_end = vaddr + p_memsz + p_offset; 1402 p_end = (caddr_t)P2ROUNDUP((size_t)p_end, PAGESIZE); 1403 1404 p_align = STRUCT_FGET(mph, x.p_align); 1405 if (p_align > 1 && p_align > max_align) { 1406 max_align = p_align; 1407 if (max_align < min_align) { 1408 max_align = min_align; 1409 MOBJ_STAT_ADD(min_align); 1410 } 1411 } 1412 } 1413 STRUCT_SET_HANDLE(mph, model, 1414 (struct myphdr *)((size_t)STRUCT_BUF(mph) + hsize)); 1415 } 1416 1417 /* 1418 * The alignment should be a power of 2, if it isn't we forgive it 1419 * and round up. On overflow, we'll set the alignment to max_align 1420 * rounded down to the nearest power of 2. 1421 */ 1422 if (max_align > 0 && !ISP2(max_align)) { 1423 MOBJ_STAT_ADD(np2_align); 1424 *align = 2 * (1L << (highbit(max_align) - 1)); 1425 if (*align < max_align || 1426 (*align > UINT_MAX && model == DATAMODEL_ILP32)) { 1427 MOBJ_STAT_ADD(np2_align_overflow); 1428 *align = 1L << (highbit(max_align) - 1); 1429 } 1430 } else { 1431 *align = max_align; 1432 } 1433 1434 ASSERT(*align >= PAGESIZE || *align == 0); 1435 1436 *loadable = num_segs; 1437 *len = p_end - start_addr; 1438 return (0); 1439 } 1440 1441 /* 1442 * Check the address space to see if the virtual addresses to be used are 1443 * available. If they are not, return errno for failure. On success, 0 1444 * will be returned, and the virtual addresses for each mmapobj_result_t 1445 * will be reserved. Note that a reservation could have earlier been made 1446 * for a given segment via a /dev/null mapping. If that is the case, then 1447 * we can use that VA space for our mappings. 1448 * Note: this function will only be used for ET_EXEC binaries. 1449 */ 1450 int 1451 check_exec_addrs(int loadable, mmapobj_result_t *mrp, caddr_t start_addr) 1452 { 1453 int i; 1454 struct as *as = curproc->p_as; 1455 struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); 1456 int ret; 1457 caddr_t myaddr; 1458 size_t mylen; 1459 struct seg *seg; 1460 1461 /* No need to reserve swap space now since it will be reserved later */ 1462 crargs.flags |= MAP_NORESERVE; 1463 as_rangelock(as); 1464 for (i = 0; i < loadable; i++) { 1465 1466 myaddr = start_addr + (size_t)mrp[i].mr_addr; 1467 mylen = mrp[i].mr_msize; 1468 1469 /* See if there is a hole in the as for this range */ 1470 if (as_gap(as, mylen, &myaddr, &mylen, 0, NULL) == 0) { 1471 ASSERT(myaddr == start_addr + (size_t)mrp[i].mr_addr); 1472 ASSERT(mylen == mrp[i].mr_msize); 1473 1474 #ifdef DEBUG 1475 if (MR_GET_TYPE(mrp[i].mr_flags) == MR_PADDING) { 1476 MOBJ_STAT_ADD(exec_padding); 1477 } 1478 #endif 1479 ret = as_map(as, myaddr, mylen, segvn_create, &crargs); 1480 if (ret) { 1481 as_rangeunlock(as); 1482 mmapobj_unmap_exec(mrp, i, start_addr); 1483 return (ret); 1484 } 1485 } else { 1486 /* 1487 * There is a mapping that exists in the range 1488 * so check to see if it was a "reservation" 1489 * from /dev/null. The mapping is from 1490 * /dev/null if the mapping comes from 1491 * segdev and the type is neither MAP_SHARED 1492 * nor MAP_PRIVATE. 1493 */ 1494 AS_LOCK_ENTER(as, RW_READER); 1495 seg = as_findseg(as, myaddr, 0); 1496 MOBJ_STAT_ADD(exec_addr_mapped); 1497 if (seg && seg->s_ops == &segdev_ops && 1498 ((SEGOP_GETTYPE(seg, myaddr) & 1499 (MAP_SHARED | MAP_PRIVATE)) == 0) && 1500 myaddr >= seg->s_base && 1501 myaddr + mylen <= 1502 seg->s_base + seg->s_size) { 1503 MOBJ_STAT_ADD(exec_addr_devnull); 1504 AS_LOCK_EXIT(as); 1505 (void) as_unmap(as, myaddr, mylen); 1506 ret = as_map(as, myaddr, mylen, segvn_create, 1507 &crargs); 1508 mrp[i].mr_flags |= MR_RESV; 1509 if (ret) { 1510 as_rangeunlock(as); 1511 /* Need to remap what we unmapped */ 1512 mmapobj_unmap_exec(mrp, i + 1, 1513 start_addr); 1514 return (ret); 1515 } 1516 } else { 1517 AS_LOCK_EXIT(as); 1518 as_rangeunlock(as); 1519 mmapobj_unmap_exec(mrp, i, start_addr); 1520 MOBJ_STAT_ADD(exec_addr_in_use); 1521 return (EADDRINUSE); 1522 } 1523 } 1524 } 1525 as_rangeunlock(as); 1526 return (0); 1527 } 1528 1529 /* 1530 * Walk through the ELF program headers and extract all useful information 1531 * for PT_LOAD and PT_SUNWBSS segments into mrp. 1532 * Return 0 on success or error on failure. 1533 */ 1534 static int 1535 process_phdrs(Ehdr *ehdrp, caddr_t phdrbase, int nphdrs, mmapobj_result_t *mrp, 1536 vnode_t *vp, uint_t *num_mapped, size_t padding, cred_t *fcred) 1537 { 1538 int i; 1539 caddr_t start_addr = NULL; 1540 caddr_t vaddr; 1541 size_t len = 0; 1542 size_t lib_len = 0; 1543 int ret; 1544 int prot; 1545 struct lib_va *lvp = NULL; 1546 vattr_t vattr; 1547 struct as *as = curproc->p_as; 1548 int error; 1549 int loadable = 0; 1550 int current = 0; 1551 int use_lib_va = 1; 1552 size_t align = 0; 1553 size_t add_pad = 0; 1554 int hdr_seen = 0; 1555 ushort_t e_type = ehdrp->e_type; /* same offset 32 and 64 bit */ 1556 uint_t p_type; 1557 offset_t p_offset; 1558 size_t p_memsz; 1559 size_t p_filesz; 1560 uint_t p_flags; 1561 int hsize; 1562 model_t model; 1563 STRUCT_HANDLE(myphdr, mph); 1564 1565 model = get_udatamodel(); 1566 STRUCT_SET_HANDLE(mph, model, (struct myphdr *)phdrbase); 1567 1568 /* 1569 * Need to make sure that hsize is aligned properly. 1570 * For 32bit processes, 4 byte alignment is required. 1571 * For 64bit processes, 8 byte alignment is required. 1572 * If the alignment isn't correct, we need to return failure 1573 * since it could cause an alignment error panic while walking 1574 * the phdr array. 1575 */ 1576 if (model == DATAMODEL_LP64) { 1577 hsize = ehdrp->e_phentsize; 1578 if (hsize & 7) { 1579 MOBJ_STAT_ADD(phent_align64); 1580 return (ENOTSUP); 1581 } 1582 } else { 1583 ASSERT(model == DATAMODEL_ILP32); 1584 hsize = ((Elf32_Ehdr *)ehdrp)->e_phentsize; 1585 if (hsize & 3) { 1586 MOBJ_STAT_ADD(phent_align32); 1587 return (ENOTSUP); 1588 } 1589 } 1590 1591 if ((padding != 0) || secflag_enabled(curproc, PROC_SEC_ASLR)) { 1592 use_lib_va = 0; 1593 } 1594 if (e_type == ET_DYN) { 1595 vattr.va_mask = AT_FSID | AT_NODEID | AT_CTIME | AT_MTIME; 1596 error = VOP_GETATTR(vp, &vattr, 0, fcred, NULL); 1597 if (error) { 1598 return (error); 1599 } 1600 /* Check to see if we already have a description for this lib */ 1601 if (!secflag_enabled(curproc, PROC_SEC_ASLR)) 1602 lvp = lib_va_find(&vattr); 1603 1604 if (lvp != NULL) { 1605 MOBJ_STAT_ADD(lvp_found); 1606 if (use_lib_va) { 1607 start_addr = mmapobj_lookup_start_addr(lvp); 1608 if (start_addr == NULL) { 1609 lib_va_release(lvp); 1610 return (ENOMEM); 1611 } 1612 } 1613 1614 /* 1615 * loadable may be zero if the original allocator 1616 * of lvp hasn't finished setting it up but the rest 1617 * of the fields will be accurate. 1618 */ 1619 loadable = lvp->lv_num_segs; 1620 len = lvp->lv_len; 1621 align = lvp->lv_align; 1622 } 1623 } 1624 1625 /* 1626 * Determine the span of all loadable segments and calculate the 1627 * number of loadable segments, the total len spanned by the mappings 1628 * and the max alignment, if we didn't get them above. 1629 */ 1630 if (loadable == 0) { 1631 MOBJ_STAT_ADD(no_loadable_yet); 1632 ret = calc_loadable(ehdrp, phdrbase, nphdrs, &len, 1633 &loadable, &align); 1634 if (ret != 0) { 1635 /* 1636 * Since it'd be an invalid file, we shouldn't have 1637 * cached it previously. 1638 */ 1639 ASSERT(lvp == NULL); 1640 return (ret); 1641 } 1642 #ifdef DEBUG 1643 if (lvp) { 1644 ASSERT(len == lvp->lv_len); 1645 ASSERT(align == lvp->lv_align); 1646 } 1647 #endif 1648 } 1649 1650 /* Make sure there's something to map. */ 1651 if (len == 0 || loadable == 0) { 1652 /* 1653 * Since it'd be an invalid file, we shouldn't have 1654 * cached it previously. 1655 */ 1656 ASSERT(lvp == NULL); 1657 MOBJ_STAT_ADD(nothing_to_map); 1658 return (ENOTSUP); 1659 } 1660 1661 lib_len = len; 1662 if (padding != 0) { 1663 loadable += 2; 1664 } 1665 if (loadable > *num_mapped) { 1666 *num_mapped = loadable; 1667 /* cleanup previous reservation */ 1668 if (start_addr) { 1669 (void) as_unmap(as, start_addr, lib_len); 1670 } 1671 MOBJ_STAT_ADD(e2big); 1672 if (lvp) { 1673 lib_va_release(lvp); 1674 } 1675 return (E2BIG); 1676 } 1677 1678 /* 1679 * We now know the size of the object to map and now we need to 1680 * get the start address to map it at. It's possible we already 1681 * have it if we found all the info we need in the lib_va cache. 1682 */ 1683 if (e_type == ET_DYN && start_addr == NULL) { 1684 /* 1685 * Need to make sure padding does not throw off 1686 * required alignment. We can only specify an 1687 * alignment for the starting address to be mapped, 1688 * so we round padding up to the alignment and map 1689 * from there and then throw out the extra later. 1690 */ 1691 if (padding != 0) { 1692 if (align > 1) { 1693 add_pad = P2ROUNDUP(padding, align); 1694 len += add_pad; 1695 MOBJ_STAT_ADD(dyn_pad_align); 1696 } else { 1697 MOBJ_STAT_ADD(dyn_pad_noalign); 1698 len += padding; /* at beginning */ 1699 } 1700 len += padding; /* at end of mapping */ 1701 } 1702 /* 1703 * At this point, if lvp is non-NULL, then above we 1704 * already found it in the cache but did not get 1705 * the start address since we were not going to use lib_va. 1706 * Since we know that lib_va will not be used, it's safe 1707 * to call mmapobj_alloc_start_addr and know that lvp 1708 * will not be modified. 1709 */ 1710 ASSERT(lvp ? use_lib_va == 0 : 1); 1711 start_addr = mmapobj_alloc_start_addr(&lvp, len, 1712 use_lib_va, 1713 secflag_enabled(curproc, PROC_SEC_ASLR), 1714 align, &vattr); 1715 if (start_addr == NULL) { 1716 if (lvp) { 1717 lib_va_release(lvp); 1718 } 1719 MOBJ_STAT_ADD(alloc_start_fail); 1720 return (ENOMEM); 1721 } 1722 /* 1723 * If we can't cache it, no need to hang on to it. 1724 * Setting lv_num_segs to non-zero will make that 1725 * field active and since there are too many segments 1726 * to cache, all future users will not try to use lv_mps. 1727 */ 1728 if (lvp != NULL && loadable > LIBVA_CACHED_SEGS && use_lib_va) { 1729 lvp->lv_num_segs = loadable; 1730 lib_va_release(lvp); 1731 lvp = NULL; 1732 MOBJ_STAT_ADD(lvp_nocache); 1733 } 1734 /* 1735 * Free the beginning of the mapping if the padding 1736 * was not aligned correctly. 1737 */ 1738 if (padding != 0 && add_pad != padding) { 1739 (void) as_unmap(as, start_addr, 1740 add_pad - padding); 1741 start_addr += (add_pad - padding); 1742 MOBJ_STAT_ADD(extra_padding); 1743 } 1744 } 1745 1746 /* 1747 * At this point, we have reserved the virtual address space 1748 * for our mappings. Now we need to start filling out the mrp 1749 * array to describe all of the individual mappings we are going 1750 * to return. 1751 * For ET_EXEC there has been no memory reservation since we are 1752 * using fixed addresses. While filling in the mrp array below, 1753 * we will have the first segment biased to start at addr 0 1754 * and the rest will be biased by this same amount. Thus if there 1755 * is padding, the first padding will start at addr 0, and the next 1756 * segment will start at the value of padding. 1757 */ 1758 1759 /* We'll fill out padding later, so start filling in mrp at index 1 */ 1760 if (padding != 0) { 1761 current = 1; 1762 } 1763 1764 /* If we have no more need for lvp let it go now */ 1765 if (lvp != NULL && use_lib_va == 0) { 1766 lib_va_release(lvp); 1767 MOBJ_STAT_ADD(lvp_not_needed); 1768 lvp = NULL; 1769 } 1770 1771 /* Now fill out the mrp structs from the program headers */ 1772 STRUCT_SET_HANDLE(mph, model, (struct myphdr *)phdrbase); 1773 for (i = 0; i < nphdrs; i++) { 1774 p_type = STRUCT_FGET(mph, x.p_type); 1775 if (p_type == PT_LOAD || p_type == PT_SUNWBSS) { 1776 vaddr = (caddr_t)(uintptr_t)STRUCT_FGET(mph, x.p_vaddr); 1777 p_memsz = STRUCT_FGET(mph, x.p_memsz); 1778 p_filesz = STRUCT_FGET(mph, x.p_filesz); 1779 p_offset = STRUCT_FGET(mph, x.p_offset); 1780 p_flags = STRUCT_FGET(mph, x.p_flags); 1781 1782 /* 1783 * Skip this header if it requests no memory to be 1784 * mapped. 1785 */ 1786 if (p_memsz == 0) { 1787 STRUCT_SET_HANDLE(mph, model, 1788 (struct myphdr *)((size_t)STRUCT_BUF(mph) + 1789 hsize)); 1790 MOBJ_STAT_ADD(no_mem_map_sz); 1791 continue; 1792 } 1793 1794 prot = 0; 1795 if (p_flags & PF_R) 1796 prot |= PROT_READ; 1797 if (p_flags & PF_W) 1798 prot |= PROT_WRITE; 1799 if (p_flags & PF_X) 1800 prot |= PROT_EXEC; 1801 1802 ASSERT(current < loadable); 1803 mrp[current].mr_msize = p_memsz; 1804 mrp[current].mr_fsize = p_filesz; 1805 mrp[current].mr_offset = p_offset; 1806 mrp[current].mr_prot = prot; 1807 1808 if (hdr_seen == 0 && p_filesz != 0) { 1809 mrp[current].mr_flags = MR_HDR_ELF; 1810 /* 1811 * We modify mr_offset because we 1812 * need to map the ELF header as well, and if 1813 * we didn't then the header could be left out 1814 * of the mapping that we will create later. 1815 * Since we're removing the offset, we need to 1816 * account for that in the other fields as well 1817 * since we will be mapping the memory from 0 1818 * to p_offset. 1819 */ 1820 if (e_type == ET_DYN) { 1821 mrp[current].mr_offset = 0; 1822 mrp[current].mr_msize += p_offset; 1823 mrp[current].mr_fsize += p_offset; 1824 } else { 1825 ASSERT(e_type == ET_EXEC); 1826 /* 1827 * Save off the start addr which will be 1828 * our bias for the rest of the 1829 * ET_EXEC mappings. 1830 */ 1831 start_addr = vaddr - padding; 1832 } 1833 mrp[current].mr_addr = (caddr_t)padding; 1834 hdr_seen = 1; 1835 } else { 1836 if (e_type == ET_EXEC) { 1837 /* bias mr_addr */ 1838 mrp[current].mr_addr = 1839 vaddr - (size_t)start_addr; 1840 } else { 1841 mrp[current].mr_addr = vaddr + padding; 1842 } 1843 mrp[current].mr_flags = 0; 1844 } 1845 current++; 1846 } 1847 1848 /* Move to next phdr */ 1849 STRUCT_SET_HANDLE(mph, model, 1850 (struct myphdr *)((size_t)STRUCT_BUF(mph) + 1851 hsize)); 1852 } 1853 1854 /* Now fill out the padding segments */ 1855 if (padding != 0) { 1856 mrp[0].mr_addr = NULL; 1857 mrp[0].mr_msize = padding; 1858 mrp[0].mr_fsize = 0; 1859 mrp[0].mr_offset = 0; 1860 mrp[0].mr_prot = 0; 1861 mrp[0].mr_flags = MR_PADDING; 1862 1863 /* Setup padding for the last segment */ 1864 ASSERT(current == loadable - 1); 1865 mrp[current].mr_addr = (caddr_t)lib_len + padding; 1866 mrp[current].mr_msize = padding; 1867 mrp[current].mr_fsize = 0; 1868 mrp[current].mr_offset = 0; 1869 mrp[current].mr_prot = 0; 1870 mrp[current].mr_flags = MR_PADDING; 1871 } 1872 1873 /* 1874 * Need to make sure address ranges desired are not in use or 1875 * are previously allocated reservations from /dev/null. For 1876 * ET_DYN, we already made sure our address range was free. 1877 */ 1878 if (e_type == ET_EXEC) { 1879 ret = check_exec_addrs(loadable, mrp, start_addr); 1880 if (ret != 0) { 1881 ASSERT(lvp == NULL); 1882 MOBJ_STAT_ADD(check_exec_failed); 1883 return (ret); 1884 } 1885 } 1886 1887 /* Finish up our business with lvp. */ 1888 if (lvp) { 1889 ASSERT(e_type == ET_DYN); 1890 if (lvp->lv_num_segs == 0 && loadable <= LIBVA_CACHED_SEGS) { 1891 bcopy(mrp, lvp->lv_mps, 1892 loadable * sizeof (mmapobj_result_t)); 1893 membar_producer(); 1894 } 1895 /* 1896 * Setting lv_num_segs to a non-zero value indicates that 1897 * lv_mps is now valid and can be used by other threads. 1898 * So, the above stores need to finish before lv_num_segs 1899 * is updated. lv_mps is only valid if lv_num_segs is 1900 * greater than LIBVA_CACHED_SEGS. 1901 */ 1902 lvp->lv_num_segs = loadable; 1903 lib_va_release(lvp); 1904 MOBJ_STAT_ADD(lvp_used); 1905 } 1906 1907 /* Now that we have mrp completely filled out go map it */ 1908 ret = mmapobj_map_elf(vp, start_addr, mrp, loadable, fcred, e_type); 1909 if (ret == 0) { 1910 *num_mapped = loadable; 1911 } 1912 1913 return (ret); 1914 } 1915 1916 /* 1917 * Take the ELF file passed in, and do the work of mapping it. 1918 * num_mapped in - # elements in user buffer 1919 * num_mapped out - # sections mapped and length of mrp array if 1920 * no errors. 1921 */ 1922 static int 1923 doelfwork(Ehdr *ehdrp, vnode_t *vp, mmapobj_result_t *mrp, 1924 uint_t *num_mapped, size_t padding, cred_t *fcred) 1925 { 1926 int error; 1927 offset_t phoff; 1928 int nphdrs; 1929 unsigned char ei_class; 1930 unsigned short phentsize; 1931 ssize_t phsizep; 1932 caddr_t phbasep; 1933 int to_map; 1934 model_t model; 1935 1936 ei_class = ehdrp->e_ident[EI_CLASS]; 1937 model = get_udatamodel(); 1938 if ((model == DATAMODEL_ILP32 && ei_class == ELFCLASS64) || 1939 (model == DATAMODEL_LP64 && ei_class == ELFCLASS32)) { 1940 MOBJ_STAT_ADD(wrong_model); 1941 return (ENOTSUP); 1942 } 1943 1944 /* Can't execute code from "noexec" mounted filesystem. */ 1945 if (ehdrp->e_type == ET_EXEC && 1946 (vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0) { 1947 MOBJ_STAT_ADD(noexec_fs); 1948 return (EACCES); 1949 } 1950 1951 /* 1952 * Relocatable and core files are mapped as a single flat file 1953 * since no interpretation is done on them by mmapobj. 1954 */ 1955 if (ehdrp->e_type == ET_REL || ehdrp->e_type == ET_CORE) { 1956 to_map = padding ? 3 : 1; 1957 if (*num_mapped < to_map) { 1958 *num_mapped = to_map; 1959 MOBJ_STAT_ADD(e2big_et_rel); 1960 return (E2BIG); 1961 } 1962 error = mmapobj_map_flat(vp, mrp, padding, fcred); 1963 if (error == 0) { 1964 *num_mapped = to_map; 1965 mrp[padding ? 1 : 0].mr_flags = MR_HDR_ELF; 1966 MOBJ_STAT_ADD(et_rel_mapped); 1967 } 1968 return (error); 1969 } 1970 1971 /* Check for an unknown ELF type */ 1972 if (ehdrp->e_type != ET_EXEC && ehdrp->e_type != ET_DYN) { 1973 MOBJ_STAT_ADD(unknown_elf_type); 1974 return (ENOTSUP); 1975 } 1976 1977 if (ei_class == ELFCLASS32) { 1978 Elf32_Ehdr *e32hdr = (Elf32_Ehdr *)ehdrp; 1979 ASSERT(model == DATAMODEL_ILP32); 1980 nphdrs = e32hdr->e_phnum; 1981 phentsize = e32hdr->e_phentsize; 1982 if (phentsize < sizeof (Elf32_Phdr)) { 1983 MOBJ_STAT_ADD(phent32_too_small); 1984 return (ENOTSUP); 1985 } 1986 phoff = e32hdr->e_phoff; 1987 } else if (ei_class == ELFCLASS64) { 1988 Elf64_Ehdr *e64hdr = (Elf64_Ehdr *)ehdrp; 1989 ASSERT(model == DATAMODEL_LP64); 1990 nphdrs = e64hdr->e_phnum; 1991 phentsize = e64hdr->e_phentsize; 1992 if (phentsize < sizeof (Elf64_Phdr)) { 1993 MOBJ_STAT_ADD(phent64_too_small); 1994 return (ENOTSUP); 1995 } 1996 phoff = e64hdr->e_phoff; 1997 } else { 1998 /* fallthrough case for an invalid ELF class */ 1999 MOBJ_STAT_ADD(inval_elf_class); 2000 return (ENOTSUP); 2001 } 2002 2003 /* 2004 * nphdrs should only have this value for core files which are handled 2005 * above as a single mapping. If other file types ever use this 2006 * sentinel, then we'll add the support needed to handle this here. 2007 */ 2008 if (nphdrs == PN_XNUM) { 2009 MOBJ_STAT_ADD(too_many_phdrs); 2010 return (ENOTSUP); 2011 } 2012 2013 phsizep = nphdrs * phentsize; 2014 2015 if (phsizep == 0) { 2016 MOBJ_STAT_ADD(no_phsize); 2017 return (ENOTSUP); 2018 } 2019 2020 /* Make sure we only wait for memory if it's a reasonable request */ 2021 if (phsizep > mmapobj_alloc_threshold) { 2022 MOBJ_STAT_ADD(phsize_large); 2023 if ((phbasep = kmem_alloc(phsizep, KM_NOSLEEP)) == NULL) { 2024 MOBJ_STAT_ADD(phsize_xtralarge); 2025 return (ENOMEM); 2026 } 2027 } else { 2028 phbasep = kmem_alloc(phsizep, KM_SLEEP); 2029 } 2030 2031 if ((error = vn_rdwr(UIO_READ, vp, phbasep, phsizep, 2032 (offset_t)phoff, UIO_SYSSPACE, 0, (rlim64_t)0, 2033 fcred, NULL)) != 0) { 2034 kmem_free(phbasep, phsizep); 2035 return (error); 2036 } 2037 2038 /* Now process the phdr's */ 2039 error = process_phdrs(ehdrp, phbasep, nphdrs, mrp, vp, num_mapped, 2040 padding, fcred); 2041 kmem_free(phbasep, phsizep); 2042 return (error); 2043 } 2044 2045 /* 2046 * These are the two types of files that we can interpret and we want to read 2047 * in enough info to cover both types when looking at the initial header. 2048 */ 2049 #define MAX_HEADER_SIZE (MAX(sizeof (Ehdr), sizeof (struct exec))) 2050 2051 /* 2052 * Map vp passed in in an interpreted manner. ELF and AOUT files will be 2053 * interpreted and mapped appropriately for execution. 2054 * num_mapped in - # elements in mrp 2055 * num_mapped out - # sections mapped and length of mrp array if 2056 * no errors or E2BIG returned. 2057 * 2058 * Returns 0 on success, errno value on failure. 2059 */ 2060 static int 2061 mmapobj_map_interpret(vnode_t *vp, mmapobj_result_t *mrp, 2062 uint_t *num_mapped, size_t padding, cred_t *fcred) 2063 { 2064 int error = 0; 2065 vattr_t vattr; 2066 struct lib_va *lvp; 2067 caddr_t start_addr; 2068 model_t model; 2069 2070 /* 2071 * header has to be aligned to the native size of ulong_t in order 2072 * to avoid an unaligned access when dereferencing the header as 2073 * a ulong_t. Thus we allocate our array on the stack of type 2074 * ulong_t and then have header, which we dereference later as a char 2075 * array point at lheader. 2076 */ 2077 ulong_t lheader[(MAX_HEADER_SIZE / (sizeof (ulong_t))) + 1]; 2078 caddr_t header = (caddr_t)&lheader; 2079 2080 vattr.va_mask = AT_FSID | AT_NODEID | AT_CTIME | AT_MTIME | AT_SIZE; 2081 error = VOP_GETATTR(vp, &vattr, 0, fcred, NULL); 2082 if (error) { 2083 return (error); 2084 } 2085 2086 /* 2087 * Check lib_va to see if we already have a full description 2088 * for this library. This is the fast path and only used for 2089 * ET_DYN ELF files (dynamic libraries). 2090 */ 2091 if (padding == 0 && !secflag_enabled(curproc, PROC_SEC_ASLR) && 2092 ((lvp = lib_va_find(&vattr)) != NULL)) { 2093 int num_segs; 2094 2095 model = get_udatamodel(); 2096 if ((model == DATAMODEL_ILP32 && 2097 lvp->lv_flags & LV_ELF64) || 2098 (model == DATAMODEL_LP64 && 2099 lvp->lv_flags & LV_ELF32)) { 2100 lib_va_release(lvp); 2101 MOBJ_STAT_ADD(fast_wrong_model); 2102 return (ENOTSUP); 2103 } 2104 num_segs = lvp->lv_num_segs; 2105 if (*num_mapped < num_segs) { 2106 *num_mapped = num_segs; 2107 lib_va_release(lvp); 2108 MOBJ_STAT_ADD(fast_e2big); 2109 return (E2BIG); 2110 } 2111 2112 /* 2113 * Check to see if we have all the mappable program headers 2114 * cached. 2115 */ 2116 if (num_segs <= LIBVA_CACHED_SEGS && num_segs != 0) { 2117 MOBJ_STAT_ADD(fast); 2118 start_addr = mmapobj_lookup_start_addr(lvp); 2119 if (start_addr == NULL) { 2120 lib_va_release(lvp); 2121 return (ENOMEM); 2122 } 2123 2124 bcopy(lvp->lv_mps, mrp, 2125 num_segs * sizeof (mmapobj_result_t)); 2126 2127 error = mmapobj_map_elf(vp, start_addr, mrp, 2128 num_segs, fcred, ET_DYN); 2129 2130 lib_va_release(lvp); 2131 if (error == 0) { 2132 *num_mapped = num_segs; 2133 MOBJ_STAT_ADD(fast_success); 2134 } 2135 return (error); 2136 } 2137 MOBJ_STAT_ADD(fast_not_now); 2138 2139 /* Release it for now since we'll look it up below */ 2140 lib_va_release(lvp); 2141 } 2142 2143 /* 2144 * Time to see if this is a file we can interpret. If it's smaller 2145 * than this, then we can't interpret it. 2146 */ 2147 if (vattr.va_size < MAX_HEADER_SIZE) { 2148 MOBJ_STAT_ADD(small_file); 2149 return (ENOTSUP); 2150 } 2151 2152 if ((error = vn_rdwr(UIO_READ, vp, header, MAX_HEADER_SIZE, 0, 2153 UIO_SYSSPACE, 0, (rlim64_t)0, fcred, NULL)) != 0) { 2154 MOBJ_STAT_ADD(read_error); 2155 return (error); 2156 } 2157 2158 /* Verify file type */ 2159 if (header[EI_MAG0] == ELFMAG0 && header[EI_MAG1] == ELFMAG1 && 2160 header[EI_MAG2] == ELFMAG2 && header[EI_MAG3] == ELFMAG3) { 2161 return (doelfwork((Ehdr *)lheader, vp, mrp, num_mapped, 2162 padding, fcred)); 2163 } 2164 2165 /* Unsupported type */ 2166 MOBJ_STAT_ADD(unsupported); 2167 return (ENOTSUP); 2168 } 2169 2170 /* 2171 * Given a vnode, map it as either a flat file or interpret it and map 2172 * it according to the rules of the file type. 2173 * *num_mapped will contain the size of the mmapobj_result_t array passed in. 2174 * If padding is non-zero, the mappings will be padded by that amount 2175 * rounded up to the nearest pagesize. 2176 * If the mapping is successful, *num_mapped will contain the number of 2177 * distinct mappings created, and mrp will point to the array of 2178 * mmapobj_result_t's which describe these mappings. 2179 * 2180 * On error, -1 is returned and errno is set appropriately. 2181 * A special error case will set errno to E2BIG when there are more than 2182 * *num_mapped mappings to be created and *num_mapped will be set to the 2183 * number of mappings needed. 2184 */ 2185 int 2186 mmapobj(vnode_t *vp, uint_t flags, mmapobj_result_t *mrp, 2187 uint_t *num_mapped, size_t padding, cred_t *fcred) 2188 { 2189 int to_map; 2190 int error = 0; 2191 2192 ASSERT((padding & PAGEOFFSET) == 0); 2193 ASSERT((flags & ~MMOBJ_ALL_FLAGS) == 0); 2194 ASSERT(num_mapped != NULL); 2195 ASSERT((flags & MMOBJ_PADDING) ? padding != 0 : padding == 0); 2196 2197 if ((flags & MMOBJ_INTERPRET) == 0) { 2198 to_map = padding ? 3 : 1; 2199 if (*num_mapped < to_map) { 2200 *num_mapped = to_map; 2201 MOBJ_STAT_ADD(flat_e2big); 2202 return (E2BIG); 2203 } 2204 error = mmapobj_map_flat(vp, mrp, padding, fcred); 2205 2206 if (error) { 2207 return (error); 2208 } 2209 *num_mapped = to_map; 2210 return (0); 2211 } 2212 2213 error = mmapobj_map_interpret(vp, mrp, num_mapped, padding, fcred); 2214 return (error); 2215 } 2216