1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright (c) 2016 by Delphix. All rights reserved. 25 */ 26 27 /* 28 * Copyright (c) 2015, Joyent, Inc. All rights reserved. 29 * Copyright 2017 James S Blachly, MD <james.blachly@gmail.com> 30 */ 31 32 /* 33 * Memory special file 34 */ 35 36 #include <sys/types.h> 37 #include <sys/param.h> 38 #include <sys/user.h> 39 #include <sys/buf.h> 40 #include <sys/systm.h> 41 #include <sys/cred.h> 42 #include <sys/vm.h> 43 #include <sys/uio.h> 44 #include <sys/mman.h> 45 #include <sys/kmem.h> 46 #include <vm/seg.h> 47 #include <vm/page.h> 48 #include <sys/stat.h> 49 #include <sys/vmem.h> 50 #include <sys/memlist.h> 51 #include <sys/bootconf.h> 52 53 #include <vm/seg_vn.h> 54 #include <vm/seg_dev.h> 55 #include <vm/seg_kmem.h> 56 #include <vm/seg_kp.h> 57 #include <vm/seg_kpm.h> 58 #include <vm/hat.h> 59 60 #include <sys/conf.h> 61 #include <sys/mem.h> 62 #include <sys/types.h> 63 #include <sys/conf.h> 64 #include <sys/param.h> 65 #include <sys/systm.h> 66 #include <sys/errno.h> 67 #include <sys/modctl.h> 68 #include <sys/memlist.h> 69 #include <sys/ddi.h> 70 #include <sys/sunddi.h> 71 #include <sys/debug.h> 72 #include <sys/fm/protocol.h> 73 74 #if defined(__sparc) 75 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *); 76 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *, 77 uint64_t *, int *, int *, int *); 78 extern size_t cpu_get_name_bufsize(void); 79 extern int cpu_get_mem_sid(char *, char *, int, int *); 80 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *); 81 #elif defined(__x86) 82 #include <sys/cpu_module.h> 83 #endif /* __sparc */ 84 85 /* 86 * Turn a byte length into a pagecount. The DDI btop takes a 87 * 32-bit size on 32-bit machines, this handles 64-bit sizes for 88 * large physical-memory 32-bit machines. 89 */ 90 #define BTOP(x) ((pgcnt_t)((x) >> _pageshift)) 91 92 static kmutex_t mm_lock; 93 static caddr_t mm_map; 94 95 static dev_info_t *mm_dip; /* private copy of devinfo pointer */ 96 97 static int mm_kmem_io_access; 98 99 static int mm_kstat_update(kstat_t *ksp, int rw); 100 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw); 101 102 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name); 103 104 #define MM_KMEMLOG_NENTRIES 64 105 106 static int mm_kmemlogent; 107 static mm_logentry_t mm_kmemlog[MM_KMEMLOG_NENTRIES]; 108 109 /* 110 * On kmem/allmem writes, we log information that might be useful in the event 111 * that a write is errant (that is, due to operator error) and induces a later 112 * problem. Note that (in particular) in the event of such operator-induced 113 * corruption, a search over the kernel address space for the corrupted 114 * address will yield the ring buffer entry that recorded the write. And 115 * should it seem baroque or otherwise unnecessary, yes, we need this kind of 116 * auditing facility and yes, we learned that the hard way: disturbingly, 117 * there exist recommendations for "tuning" the system that involve writing to 118 * kernel memory addresses via the kernel debugger, and -- as we discovered -- 119 * these can easily be applied incorrectly or unsafely, yielding an entirely 120 * undebuggable "can't happen" kind of panic. 121 */ 122 static void 123 mm_logkmem(struct uio *uio) 124 { 125 mm_logentry_t *ent; 126 proc_t *p = curthread->t_procp; 127 128 mutex_enter(&mm_lock); 129 130 ent = &mm_kmemlog[mm_kmemlogent++]; 131 132 if (mm_kmemlogent == MM_KMEMLOG_NENTRIES) 133 mm_kmemlogent = 0; 134 135 ent->mle_vaddr = (uintptr_t)uio->uio_loffset; 136 ent->mle_len = uio->uio_resid; 137 gethrestime(&ent->mle_hrestime); 138 ent->mle_hrtime = gethrtime(); 139 ent->mle_pid = p->p_pidp->pid_id; 140 141 (void) strncpy(ent->mle_psargs, 142 p->p_user.u_psargs, sizeof (ent->mle_psargs)); 143 144 mutex_exit(&mm_lock); 145 } 146 147 /*ARGSUSED1*/ 148 static int 149 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) 150 { 151 int i; 152 struct mem_minor { 153 char *name; 154 minor_t minor; 155 int privonly; 156 const char *rdpriv; 157 const char *wrpriv; 158 mode_t priv_mode; 159 } mm[] = { 160 { "mem", M_MEM, 0, NULL, "all", 0640 }, 161 { "kmem", M_KMEM, 0, NULL, "all", 0640 }, 162 { "allkmem", M_ALLKMEM, 0, "all", "all", 0600 }, 163 { "null", M_NULL, PRIVONLY_DEV, NULL, NULL, 0666 }, 164 { "zero", M_ZERO, PRIVONLY_DEV, NULL, NULL, 0666 }, 165 { "full", M_FULL, PRIVONLY_DEV, NULL, NULL, 0666 }, 166 }; 167 kstat_t *ksp; 168 169 mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL); 170 mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); 171 172 for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) { 173 if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR, 174 mm[i].minor, DDI_PSEUDO, mm[i].privonly, 175 mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) == 176 DDI_FAILURE) { 177 ddi_remove_minor_node(devi, NULL); 178 return (DDI_FAILURE); 179 } 180 } 181 182 mm_dip = devi; 183 184 ksp = kstat_create("mm", 0, "phys_installed", "misc", 185 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL); 186 if (ksp != NULL) { 187 ksp->ks_update = mm_kstat_update; 188 ksp->ks_snapshot = mm_kstat_snapshot; 189 ksp->ks_lock = &mm_lock; /* XXX - not really needed */ 190 kstat_install(ksp); 191 } 192 193 mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS, 194 "kmem_io_access", 0); 195 196 return (DDI_SUCCESS); 197 } 198 199 /*ARGSUSED*/ 200 static int 201 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 202 { 203 register int error; 204 205 switch (infocmd) { 206 case DDI_INFO_DEVT2DEVINFO: 207 *result = (void *)mm_dip; 208 error = DDI_SUCCESS; 209 break; 210 case DDI_INFO_DEVT2INSTANCE: 211 *result = (void *)0; 212 error = DDI_SUCCESS; 213 break; 214 default: 215 error = DDI_FAILURE; 216 } 217 return (error); 218 } 219 220 /*ARGSUSED1*/ 221 static int 222 mmopen(dev_t *devp, int flag, int typ, struct cred *cred) 223 { 224 switch (getminor(*devp)) { 225 case M_NULL: 226 case M_ZERO: 227 case M_FULL: 228 case M_MEM: 229 case M_KMEM: 230 case M_ALLKMEM: 231 /* standard devices */ 232 break; 233 234 default: 235 /* Unsupported or unknown type */ 236 return (EINVAL); 237 } 238 /* must be character device */ 239 if (typ != OTYP_CHR) 240 return (EINVAL); 241 return (0); 242 } 243 244 struct pollhead mm_pollhd; 245 246 /*ARGSUSED*/ 247 static int 248 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp, 249 struct pollhead **phpp) 250 { 251 switch (getminor(dev)) { 252 case M_NULL: 253 case M_ZERO: 254 case M_FULL: 255 case M_MEM: 256 case M_KMEM: 257 case M_ALLKMEM: 258 *reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM | 259 POLLWRNORM | POLLRDBAND | POLLWRBAND); 260 /* 261 * A non NULL pollhead pointer should be returned in case 262 * user polls for 0 events. 263 */ 264 *phpp = !anyyet && !*reventsp ? 265 &mm_pollhd : (struct pollhead *)NULL; 266 return (0); 267 default: 268 /* no other devices currently support polling */ 269 return (ENXIO); 270 } 271 } 272 273 static int 274 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags, 275 char *name, caddr_t valuep, int *lengthp) 276 { 277 /* 278 * implement zero size to reduce overhead (avoid two failing 279 * property lookups per stat). 280 */ 281 return (ddi_prop_op_size(dev, dip, prop_op, 282 flags, name, valuep, lengthp, 0)); 283 } 284 285 static int 286 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio, 287 page_t *pp) 288 { 289 int error = 0; 290 int devload = 0; 291 int is_memory = pf_is_memory(pfn); 292 size_t nbytes = MIN((size_t)(PAGESIZE - pageoff), 293 (size_t)uio->uio_iov->iov_len); 294 caddr_t va = NULL; 295 296 mutex_enter(&mm_lock); 297 298 if (is_memory && kpm_enable) { 299 if (pp) 300 va = hat_kpm_mapin(pp, NULL); 301 else 302 va = hat_kpm_mapin_pfn(pfn); 303 } 304 305 if (va == NULL) { 306 hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn, 307 (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ|PROT_WRITE), 308 HAT_LOAD_NOCONSIST|HAT_LOAD_LOCK); 309 va = mm_map; 310 devload = 1; 311 } 312 313 if (!is_memory) { 314 if (allowio) { 315 size_t c = uio->uio_iov->iov_len; 316 317 if (ddi_peekpokeio(NULL, uio, rw, 318 (caddr_t)(uintptr_t)uio->uio_loffset, c, 319 sizeof (int32_t)) != DDI_SUCCESS) 320 error = EFAULT; 321 } else 322 error = EIO; 323 } else 324 error = uiomove(va + pageoff, nbytes, rw, uio); 325 326 if (devload) 327 hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK); 328 else if (pp) 329 hat_kpm_mapout(pp, NULL, va); 330 else 331 hat_kpm_mapout_pfn(pfn); 332 333 mutex_exit(&mm_lock); 334 return (error); 335 } 336 337 static int 338 mmpagelock(struct as *as, caddr_t va) 339 { 340 struct seg *seg; 341 int i; 342 343 AS_LOCK_ENTER(as, RW_READER); 344 seg = as_segat(as, va); 345 i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0; 346 AS_LOCK_EXIT(as); 347 348 return (i); 349 } 350 351 #ifdef __sparc 352 353 #define NEED_LOCK_KVADDR(kva) mmpagelock(&kas, kva) 354 355 #else /* __i386, __amd64 */ 356 357 #define NEED_LOCK_KVADDR(va) 0 358 359 #endif /* __sparc */ 360 361 /*ARGSUSED3*/ 362 static int 363 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred) 364 { 365 pfn_t v; 366 struct iovec *iov; 367 int error = 0; 368 size_t c; 369 ssize_t oresid = uio->uio_resid; 370 minor_t minor = getminor(dev); 371 372 while (uio->uio_resid > 0 && error == 0) { 373 iov = uio->uio_iov; 374 if (iov->iov_len == 0) { 375 uio->uio_iov++; 376 uio->uio_iovcnt--; 377 if (uio->uio_iovcnt < 0) 378 panic("mmrw"); 379 continue; 380 } 381 switch (minor) { 382 383 case M_MEM: 384 memlist_read_lock(); 385 if (!address_in_memlist(phys_install, 386 (uint64_t)uio->uio_loffset, 1)) { 387 memlist_read_unlock(); 388 error = EFAULT; 389 break; 390 } 391 memlist_read_unlock(); 392 393 v = BTOP((u_offset_t)uio->uio_loffset); 394 error = mmio(uio, rw, v, 395 uio->uio_loffset & PAGEOFFSET, 0, NULL); 396 break; 397 398 case M_KMEM: 399 case M_ALLKMEM: 400 { 401 page_t **ppp = NULL; 402 caddr_t vaddr = (caddr_t)uio->uio_offset; 403 int try_lock = NEED_LOCK_KVADDR(vaddr); 404 int locked = 0; 405 406 if ((error = plat_mem_do_mmio(uio, rw)) != ENOTSUP) 407 break; 408 409 if (rw == UIO_WRITE) 410 mm_logkmem(uio); 411 412 /* 413 * If vaddr does not map a valid page, as_pagelock() 414 * will return failure. Hence we can't check the 415 * return value and return EFAULT here as we'd like. 416 * seg_kp and seg_kpm do not properly support 417 * as_pagelock() for this context so we avoid it 418 * using the try_lock set check above. Some day when 419 * the kernel page locking gets redesigned all this 420 * muck can be cleaned up. 421 */ 422 if (try_lock) 423 locked = (as_pagelock(&kas, &ppp, vaddr, 424 PAGESIZE, S_WRITE) == 0); 425 426 v = hat_getpfnum(kas.a_hat, 427 (caddr_t)(uintptr_t)uio->uio_loffset); 428 if (v == PFN_INVALID) { 429 if (locked) 430 as_pageunlock(&kas, ppp, vaddr, 431 PAGESIZE, S_WRITE); 432 error = EFAULT; 433 break; 434 } 435 436 error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET, 437 minor == M_ALLKMEM || mm_kmem_io_access, 438 (locked && ppp) ? *ppp : NULL); 439 if (locked) 440 as_pageunlock(&kas, ppp, vaddr, PAGESIZE, 441 S_WRITE); 442 } 443 444 break; 445 446 case M_FULL: 447 if (rw == UIO_WRITE) { 448 error = ENOSPC; 449 break; 450 } 451 /* else it's a read, fall through to zero case */ 452 /*FALLTHROUGH*/ 453 454 case M_ZERO: 455 if (rw == UIO_READ) { 456 label_t ljb; 457 458 if (on_fault(&ljb)) { 459 no_fault(); 460 error = EFAULT; 461 break; 462 } 463 uzero(iov->iov_base, iov->iov_len); 464 no_fault(); 465 uio->uio_resid -= iov->iov_len; 466 uio->uio_loffset += iov->iov_len; 467 break; 468 } 469 /* else it's a write, fall through to NULL case */ 470 /*FALLTHROUGH*/ 471 472 case M_NULL: 473 if (rw == UIO_READ) 474 return (0); 475 c = iov->iov_len; 476 iov->iov_base += c; 477 iov->iov_len -= c; 478 uio->uio_loffset += c; 479 uio->uio_resid -= c; 480 break; 481 482 } 483 } 484 return (uio->uio_resid == oresid ? error : 0); 485 } 486 487 static int 488 mmread(dev_t dev, struct uio *uio, cred_t *cred) 489 { 490 return (mmrw(dev, uio, UIO_READ, cred)); 491 } 492 493 static int 494 mmwrite(dev_t dev, struct uio *uio, cred_t *cred) 495 { 496 return (mmrw(dev, uio, UIO_WRITE, cred)); 497 } 498 499 /* 500 * Private ioctl for libkvm to support kvm_physaddr(). 501 * Given an address space and a VA, compute the PA. 502 */ 503 static int 504 mmioctl_vtop(intptr_t data) 505 { 506 #ifdef _SYSCALL32 507 mem_vtop32_t vtop32; 508 #endif 509 mem_vtop_t mem_vtop; 510 proc_t *p; 511 pfn_t pfn = (pfn_t)PFN_INVALID; 512 pid_t pid = 0; 513 struct as *as; 514 struct seg *seg; 515 516 if (get_udatamodel() == DATAMODEL_NATIVE) { 517 if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t))) 518 return (EFAULT); 519 } 520 #ifdef _SYSCALL32 521 else { 522 if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t))) 523 return (EFAULT); 524 mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as; 525 mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va; 526 527 if (mem_vtop.m_as != NULL) 528 return (EINVAL); 529 } 530 #endif 531 532 if (mem_vtop.m_as == &kas) { 533 pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va); 534 } else { 535 if (mem_vtop.m_as == NULL) { 536 /* 537 * Assume the calling process's address space if the 538 * caller didn't specify one. 539 */ 540 p = curthread->t_procp; 541 if (p == NULL) 542 return (EIO); 543 mem_vtop.m_as = p->p_as; 544 } 545 546 mutex_enter(&pidlock); 547 for (p = practive; p != NULL; p = p->p_next) { 548 if (p->p_as == mem_vtop.m_as) { 549 pid = p->p_pid; 550 break; 551 } 552 } 553 mutex_exit(&pidlock); 554 if (p == NULL) 555 return (EIO); 556 p = sprlock(pid); 557 if (p == NULL) 558 return (EIO); 559 as = p->p_as; 560 if (as == mem_vtop.m_as) { 561 mutex_exit(&p->p_lock); 562 AS_LOCK_ENTER(as, RW_READER); 563 for (seg = AS_SEGFIRST(as); seg != NULL; 564 seg = AS_SEGNEXT(as, seg)) 565 if ((uintptr_t)mem_vtop.m_va - 566 (uintptr_t)seg->s_base < seg->s_size) 567 break; 568 if (seg != NULL) 569 pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va); 570 AS_LOCK_EXIT(as); 571 mutex_enter(&p->p_lock); 572 } 573 sprunlock(p); 574 } 575 mem_vtop.m_pfn = pfn; 576 if (pfn == PFN_INVALID) 577 return (EIO); 578 579 if (get_udatamodel() == DATAMODEL_NATIVE) { 580 if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t))) 581 return (EFAULT); 582 } 583 #ifdef _SYSCALL32 584 else { 585 vtop32.m_pfn = mem_vtop.m_pfn; 586 if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t))) 587 return (EFAULT); 588 } 589 #endif 590 591 return (0); 592 } 593 594 /* 595 * Given a PA, execute the given page retire command on it. 596 */ 597 static int 598 mmioctl_page_retire(int cmd, intptr_t data) 599 { 600 extern int page_retire_test(void); 601 uint64_t pa; 602 603 if (copyin((void *)data, &pa, sizeof (uint64_t))) { 604 return (EFAULT); 605 } 606 607 switch (cmd) { 608 case MEM_PAGE_ISRETIRED: 609 return (page_retire_check(pa, NULL)); 610 611 case MEM_PAGE_UNRETIRE: 612 return (page_unretire(pa)); 613 614 case MEM_PAGE_RETIRE: 615 return (page_retire(pa, PR_FMA)); 616 617 case MEM_PAGE_RETIRE_MCE: 618 return (page_retire(pa, PR_MCE)); 619 620 case MEM_PAGE_RETIRE_UE: 621 return (page_retire(pa, PR_UE)); 622 623 case MEM_PAGE_GETERRORS: 624 { 625 uint64_t page_errors; 626 int rc = page_retire_check(pa, &page_errors); 627 if (copyout(&page_errors, (void *)data, 628 sizeof (uint64_t))) { 629 return (EFAULT); 630 } 631 return (rc); 632 } 633 634 case MEM_PAGE_RETIRE_TEST: 635 return (page_retire_test()); 636 637 } 638 639 return (EINVAL); 640 } 641 642 #ifdef __sparc 643 /* 644 * Given a syndrome, syndrome type, and address return the 645 * associated memory name in the provided data buffer. 646 */ 647 static int 648 mmioctl_get_mem_name(intptr_t data) 649 { 650 mem_name_t mem_name; 651 void *buf; 652 size_t bufsize; 653 int len, err; 654 655 if ((bufsize = cpu_get_name_bufsize()) == 0) 656 return (ENOTSUP); 657 658 if ((err = mm_read_mem_name(data, &mem_name)) < 0) 659 return (err); 660 661 buf = kmem_alloc(bufsize, KM_SLEEP); 662 663 /* 664 * Call into cpu specific code to do the lookup. 665 */ 666 if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type, 667 mem_name.m_addr, buf, bufsize, &len)) != 0) { 668 kmem_free(buf, bufsize); 669 return (err); 670 } 671 672 if (len >= mem_name.m_namelen) { 673 kmem_free(buf, bufsize); 674 return (ENOSPC); 675 } 676 677 if (copyoutstr(buf, (char *)mem_name.m_name, 678 mem_name.m_namelen, NULL) != 0) { 679 kmem_free(buf, bufsize); 680 return (EFAULT); 681 } 682 683 kmem_free(buf, bufsize); 684 return (0); 685 } 686 687 /* 688 * Given a syndrome and address return information about the associated memory. 689 */ 690 static int 691 mmioctl_get_mem_info(intptr_t data) 692 { 693 mem_info_t mem_info; 694 int err; 695 696 if (copyin((void *)data, &mem_info, sizeof (mem_info_t))) 697 return (EFAULT); 698 699 if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr, 700 &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size, 701 &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0) 702 return (err); 703 704 if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0) 705 return (EFAULT); 706 707 return (0); 708 } 709 710 /* 711 * Given a memory name, return its associated serial id 712 */ 713 static int 714 mmioctl_get_mem_sid(intptr_t data) 715 { 716 mem_name_t mem_name; 717 void *buf; 718 void *name; 719 size_t name_len; 720 size_t bufsize; 721 int len, err; 722 723 if ((bufsize = cpu_get_name_bufsize()) == 0) 724 return (ENOTSUP); 725 726 if ((err = mm_read_mem_name(data, &mem_name)) < 0) 727 return (err); 728 729 buf = kmem_alloc(bufsize, KM_SLEEP); 730 731 if (mem_name.m_namelen > 1024) 732 mem_name.m_namelen = 1024; /* cap at 1024 bytes */ 733 734 name = kmem_alloc(mem_name.m_namelen, KM_SLEEP); 735 736 if ((err = copyinstr((char *)mem_name.m_name, (char *)name, 737 mem_name.m_namelen, &name_len)) != 0) { 738 kmem_free(buf, bufsize); 739 kmem_free(name, mem_name.m_namelen); 740 return (err); 741 } 742 743 /* 744 * Call into cpu specific code to do the lookup. 745 */ 746 if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) { 747 kmem_free(buf, bufsize); 748 kmem_free(name, mem_name.m_namelen); 749 return (err); 750 } 751 752 if (len > mem_name.m_sidlen) { 753 kmem_free(buf, bufsize); 754 kmem_free(name, mem_name.m_namelen); 755 return (ENAMETOOLONG); 756 } 757 758 if (copyoutstr(buf, (char *)mem_name.m_sid, 759 mem_name.m_sidlen, NULL) != 0) { 760 kmem_free(buf, bufsize); 761 kmem_free(name, mem_name.m_namelen); 762 return (EFAULT); 763 } 764 765 kmem_free(buf, bufsize); 766 kmem_free(name, mem_name.m_namelen); 767 return (0); 768 } 769 #endif /* __sparc */ 770 771 /* 772 * Private ioctls for 773 * libkvm to support kvm_physaddr(). 774 * FMA support for page_retire() and memory attribute information. 775 */ 776 /*ARGSUSED*/ 777 static int 778 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp) 779 { 780 if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) || 781 (cmd != MEM_VTOP && getminor(dev) != M_MEM)) 782 return (ENXIO); 783 784 switch (cmd) { 785 case MEM_VTOP: 786 return (mmioctl_vtop(data)); 787 788 case MEM_PAGE_RETIRE: 789 case MEM_PAGE_ISRETIRED: 790 case MEM_PAGE_UNRETIRE: 791 case MEM_PAGE_RETIRE_MCE: 792 case MEM_PAGE_RETIRE_UE: 793 case MEM_PAGE_GETERRORS: 794 case MEM_PAGE_RETIRE_TEST: 795 return (mmioctl_page_retire(cmd, data)); 796 797 #ifdef __sparc 798 case MEM_NAME: 799 return (mmioctl_get_mem_name(data)); 800 801 case MEM_INFO: 802 return (mmioctl_get_mem_info(data)); 803 804 case MEM_SID: 805 return (mmioctl_get_mem_sid(data)); 806 #else 807 case MEM_NAME: 808 case MEM_INFO: 809 case MEM_SID: 810 return (ENOTSUP); 811 #endif /* __sparc */ 812 } 813 return (ENXIO); 814 } 815 816 /*ARGSUSED2*/ 817 static int 818 mmmmap(dev_t dev, off_t off, int prot) 819 { 820 pfn_t pf; 821 struct memlist *pmem; 822 minor_t minor = getminor(dev); 823 824 switch (minor) { 825 case M_MEM: 826 pf = btop(off); 827 memlist_read_lock(); 828 for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) { 829 if (pf >= BTOP(pmem->ml_address) && 830 pf < BTOP(pmem->ml_address + pmem->ml_size)) { 831 memlist_read_unlock(); 832 return (impl_obmem_pfnum(pf)); 833 } 834 } 835 memlist_read_unlock(); 836 break; 837 838 case M_KMEM: 839 case M_ALLKMEM: 840 /* no longer supported with KPR */ 841 return (-1); 842 843 case M_FULL: 844 case M_ZERO: 845 /* 846 * We shouldn't be mmap'ing to /dev/zero here as 847 * mmsegmap() should have already converted 848 * a mapping request for this device to a mapping 849 * using seg_vn for anonymous memory. 850 */ 851 break; 852 853 } 854 return (-1); 855 } 856 857 /* 858 * This function is called when a memory device is mmap'ed. 859 * Set up the mapping to the correct device driver. 860 */ 861 static int 862 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, 863 uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred) 864 { 865 struct segvn_crargs vn_a; 866 struct segdev_crargs dev_a; 867 int error; 868 minor_t minor; 869 off_t i; 870 871 minor = getminor(dev); 872 873 as_rangelock(as); 874 /* 875 * No need to worry about vac alignment on /dev/zero 876 * since this is a "clone" object that doesn't yet exist. 877 */ 878 error = choose_addr(as, addrp, len, off, 879 (minor == M_MEM) || (minor == M_KMEM), flags); 880 if (error != 0) { 881 as_rangeunlock(as); 882 return (error); 883 } 884 885 switch (minor) { 886 case M_MEM: 887 /* /dev/mem cannot be mmap'ed with MAP_PRIVATE */ 888 if ((flags & MAP_TYPE) != MAP_SHARED) { 889 as_rangeunlock(as); 890 return (EINVAL); 891 } 892 893 /* 894 * Check to ensure that the entire range is 895 * legal and we are not trying to map in 896 * more than the device will let us. 897 */ 898 for (i = 0; i < len; i += PAGESIZE) { 899 if (mmmmap(dev, off + i, maxprot) == -1) { 900 as_rangeunlock(as); 901 return (ENXIO); 902 } 903 } 904 905 /* 906 * Use seg_dev segment driver for /dev/mem mapping. 907 */ 908 dev_a.mapfunc = mmmmap; 909 dev_a.dev = dev; 910 dev_a.offset = off; 911 dev_a.type = (flags & MAP_TYPE); 912 dev_a.prot = (uchar_t)prot; 913 dev_a.maxprot = (uchar_t)maxprot; 914 dev_a.hat_attr = 0; 915 916 /* 917 * Make /dev/mem mappings non-consistent since we can't 918 * alias pages that don't have page structs behind them, 919 * such as kernel stack pages. If someone mmap()s a kernel 920 * stack page and if we give them a tte with cv, a line from 921 * that page can get into both pages of the spitfire d$. 922 * But snoop from another processor will only invalidate 923 * the first page. This later caused kernel (xc_attention) 924 * to go into an infinite loop at pil 13 and no interrupts 925 * could come in. See 1203630. 926 * 927 */ 928 dev_a.hat_flags = HAT_LOAD_NOCONSIST; 929 dev_a.devmap_data = NULL; 930 931 error = as_map(as, *addrp, len, segdev_create, &dev_a); 932 break; 933 934 case M_ZERO: 935 /* 936 * Use seg_vn segment driver for /dev/zero mapping. 937 * Passing in a NULL amp gives us the "cloning" effect. 938 */ 939 vn_a.vp = NULL; 940 vn_a.offset = 0; 941 vn_a.type = (flags & MAP_TYPE); 942 vn_a.prot = prot; 943 vn_a.maxprot = maxprot; 944 vn_a.flags = flags & ~MAP_TYPE; 945 vn_a.cred = cred; 946 vn_a.amp = NULL; 947 vn_a.szc = 0; 948 vn_a.lgrp_mem_policy_flags = 0; 949 error = as_map(as, *addrp, len, segvn_create, &vn_a); 950 break; 951 952 case M_KMEM: 953 case M_ALLKMEM: 954 /* No longer supported with KPR. */ 955 error = ENXIO; 956 break; 957 958 case M_NULL: 959 /* 960 * Use seg_dev segment driver for /dev/null mapping. 961 */ 962 dev_a.mapfunc = mmmmap; 963 dev_a.dev = dev; 964 dev_a.offset = off; 965 dev_a.type = 0; /* neither PRIVATE nor SHARED */ 966 dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE; 967 dev_a.hat_attr = 0; 968 dev_a.hat_flags = 0; 969 error = as_map(as, *addrp, len, segdev_create, &dev_a); 970 break; 971 972 default: 973 error = ENXIO; 974 } 975 976 as_rangeunlock(as); 977 return (error); 978 } 979 980 static struct cb_ops mm_cb_ops = { 981 mmopen, /* open */ 982 nulldev, /* close */ 983 nodev, /* strategy */ 984 nodev, /* print */ 985 nodev, /* dump */ 986 mmread, /* read */ 987 mmwrite, /* write */ 988 mmioctl, /* ioctl */ 989 nodev, /* devmap */ 990 mmmmap, /* mmap */ 991 mmsegmap, /* segmap */ 992 mmchpoll, /* poll */ 993 mmpropop, /* prop_op */ 994 0, /* streamtab */ 995 D_NEW | D_MP | D_64BIT | D_U64BIT 996 }; 997 998 static struct dev_ops mm_ops = { 999 DEVO_REV, /* devo_rev, */ 1000 0, /* refcnt */ 1001 mm_info, /* get_dev_info */ 1002 nulldev, /* identify */ 1003 nulldev, /* probe */ 1004 mm_attach, /* attach */ 1005 nodev, /* detach */ 1006 nodev, /* reset */ 1007 &mm_cb_ops, /* driver operations */ 1008 (struct bus_ops *)0, /* bus operations */ 1009 NULL, /* power */ 1010 ddi_quiesce_not_needed, /* quiesce */ 1011 }; 1012 1013 static struct modldrv modldrv = { 1014 &mod_driverops, "memory driver", &mm_ops, 1015 }; 1016 1017 static struct modlinkage modlinkage = { 1018 MODREV_1, &modldrv, NULL 1019 }; 1020 1021 int 1022 _init(void) 1023 { 1024 return (mod_install(&modlinkage)); 1025 } 1026 1027 int 1028 _info(struct modinfo *modinfop) 1029 { 1030 return (mod_info(&modlinkage, modinfop)); 1031 } 1032 1033 int 1034 _fini(void) 1035 { 1036 return (mod_remove(&modlinkage)); 1037 } 1038 1039 static int 1040 mm_kstat_update(kstat_t *ksp, int rw) 1041 { 1042 struct memlist *pmem; 1043 uint_t count; 1044 1045 if (rw == KSTAT_WRITE) 1046 return (EACCES); 1047 1048 count = 0; 1049 memlist_read_lock(); 1050 for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) { 1051 count++; 1052 } 1053 memlist_read_unlock(); 1054 1055 ksp->ks_ndata = count; 1056 ksp->ks_data_size = count * 2 * sizeof (uint64_t); 1057 1058 return (0); 1059 } 1060 1061 static int 1062 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw) 1063 { 1064 struct memlist *pmem; 1065 struct memunit { 1066 uint64_t address; 1067 uint64_t size; 1068 } *kspmem; 1069 1070 if (rw == KSTAT_WRITE) 1071 return (EACCES); 1072 1073 ksp->ks_snaptime = gethrtime(); 1074 1075 kspmem = (struct memunit *)buf; 1076 memlist_read_lock(); 1077 for (pmem = phys_install; pmem != NULL; 1078 pmem = pmem->ml_next, kspmem++) { 1079 if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) 1080 break; 1081 kspmem->address = pmem->ml_address; 1082 kspmem->size = pmem->ml_size; 1083 } 1084 memlist_read_unlock(); 1085 1086 return (0); 1087 } 1088 1089 /* 1090 * Read a mem_name_t from user-space and store it in the mem_name_t 1091 * pointed to by the mem_name argument. 1092 */ 1093 static int 1094 mm_read_mem_name(intptr_t data, mem_name_t *mem_name) 1095 { 1096 if (get_udatamodel() == DATAMODEL_NATIVE) { 1097 if (copyin((void *)data, mem_name, sizeof (mem_name_t))) 1098 return (EFAULT); 1099 } 1100 #ifdef _SYSCALL32 1101 else { 1102 mem_name32_t mem_name32; 1103 1104 if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t))) 1105 return (EFAULT); 1106 mem_name->m_addr = mem_name32.m_addr; 1107 mem_name->m_synd = mem_name32.m_synd; 1108 mem_name->m_type[0] = mem_name32.m_type[0]; 1109 mem_name->m_type[1] = mem_name32.m_type[1]; 1110 mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name; 1111 mem_name->m_namelen = (size_t)mem_name32.m_namelen; 1112 mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid; 1113 mem_name->m_sidlen = (size_t)mem_name32.m_sidlen; 1114 } 1115 #endif /* _SYSCALL32 */ 1116 1117 return (0); 1118 } 1119