1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright (c) 2016 by Delphix. All rights reserved. 25 */ 26 27 /* 28 * Copyright 2017 Joyent, Inc. 29 * Copyright 2017 James S Blachly, MD <james.blachly@gmail.com> 30 */ 31 32 /* 33 * Memory special file 34 */ 35 36 #include <sys/types.h> 37 #include <sys/param.h> 38 #include <sys/user.h> 39 #include <sys/buf.h> 40 #include <sys/systm.h> 41 #include <sys/cred.h> 42 #include <sys/vm.h> 43 #include <sys/uio.h> 44 #include <sys/mman.h> 45 #include <sys/kmem.h> 46 #include <vm/seg.h> 47 #include <vm/page.h> 48 #include <sys/stat.h> 49 #include <sys/vmem.h> 50 #include <sys/memlist.h> 51 #include <sys/bootconf.h> 52 53 #include <vm/seg_vn.h> 54 #include <vm/seg_dev.h> 55 #include <vm/seg_kmem.h> 56 #include <vm/seg_kp.h> 57 #include <vm/seg_kpm.h> 58 #include <vm/hat.h> 59 60 #include <sys/conf.h> 61 #include <sys/mem.h> 62 #include <sys/types.h> 63 #include <sys/conf.h> 64 #include <sys/param.h> 65 #include <sys/systm.h> 66 #include <sys/errno.h> 67 #include <sys/modctl.h> 68 #include <sys/memlist.h> 69 #include <sys/ddi.h> 70 #include <sys/sunddi.h> 71 #include <sys/debug.h> 72 #include <sys/fm/protocol.h> 73 74 #if defined(__sparc) 75 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *); 76 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *, 77 uint64_t *, int *, int *, int *); 78 extern size_t cpu_get_name_bufsize(void); 79 extern int cpu_get_mem_sid(char *, char *, int, int *); 80 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *); 81 #elif defined(__x86) 82 #include <sys/cpu_module.h> 83 #endif /* __sparc */ 84 85 /* 86 * Turn a byte length into a pagecount. The DDI btop takes a 87 * 32-bit size on 32-bit machines, this handles 64-bit sizes for 88 * large physical-memory 32-bit machines. 89 */ 90 #define BTOP(x) ((pgcnt_t)((x) >> _pageshift)) 91 92 static kmutex_t mm_lock; 93 static caddr_t mm_map; 94 95 static dev_info_t *mm_dip; /* private copy of devinfo pointer */ 96 97 static int mm_kmem_io_access; 98 99 static int mm_kstat_update(kstat_t *ksp, int rw); 100 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw); 101 102 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name); 103 104 #define MM_KMEMLOG_NENTRIES 64 105 106 static int mm_kmemlogent; 107 static mm_logentry_t mm_kmemlog[MM_KMEMLOG_NENTRIES]; 108 109 /* 110 * On kmem/allmem writes, we log information that might be useful in the event 111 * that a write is errant (that is, due to operator error) and induces a later 112 * problem. Note that (in particular) in the event of such operator-induced 113 * corruption, a search over the kernel address space for the corrupted 114 * address will yield the ring buffer entry that recorded the write. And 115 * should it seem baroque or otherwise unnecessary, yes, we need this kind of 116 * auditing facility and yes, we learned that the hard way: disturbingly, 117 * there exist recommendations for "tuning" the system that involve writing to 118 * kernel memory addresses via the kernel debugger, and -- as we discovered -- 119 * these can easily be applied incorrectly or unsafely, yielding an entirely 120 * undebuggable "can't happen" kind of panic. 121 */ 122 static void 123 mm_logkmem(struct uio *uio) 124 { 125 mm_logentry_t *ent; 126 proc_t *p = curthread->t_procp; 127 128 mutex_enter(&mm_lock); 129 130 ent = &mm_kmemlog[mm_kmemlogent++]; 131 132 if (mm_kmemlogent == MM_KMEMLOG_NENTRIES) 133 mm_kmemlogent = 0; 134 135 ent->mle_vaddr = (uintptr_t)uio->uio_loffset; 136 ent->mle_len = uio->uio_resid; 137 gethrestime(&ent->mle_hrestime); 138 ent->mle_hrtime = gethrtime(); 139 ent->mle_pid = p->p_pidp->pid_id; 140 141 (void) strncpy(ent->mle_psargs, 142 p->p_user.u_psargs, sizeof (ent->mle_psargs)); 143 144 mutex_exit(&mm_lock); 145 } 146 147 /*ARGSUSED1*/ 148 static int 149 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) 150 { 151 int i; 152 struct mem_minor { 153 char *name; 154 minor_t minor; 155 int privonly; 156 const char *rdpriv; 157 const char *wrpriv; 158 mode_t priv_mode; 159 } mm[] = { 160 { "mem", M_MEM, 0, NULL, "all", 0640 }, 161 { "kmem", M_KMEM, 0, NULL, "all", 0640 }, 162 { "allkmem", M_ALLKMEM, 0, "all", "all", 0600 }, 163 { "null", M_NULL, PRIVONLY_DEV, NULL, NULL, 0666 }, 164 { "zero", M_ZERO, PRIVONLY_DEV, NULL, NULL, 0666 }, 165 { "full", M_FULL, PRIVONLY_DEV, NULL, NULL, 0666 }, 166 }; 167 kstat_t *ksp; 168 169 mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL); 170 mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); 171 172 for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) { 173 if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR, 174 mm[i].minor, DDI_PSEUDO, mm[i].privonly, 175 mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) == 176 DDI_FAILURE) { 177 ddi_remove_minor_node(devi, NULL); 178 return (DDI_FAILURE); 179 } 180 } 181 182 mm_dip = devi; 183 184 ksp = kstat_create("mm", 0, "phys_installed", "misc", 185 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL); 186 if (ksp != NULL) { 187 ksp->ks_update = mm_kstat_update; 188 ksp->ks_snapshot = mm_kstat_snapshot; 189 ksp->ks_lock = &mm_lock; /* XXX - not really needed */ 190 kstat_install(ksp); 191 } 192 193 mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS, 194 "kmem_io_access", 0); 195 196 return (DDI_SUCCESS); 197 } 198 199 /*ARGSUSED*/ 200 static int 201 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 202 { 203 register int error; 204 205 switch (infocmd) { 206 case DDI_INFO_DEVT2DEVINFO: 207 *result = (void *)mm_dip; 208 error = DDI_SUCCESS; 209 break; 210 case DDI_INFO_DEVT2INSTANCE: 211 *result = (void *)0; 212 error = DDI_SUCCESS; 213 break; 214 default: 215 error = DDI_FAILURE; 216 } 217 return (error); 218 } 219 220 /*ARGSUSED1*/ 221 static int 222 mmopen(dev_t *devp, int flag, int typ, struct cred *cred) 223 { 224 switch (getminor(*devp)) { 225 case M_NULL: 226 case M_ZERO: 227 case M_FULL: 228 case M_MEM: 229 case M_KMEM: 230 case M_ALLKMEM: 231 /* standard devices */ 232 break; 233 234 default: 235 /* Unsupported or unknown type */ 236 return (EINVAL); 237 } 238 /* must be character device */ 239 if (typ != OTYP_CHR) 240 return (EINVAL); 241 return (0); 242 } 243 244 struct pollhead mm_pollhd; 245 246 /*ARGSUSED*/ 247 static int 248 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp, 249 struct pollhead **phpp) 250 { 251 switch (getminor(dev)) { 252 case M_NULL: 253 case M_ZERO: 254 case M_FULL: 255 case M_MEM: 256 case M_KMEM: 257 case M_ALLKMEM: 258 *reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM | 259 POLLWRNORM | POLLRDBAND | POLLWRBAND); 260 /* 261 * A non NULL pollhead pointer should be returned in case 262 * user polls for 0 events or is doing an edge-triggerd poll. 263 */ 264 if ((!*reventsp && !anyyet) || (events & POLLET)) { 265 *phpp = &mm_pollhd; 266 } 267 return (0); 268 default: 269 /* no other devices currently support polling */ 270 return (ENXIO); 271 } 272 } 273 274 static int 275 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags, 276 char *name, caddr_t valuep, int *lengthp) 277 { 278 /* 279 * implement zero size to reduce overhead (avoid two failing 280 * property lookups per stat). 281 */ 282 return (ddi_prop_op_size(dev, dip, prop_op, 283 flags, name, valuep, lengthp, 0)); 284 } 285 286 static int 287 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio, 288 page_t *pp) 289 { 290 int error = 0; 291 int devload = 0; 292 int is_memory = pf_is_memory(pfn); 293 size_t nbytes = MIN((size_t)(PAGESIZE - pageoff), 294 (size_t)uio->uio_iov->iov_len); 295 caddr_t va = NULL; 296 297 mutex_enter(&mm_lock); 298 299 if (is_memory && kpm_enable) { 300 if (pp) 301 va = hat_kpm_mapin(pp, NULL); 302 else 303 va = hat_kpm_mapin_pfn(pfn); 304 } 305 306 if (va == NULL) { 307 hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn, 308 (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ|PROT_WRITE), 309 HAT_LOAD_NOCONSIST|HAT_LOAD_LOCK); 310 va = mm_map; 311 devload = 1; 312 } 313 314 if (!is_memory) { 315 if (allowio) { 316 size_t c = uio->uio_iov->iov_len; 317 318 if (ddi_peekpokeio(NULL, uio, rw, 319 (caddr_t)(uintptr_t)uio->uio_loffset, c, 320 sizeof (int32_t)) != DDI_SUCCESS) 321 error = EFAULT; 322 } else 323 error = EIO; 324 } else 325 error = uiomove(va + pageoff, nbytes, rw, uio); 326 327 if (devload) 328 hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK); 329 else if (pp) 330 hat_kpm_mapout(pp, NULL, va); 331 else 332 hat_kpm_mapout_pfn(pfn); 333 334 mutex_exit(&mm_lock); 335 return (error); 336 } 337 338 static int 339 mmpagelock(struct as *as, caddr_t va) 340 { 341 struct seg *seg; 342 int i; 343 344 AS_LOCK_ENTER(as, RW_READER); 345 seg = as_segat(as, va); 346 i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0; 347 AS_LOCK_EXIT(as); 348 349 return (i); 350 } 351 352 #ifdef __sparc 353 354 #define NEED_LOCK_KVADDR(kva) mmpagelock(&kas, kva) 355 356 #else 357 358 #define NEED_LOCK_KVADDR(va) 0 359 360 #endif /* __sparc */ 361 362 /*ARGSUSED3*/ 363 static int 364 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred) 365 { 366 pfn_t v; 367 struct iovec *iov; 368 int error = 0; 369 size_t c; 370 ssize_t oresid = uio->uio_resid; 371 minor_t minor = getminor(dev); 372 373 while (uio->uio_resid > 0 && error == 0) { 374 iov = uio->uio_iov; 375 if (iov->iov_len == 0) { 376 uio->uio_iov++; 377 uio->uio_iovcnt--; 378 if (uio->uio_iovcnt < 0) 379 panic("mmrw"); 380 continue; 381 } 382 switch (minor) { 383 384 case M_MEM: 385 memlist_read_lock(); 386 if (!address_in_memlist(phys_install, 387 (uint64_t)uio->uio_loffset, 1)) { 388 memlist_read_unlock(); 389 error = EFAULT; 390 break; 391 } 392 memlist_read_unlock(); 393 394 v = BTOP((u_offset_t)uio->uio_loffset); 395 error = mmio(uio, rw, v, 396 uio->uio_loffset & PAGEOFFSET, 0, NULL); 397 break; 398 399 case M_KMEM: 400 case M_ALLKMEM: 401 { 402 page_t **ppp = NULL; 403 caddr_t vaddr = (caddr_t)uio->uio_offset; 404 int try_lock = NEED_LOCK_KVADDR(vaddr); 405 int locked = 0; 406 407 if ((error = plat_mem_do_mmio(uio, rw)) != ENOTSUP) 408 break; 409 410 if (rw == UIO_WRITE) 411 mm_logkmem(uio); 412 413 /* 414 * If vaddr does not map a valid page, as_pagelock() 415 * will return failure. Hence we can't check the 416 * return value and return EFAULT here as we'd like. 417 * seg_kp and seg_kpm do not properly support 418 * as_pagelock() for this context so we avoid it 419 * using the try_lock set check above. Some day when 420 * the kernel page locking gets redesigned all this 421 * muck can be cleaned up. 422 */ 423 if (try_lock) 424 locked = (as_pagelock(&kas, &ppp, vaddr, 425 PAGESIZE, S_WRITE) == 0); 426 427 v = hat_getpfnum(kas.a_hat, 428 (caddr_t)(uintptr_t)uio->uio_loffset); 429 if (v == PFN_INVALID) { 430 if (locked) 431 as_pageunlock(&kas, ppp, vaddr, 432 PAGESIZE, S_WRITE); 433 error = EFAULT; 434 break; 435 } 436 437 error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET, 438 minor == M_ALLKMEM || mm_kmem_io_access, 439 (locked && ppp) ? *ppp : NULL); 440 if (locked) 441 as_pageunlock(&kas, ppp, vaddr, PAGESIZE, 442 S_WRITE); 443 } 444 445 break; 446 447 case M_FULL: 448 if (rw == UIO_WRITE) { 449 error = ENOSPC; 450 break; 451 } 452 /* else it's a read, fall through to zero case */ 453 /*FALLTHROUGH*/ 454 455 case M_ZERO: 456 if (rw == UIO_READ) { 457 label_t ljb; 458 459 if (on_fault(&ljb)) { 460 no_fault(); 461 error = EFAULT; 462 break; 463 } 464 uzero(iov->iov_base, iov->iov_len); 465 no_fault(); 466 uio->uio_resid -= iov->iov_len; 467 uio->uio_loffset += iov->iov_len; 468 break; 469 } 470 /* else it's a write, fall through to NULL case */ 471 /*FALLTHROUGH*/ 472 473 case M_NULL: 474 if (rw == UIO_READ) 475 return (0); 476 c = iov->iov_len; 477 iov->iov_base += c; 478 iov->iov_len -= c; 479 uio->uio_loffset += c; 480 uio->uio_resid -= c; 481 break; 482 483 } 484 } 485 return (uio->uio_resid == oresid ? error : 0); 486 } 487 488 static int 489 mmread(dev_t dev, struct uio *uio, cred_t *cred) 490 { 491 return (mmrw(dev, uio, UIO_READ, cred)); 492 } 493 494 static int 495 mmwrite(dev_t dev, struct uio *uio, cred_t *cred) 496 { 497 return (mmrw(dev, uio, UIO_WRITE, cred)); 498 } 499 500 /* 501 * Private ioctl for libkvm to support kvm_physaddr(). 502 * Given an address space and a VA, compute the PA. 503 */ 504 static int 505 mmioctl_vtop(intptr_t data) 506 { 507 #ifdef _SYSCALL32 508 mem_vtop32_t vtop32; 509 #endif 510 mem_vtop_t mem_vtop; 511 proc_t *p; 512 pfn_t pfn = (pfn_t)PFN_INVALID; 513 pid_t pid = 0; 514 struct as *as; 515 struct seg *seg; 516 517 if (get_udatamodel() == DATAMODEL_NATIVE) { 518 if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t))) 519 return (EFAULT); 520 } 521 #ifdef _SYSCALL32 522 else { 523 if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t))) 524 return (EFAULT); 525 mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as; 526 mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va; 527 528 if (mem_vtop.m_as != NULL) 529 return (EINVAL); 530 } 531 #endif 532 533 if (mem_vtop.m_as == &kas) { 534 pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va); 535 } else { 536 if (mem_vtop.m_as == NULL) { 537 /* 538 * Assume the calling process's address space if the 539 * caller didn't specify one. 540 */ 541 p = curthread->t_procp; 542 if (p == NULL) 543 return (EIO); 544 mem_vtop.m_as = p->p_as; 545 } 546 547 mutex_enter(&pidlock); 548 for (p = practive; p != NULL; p = p->p_next) { 549 if (p->p_as == mem_vtop.m_as) { 550 pid = p->p_pid; 551 break; 552 } 553 } 554 mutex_exit(&pidlock); 555 if (p == NULL) 556 return (EIO); 557 p = sprlock(pid); 558 if (p == NULL) 559 return (EIO); 560 as = p->p_as; 561 if (as == mem_vtop.m_as) { 562 mutex_exit(&p->p_lock); 563 AS_LOCK_ENTER(as, RW_READER); 564 for (seg = AS_SEGFIRST(as); seg != NULL; 565 seg = AS_SEGNEXT(as, seg)) 566 if ((uintptr_t)mem_vtop.m_va - 567 (uintptr_t)seg->s_base < seg->s_size) 568 break; 569 if (seg != NULL) 570 pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va); 571 AS_LOCK_EXIT(as); 572 mutex_enter(&p->p_lock); 573 } 574 sprunlock(p); 575 } 576 mem_vtop.m_pfn = pfn; 577 if (pfn == PFN_INVALID) 578 return (EIO); 579 580 if (get_udatamodel() == DATAMODEL_NATIVE) { 581 if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t))) 582 return (EFAULT); 583 } 584 #ifdef _SYSCALL32 585 else { 586 vtop32.m_pfn = mem_vtop.m_pfn; 587 if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t))) 588 return (EFAULT); 589 } 590 #endif 591 592 return (0); 593 } 594 595 /* 596 * Given a PA, execute the given page retire command on it. 597 */ 598 static int 599 mmioctl_page_retire(int cmd, intptr_t data) 600 { 601 extern int page_retire_test(void); 602 uint64_t pa; 603 604 if (copyin((void *)data, &pa, sizeof (uint64_t))) { 605 return (EFAULT); 606 } 607 608 switch (cmd) { 609 case MEM_PAGE_ISRETIRED: 610 return (page_retire_check(pa, NULL)); 611 612 case MEM_PAGE_UNRETIRE: 613 return (page_unretire(pa)); 614 615 case MEM_PAGE_RETIRE: 616 return (page_retire(pa, PR_FMA)); 617 618 case MEM_PAGE_RETIRE_MCE: 619 return (page_retire(pa, PR_MCE)); 620 621 case MEM_PAGE_RETIRE_UE: 622 return (page_retire(pa, PR_UE)); 623 624 case MEM_PAGE_GETERRORS: 625 { 626 uint64_t page_errors; 627 int rc = page_retire_check(pa, &page_errors); 628 if (copyout(&page_errors, (void *)data, 629 sizeof (uint64_t))) { 630 return (EFAULT); 631 } 632 return (rc); 633 } 634 635 case MEM_PAGE_RETIRE_TEST: 636 return (page_retire_test()); 637 638 } 639 640 return (EINVAL); 641 } 642 643 #ifdef __sparc 644 /* 645 * Given a syndrome, syndrome type, and address return the 646 * associated memory name in the provided data buffer. 647 */ 648 static int 649 mmioctl_get_mem_name(intptr_t data) 650 { 651 mem_name_t mem_name; 652 void *buf; 653 size_t bufsize; 654 int len, err; 655 656 if ((bufsize = cpu_get_name_bufsize()) == 0) 657 return (ENOTSUP); 658 659 if ((err = mm_read_mem_name(data, &mem_name)) < 0) 660 return (err); 661 662 buf = kmem_alloc(bufsize, KM_SLEEP); 663 664 /* 665 * Call into cpu specific code to do the lookup. 666 */ 667 if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type, 668 mem_name.m_addr, buf, bufsize, &len)) != 0) { 669 kmem_free(buf, bufsize); 670 return (err); 671 } 672 673 if (len >= mem_name.m_namelen) { 674 kmem_free(buf, bufsize); 675 return (ENOSPC); 676 } 677 678 if (copyoutstr(buf, (char *)mem_name.m_name, 679 mem_name.m_namelen, NULL) != 0) { 680 kmem_free(buf, bufsize); 681 return (EFAULT); 682 } 683 684 kmem_free(buf, bufsize); 685 return (0); 686 } 687 688 /* 689 * Given a syndrome and address return information about the associated memory. 690 */ 691 static int 692 mmioctl_get_mem_info(intptr_t data) 693 { 694 mem_info_t mem_info; 695 int err; 696 697 if (copyin((void *)data, &mem_info, sizeof (mem_info_t))) 698 return (EFAULT); 699 700 if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr, 701 &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size, 702 &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0) 703 return (err); 704 705 if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0) 706 return (EFAULT); 707 708 return (0); 709 } 710 711 /* 712 * Given a memory name, return its associated serial id 713 */ 714 static int 715 mmioctl_get_mem_sid(intptr_t data) 716 { 717 mem_name_t mem_name; 718 void *buf; 719 void *name; 720 size_t name_len; 721 size_t bufsize; 722 int len, err; 723 724 if ((bufsize = cpu_get_name_bufsize()) == 0) 725 return (ENOTSUP); 726 727 if ((err = mm_read_mem_name(data, &mem_name)) < 0) 728 return (err); 729 730 buf = kmem_alloc(bufsize, KM_SLEEP); 731 732 if (mem_name.m_namelen > 1024) 733 mem_name.m_namelen = 1024; /* cap at 1024 bytes */ 734 735 name = kmem_alloc(mem_name.m_namelen, KM_SLEEP); 736 737 if ((err = copyinstr((char *)mem_name.m_name, (char *)name, 738 mem_name.m_namelen, &name_len)) != 0) { 739 kmem_free(buf, bufsize); 740 kmem_free(name, mem_name.m_namelen); 741 return (err); 742 } 743 744 /* 745 * Call into cpu specific code to do the lookup. 746 */ 747 if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) { 748 kmem_free(buf, bufsize); 749 kmem_free(name, mem_name.m_namelen); 750 return (err); 751 } 752 753 if (len > mem_name.m_sidlen) { 754 kmem_free(buf, bufsize); 755 kmem_free(name, mem_name.m_namelen); 756 return (ENAMETOOLONG); 757 } 758 759 if (copyoutstr(buf, (char *)mem_name.m_sid, 760 mem_name.m_sidlen, NULL) != 0) { 761 kmem_free(buf, bufsize); 762 kmem_free(name, mem_name.m_namelen); 763 return (EFAULT); 764 } 765 766 kmem_free(buf, bufsize); 767 kmem_free(name, mem_name.m_namelen); 768 return (0); 769 } 770 #endif /* __sparc */ 771 772 /* 773 * Private ioctls for 774 * libkvm to support kvm_physaddr(). 775 * FMA support for page_retire() and memory attribute information. 776 */ 777 /*ARGSUSED*/ 778 static int 779 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp) 780 { 781 if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) || 782 (cmd != MEM_VTOP && getminor(dev) != M_MEM)) 783 return (ENXIO); 784 785 switch (cmd) { 786 case MEM_VTOP: 787 return (mmioctl_vtop(data)); 788 789 case MEM_PAGE_RETIRE: 790 case MEM_PAGE_ISRETIRED: 791 case MEM_PAGE_UNRETIRE: 792 case MEM_PAGE_RETIRE_MCE: 793 case MEM_PAGE_RETIRE_UE: 794 case MEM_PAGE_GETERRORS: 795 case MEM_PAGE_RETIRE_TEST: 796 return (mmioctl_page_retire(cmd, data)); 797 798 #ifdef __sparc 799 case MEM_NAME: 800 return (mmioctl_get_mem_name(data)); 801 802 case MEM_INFO: 803 return (mmioctl_get_mem_info(data)); 804 805 case MEM_SID: 806 return (mmioctl_get_mem_sid(data)); 807 #else 808 case MEM_NAME: 809 case MEM_INFO: 810 case MEM_SID: 811 return (ENOTSUP); 812 #endif /* __sparc */ 813 } 814 return (ENXIO); 815 } 816 817 /*ARGSUSED2*/ 818 static int 819 mmmmap(dev_t dev, off_t off, int prot) 820 { 821 pfn_t pf; 822 struct memlist *pmem; 823 minor_t minor = getminor(dev); 824 825 switch (minor) { 826 case M_MEM: 827 pf = btop(off); 828 memlist_read_lock(); 829 for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) { 830 if (pf >= BTOP(pmem->ml_address) && 831 pf < BTOP(pmem->ml_address + pmem->ml_size)) { 832 memlist_read_unlock(); 833 return (impl_obmem_pfnum(pf)); 834 } 835 } 836 memlist_read_unlock(); 837 break; 838 839 case M_KMEM: 840 case M_ALLKMEM: 841 /* no longer supported with KPR */ 842 return (-1); 843 844 case M_FULL: 845 case M_ZERO: 846 /* 847 * We shouldn't be mmap'ing to /dev/zero here as 848 * mmsegmap() should have already converted 849 * a mapping request for this device to a mapping 850 * using seg_vn for anonymous memory. 851 */ 852 break; 853 854 } 855 return (-1); 856 } 857 858 /* 859 * This function is called when a memory device is mmap'ed. 860 * Set up the mapping to the correct device driver. 861 */ 862 static int 863 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, 864 uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred) 865 { 866 struct segvn_crargs vn_a; 867 struct segdev_crargs dev_a; 868 int error; 869 minor_t minor; 870 off_t i; 871 872 minor = getminor(dev); 873 874 as_rangelock(as); 875 /* 876 * No need to worry about vac alignment on /dev/zero 877 * since this is a "clone" object that doesn't yet exist. 878 */ 879 error = choose_addr(as, addrp, len, off, 880 (minor == M_MEM) || (minor == M_KMEM), flags); 881 if (error != 0) { 882 as_rangeunlock(as); 883 return (error); 884 } 885 886 switch (minor) { 887 case M_MEM: 888 /* /dev/mem cannot be mmap'ed with MAP_PRIVATE */ 889 if ((flags & MAP_TYPE) != MAP_SHARED) { 890 as_rangeunlock(as); 891 return (EINVAL); 892 } 893 894 /* 895 * Check to ensure that the entire range is 896 * legal and we are not trying to map in 897 * more than the device will let us. 898 */ 899 for (i = 0; i < len; i += PAGESIZE) { 900 if (mmmmap(dev, off + i, maxprot) == -1) { 901 as_rangeunlock(as); 902 return (ENXIO); 903 } 904 } 905 906 /* 907 * Use seg_dev segment driver for /dev/mem mapping. 908 */ 909 dev_a.mapfunc = mmmmap; 910 dev_a.dev = dev; 911 dev_a.offset = off; 912 dev_a.type = (flags & MAP_TYPE); 913 dev_a.prot = (uchar_t)prot; 914 dev_a.maxprot = (uchar_t)maxprot; 915 dev_a.hat_attr = 0; 916 917 /* 918 * Make /dev/mem mappings non-consistent since we can't 919 * alias pages that don't have page structs behind them, 920 * such as kernel stack pages. If someone mmap()s a kernel 921 * stack page and if we give them a tte with cv, a line from 922 * that page can get into both pages of the spitfire d$. 923 * But snoop from another processor will only invalidate 924 * the first page. This later caused kernel (xc_attention) 925 * to go into an infinite loop at pil 13 and no interrupts 926 * could come in. See 1203630. 927 * 928 */ 929 dev_a.hat_flags = HAT_LOAD_NOCONSIST; 930 dev_a.devmap_data = NULL; 931 932 error = as_map(as, *addrp, len, segdev_create, &dev_a); 933 break; 934 935 case M_ZERO: 936 /* 937 * Use seg_vn segment driver for /dev/zero mapping. 938 * Passing in a NULL amp gives us the "cloning" effect. 939 */ 940 vn_a.vp = NULL; 941 vn_a.offset = 0; 942 vn_a.type = (flags & MAP_TYPE); 943 vn_a.prot = prot; 944 vn_a.maxprot = maxprot; 945 vn_a.flags = flags & ~MAP_TYPE; 946 vn_a.cred = cred; 947 vn_a.amp = NULL; 948 vn_a.szc = 0; 949 vn_a.lgrp_mem_policy_flags = 0; 950 error = as_map(as, *addrp, len, segvn_create, &vn_a); 951 break; 952 953 case M_KMEM: 954 case M_ALLKMEM: 955 /* No longer supported with KPR. */ 956 error = ENXIO; 957 break; 958 959 case M_NULL: 960 /* 961 * Use seg_dev segment driver for /dev/null mapping. 962 */ 963 dev_a.mapfunc = mmmmap; 964 dev_a.dev = dev; 965 dev_a.offset = off; 966 dev_a.type = 0; /* neither PRIVATE nor SHARED */ 967 dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE; 968 dev_a.hat_attr = 0; 969 dev_a.hat_flags = 0; 970 error = as_map(as, *addrp, len, segdev_create, &dev_a); 971 break; 972 973 default: 974 error = ENXIO; 975 } 976 977 as_rangeunlock(as); 978 return (error); 979 } 980 981 static struct cb_ops mm_cb_ops = { 982 mmopen, /* open */ 983 nulldev, /* close */ 984 nodev, /* strategy */ 985 nodev, /* print */ 986 nodev, /* dump */ 987 mmread, /* read */ 988 mmwrite, /* write */ 989 mmioctl, /* ioctl */ 990 nodev, /* devmap */ 991 mmmmap, /* mmap */ 992 mmsegmap, /* segmap */ 993 mmchpoll, /* poll */ 994 mmpropop, /* prop_op */ 995 0, /* streamtab */ 996 D_NEW | D_MP | D_64BIT | D_U64BIT 997 }; 998 999 static struct dev_ops mm_ops = { 1000 DEVO_REV, /* devo_rev, */ 1001 0, /* refcnt */ 1002 mm_info, /* get_dev_info */ 1003 nulldev, /* identify */ 1004 nulldev, /* probe */ 1005 mm_attach, /* attach */ 1006 nodev, /* detach */ 1007 nodev, /* reset */ 1008 &mm_cb_ops, /* driver operations */ 1009 (struct bus_ops *)0, /* bus operations */ 1010 NULL, /* power */ 1011 ddi_quiesce_not_needed, /* quiesce */ 1012 }; 1013 1014 static struct modldrv modldrv = { 1015 &mod_driverops, "memory driver", &mm_ops, 1016 }; 1017 1018 static struct modlinkage modlinkage = { 1019 MODREV_1, &modldrv, NULL 1020 }; 1021 1022 int 1023 _init(void) 1024 { 1025 return (mod_install(&modlinkage)); 1026 } 1027 1028 int 1029 _info(struct modinfo *modinfop) 1030 { 1031 return (mod_info(&modlinkage, modinfop)); 1032 } 1033 1034 int 1035 _fini(void) 1036 { 1037 return (mod_remove(&modlinkage)); 1038 } 1039 1040 static int 1041 mm_kstat_update(kstat_t *ksp, int rw) 1042 { 1043 struct memlist *pmem; 1044 uint_t count; 1045 1046 if (rw == KSTAT_WRITE) 1047 return (EACCES); 1048 1049 count = 0; 1050 memlist_read_lock(); 1051 for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) { 1052 count++; 1053 } 1054 memlist_read_unlock(); 1055 1056 ksp->ks_ndata = count; 1057 ksp->ks_data_size = count * 2 * sizeof (uint64_t); 1058 1059 return (0); 1060 } 1061 1062 static int 1063 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw) 1064 { 1065 struct memlist *pmem; 1066 struct memunit { 1067 uint64_t address; 1068 uint64_t size; 1069 } *kspmem; 1070 1071 if (rw == KSTAT_WRITE) 1072 return (EACCES); 1073 1074 ksp->ks_snaptime = gethrtime(); 1075 1076 kspmem = (struct memunit *)buf; 1077 memlist_read_lock(); 1078 for (pmem = phys_install; pmem != NULL; 1079 pmem = pmem->ml_next, kspmem++) { 1080 if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) 1081 break; 1082 kspmem->address = pmem->ml_address; 1083 kspmem->size = pmem->ml_size; 1084 } 1085 memlist_read_unlock(); 1086 1087 return (0); 1088 } 1089 1090 /* 1091 * Read a mem_name_t from user-space and store it in the mem_name_t 1092 * pointed to by the mem_name argument. 1093 */ 1094 static int 1095 mm_read_mem_name(intptr_t data, mem_name_t *mem_name) 1096 { 1097 if (get_udatamodel() == DATAMODEL_NATIVE) { 1098 if (copyin((void *)data, mem_name, sizeof (mem_name_t))) 1099 return (EFAULT); 1100 } 1101 #ifdef _SYSCALL32 1102 else { 1103 mem_name32_t mem_name32; 1104 1105 if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t))) 1106 return (EFAULT); 1107 mem_name->m_addr = mem_name32.m_addr; 1108 mem_name->m_synd = mem_name32.m_synd; 1109 mem_name->m_type[0] = mem_name32.m_type[0]; 1110 mem_name->m_type[1] = mem_name32.m_type[1]; 1111 mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name; 1112 mem_name->m_namelen = (size_t)mem_name32.m_namelen; 1113 mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid; 1114 mem_name->m_sidlen = (size_t)mem_name32.m_sidlen; 1115 } 1116 #endif /* _SYSCALL32 */ 1117 1118 return (0); 1119 } 1120