1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Memory special file 30 */ 31 32 #include <sys/types.h> 33 #include <sys/param.h> 34 #include <sys/user.h> 35 #include <sys/buf.h> 36 #include <sys/systm.h> 37 #include <sys/cred.h> 38 #include <sys/vm.h> 39 #include <sys/uio.h> 40 #include <sys/mman.h> 41 #include <sys/kmem.h> 42 #include <vm/seg.h> 43 #include <vm/page.h> 44 #include <sys/stat.h> 45 #include <sys/vmem.h> 46 #include <sys/memlist.h> 47 #include <sys/bootconf.h> 48 49 #include <vm/seg_vn.h> 50 #include <vm/seg_dev.h> 51 #include <vm/seg_kmem.h> 52 #include <vm/seg_kp.h> 53 #include <vm/seg_kpm.h> 54 #include <vm/hat.h> 55 56 #include <sys/conf.h> 57 #include <sys/mem.h> 58 #include <sys/types.h> 59 #include <sys/conf.h> 60 #include <sys/param.h> 61 #include <sys/systm.h> 62 #include <sys/errno.h> 63 #include <sys/modctl.h> 64 #include <sys/memlist.h> 65 #include <sys/ddi.h> 66 #include <sys/sunddi.h> 67 #include <sys/debug.h> 68 #include <sys/fm/protocol.h> 69 70 #if defined(__sparc) 71 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *); 72 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *, 73 uint64_t *, int *, int *, int *); 74 extern size_t cpu_get_name_bufsize(void); 75 extern int cpu_get_mem_sid(char *, char *, int, int *); 76 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *); 77 #elif defined(__x86) 78 #include <sys/cpu_module.h> 79 #endif /* __sparc */ 80 81 /* 82 * Turn a byte length into a pagecount. The DDI btop takes a 83 * 32-bit size on 32-bit machines, this handles 64-bit sizes for 84 * large physical-memory 32-bit machines. 85 */ 86 #define BTOP(x) ((pgcnt_t)((x) >> _pageshift)) 87 88 static kmutex_t mm_lock; 89 static caddr_t mm_map; 90 91 static dev_info_t *mm_dip; /* private copy of devinfo pointer */ 92 93 static int mm_kmem_io_access; 94 95 static int mm_kstat_update(kstat_t *ksp, int rw); 96 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw); 97 98 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name); 99 static int mm_read_mem_page(intptr_t data, mem_page_t *mpage); 100 static int mm_get_mem_fmri(mem_page_t *mpage, nvlist_t **nvl); 101 static int mm_get_paddr(nvlist_t *nvl, uint64_t *paddr); 102 103 /*ARGSUSED1*/ 104 static int 105 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) 106 { 107 int i; 108 struct mem_minor { 109 char *name; 110 minor_t minor; 111 int privonly; 112 const char *rdpriv; 113 const char *wrpriv; 114 mode_t priv_mode; 115 } mm[] = { 116 { "mem", M_MEM, 0, NULL, "all", 0640 }, 117 { "kmem", M_KMEM, 0, NULL, "all", 0640 }, 118 { "allkmem", M_ALLKMEM, 0, "all", "all", 0600 }, 119 { "null", M_NULL, PRIVONLY_DEV, NULL, NULL, 0666 }, 120 { "zero", M_ZERO, PRIVONLY_DEV, NULL, NULL, 0666 }, 121 }; 122 kstat_t *ksp; 123 124 mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL); 125 mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); 126 127 for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) { 128 if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR, 129 mm[i].minor, DDI_PSEUDO, mm[i].privonly, 130 mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) == 131 DDI_FAILURE) { 132 ddi_remove_minor_node(devi, NULL); 133 return (DDI_FAILURE); 134 } 135 } 136 137 mm_dip = devi; 138 139 ksp = kstat_create("mm", 0, "phys_installed", "misc", 140 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL); 141 if (ksp != NULL) { 142 ksp->ks_update = mm_kstat_update; 143 ksp->ks_snapshot = mm_kstat_snapshot; 144 ksp->ks_lock = &mm_lock; /* XXX - not really needed */ 145 kstat_install(ksp); 146 } 147 148 mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS, 149 "kmem_io_access", 0); 150 151 return (DDI_SUCCESS); 152 } 153 154 /*ARGSUSED*/ 155 static int 156 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 157 { 158 register int error; 159 160 switch (infocmd) { 161 case DDI_INFO_DEVT2DEVINFO: 162 *result = (void *)mm_dip; 163 error = DDI_SUCCESS; 164 break; 165 case DDI_INFO_DEVT2INSTANCE: 166 *result = (void *)0; 167 error = DDI_SUCCESS; 168 break; 169 default: 170 error = DDI_FAILURE; 171 } 172 return (error); 173 } 174 175 /*ARGSUSED1*/ 176 static int 177 mmopen(dev_t *devp, int flag, int typ, struct cred *cred) 178 { 179 switch (getminor(*devp)) { 180 case M_NULL: 181 case M_ZERO: 182 case M_MEM: 183 case M_KMEM: 184 case M_ALLKMEM: 185 /* standard devices */ 186 break; 187 188 default: 189 /* Unsupported or unknown type */ 190 return (EINVAL); 191 } 192 return (0); 193 } 194 195 struct pollhead mm_pollhd; 196 197 /*ARGSUSED*/ 198 static int 199 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp, 200 struct pollhead **phpp) 201 { 202 switch (getminor(dev)) { 203 case M_NULL: 204 case M_ZERO: 205 case M_MEM: 206 case M_KMEM: 207 case M_ALLKMEM: 208 *reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM | 209 POLLWRNORM | POLLRDBAND | POLLWRBAND); 210 /* 211 * A non NULL pollhead pointer should be returned in case 212 * user polls for 0 events. 213 */ 214 *phpp = !anyyet && !*reventsp ? 215 &mm_pollhd : (struct pollhead *)NULL; 216 return (0); 217 default: 218 /* no other devices currently support polling */ 219 return (ENXIO); 220 } 221 } 222 223 static int 224 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags, 225 char *name, caddr_t valuep, int *lengthp) 226 { 227 /* 228 * implement zero size to reduce overhead (avoid two failing 229 * property lookups per stat). 230 */ 231 return (ddi_prop_op_size(dev, dip, prop_op, 232 flags, name, valuep, lengthp, 0)); 233 } 234 235 static int 236 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio) 237 { 238 int error = 0; 239 size_t nbytes = MIN((size_t)(PAGESIZE - pageoff), 240 (size_t)uio->uio_iov->iov_len); 241 242 mutex_enter(&mm_lock); 243 hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn, 244 (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ | PROT_WRITE), 245 HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK); 246 247 if (!pf_is_memory(pfn)) { 248 if (allowio) { 249 size_t c = uio->uio_iov->iov_len; 250 251 if (ddi_peekpokeio(NULL, uio, rw, 252 (caddr_t)(uintptr_t)uio->uio_loffset, c, 253 sizeof (int32_t)) != DDI_SUCCESS) 254 error = EFAULT; 255 } else 256 error = EIO; 257 } else 258 error = uiomove(&mm_map[pageoff], nbytes, rw, uio); 259 260 hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK); 261 mutex_exit(&mm_lock); 262 return (error); 263 } 264 265 /* 266 * Some platforms have permanently-mapped areas without PFNs, so we check 267 * specially here. 268 */ 269 static int 270 mmplatio(struct uio *uio, enum uio_rw rw) 271 { 272 uintptr_t pageaddr = (uintptr_t)uio->uio_loffset & PAGEMASK; 273 off_t pageoff = uio->uio_loffset & PAGEOFFSET; 274 size_t nbytes = MIN((size_t)(PAGESIZE - pageoff), 275 (size_t)uio->uio_iov->iov_len); 276 277 if (!plat_mem_valid_page(pageaddr, rw)) 278 return (ENOTSUP); 279 280 return (uiomove((void *)(pageaddr + pageoff), nbytes, rw, uio)); 281 } 282 283 #ifdef __sparc 284 285 static int 286 mmpagelock(struct as *as, caddr_t va) 287 { 288 struct seg *seg; 289 int i; 290 291 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 292 seg = as_segat(as, va); 293 i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0; 294 AS_LOCK_EXIT(as, &as->a_lock); 295 296 return (i); 297 } 298 299 #define NEED_LOCK_KVADDR(kva) mmpagelock(&kas, kva) 300 301 #else /* __i386, __amd64 */ 302 303 #define NEED_LOCK_KVADDR(va) 0 304 305 #endif /* __sparc */ 306 307 /*ARGSUSED3*/ 308 static int 309 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred) 310 { 311 pfn_t v; 312 struct iovec *iov; 313 int error = 0; 314 size_t c; 315 ssize_t oresid = uio->uio_resid; 316 minor_t minor = getminor(dev); 317 318 while (uio->uio_resid > 0 && error == 0) { 319 iov = uio->uio_iov; 320 if (iov->iov_len == 0) { 321 uio->uio_iov++; 322 uio->uio_iovcnt--; 323 if (uio->uio_iovcnt < 0) 324 panic("mmrw"); 325 continue; 326 } 327 switch (minor) { 328 329 case M_MEM: 330 memlist_read_lock(); 331 if (!address_in_memlist(phys_install, 332 (uint64_t)uio->uio_loffset, 1)) { 333 memlist_read_unlock(); 334 error = EFAULT; 335 break; 336 } 337 memlist_read_unlock(); 338 339 v = BTOP((u_offset_t)uio->uio_loffset); 340 error = mmio(uio, rw, v, 341 uio->uio_loffset & PAGEOFFSET, 0); 342 break; 343 344 case M_KMEM: 345 case M_ALLKMEM: 346 { 347 page_t **ppp; 348 caddr_t vaddr = (caddr_t)uio->uio_offset; 349 int try_lock = NEED_LOCK_KVADDR(vaddr); 350 int locked = 0; 351 352 if ((error = mmplatio(uio, rw)) != ENOTSUP) 353 break; 354 355 /* 356 * If vaddr does not map a valid page, as_pagelock() 357 * will return failure. Hence we can't check the 358 * return value and return EFAULT here as we'd like. 359 * seg_kp and seg_kpm do not properly support 360 * as_pagelock() for this context so we avoid it 361 * using the try_lock set check above. Some day when 362 * the kernel page locking gets redesigned all this 363 * muck can be cleaned up. 364 */ 365 if (try_lock) 366 locked = (as_pagelock(&kas, &ppp, vaddr, 367 PAGESIZE, S_WRITE) == 0); 368 369 v = hat_getpfnum(kas.a_hat, 370 (caddr_t)(uintptr_t)uio->uio_loffset); 371 if (v == PFN_INVALID) { 372 if (locked) 373 as_pageunlock(&kas, ppp, vaddr, 374 PAGESIZE, S_WRITE); 375 error = EFAULT; 376 break; 377 } 378 379 error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET, 380 minor == M_ALLKMEM || mm_kmem_io_access); 381 if (locked) 382 as_pageunlock(&kas, ppp, vaddr, PAGESIZE, 383 S_WRITE); 384 } 385 386 break; 387 388 case M_ZERO: 389 if (rw == UIO_READ) { 390 label_t ljb; 391 392 if (on_fault(&ljb)) { 393 no_fault(); 394 error = EFAULT; 395 break; 396 } 397 uzero(iov->iov_base, iov->iov_len); 398 no_fault(); 399 uio->uio_resid -= iov->iov_len; 400 uio->uio_loffset += iov->iov_len; 401 break; 402 } 403 /* else it's a write, fall through to NULL case */ 404 /*FALLTHROUGH*/ 405 406 case M_NULL: 407 if (rw == UIO_READ) 408 return (0); 409 c = iov->iov_len; 410 iov->iov_base += c; 411 iov->iov_len -= c; 412 uio->uio_loffset += c; 413 uio->uio_resid -= c; 414 break; 415 416 } 417 } 418 return (uio->uio_resid == oresid ? error : 0); 419 } 420 421 static int 422 mmread(dev_t dev, struct uio *uio, cred_t *cred) 423 { 424 return (mmrw(dev, uio, UIO_READ, cred)); 425 } 426 427 static int 428 mmwrite(dev_t dev, struct uio *uio, cred_t *cred) 429 { 430 return (mmrw(dev, uio, UIO_WRITE, cred)); 431 } 432 433 /* 434 * Private ioctl for libkvm to support kvm_physaddr(). 435 * Given an address space and a VA, compute the PA. 436 */ 437 static int 438 mmioctl_vtop(intptr_t data) 439 { 440 #ifdef _SYSCALL32 441 mem_vtop32_t vtop32; 442 #endif 443 mem_vtop_t mem_vtop; 444 proc_t *p; 445 pfn_t pfn = (pfn_t)PFN_INVALID; 446 pid_t pid = 0; 447 struct as *as; 448 struct seg *seg; 449 450 if (get_udatamodel() == DATAMODEL_NATIVE) { 451 if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t))) 452 return (EFAULT); 453 } 454 #ifdef _SYSCALL32 455 else { 456 if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t))) 457 return (EFAULT); 458 mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as; 459 mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va; 460 461 if (mem_vtop.m_as != NULL) 462 return (EINVAL); 463 } 464 #endif 465 466 if (mem_vtop.m_as == &kas) { 467 pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va); 468 } else { 469 if (mem_vtop.m_as == NULL) { 470 /* 471 * Assume the calling process's address space if the 472 * caller didn't specify one. 473 */ 474 p = curthread->t_procp; 475 if (p == NULL) 476 return (EIO); 477 mem_vtop.m_as = p->p_as; 478 } 479 480 mutex_enter(&pidlock); 481 for (p = practive; p != NULL; p = p->p_next) { 482 if (p->p_as == mem_vtop.m_as) { 483 pid = p->p_pid; 484 break; 485 } 486 } 487 mutex_exit(&pidlock); 488 if (p == NULL) 489 return (EIO); 490 p = sprlock(pid); 491 if (p == NULL) 492 return (EIO); 493 as = p->p_as; 494 if (as == mem_vtop.m_as) { 495 mutex_exit(&p->p_lock); 496 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 497 for (seg = AS_SEGFIRST(as); seg != NULL; 498 seg = AS_SEGNEXT(as, seg)) 499 if ((uintptr_t)mem_vtop.m_va - 500 (uintptr_t)seg->s_base < seg->s_size) 501 break; 502 if (seg != NULL) 503 pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va); 504 AS_LOCK_EXIT(as, &as->a_lock); 505 mutex_enter(&p->p_lock); 506 } 507 sprunlock(p); 508 } 509 mem_vtop.m_pfn = pfn; 510 if (pfn == PFN_INVALID) 511 return (EIO); 512 513 if (get_udatamodel() == DATAMODEL_NATIVE) { 514 if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t))) 515 return (EFAULT); 516 } 517 #ifdef _SYSCALL32 518 else { 519 vtop32.m_pfn = mem_vtop.m_pfn; 520 if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t))) 521 return (EFAULT); 522 } 523 #endif 524 525 return (0); 526 } 527 528 /* 529 * Given a PA, execute the given page retire command on it. 530 */ 531 static int 532 mmioctl_page_retire(int cmd, intptr_t data) 533 { 534 extern int page_retire_test(void); 535 uint64_t pa; 536 537 if (copyin((void *)data, &pa, sizeof (uint64_t))) { 538 return (EFAULT); 539 } 540 541 switch (cmd) { 542 case MEM_PAGE_ISRETIRED: 543 return (page_retire_check(pa, NULL)); 544 545 case MEM_PAGE_UNRETIRE: 546 return (page_unretire(pa)); 547 548 case MEM_PAGE_RETIRE: 549 return (page_retire(pa, PR_FMA)); 550 551 case MEM_PAGE_RETIRE_MCE: 552 return (page_retire(pa, PR_MCE)); 553 554 case MEM_PAGE_RETIRE_UE: 555 return (page_retire(pa, PR_UE)); 556 557 case MEM_PAGE_GETERRORS: 558 { 559 uint64_t page_errors; 560 int rc = page_retire_check(pa, &page_errors); 561 if (copyout(&page_errors, (void *)data, 562 sizeof (uint64_t))) { 563 return (EFAULT); 564 } 565 return (rc); 566 } 567 568 case MEM_PAGE_RETIRE_TEST: 569 return (page_retire_test()); 570 571 } 572 573 return (EINVAL); 574 } 575 576 /* 577 * Given a mem-scheme FMRI for a page, execute the given page retire 578 * command on it. 579 */ 580 static int 581 mmioctl_page_fmri_retire(int cmd, intptr_t data) 582 { 583 mem_page_t mpage; 584 uint64_t pa; 585 nvlist_t *nvl; 586 int err; 587 588 if ((err = mm_read_mem_page(data, &mpage)) < 0) 589 return (err); 590 591 if ((err = mm_get_mem_fmri(&mpage, &nvl)) != 0) 592 return (err); 593 594 if ((err = mm_get_paddr(nvl, &pa)) != 0) { 595 nvlist_free(nvl); 596 return (err); 597 } 598 599 nvlist_free(nvl); 600 601 switch (cmd) { 602 case MEM_PAGE_FMRI_ISRETIRED: 603 return (page_retire_check(pa, NULL)); 604 605 case MEM_PAGE_FMRI_RETIRE: 606 return (page_retire(pa, PR_FMA)); 607 } 608 609 return (EINVAL); 610 } 611 612 #ifdef __sparc 613 /* 614 * Given a syndrome, syndrome type, and address return the 615 * associated memory name in the provided data buffer. 616 */ 617 static int 618 mmioctl_get_mem_name(intptr_t data) 619 { 620 mem_name_t mem_name; 621 void *buf; 622 size_t bufsize; 623 int len, err; 624 625 if ((bufsize = cpu_get_name_bufsize()) == 0) 626 return (ENOTSUP); 627 628 if ((err = mm_read_mem_name(data, &mem_name)) < 0) 629 return (err); 630 631 buf = kmem_alloc(bufsize, KM_SLEEP); 632 633 /* 634 * Call into cpu specific code to do the lookup. 635 */ 636 if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type, 637 mem_name.m_addr, buf, bufsize, &len)) != 0) { 638 kmem_free(buf, bufsize); 639 return (err); 640 } 641 642 if (len >= mem_name.m_namelen) { 643 kmem_free(buf, bufsize); 644 return (ENAMETOOLONG); 645 } 646 647 if (copyoutstr(buf, (char *)mem_name.m_name, 648 mem_name.m_namelen, NULL) != 0) { 649 kmem_free(buf, bufsize); 650 return (EFAULT); 651 } 652 653 kmem_free(buf, bufsize); 654 return (0); 655 } 656 657 /* 658 * Given a syndrome and address return information about the associated memory. 659 */ 660 static int 661 mmioctl_get_mem_info(intptr_t data) 662 { 663 mem_info_t mem_info; 664 int err; 665 666 if (copyin((void *)data, &mem_info, sizeof (mem_info_t))) 667 return (EFAULT); 668 669 if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr, 670 &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size, 671 &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0) 672 return (err); 673 674 if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0) 675 return (EFAULT); 676 677 return (0); 678 } 679 680 /* 681 * Given a memory name, return its associated serial id 682 */ 683 static int 684 mmioctl_get_mem_sid(intptr_t data) 685 { 686 mem_name_t mem_name; 687 void *buf; 688 void *name; 689 size_t name_len; 690 size_t bufsize; 691 int len, err; 692 693 if ((bufsize = cpu_get_name_bufsize()) == 0) 694 return (ENOTSUP); 695 696 if ((err = mm_read_mem_name(data, &mem_name)) < 0) 697 return (err); 698 699 buf = kmem_alloc(bufsize, KM_SLEEP); 700 701 if (mem_name.m_namelen > 1024) 702 mem_name.m_namelen = 1024; /* cap at 1024 bytes */ 703 704 name = kmem_alloc(mem_name.m_namelen, KM_SLEEP); 705 706 if ((err = copyinstr((char *)mem_name.m_name, (char *)name, 707 mem_name.m_namelen, &name_len)) != 0) { 708 kmem_free(buf, bufsize); 709 kmem_free(name, mem_name.m_namelen); 710 return (err); 711 } 712 713 /* 714 * Call into cpu specific code to do the lookup. 715 */ 716 if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) { 717 kmem_free(buf, bufsize); 718 kmem_free(name, mem_name.m_namelen); 719 return (err); 720 } 721 722 if (len > mem_name.m_sidlen) { 723 kmem_free(buf, bufsize); 724 kmem_free(name, mem_name.m_namelen); 725 return (ENAMETOOLONG); 726 } 727 728 if (copyoutstr(buf, (char *)mem_name.m_sid, 729 mem_name.m_sidlen, NULL) != 0) { 730 kmem_free(buf, bufsize); 731 kmem_free(name, mem_name.m_namelen); 732 return (EFAULT); 733 } 734 735 kmem_free(buf, bufsize); 736 kmem_free(name, mem_name.m_namelen); 737 return (0); 738 } 739 #endif /* __sparc */ 740 741 /* 742 * Private ioctls for 743 * libkvm to support kvm_physaddr(). 744 * FMA support for page_retire() and memory attribute information. 745 */ 746 /*ARGSUSED*/ 747 static int 748 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp) 749 { 750 if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) || 751 (cmd != MEM_VTOP && getminor(dev) != M_MEM)) 752 return (ENXIO); 753 754 switch (cmd) { 755 case MEM_VTOP: 756 return (mmioctl_vtop(data)); 757 758 case MEM_PAGE_RETIRE: 759 case MEM_PAGE_ISRETIRED: 760 case MEM_PAGE_UNRETIRE: 761 case MEM_PAGE_RETIRE_MCE: 762 case MEM_PAGE_RETIRE_UE: 763 case MEM_PAGE_GETERRORS: 764 case MEM_PAGE_RETIRE_TEST: 765 return (mmioctl_page_retire(cmd, data)); 766 767 case MEM_PAGE_FMRI_RETIRE: 768 case MEM_PAGE_FMRI_ISRETIRED: 769 return (mmioctl_page_fmri_retire(cmd, data)); 770 771 #ifdef __sparc 772 case MEM_NAME: 773 return (mmioctl_get_mem_name(data)); 774 775 case MEM_INFO: 776 return (mmioctl_get_mem_info(data)); 777 778 case MEM_SID: 779 return (mmioctl_get_mem_sid(data)); 780 #else 781 case MEM_NAME: 782 case MEM_INFO: 783 case MEM_SID: 784 return (ENOTSUP); 785 #endif /* __sparc */ 786 } 787 return (ENXIO); 788 } 789 790 /*ARGSUSED2*/ 791 static int 792 mmmmap(dev_t dev, off_t off, int prot) 793 { 794 pfn_t pf; 795 struct memlist *pmem; 796 minor_t minor = getminor(dev); 797 798 switch (minor) { 799 case M_MEM: 800 pf = btop(off); 801 memlist_read_lock(); 802 for (pmem = phys_install; pmem != NULL; pmem = pmem->next) { 803 if (pf >= BTOP(pmem->address) && 804 pf < BTOP(pmem->address + pmem->size)) { 805 memlist_read_unlock(); 806 return (impl_obmem_pfnum(pf)); 807 } 808 } 809 memlist_read_unlock(); 810 break; 811 812 case M_KMEM: 813 case M_ALLKMEM: 814 /* no longer supported with KPR */ 815 return (-1); 816 817 case M_ZERO: 818 /* 819 * We shouldn't be mmap'ing to /dev/zero here as 820 * mmsegmap() should have already converted 821 * a mapping request for this device to a mapping 822 * using seg_vn for anonymous memory. 823 */ 824 break; 825 826 } 827 return (-1); 828 } 829 830 /* 831 * This function is called when a memory device is mmap'ed. 832 * Set up the mapping to the correct device driver. 833 */ 834 static int 835 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, 836 uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred) 837 { 838 struct segvn_crargs vn_a; 839 struct segdev_crargs dev_a; 840 int error; 841 minor_t minor; 842 off_t i; 843 844 minor = getminor(dev); 845 846 as_rangelock(as); 847 if ((flags & MAP_FIXED) == 0) { 848 /* 849 * No need to worry about vac alignment on /dev/zero 850 * since this is a "clone" object that doesn't yet exist. 851 */ 852 map_addr(addrp, len, (offset_t)off, 853 (minor == M_MEM) || (minor == M_KMEM), flags); 854 855 if (*addrp == NULL) { 856 as_rangeunlock(as); 857 return (ENOMEM); 858 } 859 } else { 860 /* 861 * User specified address - 862 * Blow away any previous mappings. 863 */ 864 (void) as_unmap(as, *addrp, len); 865 } 866 867 switch (minor) { 868 case M_MEM: 869 /* /dev/mem cannot be mmap'ed with MAP_PRIVATE */ 870 if ((flags & MAP_TYPE) != MAP_SHARED) { 871 as_rangeunlock(as); 872 return (EINVAL); 873 } 874 875 /* 876 * Check to ensure that the entire range is 877 * legal and we are not trying to map in 878 * more than the device will let us. 879 */ 880 for (i = 0; i < len; i += PAGESIZE) { 881 if (mmmmap(dev, off + i, maxprot) == -1) { 882 as_rangeunlock(as); 883 return (ENXIO); 884 } 885 } 886 887 /* 888 * Use seg_dev segment driver for /dev/mem mapping. 889 */ 890 dev_a.mapfunc = mmmmap; 891 dev_a.dev = dev; 892 dev_a.offset = off; 893 dev_a.type = (flags & MAP_TYPE); 894 dev_a.prot = (uchar_t)prot; 895 dev_a.maxprot = (uchar_t)maxprot; 896 dev_a.hat_attr = 0; 897 898 /* 899 * Make /dev/mem mappings non-consistent since we can't 900 * alias pages that don't have page structs behind them, 901 * such as kernel stack pages. If someone mmap()s a kernel 902 * stack page and if we give him a tte with cv, a line from 903 * that page can get into both pages of the spitfire d$. 904 * But snoop from another processor will only invalidate 905 * the first page. This later caused kernel (xc_attention) 906 * to go into an infinite loop at pil 13 and no interrupts 907 * could come in. See 1203630. 908 * 909 */ 910 dev_a.hat_flags = HAT_LOAD_NOCONSIST; 911 dev_a.devmap_data = NULL; 912 913 error = as_map(as, *addrp, len, segdev_create, &dev_a); 914 break; 915 916 case M_ZERO: 917 /* 918 * Use seg_vn segment driver for /dev/zero mapping. 919 * Passing in a NULL amp gives us the "cloning" effect. 920 */ 921 vn_a.vp = NULL; 922 vn_a.offset = 0; 923 vn_a.type = (flags & MAP_TYPE); 924 vn_a.prot = prot; 925 vn_a.maxprot = maxprot; 926 vn_a.flags = flags & ~MAP_TYPE; 927 vn_a.cred = cred; 928 vn_a.amp = NULL; 929 vn_a.szc = 0; 930 vn_a.lgrp_mem_policy_flags = 0; 931 error = as_map(as, *addrp, len, segvn_create, &vn_a); 932 break; 933 934 case M_KMEM: 935 case M_ALLKMEM: 936 /* No longer supported with KPR. */ 937 error = ENXIO; 938 break; 939 940 case M_NULL: 941 /* 942 * Use seg_dev segment driver for /dev/null mapping. 943 */ 944 dev_a.mapfunc = mmmmap; 945 dev_a.dev = dev; 946 dev_a.offset = off; 947 dev_a.type = 0; /* neither PRIVATE nor SHARED */ 948 dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE; 949 dev_a.hat_attr = 0; 950 dev_a.hat_flags = 0; 951 error = as_map(as, *addrp, len, segdev_create, &dev_a); 952 break; 953 954 default: 955 error = ENXIO; 956 } 957 958 as_rangeunlock(as); 959 return (error); 960 } 961 962 static struct cb_ops mm_cb_ops = { 963 mmopen, /* open */ 964 nulldev, /* close */ 965 nodev, /* strategy */ 966 nodev, /* print */ 967 nodev, /* dump */ 968 mmread, /* read */ 969 mmwrite, /* write */ 970 mmioctl, /* ioctl */ 971 nodev, /* devmap */ 972 mmmmap, /* mmap */ 973 mmsegmap, /* segmap */ 974 mmchpoll, /* poll */ 975 mmpropop, /* prop_op */ 976 0, /* streamtab */ 977 D_NEW | D_MP | D_64BIT | D_U64BIT 978 }; 979 980 static struct dev_ops mm_ops = { 981 DEVO_REV, /* devo_rev, */ 982 0, /* refcnt */ 983 mm_info, /* get_dev_info */ 984 nulldev, /* identify */ 985 nulldev, /* probe */ 986 mm_attach, /* attach */ 987 nodev, /* detach */ 988 nodev, /* reset */ 989 &mm_cb_ops, /* driver operations */ 990 (struct bus_ops *)0 /* bus operations */ 991 }; 992 993 static struct modldrv modldrv = { 994 &mod_driverops, "memory driver %I%", &mm_ops, 995 }; 996 997 static struct modlinkage modlinkage = { 998 MODREV_1, &modldrv, NULL 999 }; 1000 1001 int 1002 _init(void) 1003 { 1004 return (mod_install(&modlinkage)); 1005 } 1006 1007 int 1008 _info(struct modinfo *modinfop) 1009 { 1010 return (mod_info(&modlinkage, modinfop)); 1011 } 1012 1013 int 1014 _fini(void) 1015 { 1016 return (mod_remove(&modlinkage)); 1017 } 1018 1019 static int 1020 mm_kstat_update(kstat_t *ksp, int rw) 1021 { 1022 struct memlist *pmem; 1023 uint_t count; 1024 1025 if (rw == KSTAT_WRITE) 1026 return (EACCES); 1027 1028 count = 0; 1029 memlist_read_lock(); 1030 for (pmem = phys_install; pmem != NULL; pmem = pmem->next) { 1031 count++; 1032 } 1033 memlist_read_unlock(); 1034 1035 ksp->ks_ndata = count; 1036 ksp->ks_data_size = count * 2 * sizeof (uint64_t); 1037 1038 return (0); 1039 } 1040 1041 static int 1042 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw) 1043 { 1044 struct memlist *pmem; 1045 struct memunit { 1046 uint64_t address; 1047 uint64_t size; 1048 } *kspmem; 1049 1050 if (rw == KSTAT_WRITE) 1051 return (EACCES); 1052 1053 ksp->ks_snaptime = gethrtime(); 1054 1055 kspmem = (struct memunit *)buf; 1056 memlist_read_lock(); 1057 for (pmem = phys_install; pmem != NULL; pmem = pmem->next, kspmem++) { 1058 if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) 1059 break; 1060 kspmem->address = pmem->address; 1061 kspmem->size = pmem->size; 1062 } 1063 memlist_read_unlock(); 1064 1065 return (0); 1066 } 1067 1068 /* 1069 * Read a mem_name_t from user-space and store it in the mem_name_t 1070 * pointed to by the mem_name argument. 1071 */ 1072 static int 1073 mm_read_mem_name(intptr_t data, mem_name_t *mem_name) 1074 { 1075 if (get_udatamodel() == DATAMODEL_NATIVE) { 1076 if (copyin((void *)data, mem_name, sizeof (mem_name_t))) 1077 return (EFAULT); 1078 } 1079 #ifdef _SYSCALL32 1080 else { 1081 mem_name32_t mem_name32; 1082 1083 if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t))) 1084 return (EFAULT); 1085 mem_name->m_addr = mem_name32.m_addr; 1086 mem_name->m_synd = mem_name32.m_synd; 1087 mem_name->m_type[0] = mem_name32.m_type[0]; 1088 mem_name->m_type[1] = mem_name32.m_type[1]; 1089 mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name; 1090 mem_name->m_namelen = (size_t)mem_name32.m_namelen; 1091 mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid; 1092 mem_name->m_sidlen = (size_t)mem_name32.m_sidlen; 1093 } 1094 #endif /* _SYSCALL32 */ 1095 1096 return (0); 1097 } 1098 1099 /* 1100 * Read a mem_page_t from user-space and store it in the mem_page_t 1101 * pointed to by the mpage argument. 1102 */ 1103 static int 1104 mm_read_mem_page(intptr_t data, mem_page_t *mpage) 1105 { 1106 if (get_udatamodel() == DATAMODEL_NATIVE) { 1107 if (copyin((void *)data, mpage, sizeof (mem_page_t)) != 0) 1108 return (EFAULT); 1109 } 1110 #ifdef _SYSCALL32 1111 else { 1112 mem_page32_t mpage32; 1113 1114 if (copyin((void *)data, &mpage32, sizeof (mem_page32_t)) != 0) 1115 return (EFAULT); 1116 1117 mpage->m_fmri = (caddr_t)(uintptr_t)mpage32.m_fmri; 1118 mpage->m_fmrisz = mpage32.m_fmrisz; 1119 } 1120 #endif /* _SYSCALL32 */ 1121 1122 return (0); 1123 } 1124 1125 /* 1126 * Expand an FMRI from a mem_page_t. 1127 */ 1128 static int 1129 mm_get_mem_fmri(mem_page_t *mpage, nvlist_t **nvl) 1130 { 1131 char *buf; 1132 int err; 1133 1134 if (mpage->m_fmri == NULL || mpage->m_fmrisz > MEM_FMRI_MAX_BUFSIZE) 1135 return (EINVAL); 1136 1137 buf = kmem_alloc(mpage->m_fmrisz, KM_SLEEP); 1138 if (copyin(mpage->m_fmri, buf, mpage->m_fmrisz) != 0) { 1139 kmem_free(buf, mpage->m_fmrisz); 1140 return (EFAULT); 1141 } 1142 1143 err = nvlist_unpack(buf, mpage->m_fmrisz, nvl, KM_SLEEP); 1144 kmem_free(buf, mpage->m_fmrisz); 1145 1146 return (err); 1147 } 1148 1149 static int 1150 mm_get_paddr(nvlist_t *nvl, uint64_t *paddr) 1151 { 1152 uint8_t version; 1153 uint64_t pa; 1154 char *scheme; 1155 #ifdef __sparc 1156 uint64_t offset; 1157 char *unum; 1158 char **serids; 1159 uint_t nserids; 1160 int err; 1161 #endif 1162 1163 /* Verify FMRI scheme name and version number */ 1164 if ((nvlist_lookup_string(nvl, FM_FMRI_SCHEME, &scheme) != 0) || 1165 (strcmp(scheme, FM_FMRI_SCHEME_MEM) != 0) || 1166 (nvlist_lookup_uint8(nvl, FM_VERSION, &version) != 0) || 1167 version > FM_MEM_SCHEME_VERSION) { 1168 return (EINVAL); 1169 } 1170 1171 /* 1172 * There are two ways a physical address can be obtained from a mem 1173 * scheme FMRI. One way is to use the "offset" and "serial" 1174 * members, if they are present, together with the "unum" member to 1175 * calculate a physical address. This is the preferred way since 1176 * it is independent of possible changes to the programming of 1177 * underlying hardware registers that may change the physical address. 1178 * If the "offset" member is not present, then the address is 1179 * retrieved from the "physaddr" member. 1180 */ 1181 #if defined(__sparc) 1182 if (nvlist_lookup_uint64(nvl, FM_FMRI_MEM_OFFSET, &offset) != 0) { 1183 if (nvlist_lookup_uint64(nvl, FM_FMRI_MEM_PHYSADDR, &pa) != 1184 0) { 1185 return (EINVAL); 1186 } 1187 } else if (nvlist_lookup_string(nvl, FM_FMRI_MEM_UNUM, &unum) != 0 || 1188 nvlist_lookup_string_array(nvl, FM_FMRI_MEM_SERIAL_ID, &serids, 1189 &nserids) != 0) { 1190 return (EINVAL); 1191 } else { 1192 err = cpu_get_mem_addr(unum, serids[0], offset, &pa); 1193 if (err != 0) { 1194 if (err == ENOTSUP) { 1195 /* Fall back to physaddr */ 1196 if (nvlist_lookup_uint64(nvl, 1197 FM_FMRI_MEM_PHYSADDR, &pa) != 0) 1198 return (EINVAL); 1199 } else 1200 return (err); 1201 } 1202 } 1203 #elif defined(__x86) 1204 if (cmi_mc_unumtopa(NULL, nvl, &pa) == 0) 1205 return (EINVAL); 1206 #else 1207 #error "port me" 1208 #endif /* __sparc */ 1209 1210 *paddr = pa; 1211 return (0); 1212 } 1213