/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" /* * Memory special file */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __sparc extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *); extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *, uint64_t *, int *, int *, int *); extern size_t cpu_get_name_bufsize(void); #endif /* * Turn a byte length into a pagecount. The DDI btop takes a * 32-bit size on 32-bit machines, this handles 64-bit sizes for * large physical-memory 32-bit machines. */ #define BTOP(x) ((pgcnt_t)((x) >> _pageshift)) static kmutex_t mm_lock; static caddr_t mm_map; static dev_info_t *mm_dip; /* private copy of devinfo pointer */ static int mm_kmem_io_access; static int mm_kstat_update(kstat_t *ksp, int rw); static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw); /*ARGSUSED1*/ static int mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) { int i; struct mem_minor { char *name; minor_t minor; int privonly; const char *rdpriv; const char *wrpriv; mode_t priv_mode; } mm[] = { { "mem", M_MEM, 0, NULL, "all", 0640 }, { "kmem", M_KMEM, 0, NULL, "all", 0640 }, { "allkmem", M_ALLKMEM, 0, "all", "all", 0600 }, { "null", M_NULL, PRIVONLY_DEV, NULL, NULL, 0666 }, { "zero", M_ZERO, PRIVONLY_DEV, NULL, NULL, 0666 }, }; kstat_t *ksp; mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL); mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) { if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR, mm[i].minor, DDI_PSEUDO, mm[i].privonly, mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) == DDI_FAILURE) { ddi_remove_minor_node(devi, NULL); return (DDI_FAILURE); } } mm_dip = devi; ksp = kstat_create("mm", 0, "phys_installed", "misc", KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL); if (ksp != NULL) { ksp->ks_update = mm_kstat_update; ksp->ks_snapshot = mm_kstat_snapshot; ksp->ks_lock = &mm_lock; /* XXX - not really needed */ kstat_install(ksp); } mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS, "kmem_io_access", 0); return (DDI_SUCCESS); } /*ARGSUSED*/ static int mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) { register int error; switch (infocmd) { case DDI_INFO_DEVT2DEVINFO: *result = (void *)mm_dip; error = DDI_SUCCESS; break; case DDI_INFO_DEVT2INSTANCE: *result = (void *)0; error = DDI_SUCCESS; break; default: error = DDI_FAILURE; } return (error); } /*ARGSUSED1*/ static int mmopen(dev_t *devp, int flag, int typ, struct cred *cred) { switch (getminor(*devp)) { case M_NULL: case M_ZERO: case M_MEM: case M_KMEM: case M_ALLKMEM: /* standard devices */ break; default: /* Unsupported or unknown type */ return (EINVAL); } return (0); } struct pollhead mm_pollhd; /*ARGSUSED*/ static int mmchpoll(dev_t dev, short events, int anyyet, short *reventsp, struct pollhead **phpp) { switch (getminor(dev)) { case M_NULL: case M_ZERO: case M_MEM: case M_KMEM: case M_ALLKMEM: *reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM | POLLWRNORM | POLLRDBAND | POLLWRBAND); /* * A non NULL pollhead pointer should be returned in case * user polls for 0 events. */ *phpp = !anyyet && !*reventsp ? &mm_pollhd : (struct pollhead *)NULL; return (0); default: /* no other devices currently support polling */ return (ENXIO); } } static int mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags, char *name, caddr_t valuep, int *lengthp) { /* * implement zero size to reduce overhead (avoid two failing * property lookups per stat). */ return (ddi_prop_op_size(dev, dip, prop_op, flags, name, valuep, lengthp, 0)); } static int mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio) { int error = 0; size_t nbytes = MIN((size_t)(PAGESIZE - pageoff), (size_t)uio->uio_iov->iov_len); mutex_enter(&mm_lock); hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn, (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ | PROT_WRITE), HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK); if (!pf_is_memory(pfn)) { if (allowio) { size_t c = uio->uio_iov->iov_len; if (ddi_peekpokeio(NULL, uio, rw, (caddr_t)(uintptr_t)uio->uio_loffset, c, sizeof (int32_t)) != DDI_SUCCESS) error = EFAULT; } else error = EIO; } else error = uiomove(&mm_map[pageoff], nbytes, rw, uio); hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK); mutex_exit(&mm_lock); return (error); } #ifdef __sparc #define IS_KPM_VA(va) \ (kpm_enable && (va) >= segkpm->s_base && \ (va) < (segkpm->s_base + segkpm->s_size)) #define IS_KP_VA(va) \ ((va) >= segkp->s_base && (va) < segkp->s_base + segkp->s_size) #define NEED_LOCK_KVADDR(va) (!IS_KPM_VA(va) && !IS_KP_VA(va)) #else /* __i386, __amd64 */ #define NEED_LOCK_KVADDR(va) 0 #endif /* __sparc */ /*ARGSUSED3*/ static int mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred) { pfn_t v; struct iovec *iov; int error = 0; size_t c; ssize_t oresid = uio->uio_resid; minor_t minor = getminor(dev); while (uio->uio_resid > 0 && error == 0) { iov = uio->uio_iov; if (iov->iov_len == 0) { uio->uio_iov++; uio->uio_iovcnt--; if (uio->uio_iovcnt < 0) panic("mmrw"); continue; } switch (minor) { case M_MEM: memlist_read_lock(); if (!address_in_memlist(phys_install, (uint64_t)uio->uio_loffset, 1)) { memlist_read_unlock(); error = EFAULT; break; } memlist_read_unlock(); v = BTOP((u_offset_t)uio->uio_loffset); error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET, 0); break; case M_KMEM: case M_ALLKMEM: { page_t **ppp; caddr_t vaddr = (caddr_t)uio->uio_offset; int try_lock = NEED_LOCK_KVADDR(vaddr); int locked = 0; /* * If vaddr does not map a valid page, as_pagelock() * will return failure. Hence we can't check the * return value and return EFAULT here as we'd like. * seg_kp and seg_kpm do not properly support * as_pagelock() for this context so we avoid it * using the try_lock set check above. Some day when * the kernel page locking gets redesigned all this * muck can be cleaned up. */ if (try_lock) locked = (as_pagelock(&kas, &ppp, vaddr, PAGESIZE, S_WRITE) == 0); v = hat_getpfnum(kas.a_hat, (caddr_t)uio->uio_loffset); if (v == PFN_INVALID) { if (locked) as_pageunlock(&kas, ppp, vaddr, PAGESIZE, S_WRITE); error = EFAULT; break; } error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET, minor == M_ALLKMEM || mm_kmem_io_access); if (locked) as_pageunlock(&kas, ppp, vaddr, PAGESIZE, S_WRITE); } break; case M_ZERO: if (rw == UIO_READ) { label_t ljb; if (on_fault(&ljb)) { no_fault(); error = EFAULT; break; } uzero(iov->iov_base, iov->iov_len); no_fault(); uio->uio_resid -= iov->iov_len; uio->uio_loffset += iov->iov_len; break; } /* else it's a write, fall through to NULL case */ /*FALLTHROUGH*/ case M_NULL: if (rw == UIO_READ) return (0); c = iov->iov_len; iov->iov_base += c; iov->iov_len -= c; uio->uio_loffset += c; uio->uio_resid -= c; break; } } return (uio->uio_resid == oresid ? error : 0); } static int mmread(dev_t dev, struct uio *uio, cred_t *cred) { return (mmrw(dev, uio, UIO_READ, cred)); } static int mmwrite(dev_t dev, struct uio *uio, cred_t *cred) { return (mmrw(dev, uio, UIO_WRITE, cred)); } /* * Private ioctl for libkvm to support kvm_physaddr(). * Given an address space and a VA, compute the PA. */ static int mmioctl_vtop(intptr_t data) { mem_vtop_t mem_vtop; proc_t *p; pfn_t pfn = (pfn_t)PFN_INVALID; pid_t pid = 0; struct as *as; struct seg *seg; if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t))) return (EFAULT); if (mem_vtop.m_as == &kas) { pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va); } else if (mem_vtop.m_as == NULL) { return (EIO); } else { mutex_enter(&pidlock); for (p = practive; p != NULL; p = p->p_next) { if (p->p_as == mem_vtop.m_as) { pid = p->p_pid; break; } } mutex_exit(&pidlock); if (p == NULL) return (EIO); p = sprlock(pid); if (p == NULL) return (EIO); as = p->p_as; if (as == mem_vtop.m_as) { mutex_exit(&p->p_lock); AS_LOCK_ENTER(as, &as->a_lock, RW_READER); for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) if ((uintptr_t)mem_vtop.m_va - (uintptr_t)seg->s_base < seg->s_size) break; if (seg != NULL) pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va); AS_LOCK_EXIT(as, &as->a_lock); mutex_enter(&p->p_lock); } sprunlock(p); } mem_vtop.m_pfn = pfn; if (pfn == PFN_INVALID) return (EIO); if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t))) return (EFAULT); return (0); } /* * Given a PA, retire that page or check whether it has already been retired. */ static int mmioctl_page_retire(int cmd, intptr_t data) { uint64_t pa; pfn_t pfn; page_t *pp; if (copyin((void *)data, &pa, sizeof (uint64_t))) return (EFAULT); pfn = pa >> MMU_PAGESHIFT; if (!pf_is_memory(pfn) || (pp = page_numtopp_nolock(pfn)) == NULL) return (EINVAL); /* * If we're checking, see if the page is retired; if not, confirm that * its status is at least set to be failing. If neither, return EIO. */ if (cmd == MEM_PAGE_ISRETIRED) { if (page_isretired(pp)) return (0); if (!page_isfailing(pp)) return (EIO); return (EAGAIN); } /* * Try to retire the page. If the retire fails, it will be scheduled to * occur when the page is freed. If this page is out of circulation * already, or is in the process of being retired, we fail. */ if (page_isretired(pp) || page_isfailing(pp)) return (EIO); page_settoxic(pp, PAGE_IS_FAULTY); return (page_retire(pp, PAGE_IS_FAILING) ? EAGAIN : 0); } #ifdef __sparc /* * Given a syndrome, syndrome type, and address return the * associated memory name in the provided data buffer. */ static int mmioctl_get_mem_name(intptr_t data) { mem_name_t mem_name; #ifdef _SYSCALL32 mem_name32_t mem_name32; #endif void *buf; size_t bufsize; int len, err; if ((bufsize = cpu_get_name_bufsize()) == 0) return (ENOTSUP); if (get_udatamodel() == DATAMODEL_NATIVE) { if (copyin((void *)data, &mem_name, sizeof (mem_name_t))) return (EFAULT); } #ifdef _SYSCALL32 else { if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t))) return (EFAULT); mem_name.m_addr = mem_name32.m_addr; mem_name.m_synd = mem_name32.m_synd; mem_name.m_type[0] = mem_name32.m_type[0]; mem_name.m_type[1] = mem_name32.m_type[1]; mem_name.m_name = (caddr_t)mem_name32.m_name; mem_name.m_namelen = (size_t)mem_name32.m_namelen; } #endif /* _SYSCALL32 */ buf = kmem_alloc(bufsize, KM_SLEEP); /* * Call into cpu specific code to do the lookup. */ if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type, mem_name.m_addr, buf, bufsize, &len)) != 0) { kmem_free(buf, bufsize); return (err); } if (len >= mem_name.m_namelen) { kmem_free(buf, bufsize); return (ENAMETOOLONG); } if (copyoutstr(buf, (char *)mem_name.m_name, mem_name.m_namelen, NULL) != 0) { kmem_free(buf, bufsize); return (EFAULT); } kmem_free(buf, bufsize); return (0); } /* * Given a syndrome and address return information about the associated memory. */ static int mmioctl_get_mem_info(intptr_t data) { mem_info_t mem_info; int err; if (copyin((void *)data, &mem_info, sizeof (mem_info_t))) return (EFAULT); if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr, &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size, &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0) return (err); if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0) return (EFAULT); return (0); } #endif /* __sparc */ /* * Private ioctls for * libkvm to support kvm_physaddr(). * FMA support for page_retire() and memory attribute information. */ /*ARGSUSED*/ static int mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp) { switch (cmd) { case MEM_VTOP: if (getminor(dev) != M_KMEM) return (ENXIO); return (mmioctl_vtop(data)); case MEM_PAGE_RETIRE: case MEM_PAGE_ISRETIRED: if (getminor(dev) != M_MEM) return (ENXIO); return (mmioctl_page_retire(cmd, data)); case MEM_NAME: if (getminor(dev) != M_MEM) return (ENXIO); #ifdef __sparc return (mmioctl_get_mem_name(data)); #else return (ENOTSUP); #endif case MEM_INFO: if (getminor(dev) != M_MEM) return (ENXIO); #ifdef __sparc return (mmioctl_get_mem_info(data)); #else return (ENOTSUP); #endif } return (ENXIO); } /*ARGSUSED2*/ static int mmmmap(dev_t dev, off_t off, int prot) { pfn_t pf; struct memlist *pmem; minor_t minor = getminor(dev); switch (minor) { case M_MEM: pf = btop(off); memlist_read_lock(); for (pmem = phys_install; pmem != NULL; pmem = pmem->next) { if (pf >= BTOP(pmem->address) && pf < BTOP(pmem->address + pmem->size)) { memlist_read_unlock(); return (impl_obmem_pfnum(pf)); } } memlist_read_unlock(); break; case M_KMEM: case M_ALLKMEM: /* no longer supported with KPR */ return (-1); case M_ZERO: /* * We shouldn't be mmap'ing to /dev/zero here as * mmsegmap() should have already converted * a mapping request for this device to a mapping * using seg_vn for anonymous memory. */ break; } return (-1); } /* * This function is called when a memory device is mmap'ed. * Set up the mapping to the correct device driver. */ static int mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred) { struct segvn_crargs vn_a; struct segdev_crargs dev_a; int error; minor_t minor; off_t i; minor = getminor(dev); as_rangelock(as); if ((flags & MAP_FIXED) == 0) { /* * No need to worry about vac alignment on /dev/zero * since this is a "clone" object that doesn't yet exist. */ map_addr(addrp, len, (offset_t)off, (minor == M_MEM) || (minor == M_KMEM), flags); if (*addrp == NULL) { as_rangeunlock(as); return (ENOMEM); } } else { /* * User specified address - * Blow away any previous mappings. */ (void) as_unmap(as, *addrp, len); } switch (minor) { case M_MEM: /* /dev/mem cannot be mmap'ed with MAP_PRIVATE */ if ((flags & MAP_TYPE) != MAP_SHARED) { as_rangeunlock(as); return (EINVAL); } /* * Check to ensure that the entire range is * legal and we are not trying to map in * more than the device will let us. */ for (i = 0; i < len; i += PAGESIZE) { if (mmmmap(dev, off + i, maxprot) == -1) { as_rangeunlock(as); return (ENXIO); } } /* * Use seg_dev segment driver for /dev/mem mapping. */ dev_a.mapfunc = mmmmap; dev_a.dev = dev; dev_a.offset = off; dev_a.type = (flags & MAP_TYPE); dev_a.prot = (uchar_t)prot; dev_a.maxprot = (uchar_t)maxprot; dev_a.hat_attr = 0; /* * Make /dev/mem mappings non-consistent since we can't * alias pages that don't have page structs behind them, * such as kernel stack pages. If someone mmap()s a kernel * stack page and if we give him a tte with cv, a line from * that page can get into both pages of the spitfire d$. * But snoop from another processor will only invalidate * the first page. This later caused kernel (xc_attention) * to go into an infinite loop at pil 13 and no interrupts * could come in. See 1203630. * */ dev_a.hat_flags = HAT_LOAD_NOCONSIST; dev_a.devmap_data = NULL; error = as_map(as, *addrp, len, segdev_create, &dev_a); break; case M_ZERO: /* * Use seg_vn segment driver for /dev/zero mapping. * Passing in a NULL amp gives us the "cloning" effect. */ vn_a.vp = NULL; vn_a.offset = 0; vn_a.type = (flags & MAP_TYPE); vn_a.prot = prot; vn_a.maxprot = maxprot; vn_a.flags = flags & ~MAP_TYPE; vn_a.cred = cred; vn_a.amp = NULL; vn_a.szc = 0; vn_a.lgrp_mem_policy_flags = 0; error = as_map(as, *addrp, len, segvn_create, &vn_a); break; case M_KMEM: case M_ALLKMEM: /* No longer supported with KPR. */ error = ENXIO; break; case M_NULL: /* * Use seg_dev segment driver for /dev/null mapping. */ dev_a.mapfunc = mmmmap; dev_a.dev = dev; dev_a.offset = off; dev_a.type = 0; /* neither PRIVATE nor SHARED */ dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE; dev_a.hat_attr = 0; dev_a.hat_flags = 0; error = as_map(as, *addrp, len, segdev_create, &dev_a); break; default: error = ENXIO; } as_rangeunlock(as); return (error); } static struct cb_ops mm_cb_ops = { mmopen, /* open */ nulldev, /* close */ nodev, /* strategy */ nodev, /* print */ nodev, /* dump */ mmread, /* read */ mmwrite, /* write */ mmioctl, /* ioctl */ nodev, /* devmap */ mmmmap, /* mmap */ mmsegmap, /* segmap */ mmchpoll, /* poll */ mmpropop, /* prop_op */ 0, /* streamtab */ D_NEW | D_MP | D_64BIT | D_U64BIT }; static struct dev_ops mm_ops = { DEVO_REV, /* devo_rev, */ 0, /* refcnt */ mm_info, /* get_dev_info */ nulldev, /* identify */ nulldev, /* probe */ mm_attach, /* attach */ nodev, /* detach */ nodev, /* reset */ &mm_cb_ops, /* driver operations */ (struct bus_ops *)0 /* bus operations */ }; static struct modldrv modldrv = { &mod_driverops, "memory driver %I%", &mm_ops, }; static struct modlinkage modlinkage = { MODREV_1, &modldrv, NULL }; int _init(void) { return (mod_install(&modlinkage)); } int _info(struct modinfo *modinfop) { return (mod_info(&modlinkage, modinfop)); } int _fini(void) { return (mod_remove(&modlinkage)); } static int mm_kstat_update(kstat_t *ksp, int rw) { struct memlist *pmem; uint_t count; if (rw == KSTAT_WRITE) return (EACCES); count = 0; memlist_read_lock(); for (pmem = phys_install; pmem != NULL; pmem = pmem->next) { count++; } memlist_read_unlock(); ksp->ks_ndata = count; ksp->ks_data_size = count * 2 * sizeof (uint64_t); return (0); } static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw) { struct memlist *pmem; struct memunit { uint64_t address; uint64_t size; } *kspmem; if (rw == KSTAT_WRITE) return (EACCES); ksp->ks_snaptime = gethrtime(); kspmem = (struct memunit *)buf; memlist_read_lock(); for (pmem = phys_install; pmem != NULL; pmem = pmem->next, kspmem++) { if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) break; kspmem->address = pmem->address; kspmem->size = pmem->size; } memlist_read_unlock(); return (0); }