xref: /illumos-gate/usr/src/uts/common/io/mem.c (revision 62c8caf3fac65817982e780c1efa988846153bf0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Memory special file
28  */
29 
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/user.h>
33 #include <sys/buf.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/vm.h>
37 #include <sys/uio.h>
38 #include <sys/mman.h>
39 #include <sys/kmem.h>
40 #include <vm/seg.h>
41 #include <vm/page.h>
42 #include <sys/stat.h>
43 #include <sys/vmem.h>
44 #include <sys/memlist.h>
45 #include <sys/bootconf.h>
46 
47 #include <vm/seg_vn.h>
48 #include <vm/seg_dev.h>
49 #include <vm/seg_kmem.h>
50 #include <vm/seg_kp.h>
51 #include <vm/seg_kpm.h>
52 #include <vm/hat.h>
53 
54 #include <sys/conf.h>
55 #include <sys/mem.h>
56 #include <sys/types.h>
57 #include <sys/conf.h>
58 #include <sys/param.h>
59 #include <sys/systm.h>
60 #include <sys/errno.h>
61 #include <sys/modctl.h>
62 #include <sys/memlist.h>
63 #include <sys/ddi.h>
64 #include <sys/sunddi.h>
65 #include <sys/debug.h>
66 #include <sys/fm/protocol.h>
67 
68 #if defined(__sparc)
69 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *);
70 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *,
71     uint64_t *, int *, int *, int *);
72 extern size_t cpu_get_name_bufsize(void);
73 extern int cpu_get_mem_sid(char *, char *, int, int *);
74 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *);
75 #elif defined(__x86)
76 #include <sys/cpu_module.h>
77 #endif	/* __sparc */
78 
79 /*
80  * Turn a byte length into a pagecount.  The DDI btop takes a
81  * 32-bit size on 32-bit machines, this handles 64-bit sizes for
82  * large physical-memory 32-bit machines.
83  */
84 #define	BTOP(x)	((pgcnt_t)((x) >> _pageshift))
85 
86 static kmutex_t mm_lock;
87 static caddr_t mm_map;
88 
89 static dev_info_t *mm_dip;	/* private copy of devinfo pointer */
90 
91 static int mm_kmem_io_access;
92 
93 static int mm_kstat_update(kstat_t *ksp, int rw);
94 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
95 
96 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name);
97 
98 /*ARGSUSED1*/
99 static int
100 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
101 {
102 	int i;
103 	struct mem_minor {
104 		char *name;
105 		minor_t minor;
106 		int privonly;
107 		const char *rdpriv;
108 		const char *wrpriv;
109 		mode_t priv_mode;
110 	} mm[] = {
111 		{ "mem",	M_MEM,		0,	NULL,	"all",	0640 },
112 		{ "kmem",	M_KMEM,		0,	NULL,	"all",	0640 },
113 		{ "allkmem",	M_ALLKMEM,	0,	"all",	"all",	0600 },
114 		{ "null",	M_NULL,	PRIVONLY_DEV,	NULL,	NULL,	0666 },
115 		{ "zero",	M_ZERO, PRIVONLY_DEV,	NULL,	NULL,	0666 },
116 	};
117 	kstat_t *ksp;
118 
119 	mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL);
120 	mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
121 
122 	for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) {
123 		if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR,
124 		    mm[i].minor, DDI_PSEUDO, mm[i].privonly,
125 		    mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) ==
126 		    DDI_FAILURE) {
127 			ddi_remove_minor_node(devi, NULL);
128 			return (DDI_FAILURE);
129 		}
130 	}
131 
132 	mm_dip = devi;
133 
134 	ksp = kstat_create("mm", 0, "phys_installed", "misc",
135 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
136 	if (ksp != NULL) {
137 		ksp->ks_update = mm_kstat_update;
138 		ksp->ks_snapshot = mm_kstat_snapshot;
139 		ksp->ks_lock = &mm_lock; /* XXX - not really needed */
140 		kstat_install(ksp);
141 	}
142 
143 	mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
144 	    "kmem_io_access", 0);
145 
146 	return (DDI_SUCCESS);
147 }
148 
149 /*ARGSUSED*/
150 static int
151 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
152 {
153 	register int error;
154 
155 	switch (infocmd) {
156 	case DDI_INFO_DEVT2DEVINFO:
157 		*result = (void *)mm_dip;
158 		error = DDI_SUCCESS;
159 		break;
160 	case DDI_INFO_DEVT2INSTANCE:
161 		*result = (void *)0;
162 		error = DDI_SUCCESS;
163 		break;
164 	default:
165 		error = DDI_FAILURE;
166 	}
167 	return (error);
168 }
169 
170 /*ARGSUSED1*/
171 static int
172 mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
173 {
174 	switch (getminor(*devp)) {
175 	case M_NULL:
176 	case M_ZERO:
177 	case M_MEM:
178 	case M_KMEM:
179 	case M_ALLKMEM:
180 		/* standard devices */
181 		break;
182 
183 	default:
184 		/* Unsupported or unknown type */
185 		return (EINVAL);
186 	}
187 	/* must be character device */
188 	if (typ != OTYP_CHR)
189 		return (EINVAL);
190 	return (0);
191 }
192 
193 struct pollhead	mm_pollhd;
194 
195 /*ARGSUSED*/
196 static int
197 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp,
198     struct pollhead **phpp)
199 {
200 	switch (getminor(dev)) {
201 	case M_NULL:
202 	case M_ZERO:
203 	case M_MEM:
204 	case M_KMEM:
205 	case M_ALLKMEM:
206 		*reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM |
207 		    POLLWRNORM | POLLRDBAND | POLLWRBAND);
208 		/*
209 		 * A non NULL pollhead pointer should be returned in case
210 		 * user polls for 0 events.
211 		 */
212 		*phpp = !anyyet && !*reventsp ?
213 		    &mm_pollhd : (struct pollhead *)NULL;
214 		return (0);
215 	default:
216 		/* no other devices currently support polling */
217 		return (ENXIO);
218 	}
219 }
220 
221 static int
222 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
223     char *name, caddr_t valuep, int *lengthp)
224 {
225 	/*
226 	 * implement zero size to reduce overhead (avoid two failing
227 	 * property lookups per stat).
228 	 */
229 	return (ddi_prop_op_size(dev, dip, prop_op,
230 	    flags, name, valuep, lengthp, 0));
231 }
232 
233 static int
234 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio)
235 {
236 	int error = 0;
237 	size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
238 	    (size_t)uio->uio_iov->iov_len);
239 
240 	mutex_enter(&mm_lock);
241 	hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
242 	    (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ | PROT_WRITE),
243 	    HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
244 
245 	if (!pf_is_memory(pfn)) {
246 		if (allowio) {
247 			size_t c = uio->uio_iov->iov_len;
248 
249 			if (ddi_peekpokeio(NULL, uio, rw,
250 			    (caddr_t)(uintptr_t)uio->uio_loffset, c,
251 			    sizeof (int32_t)) != DDI_SUCCESS)
252 				error = EFAULT;
253 		} else
254 			error = EIO;
255 	} else
256 		error = uiomove(&mm_map[pageoff], nbytes, rw, uio);
257 
258 	hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
259 	mutex_exit(&mm_lock);
260 	return (error);
261 }
262 
263 static int
264 mmpagelock(struct as *as, caddr_t va)
265 {
266 	struct seg *seg;
267 	int i;
268 
269 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
270 	seg = as_segat(as, va);
271 	i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0;
272 	AS_LOCK_EXIT(as, &as->a_lock);
273 
274 	return (i);
275 }
276 
277 #ifdef	__sparc
278 
279 #define	NEED_LOCK_KVADDR(kva)	mmpagelock(&kas, kva)
280 
281 #else	/* __i386, __amd64 */
282 
283 #define	NEED_LOCK_KVADDR(va)	0
284 
285 #endif	/* __sparc */
286 
287 /*ARGSUSED3*/
288 static int
289 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred)
290 {
291 	pfn_t v;
292 	struct iovec *iov;
293 	int error = 0;
294 	size_t c;
295 	ssize_t oresid = uio->uio_resid;
296 	minor_t minor = getminor(dev);
297 
298 	while (uio->uio_resid > 0 && error == 0) {
299 		iov = uio->uio_iov;
300 		if (iov->iov_len == 0) {
301 			uio->uio_iov++;
302 			uio->uio_iovcnt--;
303 			if (uio->uio_iovcnt < 0)
304 				panic("mmrw");
305 			continue;
306 		}
307 		switch (minor) {
308 
309 		case M_MEM:
310 			memlist_read_lock();
311 			if (!address_in_memlist(phys_install,
312 			    (uint64_t)uio->uio_loffset, 1)) {
313 				memlist_read_unlock();
314 				error = EFAULT;
315 				break;
316 			}
317 			memlist_read_unlock();
318 
319 			v = BTOP((u_offset_t)uio->uio_loffset);
320 			error = mmio(uio, rw, v,
321 			    uio->uio_loffset & PAGEOFFSET, 0);
322 			break;
323 
324 		case M_KMEM:
325 		case M_ALLKMEM:
326 			{
327 			page_t **ppp;
328 			caddr_t vaddr = (caddr_t)uio->uio_offset;
329 			int try_lock = NEED_LOCK_KVADDR(vaddr);
330 			int locked = 0;
331 
332 			if ((error = plat_mem_do_mmio(uio, rw)) != ENOTSUP)
333 				break;
334 
335 			/*
336 			 * If vaddr does not map a valid page, as_pagelock()
337 			 * will return failure. Hence we can't check the
338 			 * return value and return EFAULT here as we'd like.
339 			 * seg_kp and seg_kpm do not properly support
340 			 * as_pagelock() for this context so we avoid it
341 			 * using the try_lock set check above.  Some day when
342 			 * the kernel page locking gets redesigned all this
343 			 * muck can be cleaned up.
344 			 */
345 			if (try_lock)
346 				locked = (as_pagelock(&kas, &ppp, vaddr,
347 				    PAGESIZE, S_WRITE) == 0);
348 
349 			v = hat_getpfnum(kas.a_hat,
350 			    (caddr_t)(uintptr_t)uio->uio_loffset);
351 			if (v == PFN_INVALID) {
352 				if (locked)
353 					as_pageunlock(&kas, ppp, vaddr,
354 					    PAGESIZE, S_WRITE);
355 				error = EFAULT;
356 				break;
357 			}
358 
359 			error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET,
360 			    minor == M_ALLKMEM || mm_kmem_io_access);
361 			if (locked)
362 				as_pageunlock(&kas, ppp, vaddr, PAGESIZE,
363 				    S_WRITE);
364 			}
365 
366 			break;
367 
368 		case M_ZERO:
369 			if (rw == UIO_READ) {
370 				label_t ljb;
371 
372 				if (on_fault(&ljb)) {
373 					no_fault();
374 					error = EFAULT;
375 					break;
376 				}
377 				uzero(iov->iov_base, iov->iov_len);
378 				no_fault();
379 				uio->uio_resid -= iov->iov_len;
380 				uio->uio_loffset += iov->iov_len;
381 				break;
382 			}
383 			/* else it's a write, fall through to NULL case */
384 			/*FALLTHROUGH*/
385 
386 		case M_NULL:
387 			if (rw == UIO_READ)
388 				return (0);
389 			c = iov->iov_len;
390 			iov->iov_base += c;
391 			iov->iov_len -= c;
392 			uio->uio_loffset += c;
393 			uio->uio_resid -= c;
394 			break;
395 
396 		}
397 	}
398 	return (uio->uio_resid == oresid ? error : 0);
399 }
400 
401 static int
402 mmread(dev_t dev, struct uio *uio, cred_t *cred)
403 {
404 	return (mmrw(dev, uio, UIO_READ, cred));
405 }
406 
407 static int
408 mmwrite(dev_t dev, struct uio *uio, cred_t *cred)
409 {
410 	return (mmrw(dev, uio, UIO_WRITE, cred));
411 }
412 
413 /*
414  * Private ioctl for libkvm to support kvm_physaddr().
415  * Given an address space and a VA, compute the PA.
416  */
417 static int
418 mmioctl_vtop(intptr_t data)
419 {
420 #ifdef _SYSCALL32
421 	mem_vtop32_t vtop32;
422 #endif
423 	mem_vtop_t mem_vtop;
424 	proc_t *p;
425 	pfn_t pfn = (pfn_t)PFN_INVALID;
426 	pid_t pid = 0;
427 	struct as *as;
428 	struct seg *seg;
429 
430 	if (get_udatamodel() == DATAMODEL_NATIVE) {
431 		if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t)))
432 			return (EFAULT);
433 	}
434 #ifdef _SYSCALL32
435 	else {
436 		if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t)))
437 			return (EFAULT);
438 		mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as;
439 		mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va;
440 
441 		if (mem_vtop.m_as != NULL)
442 			return (EINVAL);
443 	}
444 #endif
445 
446 	if (mem_vtop.m_as == &kas) {
447 		pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va);
448 	} else {
449 		if (mem_vtop.m_as == NULL) {
450 			/*
451 			 * Assume the calling process's address space if the
452 			 * caller didn't specify one.
453 			 */
454 			p = curthread->t_procp;
455 			if (p == NULL)
456 				return (EIO);
457 			mem_vtop.m_as = p->p_as;
458 		}
459 
460 		mutex_enter(&pidlock);
461 		for (p = practive; p != NULL; p = p->p_next) {
462 			if (p->p_as == mem_vtop.m_as) {
463 				pid = p->p_pid;
464 				break;
465 			}
466 		}
467 		mutex_exit(&pidlock);
468 		if (p == NULL)
469 			return (EIO);
470 		p = sprlock(pid);
471 		if (p == NULL)
472 			return (EIO);
473 		as = p->p_as;
474 		if (as == mem_vtop.m_as) {
475 			mutex_exit(&p->p_lock);
476 			AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
477 			for (seg = AS_SEGFIRST(as); seg != NULL;
478 			    seg = AS_SEGNEXT(as, seg))
479 				if ((uintptr_t)mem_vtop.m_va -
480 				    (uintptr_t)seg->s_base < seg->s_size)
481 					break;
482 			if (seg != NULL)
483 				pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va);
484 			AS_LOCK_EXIT(as, &as->a_lock);
485 			mutex_enter(&p->p_lock);
486 		}
487 		sprunlock(p);
488 	}
489 	mem_vtop.m_pfn = pfn;
490 	if (pfn == PFN_INVALID)
491 		return (EIO);
492 
493 	if (get_udatamodel() == DATAMODEL_NATIVE) {
494 		if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t)))
495 			return (EFAULT);
496 	}
497 #ifdef _SYSCALL32
498 	else {
499 		vtop32.m_pfn = mem_vtop.m_pfn;
500 		if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t)))
501 			return (EFAULT);
502 	}
503 #endif
504 
505 	return (0);
506 }
507 
508 /*
509  * Given a PA, execute the given page retire command on it.
510  */
511 static int
512 mmioctl_page_retire(int cmd, intptr_t data)
513 {
514 	extern int page_retire_test(void);
515 	uint64_t pa;
516 
517 	if (copyin((void *)data, &pa, sizeof (uint64_t))) {
518 		return (EFAULT);
519 	}
520 
521 	switch (cmd) {
522 	case MEM_PAGE_ISRETIRED:
523 		return (page_retire_check(pa, NULL));
524 
525 	case MEM_PAGE_UNRETIRE:
526 		return (page_unretire(pa));
527 
528 	case MEM_PAGE_RETIRE:
529 		return (page_retire(pa, PR_FMA));
530 
531 	case MEM_PAGE_RETIRE_MCE:
532 		return (page_retire(pa, PR_MCE));
533 
534 	case MEM_PAGE_RETIRE_UE:
535 		return (page_retire(pa, PR_UE));
536 
537 	case MEM_PAGE_GETERRORS:
538 		{
539 			uint64_t page_errors;
540 			int rc = page_retire_check(pa, &page_errors);
541 			if (copyout(&page_errors, (void *)data,
542 			    sizeof (uint64_t))) {
543 				return (EFAULT);
544 			}
545 			return (rc);
546 		}
547 
548 	case MEM_PAGE_RETIRE_TEST:
549 		return (page_retire_test());
550 
551 	}
552 
553 	return (EINVAL);
554 }
555 
556 #ifdef __sparc
557 /*
558  * Given a syndrome, syndrome type, and address return the
559  * associated memory name in the provided data buffer.
560  */
561 static int
562 mmioctl_get_mem_name(intptr_t data)
563 {
564 	mem_name_t mem_name;
565 	void *buf;
566 	size_t bufsize;
567 	int len, err;
568 
569 	if ((bufsize = cpu_get_name_bufsize()) == 0)
570 		return (ENOTSUP);
571 
572 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
573 		return (err);
574 
575 	buf = kmem_alloc(bufsize, KM_SLEEP);
576 
577 	/*
578 	 * Call into cpu specific code to do the lookup.
579 	 */
580 	if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type,
581 	    mem_name.m_addr, buf, bufsize, &len)) != 0) {
582 		kmem_free(buf, bufsize);
583 		return (err);
584 	}
585 
586 	if (len >= mem_name.m_namelen) {
587 		kmem_free(buf, bufsize);
588 		return (ENOSPC);
589 	}
590 
591 	if (copyoutstr(buf, (char *)mem_name.m_name,
592 	    mem_name.m_namelen, NULL) != 0) {
593 		kmem_free(buf, bufsize);
594 		return (EFAULT);
595 	}
596 
597 	kmem_free(buf, bufsize);
598 	return (0);
599 }
600 
601 /*
602  * Given a syndrome and address return information about the associated memory.
603  */
604 static int
605 mmioctl_get_mem_info(intptr_t data)
606 {
607 	mem_info_t mem_info;
608 	int err;
609 
610 	if (copyin((void *)data, &mem_info, sizeof (mem_info_t)))
611 		return (EFAULT);
612 
613 	if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr,
614 	    &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size,
615 	    &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0)
616 		return (err);
617 
618 	if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0)
619 		return (EFAULT);
620 
621 	return (0);
622 }
623 
624 /*
625  * Given a memory name, return its associated serial id
626  */
627 static int
628 mmioctl_get_mem_sid(intptr_t data)
629 {
630 	mem_name_t mem_name;
631 	void *buf;
632 	void *name;
633 	size_t	name_len;
634 	size_t bufsize;
635 	int len, err;
636 
637 	if ((bufsize = cpu_get_name_bufsize()) == 0)
638 		return (ENOTSUP);
639 
640 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
641 		return (err);
642 
643 	buf = kmem_alloc(bufsize, KM_SLEEP);
644 
645 	if (mem_name.m_namelen > 1024)
646 		mem_name.m_namelen = 1024; /* cap at 1024 bytes */
647 
648 	name = kmem_alloc(mem_name.m_namelen, KM_SLEEP);
649 
650 	if ((err = copyinstr((char *)mem_name.m_name, (char *)name,
651 	    mem_name.m_namelen, &name_len)) != 0) {
652 		kmem_free(buf, bufsize);
653 		kmem_free(name, mem_name.m_namelen);
654 		return (err);
655 	}
656 
657 	/*
658 	 * Call into cpu specific code to do the lookup.
659 	 */
660 	if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) {
661 		kmem_free(buf, bufsize);
662 		kmem_free(name, mem_name.m_namelen);
663 		return (err);
664 	}
665 
666 	if (len > mem_name.m_sidlen) {
667 		kmem_free(buf, bufsize);
668 		kmem_free(name, mem_name.m_namelen);
669 		return (ENAMETOOLONG);
670 	}
671 
672 	if (copyoutstr(buf, (char *)mem_name.m_sid,
673 	    mem_name.m_sidlen, NULL) != 0) {
674 		kmem_free(buf, bufsize);
675 		kmem_free(name, mem_name.m_namelen);
676 		return (EFAULT);
677 	}
678 
679 	kmem_free(buf, bufsize);
680 	kmem_free(name, mem_name.m_namelen);
681 	return (0);
682 }
683 #endif	/* __sparc */
684 
685 /*
686  * Private ioctls for
687  *	libkvm to support kvm_physaddr().
688  *	FMA support for page_retire() and memory attribute information.
689  */
690 /*ARGSUSED*/
691 static int
692 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
693 {
694 	if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) ||
695 	    (cmd != MEM_VTOP && getminor(dev) != M_MEM))
696 		return (ENXIO);
697 
698 	switch (cmd) {
699 	case MEM_VTOP:
700 		return (mmioctl_vtop(data));
701 
702 	case MEM_PAGE_RETIRE:
703 	case MEM_PAGE_ISRETIRED:
704 	case MEM_PAGE_UNRETIRE:
705 	case MEM_PAGE_RETIRE_MCE:
706 	case MEM_PAGE_RETIRE_UE:
707 	case MEM_PAGE_GETERRORS:
708 	case MEM_PAGE_RETIRE_TEST:
709 		return (mmioctl_page_retire(cmd, data));
710 
711 #ifdef __sparc
712 	case MEM_NAME:
713 		return (mmioctl_get_mem_name(data));
714 
715 	case MEM_INFO:
716 		return (mmioctl_get_mem_info(data));
717 
718 	case MEM_SID:
719 		return (mmioctl_get_mem_sid(data));
720 #else
721 	case MEM_NAME:
722 	case MEM_INFO:
723 	case MEM_SID:
724 		return (ENOTSUP);
725 #endif	/* __sparc */
726 	}
727 	return (ENXIO);
728 }
729 
730 /*ARGSUSED2*/
731 static int
732 mmmmap(dev_t dev, off_t off, int prot)
733 {
734 	pfn_t pf;
735 	struct memlist *pmem;
736 	minor_t minor = getminor(dev);
737 
738 	switch (minor) {
739 	case M_MEM:
740 		pf = btop(off);
741 		memlist_read_lock();
742 		for (pmem = phys_install; pmem != NULL; pmem = pmem->next) {
743 			if (pf >= BTOP(pmem->address) &&
744 			    pf < BTOP(pmem->address + pmem->size)) {
745 				memlist_read_unlock();
746 				return (impl_obmem_pfnum(pf));
747 			}
748 		}
749 		memlist_read_unlock();
750 		break;
751 
752 	case M_KMEM:
753 	case M_ALLKMEM:
754 		/* no longer supported with KPR */
755 		return (-1);
756 
757 	case M_ZERO:
758 		/*
759 		 * We shouldn't be mmap'ing to /dev/zero here as
760 		 * mmsegmap() should have already converted
761 		 * a mapping request for this device to a mapping
762 		 * using seg_vn for anonymous memory.
763 		 */
764 		break;
765 
766 	}
767 	return (-1);
768 }
769 
770 /*
771  * This function is called when a memory device is mmap'ed.
772  * Set up the mapping to the correct device driver.
773  */
774 static int
775 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
776     uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
777 {
778 	struct segvn_crargs vn_a;
779 	struct segdev_crargs dev_a;
780 	int error;
781 	minor_t minor;
782 	off_t i;
783 
784 	minor = getminor(dev);
785 
786 	as_rangelock(as);
787 	/*
788 	 * No need to worry about vac alignment on /dev/zero
789 	 * since this is a "clone" object that doesn't yet exist.
790 	 */
791 	error = choose_addr(as, addrp, len, off,
792 	    (minor == M_MEM) || (minor == M_KMEM), flags);
793 	if (error != 0) {
794 		as_rangeunlock(as);
795 		return (error);
796 	}
797 
798 	switch (minor) {
799 	case M_MEM:
800 		/* /dev/mem cannot be mmap'ed with MAP_PRIVATE */
801 		if ((flags & MAP_TYPE) != MAP_SHARED) {
802 			as_rangeunlock(as);
803 			return (EINVAL);
804 		}
805 
806 		/*
807 		 * Check to ensure that the entire range is
808 		 * legal and we are not trying to map in
809 		 * more than the device will let us.
810 		 */
811 		for (i = 0; i < len; i += PAGESIZE) {
812 			if (mmmmap(dev, off + i, maxprot) == -1) {
813 				as_rangeunlock(as);
814 				return (ENXIO);
815 			}
816 		}
817 
818 		/*
819 		 * Use seg_dev segment driver for /dev/mem mapping.
820 		 */
821 		dev_a.mapfunc = mmmmap;
822 		dev_a.dev = dev;
823 		dev_a.offset = off;
824 		dev_a.type = (flags & MAP_TYPE);
825 		dev_a.prot = (uchar_t)prot;
826 		dev_a.maxprot = (uchar_t)maxprot;
827 		dev_a.hat_attr = 0;
828 
829 		/*
830 		 * Make /dev/mem mappings non-consistent since we can't
831 		 * alias pages that don't have page structs behind them,
832 		 * such as kernel stack pages. If someone mmap()s a kernel
833 		 * stack page and if we give him a tte with cv, a line from
834 		 * that page can get into both pages of the spitfire d$.
835 		 * But snoop from another processor will only invalidate
836 		 * the first page. This later caused kernel (xc_attention)
837 		 * to go into an infinite loop at pil 13 and no interrupts
838 		 * could come in. See 1203630.
839 		 *
840 		 */
841 		dev_a.hat_flags = HAT_LOAD_NOCONSIST;
842 		dev_a.devmap_data = NULL;
843 
844 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
845 		break;
846 
847 	case M_ZERO:
848 		/*
849 		 * Use seg_vn segment driver for /dev/zero mapping.
850 		 * Passing in a NULL amp gives us the "cloning" effect.
851 		 */
852 		vn_a.vp = NULL;
853 		vn_a.offset = 0;
854 		vn_a.type = (flags & MAP_TYPE);
855 		vn_a.prot = prot;
856 		vn_a.maxprot = maxprot;
857 		vn_a.flags = flags & ~MAP_TYPE;
858 		vn_a.cred = cred;
859 		vn_a.amp = NULL;
860 		vn_a.szc = 0;
861 		vn_a.lgrp_mem_policy_flags = 0;
862 		error = as_map(as, *addrp, len, segvn_create, &vn_a);
863 		break;
864 
865 	case M_KMEM:
866 	case M_ALLKMEM:
867 		/* No longer supported with KPR. */
868 		error = ENXIO;
869 		break;
870 
871 	case M_NULL:
872 		/*
873 		 * Use seg_dev segment driver for /dev/null mapping.
874 		 */
875 		dev_a.mapfunc = mmmmap;
876 		dev_a.dev = dev;
877 		dev_a.offset = off;
878 		dev_a.type = 0;		/* neither PRIVATE nor SHARED */
879 		dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
880 		dev_a.hat_attr = 0;
881 		dev_a.hat_flags = 0;
882 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
883 		break;
884 
885 	default:
886 		error = ENXIO;
887 	}
888 
889 	as_rangeunlock(as);
890 	return (error);
891 }
892 
893 static struct cb_ops mm_cb_ops = {
894 	mmopen,			/* open */
895 	nulldev,		/* close */
896 	nodev,			/* strategy */
897 	nodev,			/* print */
898 	nodev,			/* dump */
899 	mmread,			/* read */
900 	mmwrite,		/* write */
901 	mmioctl,		/* ioctl */
902 	nodev,			/* devmap */
903 	mmmmap,			/* mmap */
904 	mmsegmap,		/* segmap */
905 	mmchpoll,		/* poll */
906 	mmpropop,		/* prop_op */
907 	0,			/* streamtab  */
908 	D_NEW | D_MP | D_64BIT | D_U64BIT
909 };
910 
911 static struct dev_ops mm_ops = {
912 	DEVO_REV,		/* devo_rev, */
913 	0,			/* refcnt  */
914 	mm_info,		/* get_dev_info */
915 	nulldev,		/* identify */
916 	nulldev,		/* probe */
917 	mm_attach,		/* attach */
918 	nodev,			/* detach */
919 	nodev,			/* reset */
920 	&mm_cb_ops,		/* driver operations */
921 	(struct bus_ops *)0	/* bus operations */
922 };
923 
924 static struct modldrv modldrv = {
925 	&mod_driverops, "memory driver", &mm_ops,
926 };
927 
928 static struct modlinkage modlinkage = {
929 	MODREV_1, &modldrv, NULL
930 };
931 
932 int
933 _init(void)
934 {
935 	return (mod_install(&modlinkage));
936 }
937 
938 int
939 _info(struct modinfo *modinfop)
940 {
941 	return (mod_info(&modlinkage, modinfop));
942 }
943 
944 int
945 _fini(void)
946 {
947 	return (mod_remove(&modlinkage));
948 }
949 
950 static int
951 mm_kstat_update(kstat_t *ksp, int rw)
952 {
953 	struct memlist *pmem;
954 	uint_t count;
955 
956 	if (rw == KSTAT_WRITE)
957 		return (EACCES);
958 
959 	count = 0;
960 	memlist_read_lock();
961 	for (pmem = phys_install; pmem != NULL; pmem = pmem->next) {
962 		count++;
963 	}
964 	memlist_read_unlock();
965 
966 	ksp->ks_ndata = count;
967 	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
968 
969 	return (0);
970 }
971 
972 static int
973 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
974 {
975 	struct memlist *pmem;
976 	struct memunit {
977 		uint64_t address;
978 		uint64_t size;
979 	} *kspmem;
980 
981 	if (rw == KSTAT_WRITE)
982 		return (EACCES);
983 
984 	ksp->ks_snaptime = gethrtime();
985 
986 	kspmem = (struct memunit *)buf;
987 	memlist_read_lock();
988 	for (pmem = phys_install; pmem != NULL; pmem = pmem->next, kspmem++) {
989 		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
990 			break;
991 		kspmem->address = pmem->address;
992 		kspmem->size = pmem->size;
993 	}
994 	memlist_read_unlock();
995 
996 	return (0);
997 }
998 
999 /*
1000  * Read a mem_name_t from user-space and store it in the mem_name_t
1001  * pointed to by the mem_name argument.
1002  */
1003 static int
1004 mm_read_mem_name(intptr_t data, mem_name_t *mem_name)
1005 {
1006 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1007 		if (copyin((void *)data, mem_name, sizeof (mem_name_t)))
1008 			return (EFAULT);
1009 	}
1010 #ifdef	_SYSCALL32
1011 	else {
1012 		mem_name32_t mem_name32;
1013 
1014 		if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t)))
1015 			return (EFAULT);
1016 		mem_name->m_addr = mem_name32.m_addr;
1017 		mem_name->m_synd = mem_name32.m_synd;
1018 		mem_name->m_type[0] = mem_name32.m_type[0];
1019 		mem_name->m_type[1] = mem_name32.m_type[1];
1020 		mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name;
1021 		mem_name->m_namelen = (size_t)mem_name32.m_namelen;
1022 		mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid;
1023 		mem_name->m_sidlen = (size_t)mem_name32.m_sidlen;
1024 	}
1025 #endif	/* _SYSCALL32 */
1026 
1027 	return (0);
1028 }
1029