xref: /illumos-gate/usr/src/uts/common/io/mem.c (revision 500b1e787b108592a37e3d54dc9b5e676de5386d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Memory special file
28  */
29 
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/user.h>
33 #include <sys/buf.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/vm.h>
37 #include <sys/uio.h>
38 #include <sys/mman.h>
39 #include <sys/kmem.h>
40 #include <vm/seg.h>
41 #include <vm/page.h>
42 #include <sys/stat.h>
43 #include <sys/vmem.h>
44 #include <sys/memlist.h>
45 #include <sys/bootconf.h>
46 
47 #include <vm/seg_vn.h>
48 #include <vm/seg_dev.h>
49 #include <vm/seg_kmem.h>
50 #include <vm/seg_kp.h>
51 #include <vm/seg_kpm.h>
52 #include <vm/hat.h>
53 
54 #include <sys/conf.h>
55 #include <sys/mem.h>
56 #include <sys/types.h>
57 #include <sys/conf.h>
58 #include <sys/param.h>
59 #include <sys/systm.h>
60 #include <sys/errno.h>
61 #include <sys/modctl.h>
62 #include <sys/memlist.h>
63 #include <sys/ddi.h>
64 #include <sys/sunddi.h>
65 #include <sys/debug.h>
66 #include <sys/fm/protocol.h>
67 
68 #if defined(__sparc)
69 extern int cpu_get_mem_name(uint64_t, uint64_t *, uint64_t, char *, int, int *);
70 extern int cpu_get_mem_info(uint64_t, uint64_t, uint64_t *, uint64_t *,
71     uint64_t *, int *, int *, int *);
72 extern size_t cpu_get_name_bufsize(void);
73 extern int cpu_get_mem_sid(char *, char *, int, int *);
74 extern int cpu_get_mem_addr(char *, char *, uint64_t, uint64_t *);
75 #elif defined(__x86)
76 #include <sys/cpu_module.h>
77 #endif	/* __sparc */
78 
79 /*
80  * Turn a byte length into a pagecount.  The DDI btop takes a
81  * 32-bit size on 32-bit machines, this handles 64-bit sizes for
82  * large physical-memory 32-bit machines.
83  */
84 #define	BTOP(x)	((pgcnt_t)((x) >> _pageshift))
85 
86 static kmutex_t mm_lock;
87 static caddr_t mm_map;
88 
89 static dev_info_t *mm_dip;	/* private copy of devinfo pointer */
90 
91 static int mm_kmem_io_access;
92 
93 static int mm_kstat_update(kstat_t *ksp, int rw);
94 static int mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
95 
96 static int mm_read_mem_name(intptr_t data, mem_name_t *mem_name);
97 
98 /*ARGSUSED1*/
99 static int
100 mm_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
101 {
102 	int i;
103 	struct mem_minor {
104 		char *name;
105 		minor_t minor;
106 		int privonly;
107 		const char *rdpriv;
108 		const char *wrpriv;
109 		mode_t priv_mode;
110 	} mm[] = {
111 		{ "mem",	M_MEM,		0,	NULL,	"all",	0640 },
112 		{ "kmem",	M_KMEM,		0,	NULL,	"all",	0640 },
113 		{ "allkmem",	M_ALLKMEM,	0,	"all",	"all",	0600 },
114 		{ "null",	M_NULL,	PRIVONLY_DEV,	NULL,	NULL,	0666 },
115 		{ "zero",	M_ZERO, PRIVONLY_DEV,	NULL,	NULL,	0666 },
116 	};
117 	kstat_t *ksp;
118 
119 	mutex_init(&mm_lock, NULL, MUTEX_DEFAULT, NULL);
120 	mm_map = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
121 
122 	for (i = 0; i < (sizeof (mm) / sizeof (mm[0])); i++) {
123 		if (ddi_create_priv_minor_node(devi, mm[i].name, S_IFCHR,
124 		    mm[i].minor, DDI_PSEUDO, mm[i].privonly,
125 		    mm[i].rdpriv, mm[i].wrpriv, mm[i].priv_mode) ==
126 		    DDI_FAILURE) {
127 			ddi_remove_minor_node(devi, NULL);
128 			return (DDI_FAILURE);
129 		}
130 	}
131 
132 	mm_dip = devi;
133 
134 	ksp = kstat_create("mm", 0, "phys_installed", "misc",
135 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
136 	if (ksp != NULL) {
137 		ksp->ks_update = mm_kstat_update;
138 		ksp->ks_snapshot = mm_kstat_snapshot;
139 		ksp->ks_lock = &mm_lock; /* XXX - not really needed */
140 		kstat_install(ksp);
141 	}
142 
143 	mm_kmem_io_access = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
144 	    "kmem_io_access", 0);
145 
146 	return (DDI_SUCCESS);
147 }
148 
149 /*ARGSUSED*/
150 static int
151 mm_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
152 {
153 	register int error;
154 
155 	switch (infocmd) {
156 	case DDI_INFO_DEVT2DEVINFO:
157 		*result = (void *)mm_dip;
158 		error = DDI_SUCCESS;
159 		break;
160 	case DDI_INFO_DEVT2INSTANCE:
161 		*result = (void *)0;
162 		error = DDI_SUCCESS;
163 		break;
164 	default:
165 		error = DDI_FAILURE;
166 	}
167 	return (error);
168 }
169 
170 /*ARGSUSED1*/
171 static int
172 mmopen(dev_t *devp, int flag, int typ, struct cred *cred)
173 {
174 	switch (getminor(*devp)) {
175 	case M_NULL:
176 	case M_ZERO:
177 	case M_MEM:
178 	case M_KMEM:
179 	case M_ALLKMEM:
180 		/* standard devices */
181 		break;
182 
183 	default:
184 		/* Unsupported or unknown type */
185 		return (EINVAL);
186 	}
187 	/* must be character device */
188 	if (typ != OTYP_CHR)
189 		return (EINVAL);
190 	return (0);
191 }
192 
193 struct pollhead	mm_pollhd;
194 
195 /*ARGSUSED*/
196 static int
197 mmchpoll(dev_t dev, short events, int anyyet, short *reventsp,
198     struct pollhead **phpp)
199 {
200 	switch (getminor(dev)) {
201 	case M_NULL:
202 	case M_ZERO:
203 	case M_MEM:
204 	case M_KMEM:
205 	case M_ALLKMEM:
206 		*reventsp = events & (POLLIN | POLLOUT | POLLPRI | POLLRDNORM |
207 		    POLLWRNORM | POLLRDBAND | POLLWRBAND);
208 		/*
209 		 * A non NULL pollhead pointer should be returned in case
210 		 * user polls for 0 events.
211 		 */
212 		*phpp = !anyyet && !*reventsp ?
213 		    &mm_pollhd : (struct pollhead *)NULL;
214 		return (0);
215 	default:
216 		/* no other devices currently support polling */
217 		return (ENXIO);
218 	}
219 }
220 
221 static int
222 mmpropop(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int flags,
223     char *name, caddr_t valuep, int *lengthp)
224 {
225 	/*
226 	 * implement zero size to reduce overhead (avoid two failing
227 	 * property lookups per stat).
228 	 */
229 	return (ddi_prop_op_size(dev, dip, prop_op,
230 	    flags, name, valuep, lengthp, 0));
231 }
232 
233 static int
234 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio,
235     page_t *pp)
236 {
237 	int error = 0;
238 	int devload = 0;
239 	int is_memory = pf_is_memory(pfn);
240 	size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
241 	    (size_t)uio->uio_iov->iov_len);
242 	caddr_t va = NULL;
243 
244 	mutex_enter(&mm_lock);
245 
246 	if (is_memory && kpm_enable) {
247 		if (pp)
248 			va = hat_kpm_mapin(pp, NULL);
249 		else
250 			va = hat_kpm_mapin_pfn(pfn);
251 	}
252 
253 	if (va == NULL) {
254 		hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
255 		    (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ|PROT_WRITE),
256 		    HAT_LOAD_NOCONSIST|HAT_LOAD_LOCK);
257 		va = mm_map;
258 		devload = 1;
259 	}
260 
261 	if (!is_memory) {
262 		if (allowio) {
263 			size_t c = uio->uio_iov->iov_len;
264 
265 			if (ddi_peekpokeio(NULL, uio, rw,
266 			    (caddr_t)(uintptr_t)uio->uio_loffset, c,
267 			    sizeof (int32_t)) != DDI_SUCCESS)
268 				error = EFAULT;
269 		} else
270 			error = EIO;
271 	} else
272 		error = uiomove(va + pageoff, nbytes, rw, uio);
273 
274 	if (devload)
275 		hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
276 	else if (pp)
277 		hat_kpm_mapout(pp, NULL, va);
278 	else
279 		hat_kpm_mapout_pfn(pfn);
280 
281 	mutex_exit(&mm_lock);
282 	return (error);
283 }
284 
285 static int
286 mmpagelock(struct as *as, caddr_t va)
287 {
288 	struct seg *seg;
289 	int i;
290 
291 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
292 	seg = as_segat(as, va);
293 	i = (seg != NULL)? SEGOP_CAPABLE(seg, S_CAPABILITY_NOMINFLT) : 0;
294 	AS_LOCK_EXIT(as, &as->a_lock);
295 
296 	return (i);
297 }
298 
299 #ifdef	__sparc
300 
301 #define	NEED_LOCK_KVADDR(kva)	mmpagelock(&kas, kva)
302 
303 #else	/* __i386, __amd64 */
304 
305 #define	NEED_LOCK_KVADDR(va)	0
306 
307 #endif	/* __sparc */
308 
309 /*ARGSUSED3*/
310 static int
311 mmrw(dev_t dev, struct uio *uio, enum uio_rw rw, cred_t *cred)
312 {
313 	pfn_t v;
314 	struct iovec *iov;
315 	int error = 0;
316 	size_t c;
317 	ssize_t oresid = uio->uio_resid;
318 	minor_t minor = getminor(dev);
319 
320 	while (uio->uio_resid > 0 && error == 0) {
321 		iov = uio->uio_iov;
322 		if (iov->iov_len == 0) {
323 			uio->uio_iov++;
324 			uio->uio_iovcnt--;
325 			if (uio->uio_iovcnt < 0)
326 				panic("mmrw");
327 			continue;
328 		}
329 		switch (minor) {
330 
331 		case M_MEM:
332 			memlist_read_lock();
333 			if (!address_in_memlist(phys_install,
334 			    (uint64_t)uio->uio_loffset, 1)) {
335 				memlist_read_unlock();
336 				error = EFAULT;
337 				break;
338 			}
339 			memlist_read_unlock();
340 
341 			v = BTOP((u_offset_t)uio->uio_loffset);
342 			error = mmio(uio, rw, v,
343 			    uio->uio_loffset & PAGEOFFSET, 0, NULL);
344 			break;
345 
346 		case M_KMEM:
347 		case M_ALLKMEM:
348 			{
349 			page_t **ppp = NULL;
350 			caddr_t vaddr = (caddr_t)uio->uio_offset;
351 			int try_lock = NEED_LOCK_KVADDR(vaddr);
352 			int locked = 0;
353 
354 			if ((error = plat_mem_do_mmio(uio, rw)) != ENOTSUP)
355 				break;
356 
357 			/*
358 			 * If vaddr does not map a valid page, as_pagelock()
359 			 * will return failure. Hence we can't check the
360 			 * return value and return EFAULT here as we'd like.
361 			 * seg_kp and seg_kpm do not properly support
362 			 * as_pagelock() for this context so we avoid it
363 			 * using the try_lock set check above.  Some day when
364 			 * the kernel page locking gets redesigned all this
365 			 * muck can be cleaned up.
366 			 */
367 			if (try_lock)
368 				locked = (as_pagelock(&kas, &ppp, vaddr,
369 				    PAGESIZE, S_WRITE) == 0);
370 
371 			v = hat_getpfnum(kas.a_hat,
372 			    (caddr_t)(uintptr_t)uio->uio_loffset);
373 			if (v == PFN_INVALID) {
374 				if (locked)
375 					as_pageunlock(&kas, ppp, vaddr,
376 					    PAGESIZE, S_WRITE);
377 				error = EFAULT;
378 				break;
379 			}
380 
381 			error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET,
382 			    minor == M_ALLKMEM || mm_kmem_io_access,
383 			    (locked && ppp) ? *ppp : NULL);
384 			if (locked)
385 				as_pageunlock(&kas, ppp, vaddr, PAGESIZE,
386 				    S_WRITE);
387 			}
388 
389 			break;
390 
391 		case M_ZERO:
392 			if (rw == UIO_READ) {
393 				label_t ljb;
394 
395 				if (on_fault(&ljb)) {
396 					no_fault();
397 					error = EFAULT;
398 					break;
399 				}
400 				uzero(iov->iov_base, iov->iov_len);
401 				no_fault();
402 				uio->uio_resid -= iov->iov_len;
403 				uio->uio_loffset += iov->iov_len;
404 				break;
405 			}
406 			/* else it's a write, fall through to NULL case */
407 			/*FALLTHROUGH*/
408 
409 		case M_NULL:
410 			if (rw == UIO_READ)
411 				return (0);
412 			c = iov->iov_len;
413 			iov->iov_base += c;
414 			iov->iov_len -= c;
415 			uio->uio_loffset += c;
416 			uio->uio_resid -= c;
417 			break;
418 
419 		}
420 	}
421 	return (uio->uio_resid == oresid ? error : 0);
422 }
423 
424 static int
425 mmread(dev_t dev, struct uio *uio, cred_t *cred)
426 {
427 	return (mmrw(dev, uio, UIO_READ, cred));
428 }
429 
430 static int
431 mmwrite(dev_t dev, struct uio *uio, cred_t *cred)
432 {
433 	return (mmrw(dev, uio, UIO_WRITE, cred));
434 }
435 
436 /*
437  * Private ioctl for libkvm to support kvm_physaddr().
438  * Given an address space and a VA, compute the PA.
439  */
440 static int
441 mmioctl_vtop(intptr_t data)
442 {
443 #ifdef _SYSCALL32
444 	mem_vtop32_t vtop32;
445 #endif
446 	mem_vtop_t mem_vtop;
447 	proc_t *p;
448 	pfn_t pfn = (pfn_t)PFN_INVALID;
449 	pid_t pid = 0;
450 	struct as *as;
451 	struct seg *seg;
452 
453 	if (get_udatamodel() == DATAMODEL_NATIVE) {
454 		if (copyin((void *)data, &mem_vtop, sizeof (mem_vtop_t)))
455 			return (EFAULT);
456 	}
457 #ifdef _SYSCALL32
458 	else {
459 		if (copyin((void *)data, &vtop32, sizeof (mem_vtop32_t)))
460 			return (EFAULT);
461 		mem_vtop.m_as = (struct as *)(uintptr_t)vtop32.m_as;
462 		mem_vtop.m_va = (void *)(uintptr_t)vtop32.m_va;
463 
464 		if (mem_vtop.m_as != NULL)
465 			return (EINVAL);
466 	}
467 #endif
468 
469 	if (mem_vtop.m_as == &kas) {
470 		pfn = hat_getpfnum(kas.a_hat, mem_vtop.m_va);
471 	} else {
472 		if (mem_vtop.m_as == NULL) {
473 			/*
474 			 * Assume the calling process's address space if the
475 			 * caller didn't specify one.
476 			 */
477 			p = curthread->t_procp;
478 			if (p == NULL)
479 				return (EIO);
480 			mem_vtop.m_as = p->p_as;
481 		}
482 
483 		mutex_enter(&pidlock);
484 		for (p = practive; p != NULL; p = p->p_next) {
485 			if (p->p_as == mem_vtop.m_as) {
486 				pid = p->p_pid;
487 				break;
488 			}
489 		}
490 		mutex_exit(&pidlock);
491 		if (p == NULL)
492 			return (EIO);
493 		p = sprlock(pid);
494 		if (p == NULL)
495 			return (EIO);
496 		as = p->p_as;
497 		if (as == mem_vtop.m_as) {
498 			mutex_exit(&p->p_lock);
499 			AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
500 			for (seg = AS_SEGFIRST(as); seg != NULL;
501 			    seg = AS_SEGNEXT(as, seg))
502 				if ((uintptr_t)mem_vtop.m_va -
503 				    (uintptr_t)seg->s_base < seg->s_size)
504 					break;
505 			if (seg != NULL)
506 				pfn = hat_getpfnum(as->a_hat, mem_vtop.m_va);
507 			AS_LOCK_EXIT(as, &as->a_lock);
508 			mutex_enter(&p->p_lock);
509 		}
510 		sprunlock(p);
511 	}
512 	mem_vtop.m_pfn = pfn;
513 	if (pfn == PFN_INVALID)
514 		return (EIO);
515 
516 	if (get_udatamodel() == DATAMODEL_NATIVE) {
517 		if (copyout(&mem_vtop, (void *)data, sizeof (mem_vtop_t)))
518 			return (EFAULT);
519 	}
520 #ifdef _SYSCALL32
521 	else {
522 		vtop32.m_pfn = mem_vtop.m_pfn;
523 		if (copyout(&vtop32, (void *)data, sizeof (mem_vtop32_t)))
524 			return (EFAULT);
525 	}
526 #endif
527 
528 	return (0);
529 }
530 
531 /*
532  * Given a PA, execute the given page retire command on it.
533  */
534 static int
535 mmioctl_page_retire(int cmd, intptr_t data)
536 {
537 	extern int page_retire_test(void);
538 	uint64_t pa;
539 
540 	if (copyin((void *)data, &pa, sizeof (uint64_t))) {
541 		return (EFAULT);
542 	}
543 
544 	switch (cmd) {
545 	case MEM_PAGE_ISRETIRED:
546 		return (page_retire_check(pa, NULL));
547 
548 	case MEM_PAGE_UNRETIRE:
549 		return (page_unretire(pa));
550 
551 	case MEM_PAGE_RETIRE:
552 		return (page_retire(pa, PR_FMA));
553 
554 	case MEM_PAGE_RETIRE_MCE:
555 		return (page_retire(pa, PR_MCE));
556 
557 	case MEM_PAGE_RETIRE_UE:
558 		return (page_retire(pa, PR_UE));
559 
560 	case MEM_PAGE_GETERRORS:
561 		{
562 			uint64_t page_errors;
563 			int rc = page_retire_check(pa, &page_errors);
564 			if (copyout(&page_errors, (void *)data,
565 			    sizeof (uint64_t))) {
566 				return (EFAULT);
567 			}
568 			return (rc);
569 		}
570 
571 	case MEM_PAGE_RETIRE_TEST:
572 		return (page_retire_test());
573 
574 	}
575 
576 	return (EINVAL);
577 }
578 
579 #ifdef __sparc
580 /*
581  * Given a syndrome, syndrome type, and address return the
582  * associated memory name in the provided data buffer.
583  */
584 static int
585 mmioctl_get_mem_name(intptr_t data)
586 {
587 	mem_name_t mem_name;
588 	void *buf;
589 	size_t bufsize;
590 	int len, err;
591 
592 	if ((bufsize = cpu_get_name_bufsize()) == 0)
593 		return (ENOTSUP);
594 
595 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
596 		return (err);
597 
598 	buf = kmem_alloc(bufsize, KM_SLEEP);
599 
600 	/*
601 	 * Call into cpu specific code to do the lookup.
602 	 */
603 	if ((err = cpu_get_mem_name(mem_name.m_synd, mem_name.m_type,
604 	    mem_name.m_addr, buf, bufsize, &len)) != 0) {
605 		kmem_free(buf, bufsize);
606 		return (err);
607 	}
608 
609 	if (len >= mem_name.m_namelen) {
610 		kmem_free(buf, bufsize);
611 		return (ENOSPC);
612 	}
613 
614 	if (copyoutstr(buf, (char *)mem_name.m_name,
615 	    mem_name.m_namelen, NULL) != 0) {
616 		kmem_free(buf, bufsize);
617 		return (EFAULT);
618 	}
619 
620 	kmem_free(buf, bufsize);
621 	return (0);
622 }
623 
624 /*
625  * Given a syndrome and address return information about the associated memory.
626  */
627 static int
628 mmioctl_get_mem_info(intptr_t data)
629 {
630 	mem_info_t mem_info;
631 	int err;
632 
633 	if (copyin((void *)data, &mem_info, sizeof (mem_info_t)))
634 		return (EFAULT);
635 
636 	if ((err = cpu_get_mem_info(mem_info.m_synd, mem_info.m_addr,
637 	    &mem_info.m_mem_size, &mem_info.m_seg_size, &mem_info.m_bank_size,
638 	    &mem_info.m_segments, &mem_info.m_banks, &mem_info.m_mcid)) != 0)
639 		return (err);
640 
641 	if (copyout(&mem_info, (void *)data, sizeof (mem_info_t)) != 0)
642 		return (EFAULT);
643 
644 	return (0);
645 }
646 
647 /*
648  * Given a memory name, return its associated serial id
649  */
650 static int
651 mmioctl_get_mem_sid(intptr_t data)
652 {
653 	mem_name_t mem_name;
654 	void *buf;
655 	void *name;
656 	size_t	name_len;
657 	size_t bufsize;
658 	int len, err;
659 
660 	if ((bufsize = cpu_get_name_bufsize()) == 0)
661 		return (ENOTSUP);
662 
663 	if ((err = mm_read_mem_name(data, &mem_name)) < 0)
664 		return (err);
665 
666 	buf = kmem_alloc(bufsize, KM_SLEEP);
667 
668 	if (mem_name.m_namelen > 1024)
669 		mem_name.m_namelen = 1024; /* cap at 1024 bytes */
670 
671 	name = kmem_alloc(mem_name.m_namelen, KM_SLEEP);
672 
673 	if ((err = copyinstr((char *)mem_name.m_name, (char *)name,
674 	    mem_name.m_namelen, &name_len)) != 0) {
675 		kmem_free(buf, bufsize);
676 		kmem_free(name, mem_name.m_namelen);
677 		return (err);
678 	}
679 
680 	/*
681 	 * Call into cpu specific code to do the lookup.
682 	 */
683 	if ((err = cpu_get_mem_sid(name, buf, bufsize, &len)) != 0) {
684 		kmem_free(buf, bufsize);
685 		kmem_free(name, mem_name.m_namelen);
686 		return (err);
687 	}
688 
689 	if (len > mem_name.m_sidlen) {
690 		kmem_free(buf, bufsize);
691 		kmem_free(name, mem_name.m_namelen);
692 		return (ENAMETOOLONG);
693 	}
694 
695 	if (copyoutstr(buf, (char *)mem_name.m_sid,
696 	    mem_name.m_sidlen, NULL) != 0) {
697 		kmem_free(buf, bufsize);
698 		kmem_free(name, mem_name.m_namelen);
699 		return (EFAULT);
700 	}
701 
702 	kmem_free(buf, bufsize);
703 	kmem_free(name, mem_name.m_namelen);
704 	return (0);
705 }
706 #endif	/* __sparc */
707 
708 /*
709  * Private ioctls for
710  *	libkvm to support kvm_physaddr().
711  *	FMA support for page_retire() and memory attribute information.
712  */
713 /*ARGSUSED*/
714 static int
715 mmioctl(dev_t dev, int cmd, intptr_t data, int flag, cred_t *cred, int *rvalp)
716 {
717 	if ((cmd == MEM_VTOP && getminor(dev) != M_KMEM) ||
718 	    (cmd != MEM_VTOP && getminor(dev) != M_MEM))
719 		return (ENXIO);
720 
721 	switch (cmd) {
722 	case MEM_VTOP:
723 		return (mmioctl_vtop(data));
724 
725 	case MEM_PAGE_RETIRE:
726 	case MEM_PAGE_ISRETIRED:
727 	case MEM_PAGE_UNRETIRE:
728 	case MEM_PAGE_RETIRE_MCE:
729 	case MEM_PAGE_RETIRE_UE:
730 	case MEM_PAGE_GETERRORS:
731 	case MEM_PAGE_RETIRE_TEST:
732 		return (mmioctl_page_retire(cmd, data));
733 
734 #ifdef __sparc
735 	case MEM_NAME:
736 		return (mmioctl_get_mem_name(data));
737 
738 	case MEM_INFO:
739 		return (mmioctl_get_mem_info(data));
740 
741 	case MEM_SID:
742 		return (mmioctl_get_mem_sid(data));
743 #else
744 	case MEM_NAME:
745 	case MEM_INFO:
746 	case MEM_SID:
747 		return (ENOTSUP);
748 #endif	/* __sparc */
749 	}
750 	return (ENXIO);
751 }
752 
753 /*ARGSUSED2*/
754 static int
755 mmmmap(dev_t dev, off_t off, int prot)
756 {
757 	pfn_t pf;
758 	struct memlist *pmem;
759 	minor_t minor = getminor(dev);
760 
761 	switch (minor) {
762 	case M_MEM:
763 		pf = btop(off);
764 		memlist_read_lock();
765 		for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
766 			if (pf >= BTOP(pmem->ml_address) &&
767 			    pf < BTOP(pmem->ml_address + pmem->ml_size)) {
768 				memlist_read_unlock();
769 				return (impl_obmem_pfnum(pf));
770 			}
771 		}
772 		memlist_read_unlock();
773 		break;
774 
775 	case M_KMEM:
776 	case M_ALLKMEM:
777 		/* no longer supported with KPR */
778 		return (-1);
779 
780 	case M_ZERO:
781 		/*
782 		 * We shouldn't be mmap'ing to /dev/zero here as
783 		 * mmsegmap() should have already converted
784 		 * a mapping request for this device to a mapping
785 		 * using seg_vn for anonymous memory.
786 		 */
787 		break;
788 
789 	}
790 	return (-1);
791 }
792 
793 /*
794  * This function is called when a memory device is mmap'ed.
795  * Set up the mapping to the correct device driver.
796  */
797 static int
798 mmsegmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len,
799     uint_t prot, uint_t maxprot, uint_t flags, struct cred *cred)
800 {
801 	struct segvn_crargs vn_a;
802 	struct segdev_crargs dev_a;
803 	int error;
804 	minor_t minor;
805 	off_t i;
806 
807 	minor = getminor(dev);
808 
809 	as_rangelock(as);
810 	/*
811 	 * No need to worry about vac alignment on /dev/zero
812 	 * since this is a "clone" object that doesn't yet exist.
813 	 */
814 	error = choose_addr(as, addrp, len, off,
815 	    (minor == M_MEM) || (minor == M_KMEM), flags);
816 	if (error != 0) {
817 		as_rangeunlock(as);
818 		return (error);
819 	}
820 
821 	switch (minor) {
822 	case M_MEM:
823 		/* /dev/mem cannot be mmap'ed with MAP_PRIVATE */
824 		if ((flags & MAP_TYPE) != MAP_SHARED) {
825 			as_rangeunlock(as);
826 			return (EINVAL);
827 		}
828 
829 		/*
830 		 * Check to ensure that the entire range is
831 		 * legal and we are not trying to map in
832 		 * more than the device will let us.
833 		 */
834 		for (i = 0; i < len; i += PAGESIZE) {
835 			if (mmmmap(dev, off + i, maxprot) == -1) {
836 				as_rangeunlock(as);
837 				return (ENXIO);
838 			}
839 		}
840 
841 		/*
842 		 * Use seg_dev segment driver for /dev/mem mapping.
843 		 */
844 		dev_a.mapfunc = mmmmap;
845 		dev_a.dev = dev;
846 		dev_a.offset = off;
847 		dev_a.type = (flags & MAP_TYPE);
848 		dev_a.prot = (uchar_t)prot;
849 		dev_a.maxprot = (uchar_t)maxprot;
850 		dev_a.hat_attr = 0;
851 
852 		/*
853 		 * Make /dev/mem mappings non-consistent since we can't
854 		 * alias pages that don't have page structs behind them,
855 		 * such as kernel stack pages. If someone mmap()s a kernel
856 		 * stack page and if we give him a tte with cv, a line from
857 		 * that page can get into both pages of the spitfire d$.
858 		 * But snoop from another processor will only invalidate
859 		 * the first page. This later caused kernel (xc_attention)
860 		 * to go into an infinite loop at pil 13 and no interrupts
861 		 * could come in. See 1203630.
862 		 *
863 		 */
864 		dev_a.hat_flags = HAT_LOAD_NOCONSIST;
865 		dev_a.devmap_data = NULL;
866 
867 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
868 		break;
869 
870 	case M_ZERO:
871 		/*
872 		 * Use seg_vn segment driver for /dev/zero mapping.
873 		 * Passing in a NULL amp gives us the "cloning" effect.
874 		 */
875 		vn_a.vp = NULL;
876 		vn_a.offset = 0;
877 		vn_a.type = (flags & MAP_TYPE);
878 		vn_a.prot = prot;
879 		vn_a.maxprot = maxprot;
880 		vn_a.flags = flags & ~MAP_TYPE;
881 		vn_a.cred = cred;
882 		vn_a.amp = NULL;
883 		vn_a.szc = 0;
884 		vn_a.lgrp_mem_policy_flags = 0;
885 		error = as_map(as, *addrp, len, segvn_create, &vn_a);
886 		break;
887 
888 	case M_KMEM:
889 	case M_ALLKMEM:
890 		/* No longer supported with KPR. */
891 		error = ENXIO;
892 		break;
893 
894 	case M_NULL:
895 		/*
896 		 * Use seg_dev segment driver for /dev/null mapping.
897 		 */
898 		dev_a.mapfunc = mmmmap;
899 		dev_a.dev = dev;
900 		dev_a.offset = off;
901 		dev_a.type = 0;		/* neither PRIVATE nor SHARED */
902 		dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
903 		dev_a.hat_attr = 0;
904 		dev_a.hat_flags = 0;
905 		error = as_map(as, *addrp, len, segdev_create, &dev_a);
906 		break;
907 
908 	default:
909 		error = ENXIO;
910 	}
911 
912 	as_rangeunlock(as);
913 	return (error);
914 }
915 
916 static struct cb_ops mm_cb_ops = {
917 	mmopen,			/* open */
918 	nulldev,		/* close */
919 	nodev,			/* strategy */
920 	nodev,			/* print */
921 	nodev,			/* dump */
922 	mmread,			/* read */
923 	mmwrite,		/* write */
924 	mmioctl,		/* ioctl */
925 	nodev,			/* devmap */
926 	mmmmap,			/* mmap */
927 	mmsegmap,		/* segmap */
928 	mmchpoll,		/* poll */
929 	mmpropop,		/* prop_op */
930 	0,			/* streamtab  */
931 	D_NEW | D_MP | D_64BIT | D_U64BIT
932 };
933 
934 static struct dev_ops mm_ops = {
935 	DEVO_REV,		/* devo_rev, */
936 	0,			/* refcnt  */
937 	mm_info,		/* get_dev_info */
938 	nulldev,		/* identify */
939 	nulldev,		/* probe */
940 	mm_attach,		/* attach */
941 	nodev,			/* detach */
942 	nodev,			/* reset */
943 	&mm_cb_ops,		/* driver operations */
944 	(struct bus_ops *)0,	/* bus operations */
945 	NULL,			/* power */
946 	ddi_quiesce_not_needed,		/* quiesce */
947 };
948 
949 static struct modldrv modldrv = {
950 	&mod_driverops, "memory driver", &mm_ops,
951 };
952 
953 static struct modlinkage modlinkage = {
954 	MODREV_1, &modldrv, NULL
955 };
956 
957 int
958 _init(void)
959 {
960 	return (mod_install(&modlinkage));
961 }
962 
963 int
964 _info(struct modinfo *modinfop)
965 {
966 	return (mod_info(&modlinkage, modinfop));
967 }
968 
969 int
970 _fini(void)
971 {
972 	return (mod_remove(&modlinkage));
973 }
974 
975 static int
976 mm_kstat_update(kstat_t *ksp, int rw)
977 {
978 	struct memlist *pmem;
979 	uint_t count;
980 
981 	if (rw == KSTAT_WRITE)
982 		return (EACCES);
983 
984 	count = 0;
985 	memlist_read_lock();
986 	for (pmem = phys_install; pmem != NULL; pmem = pmem->ml_next) {
987 		count++;
988 	}
989 	memlist_read_unlock();
990 
991 	ksp->ks_ndata = count;
992 	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
993 
994 	return (0);
995 }
996 
997 static int
998 mm_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
999 {
1000 	struct memlist *pmem;
1001 	struct memunit {
1002 		uint64_t address;
1003 		uint64_t size;
1004 	} *kspmem;
1005 
1006 	if (rw == KSTAT_WRITE)
1007 		return (EACCES);
1008 
1009 	ksp->ks_snaptime = gethrtime();
1010 
1011 	kspmem = (struct memunit *)buf;
1012 	memlist_read_lock();
1013 	for (pmem = phys_install; pmem != NULL;
1014 	    pmem = pmem->ml_next, kspmem++) {
1015 		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1016 			break;
1017 		kspmem->address = pmem->ml_address;
1018 		kspmem->size = pmem->ml_size;
1019 	}
1020 	memlist_read_unlock();
1021 
1022 	return (0);
1023 }
1024 
1025 /*
1026  * Read a mem_name_t from user-space and store it in the mem_name_t
1027  * pointed to by the mem_name argument.
1028  */
1029 static int
1030 mm_read_mem_name(intptr_t data, mem_name_t *mem_name)
1031 {
1032 	if (get_udatamodel() == DATAMODEL_NATIVE) {
1033 		if (copyin((void *)data, mem_name, sizeof (mem_name_t)))
1034 			return (EFAULT);
1035 	}
1036 #ifdef	_SYSCALL32
1037 	else {
1038 		mem_name32_t mem_name32;
1039 
1040 		if (copyin((void *)data, &mem_name32, sizeof (mem_name32_t)))
1041 			return (EFAULT);
1042 		mem_name->m_addr = mem_name32.m_addr;
1043 		mem_name->m_synd = mem_name32.m_synd;
1044 		mem_name->m_type[0] = mem_name32.m_type[0];
1045 		mem_name->m_type[1] = mem_name32.m_type[1];
1046 		mem_name->m_name = (caddr_t)(uintptr_t)mem_name32.m_name;
1047 		mem_name->m_namelen = (size_t)mem_name32.m_namelen;
1048 		mem_name->m_sid = (caddr_t)(uintptr_t)mem_name32.m_sid;
1049 		mem_name->m_sidlen = (size_t)mem_name32.m_sidlen;
1050 	}
1051 #endif	/* _SYSCALL32 */
1052 
1053 	return (0);
1054 }
1055