xref: /titanic_51/usr/src/uts/common/io/physmem.c (revision bde3d612a7c090234c60e6e4578821237a5db135)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 
27 #include <sys/types.h>
28 #include <sys/modctl.h>
29 #include <sys/conf.h>
30 #include <sys/ddi.h>
31 #include <sys/sunddi.h>
32 #include <sys/devops.h>
33 #include <sys/stat.h>
34 #include <sys/file.h>
35 #include <sys/cred.h>
36 #include <sys/policy.h>
37 #include <sys/errno.h>
38 #include <vm/seg_dev.h>
39 #include <vm/seg_vn.h>
40 #include <vm/page.h>
41 #include <sys/fs/swapnode.h>
42 #include <sys/sysmacros.h>
43 #include <sys/fcntl.h>
44 #include <sys/vmsystm.h>
45 #include <sys/physmem.h>
46 #include <sys/vfs_opreg.h>
47 
48 static dev_info_t		*physmem_dip = NULL;
49 
50 /*
51  * Linked list element hanging off physmem_proc_hash below, which holds all
52  * the information for a given segment which has been setup for this process.
53  * This is a simple linked list as we are assuming that for a given process
54  * the setup ioctl will only be called a handful of times.  If this assumption
55  * changes in the future, a quicker to traverse data structure should be used.
56  */
57 struct physmem_hash {
58 	struct physmem_hash *ph_next;
59 	uint64_t ph_base_pa;
60 	caddr_t ph_base_va;
61 	size_t ph_seg_len;
62 	struct vnode *ph_vnode;
63 };
64 
65 /*
66  * Hash of all of the processes which have setup mappings with the driver with
67  * pointers to per process data.
68  */
69 struct physmem_proc_hash {
70 	struct proc *pph_proc;
71 	struct physmem_hash *pph_hash;
72 	struct physmem_proc_hash *pph_next;
73 };
74 
75 
76 /* Needs to be a power of two for simple hash algorithm */
77 #define	PPH_SIZE	8
78 struct physmem_proc_hash *pph[PPH_SIZE];
79 
80 /*
81  * Lock which protects the pph hash above.  To add an element (either a new
82  * process or a new segment) the WRITE lock must be held.  To traverse the
83  * list, only a READ lock is needed.
84  */
85 krwlock_t pph_rwlock;
86 
87 #define	PHYSMEM_HASH(procp) ((int)((((uintptr_t)procp) >> 8) & (PPH_SIZE - 1)))
88 
89 /*
90  * Need to keep a reference count of how many processes have the driver
91  * open to prevent it from disappearing.
92  */
93 uint64_t physmem_vnodecnt;
94 kmutex_t physmem_mutex;		/* protects phsymem_vnodecnt */
95 
96 static int physmem_getpage(struct vnode *vp, offset_t off, size_t len,
97     uint_t *protp, page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
98     enum seg_rw rw, struct cred *cr, caller_context_t *ct);
99 
100 static int physmem_addmap(struct vnode *vp, offset_t off, struct as *as,
101     caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
102     struct cred *cred, caller_context_t *ct);
103 
104 static int physmem_delmap(struct vnode *vp, offset_t off, struct as *as,
105     caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags,
106     struct cred *cred, caller_context_t *ct);
107 
108 static void physmem_inactive(vnode_t *vp, cred_t *crp, caller_context_t *ct);
109 
110 const fs_operation_def_t physmem_vnodeops_template[] = {
111 	VOPNAME_GETPAGE,	{ .vop_getpage = physmem_getpage },
112 	VOPNAME_ADDMAP,		{ .vop_addmap = physmem_addmap },
113 	VOPNAME_DELMAP,		{ .vop_delmap = physmem_delmap },
114 	VOPNAME_INACTIVE,	{ .vop_inactive = physmem_inactive },
115 	NULL,			NULL
116 };
117 
118 vnodeops_t *physmem_vnodeops = NULL;
119 
120 /*
121  * Removes the current process from the hash if the process has no more
122  * physmem segments active.
123  */
124 void
125 physmem_remove_hash_proc()
126 {
127 	int index;
128 	struct physmem_proc_hash **walker;
129 	struct physmem_proc_hash *victim = NULL;
130 
131 	index = PHYSMEM_HASH(curproc);
132 	rw_enter(&pph_rwlock, RW_WRITER);
133 	walker = &pph[index];
134 	while (*walker != NULL) {
135 		if ((*walker)->pph_proc == curproc &&
136 		    (*walker)->pph_hash == NULL) {
137 			victim = *walker;
138 			*walker = victim->pph_next;
139 			break;
140 		}
141 		walker = &((*walker)->pph_next);
142 	}
143 	rw_exit(&pph_rwlock);
144 	if (victim != NULL)
145 		kmem_free(victim, sizeof (struct physmem_proc_hash));
146 }
147 
148 /*
149  * Add a new entry to the hash for the given process to cache the
150  * address ranges that it is working on.  If this is the first hash
151  * item to be added for this process, we will create the head pointer
152  * for this process.
153  * Returns 0 on success, ERANGE when the physical address is already in the
154  * hash.
155  */
156 int
157 physmem_add_hash(struct physmem_hash *php)
158 {
159 	int index;
160 	struct physmem_proc_hash *iterator;
161 	struct physmem_proc_hash *newp = NULL;
162 	struct physmem_hash *temp;
163 	int ret = 0;
164 
165 	index = PHYSMEM_HASH(curproc);
166 
167 insert:
168 	rw_enter(&pph_rwlock, RW_WRITER);
169 	iterator = pph[index];
170 	while (iterator != NULL) {
171 		if (iterator->pph_proc == curproc) {
172 			/*
173 			 * check to make sure a single process does not try to
174 			 * map the same region twice.
175 			 */
176 			for (temp = iterator->pph_hash; temp != NULL;
177 			    temp = temp->ph_next) {
178 				if ((php->ph_base_pa >= temp->ph_base_pa &&
179 				    php->ph_base_pa < temp->ph_base_pa +
180 				    temp->ph_seg_len) ||
181 				    (temp->ph_base_pa >= php->ph_base_pa &&
182 				    temp->ph_base_pa < php->ph_base_pa +
183 				    php->ph_seg_len)) {
184 					ret = ERANGE;
185 					break;
186 				}
187 			}
188 			if (ret == 0) {
189 				php->ph_next = iterator->pph_hash;
190 				iterator->pph_hash = php;
191 			}
192 			rw_exit(&pph_rwlock);
193 			/* Need to check for two threads in sync */
194 			if (newp != NULL)
195 				kmem_free(newp, sizeof (*newp));
196 			return (ret);
197 		}
198 		iterator = iterator->pph_next;
199 	}
200 
201 	if (newp != NULL) {
202 		newp->pph_proc = curproc;
203 		newp->pph_next = pph[index];
204 		newp->pph_hash = php;
205 		php->ph_next = NULL;
206 		pph[index] = newp;
207 		rw_exit(&pph_rwlock);
208 		return (0);
209 	}
210 
211 	rw_exit(&pph_rwlock);
212 	/* Dropped the lock so we could use KM_SLEEP */
213 	newp = kmem_zalloc(sizeof (struct physmem_proc_hash), KM_SLEEP);
214 	goto insert;
215 }
216 
217 /*
218  * Will return the pointer to the physmem_hash struct if the setup routine
219  * has previously been called for this memory.
220  * Returns NULL on failure.
221  */
222 struct physmem_hash *
223 physmem_get_hash(uint64_t req_paddr, size_t len, proc_t *procp)
224 {
225 	int index;
226 	struct physmem_proc_hash *proc_hp;
227 	struct physmem_hash *php;
228 
229 	ASSERT(rw_lock_held(&pph_rwlock));
230 
231 	index = PHYSMEM_HASH(procp);
232 	proc_hp = pph[index];
233 	while (proc_hp != NULL) {
234 		if (proc_hp->pph_proc == procp) {
235 			php = proc_hp->pph_hash;
236 			while (php != NULL) {
237 				if ((req_paddr >= php->ph_base_pa) &&
238 				    (req_paddr + len <=
239 				    php->ph_base_pa + php->ph_seg_len)) {
240 					return (php);
241 				}
242 				php = php->ph_next;
243 			}
244 		}
245 		proc_hp = proc_hp->pph_next;
246 	}
247 	return (NULL);
248 }
249 
250 int
251 physmem_validate_cookie(uint64_t p_cookie)
252 {
253 	int index;
254 	struct physmem_proc_hash *proc_hp;
255 	struct physmem_hash *php;
256 
257 	ASSERT(rw_lock_held(&pph_rwlock));
258 
259 	index = PHYSMEM_HASH(curproc);
260 	proc_hp = pph[index];
261 	while (proc_hp != NULL) {
262 		if (proc_hp->pph_proc == curproc) {
263 			php = proc_hp->pph_hash;
264 			while (php != NULL) {
265 				if ((uint64_t)(uintptr_t)php == p_cookie) {
266 					return (1);
267 				}
268 				php = php->ph_next;
269 			}
270 		}
271 		proc_hp = proc_hp->pph_next;
272 	}
273 	return (0);
274 }
275 
276 /*
277  * Remove the given vnode from the pph hash.  If it exists in the hash the
278  * process still has to be around as the vnode is obviously still around and
279  * since it's a physmem vnode, it must be in the hash.
280  * If it is not in the hash that must mean that the setup ioctl failed.
281  * Return 0 in this instance, 1 if it is in the hash.
282  */
283 int
284 physmem_remove_vnode_hash(vnode_t *vp)
285 {
286 	int index;
287 	struct physmem_proc_hash *proc_hp;
288 	struct physmem_hash **phpp;
289 	struct physmem_hash *victim;
290 
291 	index = PHYSMEM_HASH(curproc);
292 	/* synchronize with the map routine */
293 	rw_enter(&pph_rwlock, RW_WRITER);
294 	proc_hp = pph[index];
295 	while (proc_hp != NULL) {
296 		if (proc_hp->pph_proc == curproc) {
297 			phpp = &proc_hp->pph_hash;
298 			while (*phpp != NULL) {
299 				if ((*phpp)->ph_vnode == vp) {
300 					victim = *phpp;
301 					*phpp = victim->ph_next;
302 
303 					rw_exit(&pph_rwlock);
304 					kmem_free(victim, sizeof (*victim));
305 					return (1);
306 				}
307 				phpp = &(*phpp)->ph_next;
308 			}
309 		}
310 		proc_hp = proc_hp->pph_next;
311 	}
312 	rw_exit(&pph_rwlock);
313 
314 	/* not found */
315 	return (0);
316 }
317 
318 int
319 physmem_setup_vnops()
320 {
321 	int error;
322 	char *name = "physmem";
323 	if (physmem_vnodeops != NULL)
324 		cmn_err(CE_PANIC, "physmem vnodeops already set\n");
325 	error = vn_make_ops(name, physmem_vnodeops_template, &physmem_vnodeops);
326 	if (error != 0) {
327 		cmn_err(CE_WARN, "physmem_setup_vnops: bad vnode ops template");
328 	}
329 	return (error);
330 }
331 
332 /*
333  * The guts of the PHYSMEM_SETUP ioctl.
334  * Create a segment in the address space with the specified parameters.
335  * If pspp->user_va is NULL, as_gap will be used to find an appropriate VA.
336  * We do not do bounds checking on the requested physical addresses, if they
337  * do not exist in the system, they will not be mappable.
338  * Returns 0 on success with the following error codes on failure:
339  *	ENOMEM - The VA range requested was already mapped if pspp->user_va is
340  *		non-NULL or the system was unable to find enough VA space for
341  *		the desired length if user_va was NULL>
342  *	EINVAL - The requested PA, VA, or length was not PAGESIZE aligned.
343  */
344 int
345 physmem_setup_addrs(struct physmem_setup_param *pspp)
346 {
347 	struct as *as = curproc->p_as;
348 	struct segvn_crargs vn_a;
349 	int ret = 0;
350 	uint64_t base_pa;
351 	size_t len;
352 	caddr_t uvaddr;
353 	struct vnode *vp;
354 	struct physmem_hash *php;
355 
356 	ASSERT(pspp != NULL);
357 	base_pa = pspp->req_paddr;
358 	len = pspp->len;
359 	uvaddr = (caddr_t)(uintptr_t)pspp->user_va;
360 
361 	/* Sanity checking */
362 	if (!IS_P2ALIGNED(base_pa, PAGESIZE))
363 		return (EINVAL);
364 	if (!IS_P2ALIGNED(len, PAGESIZE))
365 		return (EINVAL);
366 	if (uvaddr != NULL && !IS_P2ALIGNED(uvaddr, PAGESIZE))
367 		return (EINVAL);
368 
369 	php = kmem_zalloc(sizeof (struct physmem_hash), KM_SLEEP);
370 
371 	/* Need to bump vnode count so that the driver can not be unloaded */
372 	mutex_enter(&physmem_mutex);
373 	physmem_vnodecnt++;
374 	mutex_exit(&physmem_mutex);
375 
376 	vp = vn_alloc(KM_SLEEP);
377 	ASSERT(vp != NULL);	/* SLEEP can't return NULL */
378 	vn_setops(vp, physmem_vnodeops);
379 
380 	php->ph_vnode = vp;
381 
382 	vn_a.vp = vp;
383 	vn_a.offset = (u_offset_t)base_pa;
384 	vn_a.type = MAP_SHARED;
385 	vn_a.prot = PROT_ALL;
386 	vn_a.maxprot = PROT_ALL;
387 	vn_a.flags = 0;
388 	vn_a.cred = NULL;
389 	vn_a.amp = NULL;
390 	vn_a.szc = 0;
391 	vn_a.lgrp_mem_policy_flags = 0;
392 
393 	as_rangelock(as);
394 	if (uvaddr != NULL) {
395 		if (as_gap(as, len, &uvaddr, &len, AH_LO, NULL) == -1) {
396 			ret = ENOMEM;
397 fail:
398 			as_rangeunlock(as);
399 			vn_free(vp);
400 			kmem_free(php, sizeof (*php));
401 			mutex_enter(&physmem_mutex);
402 			physmem_vnodecnt--;
403 			mutex_exit(&physmem_mutex);
404 			return (ret);
405 		}
406 	} else {
407 		/* We pick the address for the user */
408 		map_addr(&uvaddr, len, 0, 1, 0);
409 		if (uvaddr == NULL) {
410 			ret = ENOMEM;
411 			goto fail;
412 		}
413 	}
414 	ret = as_map(as, uvaddr, len, segvn_create, &vn_a);
415 
416 	if (ret == 0) {
417 		as_rangeunlock(as);
418 		php->ph_base_pa = base_pa;
419 		php->ph_base_va = uvaddr;
420 		php->ph_seg_len = len;
421 		pspp->user_va = (uint64_t)(uintptr_t)uvaddr;
422 		pspp->cookie = (uint64_t)(uintptr_t)php;
423 		ret = physmem_add_hash(php);
424 		if (ret == 0)
425 			return (0);
426 
427 		/* Note that the call to as_unmap will free the vnode */
428 		(void) as_unmap(as, uvaddr, len);
429 		kmem_free(php, sizeof (*php));
430 		return (ret);
431 	}
432 
433 	goto fail;
434 	/*NOTREACHED*/
435 }
436 
437 /*
438  * The guts of the PHYSMEM_MAP ioctl.
439  * Map the given PA to the appropriate VA if PHYSMEM_SETUP ioctl has already
440  * been called for this PA range.
441  * Returns 0 on success with the following error codes on failure:
442  *	EPERM - The requested page is long term locked, and thus repeated
443  *		requests to allocate this page will likely fail.
444  *	EAGAIN - The requested page could not be allocated, but it is believed
445  *		that future attempts could succeed.
446  *	ENOMEM - There was not enough free memory in the system to safely
447  *		map the requested page.
448  *	EINVAL - The requested paddr was not PAGESIZE aligned or the
449  *		PHYSMEM_SETUP ioctl was not called for this page.
450  *	ENOENT - The requested page was iniside the kernel cage, and the
451  *		PHYSMEM_CAGE flag was not set.
452  *	EBUSY - The requested page is retired and the PHYSMEM_RETIRE flag
453  *		was not set.
454  */
455 static int
456 physmem_map_addrs(struct physmem_map_param *pmpp)
457 {
458 	caddr_t uvaddr;
459 	page_t *pp;
460 	uint64_t req_paddr;
461 	struct vnode *vp;
462 	int ret = 0;
463 	struct physmem_hash *php;
464 	uint_t flags = 0;
465 
466 	ASSERT(pmpp != NULL);
467 	req_paddr = pmpp->req_paddr;
468 
469 	if (!IS_P2ALIGNED(req_paddr, PAGESIZE))
470 		return (EINVAL);
471 	/* Find the vnode for this map request */
472 	rw_enter(&pph_rwlock, RW_READER);
473 	php = physmem_get_hash(req_paddr, PAGESIZE, curproc);
474 	if (php == NULL) {
475 		rw_exit(&pph_rwlock);
476 		return (EINVAL);
477 	}
478 	vp = php->ph_vnode;
479 	uvaddr = php->ph_base_va + (req_paddr - php->ph_base_pa);
480 	rw_exit(&pph_rwlock);
481 
482 	pp = page_numtopp_nolock(btop((size_t)req_paddr));
483 	if (pp == NULL) {
484 		pmpp->ret_va = NULL;
485 		return (EPERM);
486 	}
487 
488 	/*
489 	 * Check to see if page already mapped correctly.  This can happen
490 	 * when we failed to capture a page previously and it was captured
491 	 * asynchronously for us.  Return success in this case.
492 	 */
493 	if (pp->p_vnode == vp) {
494 		ASSERT(pp->p_offset == (u_offset_t)req_paddr);
495 		pmpp->ret_va = (uint64_t)(uintptr_t)uvaddr;
496 		return (0);
497 	}
498 
499 	/*
500 	 * physmem should be responsible for checking for cage
501 	 * and prom pages.
502 	 */
503 	if (pmpp->flags & PHYSMEM_CAGE)
504 		flags = CAPTURE_GET_CAGE;
505 	if (pmpp->flags & PHYSMEM_RETIRED)
506 		flags |= CAPTURE_GET_RETIRED;
507 
508 	ret = page_trycapture(pp, 0, flags | CAPTURE_PHYSMEM, curproc);
509 
510 	if (ret != 0) {
511 		pmpp->ret_va = NULL;
512 		return (ret);
513 	} else {
514 		pmpp->ret_va = (uint64_t)(uintptr_t)uvaddr;
515 		return (0);
516 	}
517 }
518 
519 /*
520  * Map the given page into the process's address space if possible.
521  * We actually only hash the page in on the correct vnode as the page
522  * will be mapped via segvn_pagefault.
523  * returns 0 on success
524  * returns 1 if there is no need to map this page anymore (process exited)
525  * returns -1 if we failed to map the page.
526  */
527 int
528 map_page_proc(page_t *pp, void *arg, uint_t flags)
529 {
530 	struct vnode *vp;
531 	proc_t *procp = (proc_t *)arg;
532 	int ret;
533 	u_offset_t paddr = (u_offset_t)ptob(pp->p_pagenum);
534 	struct physmem_hash *php;
535 
536 	ASSERT(pp != NULL);
537 
538 	/*
539 	 * Check against availrmem to make sure that we're not low on memory.
540 	 * We check again here as ASYNC requests do not do this check elsewhere.
541 	 * We return 1 as we don't want the page to have the PR_CAPTURE bit
542 	 * set or be on the page capture hash.
543 	 */
544 	if (swapfs_minfree > availrmem + 1) {
545 		page_free(pp, 1);
546 		return (1);
547 	}
548 
549 	/*
550 	 * If this is an asynchronous request for the current process,
551 	 * we can not map the page as it's possible that we are also in the
552 	 * process of unmapping the page which could result in a deadlock
553 	 * with the as lock.
554 	 */
555 	if ((flags & CAPTURE_ASYNC) && (curproc == procp)) {
556 		page_free(pp, 1);
557 		return (-1);
558 	}
559 
560 	/* only return zeroed out pages */
561 	pagezero(pp, 0, PAGESIZE);
562 
563 	rw_enter(&pph_rwlock, RW_READER);
564 	php = physmem_get_hash(paddr, PAGESIZE, procp);
565 	if (php == NULL) {
566 		rw_exit(&pph_rwlock);
567 		/*
568 		 * Free the page as there is no longer a valid outstanding
569 		 * request for this page.
570 		 */
571 		page_free(pp, 1);
572 		return (1);
573 	}
574 
575 	vp = php->ph_vnode;
576 
577 	/*
578 	 * We need to protect against a possible deadlock here where we own
579 	 * the vnode page hash mutex and want to acquire it again as there
580 	 * are locations in the code, where we unlock a page while holding
581 	 * the mutex which can lead to the page being captured and eventually
582 	 * end up here.
583 	 */
584 	if (mutex_owned(page_vnode_mutex(vp))) {
585 		rw_exit(&pph_rwlock);
586 		page_free(pp, 1);
587 		return (-1);
588 	}
589 
590 	ret = page_hashin(pp, vp, paddr, NULL);
591 	rw_exit(&pph_rwlock);
592 	if (ret == 0) {
593 		page_free(pp, 1);
594 		return (-1);
595 	}
596 
597 	page_downgrade(pp);
598 
599 	mutex_enter(&freemem_lock);
600 	availrmem--;
601 	mutex_exit(&freemem_lock);
602 
603 	return (0);
604 }
605 
606 /*
607  * The guts of the PHYSMEM_DESTROY ioctl.
608  * The cookie passed in will provide all of the information needed to
609  * free up the address space and physical memory associated with the
610  * corresponding PHSYMEM_SETUP ioctl.
611  * Returns 0 on success with the following error codes on failure:
612  *	EINVAL - The cookie supplied is not valid.
613  */
614 int
615 physmem_destroy_addrs(uint64_t p_cookie)
616 {
617 	struct as *as = curproc->p_as;
618 	size_t len;
619 	caddr_t uvaddr;
620 
621 	rw_enter(&pph_rwlock, RW_READER);
622 	if (physmem_validate_cookie(p_cookie) == 0) {
623 		rw_exit(&pph_rwlock);
624 		return (EINVAL);
625 	}
626 
627 	len = ((struct physmem_hash *)(uintptr_t)p_cookie)->ph_seg_len;
628 	uvaddr = ((struct physmem_hash *)(uintptr_t)p_cookie)->ph_base_va;
629 	rw_exit(&pph_rwlock);
630 
631 	(void) as_unmap(as, uvaddr, len);
632 
633 	return (0);
634 }
635 
636 /*
637  * If the page has been hashed into the physmem vnode, then just look it up
638  * and return it via pl, otherwise return ENOMEM as the map ioctl has not
639  * succeeded on the given page.
640  */
641 /*ARGSUSED*/
642 static int
643 physmem_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
644     page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw,
645     struct cred *cr, caller_context_t *ct)
646 {
647 	page_t *pp;
648 
649 	ASSERT(len == PAGESIZE);
650 	ASSERT(AS_READ_HELD(seg->s_as, &seg->s_as->a_lock));
651 
652 	/*
653 	 * If the page is in the hash, then we successfully claimed this
654 	 * page earlier, so return it to the caller.
655 	 */
656 	pp = page_lookup(vp, off, SE_SHARED);
657 	if (pp != NULL) {
658 		pl[0] = pp;
659 		pl[1] = NULL;
660 		*protp = PROT_ALL;
661 		return (0);
662 	}
663 	return (ENOMEM);
664 }
665 
666 /*
667  * We can not allow a process mapping /dev/physmem pages to fork as there can
668  * only be a single mapping to a /dev/physmem page at a given time.  Thus, the
669  * return of EINVAL when we are not working on our own address space.
670  * Otherwise we return zero as this function is required for normal operation.
671  */
672 /*ARGSUSED*/
673 static int
674 physmem_addmap(struct vnode *vp, offset_t off, struct as *as,
675     caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
676     struct cred *cred, caller_context_t *ct)
677 {
678 	if (curproc->p_as != as) {
679 		return (EINVAL);
680 	}
681 	return (0);
682 }
683 
684 /* Will always get called for removing a whole segment. */
685 /*ARGSUSED*/
686 static int
687 physmem_delmap(struct vnode *vp, offset_t off, struct as *as,
688     caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags,
689     struct cred *cred, caller_context_t *ct)
690 {
691 	/*
692 	 * Release our hold on the vnode so that the final VN_RELE will
693 	 * call physmem_inactive to clean things up.
694 	 */
695 	VN_RELE(vp);
696 
697 	return (0);
698 }
699 
700 /*
701  * Clean up all the pages belonging to this vnode and then free it.
702  */
703 /*ARGSUSED*/
704 static void
705 physmem_inactive(vnode_t *vp, cred_t *crp, caller_context_t *ct)
706 {
707 	page_t *pp;
708 
709 	/*
710 	 * Remove the vnode from the hash now, to prevent asynchronous
711 	 * attempts to map into this vnode.  This avoids a deadlock
712 	 * where two threads try to get into this logic at the same
713 	 * time and try to map the pages they are destroying into the
714 	 * other's address space.
715 	 * If it's not in the hash, just free it.
716 	 */
717 	if (physmem_remove_vnode_hash(vp) == 0) {
718 		ASSERT(vp->v_pages == NULL);
719 		vn_free(vp);
720 		physmem_remove_hash_proc();
721 		mutex_enter(&physmem_mutex);
722 		physmem_vnodecnt--;
723 		mutex_exit(&physmem_mutex);
724 		return;
725 	}
726 
727 	/*
728 	 * At this point in time, no other logic can be adding or removing
729 	 * pages from the vnode, otherwise the v_pages list could be inaccurate.
730 	 */
731 
732 	while ((pp = vp->v_pages) != NULL) {
733 		page_t *rpp;
734 		if (page_tryupgrade(pp)) {
735 			/*
736 			 * set lckcnt for page_destroy to do availrmem
737 			 * accounting
738 			 */
739 			pp->p_lckcnt = 1;
740 			page_destroy(pp, 0);
741 		} else {
742 			/* failure to lock should be transient */
743 			rpp = page_lookup(vp, ptob(pp->p_pagenum), SE_SHARED);
744 			if (rpp != pp) {
745 				page_unlock(rpp);
746 				continue;
747 			}
748 			page_unlock(pp);
749 		}
750 	}
751 	vn_free(vp);
752 	physmem_remove_hash_proc();
753 	mutex_enter(&physmem_mutex);
754 	physmem_vnodecnt--;
755 	mutex_exit(&physmem_mutex);
756 }
757 
758 /*ARGSUSED*/
759 static int
760 physmem_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
761     int *rvalp)
762 {
763 	int ret;
764 
765 	switch (cmd) {
766 	case PHYSMEM_SETUP:
767 		{
768 			struct physmem_setup_param psp;
769 			if (ddi_copyin((void *)arg, &psp,
770 			    sizeof (struct physmem_setup_param), 0))
771 				return (EFAULT);
772 			ret = physmem_setup_addrs(&psp);
773 			if (ddi_copyout(&psp, (void *)arg, sizeof (psp), 0))
774 				return (EFAULT);
775 		}
776 		break;
777 	case PHYSMEM_MAP:
778 		{
779 			struct physmem_map_param pmp;
780 			if (ddi_copyin((void *)arg, &pmp,
781 			    sizeof (struct physmem_map_param), 0))
782 				return (EFAULT);
783 			ret = physmem_map_addrs(&pmp);
784 			if (ddi_copyout(&pmp, (void *)arg, sizeof (pmp), 0))
785 				return (EFAULT);
786 		}
787 		break;
788 	case PHYSMEM_DESTROY:
789 		{
790 			uint64_t cookie;
791 			if (ddi_copyin((void *)arg, &cookie,
792 			    sizeof (uint64_t), 0))
793 				return (EFAULT);
794 			ret = physmem_destroy_addrs(cookie);
795 		}
796 		break;
797 	default:
798 		return (ENOTSUP);
799 	}
800 	return (ret);
801 }
802 
803 /*ARGSUSED*/
804 static int
805 physmem_open(dev_t *devp, int flag, int otyp, cred_t *credp)
806 {
807 	int ret;
808 	static int msg_printed = 0;
809 
810 	if ((flag & (FWRITE | FREAD)) != (FWRITE | FREAD)) {
811 		return (EINVAL);
812 	}
813 
814 	/* need to make sure we have the right privileges */
815 	if ((ret = secpolicy_resource(credp)) != 0)
816 		return (ret);
817 	if ((ret = secpolicy_lock_memory(credp)) != 0)
818 		return (ret);
819 
820 	if (msg_printed == 0) {
821 		cmn_err(CE_NOTE, "!driver has been opened. This driver may "
822 		    "take out long term locks on pages which may impact "
823 		    "dynamic reconfiguration events");
824 		msg_printed = 1;
825 	}
826 
827 	return (0);
828 }
829 
830 /*ARGSUSED*/
831 static int
832 physmem_close(dev_t dev, int flag, int otyp, cred_t *credp)
833 {
834 	return (0);
835 }
836 
837 /*ARGSUSED*/
838 static int
839 physmem_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd,
840     void *arg, void **resultp)
841 {
842 	switch (infocmd) {
843 	case DDI_INFO_DEVT2DEVINFO:
844 		*resultp = physmem_dip;
845 		return (DDI_SUCCESS);
846 
847 	case DDI_INFO_DEVT2INSTANCE:
848 		*resultp = (void *)(ulong_t)getminor((dev_t)arg);
849 		return (DDI_SUCCESS);
850 
851 	default:
852 		return (DDI_FAILURE);
853 	}
854 }
855 
856 static int
857 physmem_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
858 {
859 	int i;
860 
861 	if (cmd == DDI_RESUME) {
862 		return (DDI_SUCCESS);
863 	}
864 
865 	if (cmd != DDI_ATTACH)
866 		return (DDI_FAILURE);
867 
868 	if (ddi_create_minor_node(dip, ddi_get_name(dip), S_IFCHR,
869 	    ddi_get_instance(dip), DDI_PSEUDO, 0) != DDI_SUCCESS)
870 		return (DDI_FAILURE);
871 
872 	physmem_dip = dip;
873 
874 	/* Initialize driver specific data */
875 	if (physmem_setup_vnops()) {
876 		ddi_remove_minor_node(dip, ddi_get_name(dip));
877 		return (DDI_FAILURE);
878 	}
879 
880 	for (i = 0; i < PPH_SIZE; i++)
881 		pph[i] = NULL;
882 
883 	page_capture_register_callback(PC_PHYSMEM, 10000,
884 	    map_page_proc);
885 
886 	return (DDI_SUCCESS);
887 }
888 
889 static int
890 physmem_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
891 {
892 	int ret = DDI_SUCCESS;
893 
894 	if (cmd == DDI_SUSPEND) {
895 		return (DDI_SUCCESS);
896 	}
897 
898 	if (cmd != DDI_DETACH)
899 		return (DDI_FAILURE);
900 
901 	ASSERT(physmem_dip == dip);
902 
903 	mutex_enter(&physmem_mutex);
904 	if (physmem_vnodecnt == 0) {
905 		if (physmem_vnodeops != NULL) {
906 			vn_freevnodeops(physmem_vnodeops);
907 			physmem_vnodeops = NULL;
908 			page_capture_unregister_callback(PC_PHYSMEM);
909 		}
910 	} else {
911 		ret = EBUSY;
912 	}
913 	mutex_exit(&physmem_mutex);
914 	if (ret == DDI_SUCCESS)
915 		ddi_remove_minor_node(dip, ddi_get_name(dip));
916 	return (ret);
917 }
918 
919 static struct cb_ops physmem_cb_ops = {
920 	physmem_open,	/* open */
921 	physmem_close,	/* close */
922 	nodev,		/* strategy */
923 	nodev,		/* print */
924 	nodev,		/* dump */
925 	nodev,		/* read */
926 	nodev,		/* write */
927 	physmem_ioctl,	/* ioctl */
928 	nodev,		/* devmap */
929 	nodev,		/* mmap */
930 	nodev,		/* segmap */
931 	nochpoll,	/* chpoll */
932 	ddi_prop_op,	/* prop_op */
933 	NULL,		/* cb_str */
934 	D_NEW | D_MP | D_DEVMAP,
935 	CB_REV,
936 	NULL,
937 	NULL
938 };
939 
940 static struct dev_ops physmem_ops = {
941 	DEVO_REV,
942 	0,
943 	physmem_getinfo,
944 	nulldev,
945 	nulldev,
946 	physmem_attach,
947 	physmem_detach,
948 	nodev,
949 	&physmem_cb_ops,
950 	NULL,
951 	NULL,
952 	ddi_quiesce_not_needed,		/* quiesce */
953 };
954 
955 static struct modldrv modldrv = {
956 	&mod_driverops,
957 	"physmem driver",
958 	&physmem_ops
959 };
960 
961 static struct modlinkage modlinkage = {
962 	MODREV_1,
963 	&modldrv,
964 	NULL
965 };
966 
967 int
968 _init(void)
969 {
970 	return (mod_install(&modlinkage));
971 }
972 
973 int
974 _info(struct modinfo *modinfop)
975 {
976 	return (mod_info(&modlinkage, modinfop));
977 }
978 
979 int
980 _fini(void)
981 {
982 	return (mod_remove(&modlinkage));
983 }
984