1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2018 Joyent, Inc.
14 * Copyright 2021 Oxide Computer Company
15 */
16
17 /*
18 * segvmm - Virtual-Machine-Memory segment
19 *
20 * The vmm segment driver was designed for mapping regions of kernel memory
21 * allocated to an HVM instance into userspace for manipulation there. It
22 * draws direct lineage from the umap segment driver, but meant for larger
23 * mappings with fewer restrictions.
24 *
25 * seg*k*vmm, in contrast, has mappings for every VMM into kas. We use its
26 * mappings here only to find the relevant PFNs in segvmm_fault_in().
27 */
28
29
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/errno.h>
33 #include <sys/cred.h>
34 #include <sys/kmem.h>
35 #include <sys/lgrp.h>
36 #include <sys/mman.h>
37
38 #include <vm/hat.h>
39 #include <vm/hat_pte.h>
40 #include <vm/htable.h>
41 #include <vm/as.h>
42 #include <vm/seg.h>
43 #include <vm/seg_kmem.h>
44
45 #include <sys/seg_vmm.h>
46
47 typedef struct segvmm_data {
48 krwlock_t svmd_lock;
49 vm_object_t *svmd_vmo;
50 vm_client_t *svmd_vmc;
51 uintptr_t svmd_off;
52 uchar_t svmd_prot;
53 size_t svmd_softlockcnt;
54 } segvmm_data_t;
55
56
57 static int segvmm_dup(struct seg *, struct seg *);
58 static int segvmm_unmap(struct seg *, caddr_t, size_t);
59 static void segvmm_free(struct seg *);
60 static faultcode_t segvmm_fault(struct hat *, struct seg *, caddr_t, size_t,
61 enum fault_type, enum seg_rw);
62 static faultcode_t segvmm_faulta(struct seg *, caddr_t);
63 static int segvmm_setprot(struct seg *, caddr_t, size_t, uint_t);
64 static int segvmm_checkprot(struct seg *, caddr_t, size_t, uint_t);
65 static int segvmm_sync(struct seg *, caddr_t, size_t, int, uint_t);
66 static size_t segvmm_incore(struct seg *, caddr_t, size_t, char *);
67 static int segvmm_lockop(struct seg *, caddr_t, size_t, int, int, ulong_t *,
68 size_t);
69 static int segvmm_getprot(struct seg *, caddr_t, size_t, uint_t *);
70 static u_offset_t segvmm_getoffset(struct seg *, caddr_t);
71 static int segvmm_gettype(struct seg *, caddr_t);
72 static int segvmm_getvp(struct seg *, caddr_t, struct vnode **);
73 static int segvmm_advise(struct seg *, caddr_t, size_t, uint_t);
74 static void segvmm_dump(struct seg *);
75 static int segvmm_pagelock(struct seg *, caddr_t, size_t, struct page ***,
76 enum lock_type, enum seg_rw);
77 static int segvmm_setpagesize(struct seg *, caddr_t, size_t, uint_t);
78 static int segvmm_getmemid(struct seg *, caddr_t, memid_t *);
79 static int segvmm_capable(struct seg *, segcapability_t);
80
81 static struct seg_ops segvmm_ops = {
82 .dup = segvmm_dup,
83 .unmap = segvmm_unmap,
84 .free = segvmm_free,
85 .fault = segvmm_fault,
86 .faulta = segvmm_faulta,
87 .setprot = segvmm_setprot,
88 .checkprot = segvmm_checkprot,
89 .kluster = NULL,
90 .swapout = NULL,
91 .sync = segvmm_sync,
92 .incore = segvmm_incore,
93 .lockop = segvmm_lockop,
94 .getprot = segvmm_getprot,
95 .getoffset = segvmm_getoffset,
96 .gettype = segvmm_gettype,
97 .getvp = segvmm_getvp,
98 .advise = segvmm_advise,
99 .dump = segvmm_dump,
100 .pagelock = segvmm_pagelock,
101 .setpagesize = segvmm_setpagesize,
102 .getmemid = segvmm_getmemid,
103 .getpolicy = NULL,
104 .capable = segvmm_capable,
105 .inherit = seg_inherit_notsup
106 };
107
108 /*
109 * Unload a region from the HAT for A/D tracking.
110 */
111 static void
segvmm_invalidate(void * arg,uintptr_t gpa,size_t sz)112 segvmm_invalidate(void *arg, uintptr_t gpa, size_t sz)
113 {
114 struct seg *seg = arg;
115 segvmm_data_t *svmd = seg->s_data;
116
117 /*
118 * Invalidations are only necessary (and configured) for vmspace
119 * mappings. Direct vm_object mappings are not involved.
120 */
121 ASSERT3P(svmd->svmd_vmo, ==, NULL);
122
123 /*
124 * The region being invalidated may overlap with all, some, or none of
125 * this segment. We are only concerned about that overlap.
126 */
127 const uintptr_t start = MAX(gpa, svmd->svmd_off);
128 const uintptr_t end = MIN(gpa + sz, svmd->svmd_off + seg->s_size);
129 if (start >= end) {
130 return;
131 }
132 ASSERT(start >= svmd->svmd_off && end <= svmd->svmd_off + seg->s_size);
133 ASSERT(start >= gpa && end <= gpa + sz);
134 const caddr_t unload_va = seg->s_base + (start - svmd->svmd_off);
135 const size_t unload_sz = (end - start);
136 ASSERT3U(unload_sz, <=, seg->s_size);
137
138 hat_unload(seg->s_as->a_hat, unload_va, unload_sz, HAT_UNLOAD);
139 }
140
141 /*
142 * Create a VMM-memory-backed segment.
143 */
144 int
segvmm_create(struct seg ** segpp,void * argsp)145 segvmm_create(struct seg **segpp, void *argsp)
146 {
147 struct seg *seg = *segpp;
148 segvmm_crargs_t *cra = argsp;
149 segvmm_data_t *data;
150
151 VERIFY((cra->vmo == NULL && cra->vmc != NULL) ||
152 (cra->vmo != NULL && cra->vmc == NULL));
153 VERIFY(cra->prot & PROT_USER);
154 VERIFY0(cra->offset & PAGEOFFSET);
155
156 data = kmem_zalloc(sizeof (*data), KM_SLEEP);
157 rw_init(&data->svmd_lock, NULL, RW_DEFAULT, NULL);
158 data->svmd_off = cra->offset;
159 data->svmd_prot = cra->prot & ~PROT_USER;
160
161 seg->s_ops = &segvmm_ops;
162 seg->s_data = data;
163
164 if (cra->vmo != NULL) {
165 data->svmd_vmo = cra->vmo;
166 /* Grab a hold on the VM object for the lifetime of segment */
167 vm_object_reference(data->svmd_vmo);
168 } else {
169 int err;
170
171 data->svmd_vmc = cra->vmc;
172 err = vmc_set_inval_cb(data->svmd_vmc, segvmm_invalidate, seg);
173 if (err != 0) {
174 seg->s_ops = NULL;
175 seg->s_data = NULL;
176 kmem_free(data, sizeof (*data));
177 return (err);
178 }
179 }
180 return (0);
181 }
182
183 static int
segvmm_dup(struct seg * seg,struct seg * newseg)184 segvmm_dup(struct seg *seg, struct seg *newseg)
185 {
186 segvmm_data_t *svmd = seg->s_data;
187 segvmm_data_t *newsvmd;
188
189 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
190
191 newsvmd = kmem_zalloc(sizeof (segvmm_data_t), KM_SLEEP);
192 rw_init(&newsvmd->svmd_lock, NULL, RW_DEFAULT, NULL);
193 newsvmd->svmd_off = svmd->svmd_off;
194 newsvmd->svmd_prot = svmd->svmd_prot;
195
196 newseg->s_ops = seg->s_ops;
197 newseg->s_data = newsvmd;
198
199 if (svmd->svmd_vmo != NULL) {
200 /* Grab another hold for the duplicate segment */
201 vm_object_reference(svmd->svmd_vmo);
202 newsvmd->svmd_vmo = svmd->svmd_vmo;
203 } else {
204 int err;
205
206 newsvmd->svmd_vmc = vmc_clone(svmd->svmd_vmc);
207 /*
208 * The cloned client does not inherit the invalidation
209 * configuration, so attempt to set it here for the new segment.
210 */
211 err = vmc_set_inval_cb(newsvmd->svmd_vmc, segvmm_invalidate,
212 newseg);
213 if (err != 0) {
214 newseg->s_ops = NULL;
215 newseg->s_data = NULL;
216 kmem_free(newsvmd, sizeof (*newsvmd));
217 return (err);
218 }
219 }
220
221 return (0);
222 }
223
224 static int
segvmm_unmap(struct seg * seg,caddr_t addr,size_t len)225 segvmm_unmap(struct seg *seg, caddr_t addr, size_t len)
226 {
227 segvmm_data_t *svmd = seg->s_data;
228
229 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
230
231 /* Only allow unmap of entire segment */
232 if (addr != seg->s_base || len != seg->s_size) {
233 return (EINVAL);
234 }
235 if (svmd->svmd_softlockcnt != 0) {
236 return (EAGAIN);
237 }
238
239 /* Unconditionally unload the entire segment range. */
240 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD_UNMAP);
241
242 seg_free(seg);
243 return (0);
244 }
245
246 static void
segvmm_free(struct seg * seg)247 segvmm_free(struct seg *seg)
248 {
249 segvmm_data_t *svmd = seg->s_data;
250
251 ASSERT(svmd != NULL);
252
253 if (svmd->svmd_vmo != NULL) {
254 /* Release the VM object hold this segment possessed */
255 vm_object_release(svmd->svmd_vmo);
256 svmd->svmd_vmo = NULL;
257 } else {
258 vmc_destroy(svmd->svmd_vmc);
259 svmd->svmd_vmc = NULL;
260 }
261 rw_destroy(&svmd->svmd_lock);
262 VERIFY(svmd->svmd_softlockcnt == 0);
263 kmem_free(svmd, sizeof (*svmd));
264 seg->s_data = NULL;
265 }
266
267 static int
segvmm_fault_obj(struct hat * hat,struct seg * seg,uintptr_t va,size_t len)268 segvmm_fault_obj(struct hat *hat, struct seg *seg, uintptr_t va, size_t len)
269 {
270 segvmm_data_t *svmd = seg->s_data;
271 const uintptr_t end = va + len;
272 const int prot = svmd->svmd_prot;
273 const int uprot = prot | PROT_USER;
274 vm_object_t *vmo = svmd->svmd_vmo;
275
276 ASSERT(vmo != NULL);
277
278 va &= PAGEMASK;
279 uintptr_t off = va - (uintptr_t)seg->s_base + svmd->svmd_off;
280 do {
281 pfn_t pfn;
282
283 pfn = vm_object_pfn(vmo, off);
284 if (pfn == PFN_INVALID) {
285 return (FC_NOMAP);
286 }
287
288 /* Ignore any large-page possibilities for now */
289 hat_devload(hat, (caddr_t)va, PAGESIZE, pfn, uprot, HAT_LOAD);
290 va += PAGESIZE;
291 off += PAGESIZE;
292 } while (va < end);
293
294 return (0);
295 }
296
297 static int
segvmm_fault_space(struct hat * hat,struct seg * seg,uintptr_t va,size_t len)298 segvmm_fault_space(struct hat *hat, struct seg *seg, uintptr_t va, size_t len)
299 {
300 segvmm_data_t *svmd = seg->s_data;
301 const uintptr_t end = va + len;
302 const int prot = svmd->svmd_prot;
303 const int uprot = prot | PROT_USER;
304 vm_client_t *vmc = svmd->svmd_vmc;
305
306 ASSERT(vmc != NULL);
307
308 va &= PAGEMASK;
309 uintptr_t off = va - (uintptr_t)seg->s_base + svmd->svmd_off;
310
311 do {
312 vm_page_t *vmp;
313 pfn_t pfn;
314
315 vmp = vmc_hold(vmc, off, prot);
316 if (vmp == NULL) {
317 return (FC_NOMAP);
318 }
319
320 pfn = vmp_get_pfn(vmp);
321 ASSERT3U(pfn, !=, PFN_INVALID);
322
323 /* Ignore any large-page possibilities for now */
324 hat_devload(hat, (caddr_t)va, PAGESIZE, pfn, uprot, HAT_LOAD);
325
326 if (vmp_release(vmp)) {
327 /*
328 * Region was unmapped from vmspace while we were
329 * loading it into this AS. Communicate it as if it
330 * were a fault.
331 */
332 hat_unload(hat, (caddr_t)va, PAGESIZE, HAT_UNLOAD);
333 return (FC_NOMAP);
334 }
335
336 va += PAGESIZE;
337 off += PAGESIZE;
338 } while (va < end);
339
340 return (0);
341 }
342
343 /* ARGSUSED */
344 static faultcode_t
segvmm_fault(struct hat * hat,struct seg * seg,caddr_t addr,size_t len,enum fault_type type,enum seg_rw rw)345 segvmm_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
346 enum fault_type type, enum seg_rw rw)
347 {
348 segvmm_data_t *svmd = seg->s_data;
349 int err = 0;
350
351 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
352
353 if (type == F_PROT) {
354 /*
355 * Since protection on the segment is fixed, there is nothing
356 * to do but report an error for protection faults.
357 */
358 return (FC_PROT);
359 } else if (type == F_SOFTUNLOCK) {
360 size_t plen = btop(len);
361
362 rw_enter(&svmd->svmd_lock, RW_WRITER);
363 VERIFY(svmd->svmd_softlockcnt >= plen);
364 svmd->svmd_softlockcnt -= plen;
365 rw_exit(&svmd->svmd_lock);
366 return (0);
367 }
368
369 VERIFY(type == F_INVAL || type == F_SOFTLOCK);
370 rw_enter(&svmd->svmd_lock, RW_WRITER);
371
372 if (svmd->svmd_vmo != NULL) {
373 err = segvmm_fault_obj(hat, seg, (uintptr_t)addr, len);
374 } else {
375 err = segvmm_fault_space(hat, seg, (uintptr_t)addr, len);
376 }
377 if (type == F_SOFTLOCK && err == 0) {
378 size_t nval = svmd->svmd_softlockcnt + btop(len);
379
380 if (svmd->svmd_softlockcnt >= nval) {
381 rw_exit(&svmd->svmd_lock);
382 return (FC_MAKE_ERR(EOVERFLOW));
383 }
384 svmd->svmd_softlockcnt = nval;
385 }
386
387 rw_exit(&svmd->svmd_lock);
388 return (err);
389 }
390
391 /* ARGSUSED */
392 static faultcode_t
segvmm_faulta(struct seg * seg,caddr_t addr)393 segvmm_faulta(struct seg *seg, caddr_t addr)
394 {
395 /* Do nothing since asynch pagefault should not load translation. */
396 return (0);
397 }
398
399 /* ARGSUSED */
400 static int
segvmm_setprot(struct seg * seg,caddr_t addr,size_t len,uint_t prot)401 segvmm_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
402 {
403 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
404
405 /* The seg_vmm driver does not yet allow protection to be changed. */
406 return (EACCES);
407 }
408
409 /* ARGSUSED */
410 static int
segvmm_checkprot(struct seg * seg,caddr_t addr,size_t len,uint_t prot)411 segvmm_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
412 {
413 segvmm_data_t *svmd = seg->s_data;
414 int error = 0;
415
416 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
417
418 rw_enter(&svmd->svmd_lock, RW_READER);
419 if ((svmd->svmd_prot & prot) != prot) {
420 error = EACCES;
421 }
422 rw_exit(&svmd->svmd_lock);
423 return (error);
424 }
425
426 /* ARGSUSED */
427 static int
segvmm_sync(struct seg * seg,caddr_t addr,size_t len,int attr,uint_t flags)428 segvmm_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
429 {
430 /* Always succeed since there are no backing store to sync */
431 return (0);
432 }
433
434 /* ARGSUSED */
435 static size_t
segvmm_incore(struct seg * seg,caddr_t addr,size_t len,char * vec)436 segvmm_incore(struct seg *seg, caddr_t addr, size_t len, char *vec)
437 {
438 size_t sz = 0;
439
440 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
441
442 len = (len + PAGEOFFSET) & PAGEMASK;
443 while (len > 0) {
444 *vec = 1;
445 sz += PAGESIZE;
446 vec++;
447 len -= PAGESIZE;
448 }
449 return (sz);
450 }
451
452 /* ARGSUSED */
453 static int
segvmm_lockop(struct seg * seg,caddr_t addr,size_t len,int attr,int op,ulong_t * lockmap,size_t pos)454 segvmm_lockop(struct seg *seg, caddr_t addr, size_t len, int attr, int op,
455 ulong_t *lockmap, size_t pos)
456 {
457 /* Report success since kernel pages are always in memory. */
458 return (0);
459 }
460
461 static int
segvmm_getprot(struct seg * seg,caddr_t addr,size_t len,uint_t * protv)462 segvmm_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
463 {
464 segvmm_data_t *svmd = seg->s_data;
465 size_t pgno;
466 uint_t prot;
467
468 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
469
470 rw_enter(&svmd->svmd_lock, RW_READER);
471 prot = svmd->svmd_prot;
472 rw_exit(&svmd->svmd_lock);
473
474 /*
475 * Reporting protection is simple since it is not tracked per-page.
476 */
477 pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
478 while (pgno > 0) {
479 protv[--pgno] = prot;
480 }
481 return (0);
482 }
483
484 /* ARGSUSED */
485 static u_offset_t
segvmm_getoffset(struct seg * seg,caddr_t addr)486 segvmm_getoffset(struct seg *seg, caddr_t addr)
487 {
488 /*
489 * To avoid leaking information about the layout of the kernel address
490 * space, always report '0' as the offset.
491 */
492 return (0);
493 }
494
495 /* ARGSUSED */
496 static int
segvmm_gettype(struct seg * seg,caddr_t addr)497 segvmm_gettype(struct seg *seg, caddr_t addr)
498 {
499 /*
500 * Since already-existing vmm reservoir pages are being mapped into
501 * userspace, always report the segment type as shared.
502 */
503 return (MAP_SHARED);
504 }
505
506 /* ARGSUSED */
507 static int
segvmm_getvp(struct seg * seg,caddr_t addr,struct vnode ** vpp)508 segvmm_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
509 {
510 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
511
512 *vpp = NULL;
513 return (0);
514 }
515
516 /* ARGSUSED */
517 static int
segvmm_advise(struct seg * seg,caddr_t addr,size_t len,uint_t behav)518 segvmm_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
519 {
520 if (behav == MADV_PURGE) {
521 /* Purge does not make sense for this mapping */
522 return (EINVAL);
523 }
524 /* Indicate success for everything else. */
525 return (0);
526 }
527
528 /* ARGSUSED */
529 static void
segvmm_dump(struct seg * seg)530 segvmm_dump(struct seg *seg)
531 {
532 /*
533 * Since this is a mapping to share kernel data with userspace, nothing
534 * additional should be dumped.
535 */
536 }
537
538 /* ARGSUSED */
539 static int
segvmm_pagelock(struct seg * seg,caddr_t addr,size_t len,struct page *** ppp,enum lock_type type,enum seg_rw rw)540 segvmm_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp,
541 enum lock_type type, enum seg_rw rw)
542 {
543 return (ENOTSUP);
544 }
545
546 /* ARGSUSED */
547 static int
segvmm_setpagesize(struct seg * seg,caddr_t addr,size_t len,uint_t szc)548 segvmm_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
549 {
550 return (ENOTSUP);
551 }
552
553 static int
segvmm_getmemid(struct seg * seg,caddr_t addr,memid_t * memidp)554 segvmm_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
555 {
556 segvmm_data_t *svmd = seg->s_data;
557
558 memidp->val[0] = (uintptr_t)svmd->svmd_vmo;
559 memidp->val[1] = (uintptr_t)(addr - seg->s_base) + svmd->svmd_off;
560 return (0);
561 }
562
563 /* ARGSUSED */
564 static int
segvmm_capable(struct seg * seg,segcapability_t capability)565 segvmm_capable(struct seg *seg, segcapability_t capability)
566 {
567 /* no special capablities */
568 return (0);
569 }
570