1 /*
2 * Copyright (c) 2004 Poul-Henning Kamp
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: head/sys/kern/subr_unit.c 255057 2013-08-30 07:37:45Z kib $
27 */
28 /*
29 * This file and its contents are supplied under the terms of the
30 * Common Development and Distribution License ("CDDL"), version 1.0.
31 * You may only use this file in accordance with the terms of version
32 * 1.0 of the CDDL.
33 *
34 * A full copy of the text of the CDDL should have accompanied this
35 * source. A copy of the CDDL is also available via the Internet at
36 * http://www.illumos.org/license/CDDL.
37 */
38 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
39
40 /*
41 * Copyright 2014 Pluribus Networks Inc.
42 * Copyright 2019 Joyent, Inc.
43 * Copyright 2020 Oxide Computer Company
44 */
45
46 #include <sys/types.h>
47 #include <sys/archsystm.h>
48 #include <sys/cpuset.h>
49 #include <sys/fp.h>
50 #include <sys/kmem.h>
51 #include <sys/queue.h>
52 #include <sys/spl.h>
53 #include <sys/systm.h>
54 #include <sys/ddidmareq.h>
55 #include <sys/id_space.h>
56 #include <sys/psm_defs.h>
57 #include <sys/smp_impldefs.h>
58 #include <sys/modhash.h>
59 #include <sys/hma.h>
60
61 #include <sys/x86_archext.h>
62
63 #include <machine/cpufunc.h>
64 #include <machine/md_var.h>
65 #include <machine/specialreg.h>
66 #include <machine/vmm.h>
67 #include <machine/vmparam.h>
68 #include <sys/vmm_impl.h>
69 #include <sys/kernel.h>
70
71 #include <vm/as.h>
72 #include <vm/seg_kmem.h>
73
74
75 static void vmm_tsc_init(void);
76
77 SET_DECLARE(sysinit_set, struct sysinit);
78
79 void
sysinit(void)80 sysinit(void)
81 {
82 struct sysinit **si;
83
84 SET_FOREACH(si, sysinit_set)
85 (*si)->func((*si)->data);
86 }
87
88 void
invalidate_cache_all(void)89 invalidate_cache_all(void)
90 {
91 cpuset_t cpuset;
92
93 kpreempt_disable();
94 cpuset_all_but(&cpuset, CPU->cpu_id);
95 xc_call((xc_arg_t)NULL, (xc_arg_t)NULL, (xc_arg_t)NULL,
96 CPUSET2BV(cpuset), (xc_func_t)invalidate_cache);
97 invalidate_cache();
98 kpreempt_enable();
99 }
100
101 vm_paddr_t
vtophys(void * va)102 vtophys(void *va)
103 {
104 pfn_t pfn;
105
106 /*
107 * Since hat_getpfnum() may block on an htable mutex, this is not at
108 * all safe to run from a critical_enter/kpreempt_disable context.
109 * The FreeBSD analog does not have the same locking constraints, so
110 * close attention must be paid wherever this is called.
111 */
112 ASSERT(curthread->t_preempt == 0);
113
114 pfn = hat_getpfnum(kas.a_hat, (caddr_t)va);
115 ASSERT(pfn != PFN_INVALID);
116 return (pfn << PAGE_SHIFT) | ((uintptr_t)va & PAGE_MASK);
117 }
118
119 int
cpusetobj_ffs(const cpuset_t * set)120 cpusetobj_ffs(const cpuset_t *set)
121 {
122 uint_t large, small;
123
124 /*
125 * Rather than reaching into the cpuset_t ourselves, leave that task to
126 * cpuset_bounds(). The simplicity is worth the extra wasted work to
127 * find the upper bound.
128 */
129 cpuset_bounds(set, &small, &large);
130
131 if (small == CPUSET_NOTINSET) {
132 /* The FreeBSD version returns 0 if it find nothing */
133 return (0);
134 }
135
136 ASSERT3U(small, <=, INT_MAX);
137
138 /* Least significant bit index starts at 1 for valid results */
139 return (small + 1);
140 }
141
142 struct vmm_ptp_item {
143 void *vpi_vaddr;
144 };
145 static kmutex_t vmm_ptp_lock;
146
147 static mod_hash_t *vmm_ptp_hash;
148 uint_t vmm_ptp_hash_nchains = 16381;
149 uint_t vmm_ptp_hash_size = PAGESIZE;
150
151 static void
vmm_ptp_hash_valdtor(mod_hash_val_t val)152 vmm_ptp_hash_valdtor(mod_hash_val_t val)
153 {
154 struct vmm_ptp_item *i = (struct vmm_ptp_item *)val;
155
156 kmem_free(i->vpi_vaddr, PAGE_SIZE);
157 kmem_free(i, sizeof (*i));
158 }
159
160 static void
vmm_ptp_init(void)161 vmm_ptp_init(void)
162 {
163 vmm_ptp_hash = mod_hash_create_ptrhash("vmm_ptp_hash",
164 vmm_ptp_hash_nchains, vmm_ptp_hash_valdtor, vmm_ptp_hash_size);
165
166 VERIFY(vmm_ptp_hash != NULL);
167 }
168
169 static uint_t
vmm_ptp_check(mod_hash_key_t key,mod_hash_val_t * val,void * unused)170 vmm_ptp_check(mod_hash_key_t key, mod_hash_val_t *val, void *unused)
171 {
172 struct vmm_ptp_item *i = (struct vmm_ptp_item *)val;
173
174 cmn_err(CE_PANIC, "!vmm_ptp_check: hash not empty: %p", i->vpi_vaddr);
175
176 return (MH_WALK_TERMINATE);
177 }
178
179 static void
vmm_ptp_cleanup(void)180 vmm_ptp_cleanup(void)
181 {
182 mod_hash_walk(vmm_ptp_hash, vmm_ptp_check, NULL);
183 mod_hash_destroy_ptrhash(vmm_ptp_hash);
184 }
185
186 /*
187 * The logic in VT-d uses both kernel-virtual and direct-mapped addresses when
188 * freeing PTP pages. Until the consuming code is improved to better track the
189 * pages it allocates, we keep the kernel-virtual addresses to those pages in a
190 * hash table for when they are freed.
191 */
192 void *
vmm_ptp_alloc(void)193 vmm_ptp_alloc(void)
194 {
195 void *p;
196 struct vmm_ptp_item *i;
197
198 p = kmem_zalloc(PAGE_SIZE, KM_SLEEP);
199 i = kmem_alloc(sizeof (struct vmm_ptp_item), KM_SLEEP);
200 i->vpi_vaddr = p;
201
202 mutex_enter(&vmm_ptp_lock);
203 VERIFY(mod_hash_insert(vmm_ptp_hash,
204 (mod_hash_key_t)PHYS_TO_DMAP(vtophys(p)), (mod_hash_val_t)i) == 0);
205 mutex_exit(&vmm_ptp_lock);
206
207 return (p);
208 }
209
210 void
vmm_ptp_free(void * addr)211 vmm_ptp_free(void *addr)
212 {
213 mutex_enter(&vmm_ptp_lock);
214 VERIFY(mod_hash_destroy(vmm_ptp_hash,
215 (mod_hash_key_t)PHYS_TO_DMAP(vtophys(addr))) == 0);
216 mutex_exit(&vmm_ptp_lock);
217 }
218
219 /* Reach into i86pc/os/ddi_impl.c for these */
220 extern void *contig_alloc(size_t, ddi_dma_attr_t *, uintptr_t, int);
221 extern void contig_free(void *, size_t);
222
223 void *
vmm_contig_alloc(size_t size)224 vmm_contig_alloc(size_t size)
225 {
226 ddi_dma_attr_t attr = {
227 /* Using fastboot_dma_attr as a guide... */
228 .dma_attr_version = DMA_ATTR_V0,
229 .dma_attr_addr_lo = 0,
230 .dma_attr_addr_hi = ~0UL,
231 .dma_attr_count_max = 0x00000000FFFFFFFFULL,
232 .dma_attr_align = PAGE_SIZE,
233 .dma_attr_burstsizes = 1,
234 .dma_attr_minxfer = 1,
235 .dma_attr_maxxfer = 0x00000000FFFFFFFFULL,
236 .dma_attr_seg = 0x00000000FFFFFFFFULL, /* any */
237 .dma_attr_sgllen = 1,
238 .dma_attr_granular = PAGE_SIZE,
239 .dma_attr_flags = 0,
240 };
241 void *res;
242
243 res = contig_alloc(size, &attr, PAGE_SIZE, 1);
244 if (res != NULL) {
245 bzero(res, size);
246 }
247
248 return (res);
249 }
250
251 void
vmm_contig_free(void * addr,size_t size)252 vmm_contig_free(void *addr, size_t size)
253 {
254 contig_free(addr, size);
255 }
256
257 void
critical_enter(void)258 critical_enter(void)
259 {
260 kpreempt_disable();
261 }
262
263 void
critical_exit(void)264 critical_exit(void)
265 {
266 kpreempt_enable();
267 }
268
269
270 static void
vmm_glue_callout_handler(void * arg)271 vmm_glue_callout_handler(void *arg)
272 {
273 struct callout *c = arg;
274
275 if (callout_active(c)) {
276 /*
277 * Record the handler fire time so that callout_pending() is
278 * able to detect if the callout becomes rescheduled during the
279 * course of the handler.
280 */
281 c->c_fired = gethrtime();
282 (c->c_func)(c->c_arg);
283 }
284 }
285
286 void
vmm_glue_callout_init(struct callout * c,int mpsafe)287 vmm_glue_callout_init(struct callout *c, int mpsafe)
288 {
289 cyc_handler_t hdlr;
290 cyc_time_t when;
291
292 hdlr.cyh_level = CY_LOW_LEVEL;
293 hdlr.cyh_func = vmm_glue_callout_handler;
294 hdlr.cyh_arg = c;
295 when.cyt_when = CY_INFINITY;
296 when.cyt_interval = CY_INFINITY;
297 bzero(c, sizeof (*c));
298
299 mutex_enter(&cpu_lock);
300 c->c_cyc_id = cyclic_add(&hdlr, &when);
301 mutex_exit(&cpu_lock);
302 }
303
304 void
callout_reset_hrtime(struct callout * c,hrtime_t target,void (* func)(void *),void * arg,int flags)305 callout_reset_hrtime(struct callout *c, hrtime_t target, void (*func)(void *),
306 void *arg, int flags)
307 {
308 ASSERT(c->c_cyc_id != CYCLIC_NONE);
309
310 if ((flags & C_ABSOLUTE) == 0) {
311 target += gethrtime();
312 }
313
314 c->c_func = func;
315 c->c_arg = arg;
316 c->c_target = target;
317 (void) cyclic_reprogram(c->c_cyc_id, target);
318 }
319
320 void
vmm_glue_callout_stop(struct callout * c)321 vmm_glue_callout_stop(struct callout *c)
322 {
323 ASSERT(c->c_cyc_id != CYCLIC_NONE);
324
325 c->c_target = 0;
326 (void) cyclic_reprogram(c->c_cyc_id, CY_INFINITY);
327 }
328
329 void
vmm_glue_callout_drain(struct callout * c)330 vmm_glue_callout_drain(struct callout *c)
331 {
332 ASSERT(c->c_cyc_id != CYCLIC_NONE);
333
334 c->c_target = 0;
335 mutex_enter(&cpu_lock);
336 cyclic_remove(c->c_cyc_id);
337 c->c_cyc_id = CYCLIC_NONE;
338 mutex_exit(&cpu_lock);
339 }
340
341 void
vmm_glue_callout_localize(struct callout * c)342 vmm_glue_callout_localize(struct callout *c)
343 {
344 mutex_enter(&cpu_lock);
345 cyclic_move_here(c->c_cyc_id);
346 mutex_exit(&cpu_lock);
347 }
348
349 /*
350 * Given an interval (in ns) and a frequency (in hz), calculate the number of
351 * "ticks" at that frequency which cover the interval.
352 */
353 uint64_t
hrt_freq_count(hrtime_t interval,uint32_t freq)354 hrt_freq_count(hrtime_t interval, uint32_t freq)
355 {
356 ASSERT3S(interval, >=, 0);
357 const uint64_t sec = interval / NANOSEC;
358 const uint64_t nsec = interval % NANOSEC;
359
360 return ((sec * freq) + ((nsec * freq) / NANOSEC));
361 }
362
363 /*
364 * Given a frequency (in hz) and number of "ticks", calculate the interval
365 * (in ns) which would be covered by those ticks.
366 */
367 hrtime_t
hrt_freq_interval(uint32_t freq,uint64_t count)368 hrt_freq_interval(uint32_t freq, uint64_t count)
369 {
370 const uint64_t sec = count / freq;
371 const uint64_t frac = count % freq;
372
373 return ((NANOSEC * sec) + ((frac * NANOSEC) / freq));
374 }
375
376
377 uint_t cpu_high; /* Highest arg to CPUID */
378 uint_t cpu_exthigh; /* Highest arg to extended CPUID */
379 uint_t cpu_id; /* Stepping ID */
380 char cpu_vendor[20]; /* CPU Origin code */
381
382 static void
vmm_cpuid_init(void)383 vmm_cpuid_init(void)
384 {
385 uint_t regs[4];
386
387 do_cpuid(0, regs);
388 cpu_high = regs[0];
389 ((uint_t *)&cpu_vendor)[0] = regs[1];
390 ((uint_t *)&cpu_vendor)[1] = regs[3];
391 ((uint_t *)&cpu_vendor)[2] = regs[2];
392 cpu_vendor[12] = '\0';
393
394 do_cpuid(1, regs);
395 cpu_id = regs[0];
396
397 do_cpuid(0x80000000, regs);
398 cpu_exthigh = regs[0];
399 }
400
401 void
vmm_sol_glue_init(void)402 vmm_sol_glue_init(void)
403 {
404 vmm_ptp_init();
405 vmm_cpuid_init();
406 vmm_tsc_init();
407 }
408
409 void
vmm_sol_glue_cleanup(void)410 vmm_sol_glue_cleanup(void)
411 {
412 vmm_ptp_cleanup();
413 }
414
415
416 /* From FreeBSD's sys/kern/subr_clock.c */
417
418 /*-
419 * Copyright (c) 1988 University of Utah.
420 * Copyright (c) 1982, 1990, 1993
421 * The Regents of the University of California. All rights reserved.
422 *
423 * This code is derived from software contributed to Berkeley by
424 * the Systems Programming Group of the University of Utah Computer
425 * Science Department.
426 *
427 * Redistribution and use in source and binary forms, with or without
428 * modification, are permitted provided that the following conditions
429 * are met:
430 * 1. Redistributions of source code must retain the above copyright
431 * notice, this list of conditions and the following disclaimer.
432 * 2. Redistributions in binary form must reproduce the above copyright
433 * notice, this list of conditions and the following disclaimer in the
434 * documentation and/or other materials provided with the distribution.
435 * 4. Neither the name of the University nor the names of its contributors
436 * may be used to endorse or promote products derived from this software
437 * without specific prior written permission.
438 *
439 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
440 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
441 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
442 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
443 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
444 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
445 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
446 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
447 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
448 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
449 * SUCH DAMAGE.
450 *
451 * from: Utah $Hdr: clock.c 1.18 91/01/21$
452 * from: @(#)clock.c 8.2 (Berkeley) 1/12/94
453 * from: NetBSD: clock_subr.c,v 1.6 2001/07/07 17:04:02 thorpej Exp
454 * and
455 * from: src/sys/i386/isa/clock.c,v 1.176 2001/09/04
456 */
457
458 #include <sys/clock.h>
459
460 /*
461 * Generic routines to convert between a POSIX date
462 * (seconds since 1/1/1970) and yr/mo/day/hr/min/sec
463 * Derived from NetBSD arch/hp300/hp300/clock.c
464 */
465
466 #define FEBRUARY 2
467 #define days_in_year(y) (leapyear(y) ? 366 : 365)
468 #define days_in_month(y, m) \
469 (month_days[(m) - 1] + (m == FEBRUARY ? leapyear(y) : 0))
470 /* Day of week. Days are counted from 1/1/1970, which was a Thursday */
471 #define day_of_week(days) (((days) + 4) % 7)
472
473 static const int month_days[12] = {
474 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
475 };
476
477
478 /*
479 * This inline avoids some unnecessary modulo operations
480 * as compared with the usual macro:
481 * ( ((year % 4) == 0 &&
482 * (year % 100) != 0) ||
483 * ((year % 400) == 0) )
484 * It is otherwise equivalent.
485 */
486 static int
leapyear(int year)487 leapyear(int year)
488 {
489 int rv = 0;
490
491 if ((year & 3) == 0) {
492 rv = 1;
493 if ((year % 100) == 0) {
494 rv = 0;
495 if ((year % 400) == 0)
496 rv = 1;
497 }
498 }
499 return (rv);
500 }
501
502 int
clock_ct_to_ts(struct clocktime * ct,struct timespec * ts)503 clock_ct_to_ts(struct clocktime *ct, struct timespec *ts)
504 {
505 int i, year, days;
506
507 year = ct->year;
508
509 #ifdef __FreeBSD__
510 if (ct_debug) {
511 printf("ct_to_ts(");
512 print_ct(ct);
513 printf(")");
514 }
515 #endif
516
517 /* Sanity checks. */
518 if (ct->mon < 1 || ct->mon > 12 || ct->day < 1 ||
519 ct->day > days_in_month(year, ct->mon) ||
520 ct->hour > 23 || ct->min > 59 || ct->sec > 59 ||
521 (sizeof (time_t) == 4 && year > 2037)) { /* time_t overflow */
522 #ifdef __FreeBSD__
523 if (ct_debug)
524 printf(" = EINVAL\n");
525 #endif
526 return (EINVAL);
527 }
528
529 /*
530 * Compute days since start of time
531 * First from years, then from months.
532 */
533 days = 0;
534 for (i = POSIX_BASE_YEAR; i < year; i++)
535 days += days_in_year(i);
536
537 /* Months */
538 for (i = 1; i < ct->mon; i++)
539 days += days_in_month(year, i);
540 days += (ct->day - 1);
541
542 ts->tv_sec = (((time_t)days * 24 + ct->hour) * 60 + ct->min) * 60 +
543 ct->sec;
544 ts->tv_nsec = ct->nsec;
545
546 #ifdef __FreeBSD__
547 if (ct_debug)
548 printf(" = %ld.%09ld\n", (long)ts->tv_sec, (long)ts->tv_nsec);
549 #endif
550 return (0);
551 }
552
553 void
clock_ts_to_ct(struct timespec * ts,struct clocktime * ct)554 clock_ts_to_ct(struct timespec *ts, struct clocktime *ct)
555 {
556 int i, year, days;
557 time_t rsec; /* remainder seconds */
558 time_t secs;
559
560 secs = ts->tv_sec;
561 days = secs / SECDAY;
562 rsec = secs % SECDAY;
563
564 ct->dow = day_of_week(days);
565
566 /* Subtract out whole years, counting them in i. */
567 for (year = POSIX_BASE_YEAR; days >= days_in_year(year); year++)
568 days -= days_in_year(year);
569 ct->year = year;
570
571 /* Subtract out whole months, counting them in i. */
572 for (i = 1; days >= days_in_month(year, i); i++)
573 days -= days_in_month(year, i);
574 ct->mon = i;
575
576 /* Days are what is left over (+1) from all that. */
577 ct->day = days + 1;
578
579 /* Hours, minutes, seconds are easy */
580 ct->hour = rsec / 3600;
581 rsec = rsec % 3600;
582 ct->min = rsec / 60;
583 rsec = rsec % 60;
584 ct->sec = rsec;
585 ct->nsec = ts->tv_nsec;
586 #ifdef __FreeBSD__
587 if (ct_debug) {
588 printf("ts_to_ct(%ld.%09ld) = ",
589 (long)ts->tv_sec, (long)ts->tv_nsec);
590 print_ct(ct);
591 printf("\n");
592 }
593 #endif
594 }
595
596 /* Do the host CPU TSCs require offsets be applied for proper sync? */
597 static bool vmm_host_tsc_offset;
598
599 static void
vmm_tsc_init(void)600 vmm_tsc_init(void)
601 {
602 /*
603 * The timestamp logic will decide if a delta need be applied to the
604 * unscaled hrtime reading (effectively rdtsc), but we do require it be
605 * backed by the TSC itself.
606 */
607 extern hrtime_t (*gethrtimeunscaledf)(void);
608 extern hrtime_t tsc_gethrtimeunscaled(void);
609 extern hrtime_t tsc_gethrtimeunscaled_delta(void);
610
611 VERIFY(*gethrtimeunscaledf == tsc_gethrtimeunscaled ||
612 *gethrtimeunscaledf == tsc_gethrtimeunscaled_delta);
613
614 /*
615 * If a delta is being applied to the TSC on a per-host-CPU basis,
616 * expose that delta via vmm_host_tsc_delta().
617 */
618 vmm_host_tsc_offset =
619 (*gethrtimeunscaledf == tsc_gethrtimeunscaled_delta);
620
621 }
622
623 /* Equivalent to the FreeBSD rdtsc(), but with any necessary per-cpu offset */
624 uint64_t
rdtsc_offset(void)625 rdtsc_offset(void)
626 {
627 return ((uint64_t)gethrtimeunscaledf());
628 }
629
630 /*
631 * The delta (if any) which needs to be applied to the TSC of this host CPU to
632 * bring it in sync with the others.
633 */
634 uint64_t
vmm_host_tsc_delta(void)635 vmm_host_tsc_delta(void)
636 {
637 if (vmm_host_tsc_offset) {
638 extern hrtime_t tsc_gethrtime_tick_delta(void);
639 return (tsc_gethrtime_tick_delta());
640 } else {
641 return (0);
642 }
643 }
644