xref: /illumos-gate/usr/src/lib/commpage/amd64/cp_subr.S (revision 5d9d9091f564c198a760790b0bfa72c44e17912b)
1*5d9d9091SRichard Lowe/*
2*5d9d9091SRichard Lowe * This file and its contents are supplied under the terms of the
3*5d9d9091SRichard Lowe * Common Development and Distribution License ("CDDL"), version 1.0.
4*5d9d9091SRichard Lowe * You may only use this file in accordance with the terms of version
5*5d9d9091SRichard Lowe * 1.0 of the CDDL.
6*5d9d9091SRichard Lowe *
7*5d9d9091SRichard Lowe * A full copy of the text of the CDDL should have accompanied this
8*5d9d9091SRichard Lowe * source.  A copy of the CDDL is also available via the Internet at
9*5d9d9091SRichard Lowe * http://www.illumos.org/license/CDDL.
10*5d9d9091SRichard Lowe */
11*5d9d9091SRichard Lowe
12*5d9d9091SRichard Lowe/*
13*5d9d9091SRichard Lowe * Copyright 2019 Joyent, Inc.
14*5d9d9091SRichard Lowe * Copyright 2020 Oxide Computer Company
15*5d9d9091SRichard Lowe */
16*5d9d9091SRichard Lowe
17*5d9d9091SRichard Lowe#include <sys/asm_linkage.h>
18*5d9d9091SRichard Lowe#include <sys/segments.h>
19*5d9d9091SRichard Lowe#include <sys/time_impl.h>
20*5d9d9091SRichard Lowe#include <sys/tsc.h>
21*5d9d9091SRichard Lowe#include <cp_offsets.h>
22*5d9d9091SRichard Lowe
23*5d9d9091SRichard Lowe#define	GETCPU_GDT_OFFSET	SEL_GDT(GDT_CPUID, SEL_UPL)
24*5d9d9091SRichard Lowe
25*5d9d9091SRichard Lowe	.file	"cp_subr.s"
26*5d9d9091SRichard Lowe
27*5d9d9091SRichard Lowe/*
28*5d9d9091SRichard Lowe * These are cloned from TSC and time related code in the kernel.  They should
29*5d9d9091SRichard Lowe * be kept in sync in the case that the source values are changed.
30*5d9d9091SRichard Lowe * See: uts/i86pc/os/timestamp.c
31*5d9d9091SRichard Lowe */
32*5d9d9091SRichard Lowe#define	NSEC_SHIFT	5
33*5d9d9091SRichard Lowe#define	ADJ_SHIFT	4
34*5d9d9091SRichard Lowe#define	NANOSEC		0x3b9aca00
35*5d9d9091SRichard Lowe
36*5d9d9091SRichard Lowe/*
37*5d9d9091SRichard Lowe * For __cp_tsc_read calls which incur looping retries due to CPU migration,
38*5d9d9091SRichard Lowe * this represents the maximum number of tries before bailing out.
39*5d9d9091SRichard Lowe */
40*5d9d9091SRichard Lowe#define	TSC_READ_MAXLOOP	0x4
41*5d9d9091SRichard Lowe
42*5d9d9091SRichard Lowe/*
43*5d9d9091SRichard Lowe * hrtime_t
44*5d9d9091SRichard Lowe * __cp_tsc_read(comm_page_t *cp)
45*5d9d9091SRichard Lowe *
46*5d9d9091SRichard Lowe * Stack usage: 0 bytes
47*5d9d9091SRichard Lowe */
48*5d9d9091SRichard Lowe	ENTRY_NP(__cp_tsc_read)
49*5d9d9091SRichard Lowe	movl	CP_TSC_TYPE(%rdi), %esi
50*5d9d9091SRichard Lowe	movl	CP_TSC_NCPU(%rdi), %r8d
51*5d9d9091SRichard Lowe
52*5d9d9091SRichard Lowe	cmpl	$TSC_TSCP, %esi
53*5d9d9091SRichard Lowe	jne	2f
54*5d9d9091SRichard Lowe	rdtscp
55*5d9d9091SRichard Lowe	/*
56*5d9d9091SRichard Lowe	 * When the TSC is read, the low 32 bits are placed in %eax while the
57*5d9d9091SRichard Lowe	 * high 32 bits are placed in %edx.  They are shifted and ORed together
58*5d9d9091SRichard Lowe	 * to obtain the full 64-bit value.
59*5d9d9091SRichard Lowe	 */
60*5d9d9091SRichard Lowe	shlq	$0x20, %rdx
61*5d9d9091SRichard Lowe	orq	%rdx, %rax
62*5d9d9091SRichard Lowe
63*5d9d9091SRichard Lowe	/*
64*5d9d9091SRichard Lowe	 * A zeroed cp_tsc_ncpu (currently held in r8d) indicates that no
65*5d9d9091SRichard Lowe	 * per-CPU TSC offsets are required.
66*5d9d9091SRichard Lowe	 */
67*5d9d9091SRichard Lowe	testl	%r8d, %r8d
68*5d9d9091SRichard Lowe	jnz	1f
69*5d9d9091SRichard Lowe	ret
70*5d9d9091SRichard Lowe
71*5d9d9091SRichard Lowe1:
72*5d9d9091SRichard Lowe	/*
73*5d9d9091SRichard Lowe	 * A non-zero cp_tsc_ncpu indicates the array length of
74*5d9d9091SRichard Lowe	 * cp_tsc_sync_tick_delta containing per-CPU offsets which are applied
75*5d9d9091SRichard Lowe	 * to TSC readings.  The CPU ID furnished by the IA32_TSC_AUX register
76*5d9d9091SRichard Lowe	 * via rdtscp (placed in rcx) is used to look up an offset value in
77*5d9d9091SRichard Lowe	 * that array and apply it to the TSC value.
78*5d9d9091SRichard Lowe	 */
79*5d9d9091SRichard Lowe	leaq	CP_TSC_SYNC_TICK_DELTA(%rdi), %r9
80*5d9d9091SRichard Lowe	movq	(%r9, %rcx, 8), %rdx
81*5d9d9091SRichard Lowe	addq	%rdx, %rax
82*5d9d9091SRichard Lowe	ret
83*5d9d9091SRichard Lowe
84*5d9d9091SRichard Lowe2:
85*5d9d9091SRichard Lowe	/*
86*5d9d9091SRichard Lowe	 * TSC reading without RDTSCP
87*5d9d9091SRichard Lowe	 *
88*5d9d9091SRichard Lowe	 * Check if handling for per-CPU TSC offsets is required.  If not,
89*5d9d9091SRichard Lowe	 * immediately skip to the the appropriate steps to perform a rdtsc.
90*5d9d9091SRichard Lowe	 *
91*5d9d9091SRichard Lowe	 * If per-CPU offsets are present, the TSC reading process is more
92*5d9d9091SRichard Lowe	 * complicated.  Without rdtscp, there is no way to simultaneously read
93*5d9d9091SRichard Lowe	 * the TSC and query the current CPU.  In order to "catch" migrations
94*5d9d9091SRichard Lowe	 * during execution, the CPU ID is queried before and after rdtsc.  The
95*5d9d9091SRichard Lowe	 * execution is repeated if results differ, subject to a loop limit.
96*5d9d9091SRichard Lowe	 */
97*5d9d9091SRichard Lowe	xorq	%r9, %r9
98*5d9d9091SRichard Lowe	testl	%r8d, %r8d
99*5d9d9091SRichard Lowe	jz	3f
100*5d9d9091SRichard Lowe
101*5d9d9091SRichard Lowe	/*
102*5d9d9091SRichard Lowe	 * Load the address of the per-CPU offset array, since it is needed.
103*5d9d9091SRichard Lowe	 * The attempted loop count is kept in r8.
104*5d9d9091SRichard Lowe	 */
105*5d9d9091SRichard Lowe	leaq	CP_TSC_SYNC_TICK_DELTA(%rdi), %r9
106*5d9d9091SRichard Lowe	xorl	%r8d, %r8d
107*5d9d9091SRichard Lowe
108*5d9d9091SRichard Lowe	/* Query the CPU ID and stash it in r10 for later comparison */
109*5d9d9091SRichard Lowe	movl	$GETCPU_GDT_OFFSET, %edx
110*5d9d9091SRichard Lowe	lsl	%edx, %edx
111*5d9d9091SRichard Lowe	movl	%edx, %r10d
112*5d9d9091SRichard Lowe
113*5d9d9091SRichard Lowe3:
114*5d9d9091SRichard Lowe	/*
115*5d9d9091SRichard Lowe	 * TSC_RDTSC_MFENCE was used in the past for AMD chips, but has been
116*5d9d9091SRichard Lowe	 * supplanted by TSC_RDTSC_LFENCE, which works on Intel and AMD (when
117*5d9d9091SRichard Lowe	 * lfence can be confirmed as serializing).
118*5d9d9091SRichard Lowe	 */
119*5d9d9091SRichard Lowe
120*5d9d9091SRichard Lowe4:
121*5d9d9091SRichard Lowe	cmpl	$TSC_RDTSC_LFENCE, %esi
122*5d9d9091SRichard Lowe	jne	5f
123*5d9d9091SRichard Lowe	lfence
124*5d9d9091SRichard Lowe	rdtsc
125*5d9d9091SRichard Lowe	jmp	7f
126*5d9d9091SRichard Lowe
127*5d9d9091SRichard Lowe5:
128*5d9d9091SRichard Lowe	cmpl	$TSC_RDTSC_CPUID, %esi
129*5d9d9091SRichard Lowe	jne	6f
130*5d9d9091SRichard Lowe	/*
131*5d9d9091SRichard Lowe	 * Since the amd64 ABI dictates that %rbx is callee-saved, it must be
132*5d9d9091SRichard Lowe	 * preserved here.  Its contents will be overwritten when cpuid is used
133*5d9d9091SRichard Lowe	 * as a serializing instruction.
134*5d9d9091SRichard Lowe	 */
135*5d9d9091SRichard Lowe	movq	%rbx, %r11
136*5d9d9091SRichard Lowe	xorl	%eax, %eax
137*5d9d9091SRichard Lowe	cpuid
138*5d9d9091SRichard Lowe	rdtsc
139*5d9d9091SRichard Lowe	movq	%r11, %rbx
140*5d9d9091SRichard Lowe	jmp	7f
141*5d9d9091SRichard Lowe
142*5d9d9091SRichard Lowe6:
143*5d9d9091SRichard Lowe	/*
144*5d9d9091SRichard Lowe	 * Other protections should have prevented this function from being
145*5d9d9091SRichard Lowe	 * called in the first place.  Since callers must handle a failure from
146*5d9d9091SRichard Lowe	 * CPU migration looping, yield the same result as a bail-out: 0
147*5d9d9091SRichard Lowe	 */
148*5d9d9091SRichard Lowe	xorl	%eax, %eax
149*5d9d9091SRichard Lowe	ret
150*5d9d9091SRichard Lowe
151*5d9d9091SRichard Lowe7:
152*5d9d9091SRichard Lowe	shlq	$0x20, %rdx
153*5d9d9091SRichard Lowe	orq	%rdx, %rax
154*5d9d9091SRichard Lowe
155*5d9d9091SRichard Lowe	/*
156*5d9d9091SRichard Lowe	 * With the TSC reading in-hand, check if any per-CPU offset handling
157*5d9d9091SRichard Lowe	 * is required.  The address to the array of deltas (r9) will not have
158*5d9d9091SRichard Lowe	 * been populated if offset handling is unecessary.
159*5d9d9091SRichard Lowe	 */
160*5d9d9091SRichard Lowe	testq	%r9, %r9
161*5d9d9091SRichard Lowe	jnz	8f
162*5d9d9091SRichard Lowe	ret
163*5d9d9091SRichard Lowe
164*5d9d9091SRichard Lowe8:
165*5d9d9091SRichard Lowe	movl	$GETCPU_GDT_OFFSET, %edx
166*5d9d9091SRichard Lowe	lsl	%edx, %edx
167*5d9d9091SRichard Lowe	cmpl	%edx, %r10d
168*5d9d9091SRichard Lowe	jne	9f
169*5d9d9091SRichard Lowe	movq	(%r9, %rdx, 8), %rdx
170*5d9d9091SRichard Lowe	addq	%rdx, %rax
171*5d9d9091SRichard Lowe	ret
172*5d9d9091SRichard Lowe
173*5d9d9091SRichard Lowe9:
174*5d9d9091SRichard Lowe	/*
175*5d9d9091SRichard Lowe	 * It appears that a migration has occurred between the first CPU ID
176*5d9d9091SRichard Lowe	 * query and now.  Check if the loop limit has been broken and retry if
177*5d9d9091SRichard Lowe	 * that's not the case.
178*5d9d9091SRichard Lowe	 */
179*5d9d9091SRichard Lowe	cmpl	$TSC_READ_MAXLOOP, %r8d
180*5d9d9091SRichard Lowe	jge	10f
181*5d9d9091SRichard Lowe	incl	%r8d
182*5d9d9091SRichard Lowe	movl	%edx, %r10d
183*5d9d9091SRichard Lowe	jmp	3b
184*5d9d9091SRichard Lowe
185*5d9d9091SRichard Lowe10:
186*5d9d9091SRichard Lowe	/* Loop limit was reached. Return bail-out value of 0. */
187*5d9d9091SRichard Lowe	xorl	%eax, %eax
188*5d9d9091SRichard Lowe	ret
189*5d9d9091SRichard Lowe
190*5d9d9091SRichard Lowe	SET_SIZE(__cp_tsc_read)
191*5d9d9091SRichard Lowe
192*5d9d9091SRichard Lowe
193*5d9d9091SRichard Lowe/*
194*5d9d9091SRichard Lowe * uint_t
195*5d9d9091SRichard Lowe * __cp_getcpu(comm_page_t *)
196*5d9d9091SRichard Lowe *
197*5d9d9091SRichard Lowe * Stack usage: 0 bytes
198*5d9d9091SRichard Lowe */
199*5d9d9091SRichard Lowe	ENTRY_NP(__cp_getcpu)
200*5d9d9091SRichard Lowe	movl	CP_TSC_TYPE(%rdi), %edi
201*5d9d9091SRichard Lowe	/*
202*5d9d9091SRichard Lowe	 * If RDTSCP is available, it is a quick way to grab the cpu_id which
203*5d9d9091SRichard Lowe	 * is stored in the TSC_AUX MSR by the kernel.
204*5d9d9091SRichard Lowe	 */
205*5d9d9091SRichard Lowe	cmpl	$TSC_TSCP, %edi
206*5d9d9091SRichard Lowe	jne	1f
207*5d9d9091SRichard Lowe	rdtscp
208*5d9d9091SRichard Lowe	movl	%ecx, %eax
209*5d9d9091SRichard Lowe	ret
210*5d9d9091SRichard Lowe1:
211*5d9d9091SRichard Lowe	mov	$GETCPU_GDT_OFFSET, %eax
212*5d9d9091SRichard Lowe	lsl	%eax, %eax
213*5d9d9091SRichard Lowe	ret
214*5d9d9091SRichard Lowe	SET_SIZE(__cp_getcpu)
215*5d9d9091SRichard Lowe
216*5d9d9091SRichard Lowe/*
217*5d9d9091SRichard Lowe * hrtime_t
218*5d9d9091SRichard Lowe * __cp_gethrtime(comm_page_t *cp)
219*5d9d9091SRichard Lowe *
220*5d9d9091SRichard Lowe * Stack usage: 0x20 local + 0x8 call = 0x28 bytes
221*5d9d9091SRichard Lowe *
222*5d9d9091SRichard Lowe * %rsp+0x00 - hrtime_t tsc_last
223*5d9d9091SRichard Lowe * %rsp+0x08 - hrtime_t hrtime_base
224*5d9d9091SRichard Lowe * %rsp+0x10 - commpage_t *cp
225*5d9d9091SRichard Lowe * %rsp+0x18 - int hres_lock
226*5d9d9091SRichard Lowe */
227*5d9d9091SRichard Lowe	ENTRY_NP(__cp_gethrtime)
228*5d9d9091SRichard Lowe	subq	$0x20, %rsp
229*5d9d9091SRichard Lowe	movq	%rdi, 0x10(%rsp)
230*5d9d9091SRichard Lowe1:
231*5d9d9091SRichard Lowe	movl	CP_HRES_LOCK(%rdi), %r9d
232*5d9d9091SRichard Lowe	movl	%r9d, 0x18(%rsp)
233*5d9d9091SRichard Lowe
234*5d9d9091SRichard Lowe	movq	CP_TSC_LAST(%rdi), %rax
235*5d9d9091SRichard Lowe	movq	CP_TSC_HRTIME_BASE(%rdi), %rdx
236*5d9d9091SRichard Lowe	movq	%rax, (%rsp)
237*5d9d9091SRichard Lowe	movq	%rdx, 0x8(%rsp)
238*5d9d9091SRichard Lowe
239*5d9d9091SRichard Lowe	call	__cp_tsc_read
240*5d9d9091SRichard Lowe
241*5d9d9091SRichard Lowe	/*
242*5d9d9091SRichard Lowe	 * Failure is inferred from a TSC reading of 0.  The normal fasttrap
243*5d9d9091SRichard Lowe	 * mechanism can be used as a fallback in such cases.
244*5d9d9091SRichard Lowe	 */
245*5d9d9091SRichard Lowe	testq	%rax, %rax
246*5d9d9091SRichard Lowe	jz	6f
247*5d9d9091SRichard Lowe
248*5d9d9091SRichard Lowe	movq	0x10(%rsp), %rdi
249*5d9d9091SRichard Lowe	movl	0x18(%rsp), %r9d
250*5d9d9091SRichard Lowe	movl	CP_HRES_LOCK(%rdi), %edx
251*5d9d9091SRichard Lowe	andl	$0xfffffffe, %r9d
252*5d9d9091SRichard Lowe	cmpl	%r9d, %edx
253*5d9d9091SRichard Lowe	jne	1b
254*5d9d9091SRichard Lowe
255*5d9d9091SRichard Lowe	/*
256*5d9d9091SRichard Lowe	 * The in-kernel logic for calculating hrtime performs several checks
257*5d9d9091SRichard Lowe	 * to protect against edge cases.  That logic is summarized as:
258*5d9d9091SRichard Lowe	 * if (tsc >= tsc_last) {
259*5d9d9091SRichard Lowe	 *         delta -= tsc_last;
260*5d9d9091SRichard Lowe	 * } else if (tsc >= tsc_last - 2*tsc_max_delta) {
261*5d9d9091SRichard Lowe	 *         delta = 0;
262*5d9d9091SRichard Lowe	 * } else {
263*5d9d9091SRichard Lowe	 *         delta = MIN(tsc, tsc_resume_cap);
264*5d9d9091SRichard Lowe	 * }
265*5d9d9091SRichard Lowe	 *
266*5d9d9091SRichard Lowe	 * The below implementation achieves the same result, although it is
267*5d9d9091SRichard Lowe	 * structured for speed and optimized for the fast path:
268*5d9d9091SRichard Lowe	 *
269*5d9d9091SRichard Lowe	 * delta = tsc - tsc_last;
270*5d9d9091SRichard Lowe	 * if (delta < 0) {
271*5d9d9091SRichard Lowe	 *         delta += (tsc_max_delta << 1);
272*5d9d9091SRichard Lowe	 *         if (delta >= 0) {
273*5d9d9091SRichard Lowe	 *                 delta = 0;
274*5d9d9091SRichard Lowe	 *         } else {
275*5d9d9091SRichard Lowe	 *                 delta = MIN(tsc, tsc_resume_cap);
276*5d9d9091SRichard Lowe	 *         }
277*5d9d9091SRichard Lowe	 * }
278*5d9d9091SRichard Lowe	 */
279*5d9d9091SRichard Lowe	movq	(%rsp), %rdx
280*5d9d9091SRichard Lowe	subq	%rdx, %rax		/* delta = tsc - tsc_last */
281*5d9d9091SRichard Lowe	jbe	3f			/* if (delta < 0) */
282*5d9d9091SRichard Lowe
283*5d9d9091SRichard Lowe2:
284*5d9d9091SRichard Lowe	/*
285*5d9d9091SRichard Lowe	 * Optimized TSC_CONVERT_AND_ADD:
286*5d9d9091SRichard Lowe	 * hrtime_base += (tsc_delta * nsec_scale) >> (32 - NSEC_SHIFT)
287*5d9d9091SRichard Lowe	 *
288*5d9d9091SRichard Lowe	 * Since the multiply and shift are done in 128-bit, there is no need
289*5d9d9091SRichard Lowe	 * to worry about overflow.
290*5d9d9091SRichard Lowe	 */
291*5d9d9091SRichard Lowe	movl	CP_NSEC_SCALE(%rdi), %ecx
292*5d9d9091SRichard Lowe	mulq	%rcx
293*5d9d9091SRichard Lowe	shrdq	$_CONST(32 - NSEC_SHIFT), %rdx, %rax
294*5d9d9091SRichard Lowe	movq	0x8(%rsp), %r8
295*5d9d9091SRichard Lowe	addq	%r8, %rax
296*5d9d9091SRichard Lowe
297*5d9d9091SRichard Lowe	addq	$0x20, %rsp
298*5d9d9091SRichard Lowe	ret
299*5d9d9091SRichard Lowe
300*5d9d9091SRichard Lowe3:
301*5d9d9091SRichard Lowe	movq	%rax, %r9		/* save (tsc - tsc_last) in r9 */
302*5d9d9091SRichard Lowe	movl	CP_TSC_MAX_DELTA(%rdi), %ecx
303*5d9d9091SRichard Lowe	sall	$1, %ecx
304*5d9d9091SRichard Lowe	addq	%rcx, %rax		/* delta += (tsc_max_delta << 1) */
305*5d9d9091SRichard Lowe	jae	4f			/* delta < 0 */
306*5d9d9091SRichard Lowe	xorq	%rax, %rax
307*5d9d9091SRichard Lowe	jmp	2b
308*5d9d9091SRichard Lowe
309*5d9d9091SRichard Lowe4:
310*5d9d9091SRichard Lowe	/*
311*5d9d9091SRichard Lowe	 * Repopulate %rax with the TSC reading by adding tsc_last to %r9
312*5d9d9091SRichard Lowe	 * (which holds tsc - tsc_last)
313*5d9d9091SRichard Lowe	 */
314*5d9d9091SRichard Lowe	movq	(%rsp), %rax
315*5d9d9091SRichard Lowe	addq	%r9, %rax
316*5d9d9091SRichard Lowe
317*5d9d9091SRichard Lowe	/* delta = MIN(tsc, resume_cap) */
318*5d9d9091SRichard Lowe	movq	CP_TSC_RESUME_CAP(%rdi), %rcx
319*5d9d9091SRichard Lowe	cmpq	%rcx, %rax
320*5d9d9091SRichard Lowe	jbe	5f
321*5d9d9091SRichard Lowe	movq	%rcx, %rax
322*5d9d9091SRichard Lowe5:
323*5d9d9091SRichard Lowe	jmp	2b
324*5d9d9091SRichard Lowe
325*5d9d9091SRichard Lowe6:
326*5d9d9091SRichard Lowe	movl	$T_GETHRTIME, %eax
327*5d9d9091SRichard Lowe	int	$T_FASTTRAP
328*5d9d9091SRichard Lowe	addq	$0x20, %rsp
329*5d9d9091SRichard Lowe	ret
330*5d9d9091SRichard Lowe
331*5d9d9091SRichard Lowe	SET_SIZE(__cp_gethrtime)
332*5d9d9091SRichard Lowe
333*5d9d9091SRichard Lowe/*
334*5d9d9091SRichard Lowe * int
335*5d9d9091SRichard Lowe * __cp_clock_gettime_monotonic(comm_page_t *cp, timespec_t *tsp)
336*5d9d9091SRichard Lowe *
337*5d9d9091SRichard Lowe * Stack usage: 0x8 local + 0x8 call + 0x28 called func. = 0x38 bytes
338*5d9d9091SRichard Lowe *
339*5d9d9091SRichard Lowe * %rsp+0x00 - timespec_t *tsp
340*5d9d9091SRichard Lowe */
341*5d9d9091SRichard Lowe	ENTRY_NP(__cp_clock_gettime_monotonic)
342*5d9d9091SRichard Lowe	subq	$0x8, %rsp
343*5d9d9091SRichard Lowe	movq	%rsi, (%rsp)
344*5d9d9091SRichard Lowe
345*5d9d9091SRichard Lowe	call	__cp_gethrtime
346*5d9d9091SRichard Lowe
347*5d9d9091SRichard Lowe	/*
348*5d9d9091SRichard Lowe	 * Convert from hrtime_t (int64_t in nanoseconds) to timespec_t.
349*5d9d9091SRichard Lowe	 * This uses the same approach as hrt2ts, although it has been updated
350*5d9d9091SRichard Lowe	 * to utilize 64-bit math.
351*5d9d9091SRichard Lowe	 * 1 / 1,000,000,000 =
352*5d9d9091SRichard Lowe	 * 1000100101110000010111110100000100110110101101001010110110011B-26
353*5d9d9091SRichard Lowe	 * = 0x112e0be826d694b3 * 2^-26
354*5d9d9091SRichard Lowe	 *
355*5d9d9091SRichard Lowe	 * secs = (nsecs * 0x112e0be826d694b3) >> 26
356*5d9d9091SRichard Lowe	 *
357*5d9d9091SRichard Lowe	 * In order to account for the 2s-compliment of negative inputs, a
358*5d9d9091SRichard Lowe	 * final operation completes the process:
359*5d9d9091SRichard Lowe	 *
360*5d9d9091SRichard Lowe	 * secs -= (nsecs >> 63)
361*5d9d9091SRichard Lowe	 */
362*5d9d9091SRichard Lowe	movq	%rax, %r11
363*5d9d9091SRichard Lowe	movq	$0x112e0be826d694b3, %rdx
364*5d9d9091SRichard Lowe	imulq	%rdx
365*5d9d9091SRichard Lowe	sarq	$0x1a, %rdx
366*5d9d9091SRichard Lowe	movq	%r11, %rax
367*5d9d9091SRichard Lowe	sarq	$0x3f, %rax
368*5d9d9091SRichard Lowe	subq	%rax, %rdx
369*5d9d9091SRichard Lowe	movq	(%rsp), %rsi
370*5d9d9091SRichard Lowe	movq	%rdx, (%rsi)
371*5d9d9091SRichard Lowe	/*
372*5d9d9091SRichard Lowe	 * Populating tv_nsec is easier:
373*5d9d9091SRichard Lowe	 * tv_nsec = nsecs - (secs * NANOSEC)
374*5d9d9091SRichard Lowe	 */
375*5d9d9091SRichard Lowe	imulq	$NANOSEC, %rdx, %rdx
376*5d9d9091SRichard Lowe	subq	%rdx, %r11
377*5d9d9091SRichard Lowe	movq	%r11, 0x8(%rsi)
378*5d9d9091SRichard Lowe
379*5d9d9091SRichard Lowe	xorl	%eax, %eax
380*5d9d9091SRichard Lowe	addq	$0x8, %rsp
381*5d9d9091SRichard Lowe	ret
382*5d9d9091SRichard Lowe	SET_SIZE(__cp_clock_gettime_monotonic)
383*5d9d9091SRichard Lowe
384*5d9d9091SRichard Lowe/*
385*5d9d9091SRichard Lowe * int
386*5d9d9091SRichard Lowe * __cp_clock_gettime_realtime(comm_page_t *cp, timespec_t *tsp)
387*5d9d9091SRichard Lowe *
388*5d9d9091SRichard Lowe * Stack usage: 0x18 local + 0x8 call + 0x28 called func. = 0x48 bytes
389*5d9d9091SRichard Lowe *
390*5d9d9091SRichard Lowe * %rsp+0x00 - commpage_t *cp
391*5d9d9091SRichard Lowe * %rsp+0x08 - timespec_t *tsp
392*5d9d9091SRichard Lowe * %rsp+0x10 - int hres_lock
393*5d9d9091SRichard Lowe */
394*5d9d9091SRichard Lowe	ENTRY_NP(__cp_clock_gettime_realtime)
395*5d9d9091SRichard Lowe	subq	$0x18, %rsp
396*5d9d9091SRichard Lowe	movq	%rdi, (%rsp)
397*5d9d9091SRichard Lowe	movq	%rsi, 0x8(%rsp)
398*5d9d9091SRichard Lowe
399*5d9d9091SRichard Lowe1:
400*5d9d9091SRichard Lowe	movl	CP_HRES_LOCK(%rdi), %eax
401*5d9d9091SRichard Lowe	movl	%eax, 0x10(%rsp)
402*5d9d9091SRichard Lowe
403*5d9d9091SRichard Lowe	call	__cp_gethrtime
404*5d9d9091SRichard Lowe	movq	(%rsp), %rdi
405*5d9d9091SRichard Lowe	movq	CP_HRES_LAST_TICK(%rdi), %rdx
406*5d9d9091SRichard Lowe	subq	%rdx, %rax			/* nslt = hrtime - last_tick */
407*5d9d9091SRichard Lowe	jb	1b
408*5d9d9091SRichard Lowe	movq	CP_HRESTIME(%rdi), %r9
409*5d9d9091SRichard Lowe	movq	_CONST(CP_HRESTIME + CP_HRESTIME_INCR)(%rdi), %r10
410*5d9d9091SRichard Lowe	movl	CP_HRESTIME_ADJ(%rdi), %r11d
411*5d9d9091SRichard Lowe
412*5d9d9091SRichard Lowe	addq	%rax, %r10			/* now.tv_nsec += nslt */
413*5d9d9091SRichard Lowe
414*5d9d9091SRichard Lowe	cmpl	$0, %r11d
415*5d9d9091SRichard Lowe	jb	4f				/* hres_adj > 0 */
416*5d9d9091SRichard Lowe	ja	6f				/* hres_adj < 0 */
417*5d9d9091SRichard Lowe
418*5d9d9091SRichard Lowe2:
419*5d9d9091SRichard Lowe	cmpq	$NANOSEC, %r10
420*5d9d9091SRichard Lowe	jae	8f				/* tv_nsec >= NANOSEC */
421*5d9d9091SRichard Lowe
422*5d9d9091SRichard Lowe3:
423*5d9d9091SRichard Lowe	movl	0x10(%rsp), %eax
424*5d9d9091SRichard Lowe	movl	CP_HRES_LOCK(%rdi), %edx
425*5d9d9091SRichard Lowe	andl	$0xfffffffe, %edx
426*5d9d9091SRichard Lowe	cmpl	%eax, %edx
427*5d9d9091SRichard Lowe	jne	1b
428*5d9d9091SRichard Lowe
429*5d9d9091SRichard Lowe	movq	0x8(%rsp), %rsi
430*5d9d9091SRichard Lowe	movq	%r9, (%rsi)
431*5d9d9091SRichard Lowe	movq	%r10, 0x8(%rsi)
432*5d9d9091SRichard Lowe
433*5d9d9091SRichard Lowe	xorl	%eax, %eax
434*5d9d9091SRichard Lowe	addq	$0x18, %rsp
435*5d9d9091SRichard Lowe	ret
436*5d9d9091SRichard Lowe
437*5d9d9091SRichard Lowe
438*5d9d9091SRichard Lowe4:						/* hres_adj > 0 */
439*5d9d9091SRichard Lowe	sarq	$ADJ_SHIFT, %rax
440*5d9d9091SRichard Lowe	cmpl	%r11d, %eax
441*5d9d9091SRichard Lowe	jbe	5f
442*5d9d9091SRichard Lowe	movl	%r11d, %eax
443*5d9d9091SRichard Lowe5:
444*5d9d9091SRichard Lowe	addq	%rax, %r10
445*5d9d9091SRichard Lowe	jmp	2b
446*5d9d9091SRichard Lowe
447*5d9d9091SRichard Lowe6:						/* hres_adj < 0 */
448*5d9d9091SRichard Lowe	sarq	$ADJ_SHIFT, %rax
449*5d9d9091SRichard Lowe	negl	%r11d
450*5d9d9091SRichard Lowe	cmpl	%r11d, %eax
451*5d9d9091SRichard Lowe	jbe	7f
452*5d9d9091SRichard Lowe	movl	%r11d, %eax
453*5d9d9091SRichard Lowe7:
454*5d9d9091SRichard Lowe	subq	%rax, %r10
455*5d9d9091SRichard Lowe	jmp	2b
456*5d9d9091SRichard Lowe
457*5d9d9091SRichard Lowe8:						/* tv_nsec >= NANOSEC */
458*5d9d9091SRichard Lowe	subq	$NANOSEC, %r10
459*5d9d9091SRichard Lowe	incq	%r9
460*5d9d9091SRichard Lowe	cmpq	$NANOSEC, %r10
461*5d9d9091SRichard Lowe	jae	8b
462*5d9d9091SRichard Lowe	jmp	3b
463*5d9d9091SRichard Lowe
464*5d9d9091SRichard Lowe	SET_SIZE(__cp_clock_gettime_realtime)
465