xref: /illumos-gate/usr/src/lib/commpage/amd64/cp_subr.S (revision 4d8d108f42a089b7b4441353f2ad7a75e1c7b31d)
1/*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source.  A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12/*
13 * Copyright 2019 Joyent, Inc.
14 * Copyright 2020 Oxide Computer Company
15 */
16
17#include <sys/asm_linkage.h>
18#include <sys/segments.h>
19#include <sys/time_impl.h>
20#include <sys/tsc.h>
21#include <cp_offsets.h>
22
23#define	GETCPU_GDT_OFFSET	SEL_GDT(GDT_CPUID, SEL_UPL)
24
25	.file	"cp_subr.s"
26
27/*
28 * These are cloned from TSC and time related code in the kernel.  They should
29 * be kept in sync in the case that the source values are changed.
30 * See: uts/i86pc/os/timestamp.c
31 */
32#define	NSEC_SHIFT	5
33#define	ADJ_SHIFT	4
34#define	NANOSEC		0x3b9aca00
35
36/*
37 * For __cp_tsc_read calls which incur looping retries due to CPU migration,
38 * this represents the maximum number of tries before bailing out.
39 */
40#define	TSC_READ_MAXLOOP	0x4
41
42/*
43 * hrtime_t
44 * __cp_tsc_read(comm_page_t *cp)
45 *
46 * Stack usage: 0 bytes
47 */
48	ENTRY_NP(__cp_tsc_read)
49	movl	CP_TSC_TYPE(%rdi), %esi
50	movl	CP_TSC_NCPU(%rdi), %r8d
51
52	cmpl	$TSC_TSCP, %esi
53	jne	2f
54	rdtscp
55	/*
56	 * When the TSC is read, the low 32 bits are placed in %eax while the
57	 * high 32 bits are placed in %edx.  They are shifted and ORed together
58	 * to obtain the full 64-bit value.
59	 */
60	shlq	$0x20, %rdx
61	orq	%rdx, %rax
62
63	/*
64	 * A zeroed cp_tsc_ncpu (currently held in r8d) indicates that no
65	 * per-CPU TSC offsets are required.
66	 */
67	testl	%r8d, %r8d
68	jnz	1f
69	ret
70
711:
72	/*
73	 * A non-zero cp_tsc_ncpu indicates the array length of
74	 * cp_tsc_sync_tick_delta containing per-CPU offsets which are applied
75	 * to TSC readings.  The CPU ID furnished by the IA32_TSC_AUX register
76	 * via rdtscp (placed in rcx) is used to look up an offset value in
77	 * that array and apply it to the TSC value.
78	 */
79	leaq	CP_TSC_SYNC_TICK_DELTA(%rdi), %r9
80	movq	(%r9, %rcx, 8), %rdx
81	addq	%rdx, %rax
82	ret
83
842:
85	/*
86	 * TSC reading without RDTSCP
87	 *
88	 * Check if handling for per-CPU TSC offsets is required.  If not,
89	 * immediately skip to the the appropriate steps to perform a rdtsc.
90	 *
91	 * If per-CPU offsets are present, the TSC reading process is more
92	 * complicated.  Without rdtscp, there is no way to simultaneously read
93	 * the TSC and query the current CPU.  In order to "catch" migrations
94	 * during execution, the CPU ID is queried before and after rdtsc.  The
95	 * execution is repeated if results differ, subject to a loop limit.
96	 */
97	xorq	%r9, %r9
98	testl	%r8d, %r8d
99	jz	3f
100
101	/*
102	 * Load the address of the per-CPU offset array, since it is needed.
103	 * The attempted loop count is kept in r8.
104	 */
105	leaq	CP_TSC_SYNC_TICK_DELTA(%rdi), %r9
106	xorl	%r8d, %r8d
107
108	/* Query the CPU ID and stash it in r10 for later comparison */
109	movl	$GETCPU_GDT_OFFSET, %edx
110	lsl	%edx, %edx
111	movl	%edx, %r10d
112
1133:
114	/*
115	 * TSC_RDTSC_MFENCE was used in the past for AMD chips, but has been
116	 * supplanted by TSC_RDTSC_LFENCE, which works on Intel and AMD (when
117	 * lfence can be confirmed as serializing).
118	 */
119
1204:
121	cmpl	$TSC_RDTSC_LFENCE, %esi
122	jne	5f
123	lfence
124	rdtsc
125	jmp	7f
126
1275:
128	cmpl	$TSC_RDTSC_CPUID, %esi
129	jne	6f
130	/*
131	 * Since the amd64 ABI dictates that %rbx is callee-saved, it must be
132	 * preserved here.  Its contents will be overwritten when cpuid is used
133	 * as a serializing instruction.
134	 */
135	movq	%rbx, %r11
136	xorl	%eax, %eax
137	cpuid
138	rdtsc
139	movq	%r11, %rbx
140	jmp	7f
141
1426:
143	/*
144	 * Other protections should have prevented this function from being
145	 * called in the first place.  Since callers must handle a failure from
146	 * CPU migration looping, yield the same result as a bail-out: 0
147	 */
148	xorl	%eax, %eax
149	ret
150
1517:
152	shlq	$0x20, %rdx
153	orq	%rdx, %rax
154
155	/*
156	 * With the TSC reading in-hand, check if any per-CPU offset handling
157	 * is required.  The address to the array of deltas (r9) will not have
158	 * been populated if offset handling is unecessary.
159	 */
160	testq	%r9, %r9
161	jnz	8f
162	ret
163
1648:
165	movl	$GETCPU_GDT_OFFSET, %edx
166	lsl	%edx, %edx
167	cmpl	%edx, %r10d
168	jne	9f
169	movq	(%r9, %rdx, 8), %rdx
170	addq	%rdx, %rax
171	ret
172
1739:
174	/*
175	 * It appears that a migration has occurred between the first CPU ID
176	 * query and now.  Check if the loop limit has been broken and retry if
177	 * that's not the case.
178	 */
179	cmpl	$TSC_READ_MAXLOOP, %r8d
180	jge	10f
181	incl	%r8d
182	movl	%edx, %r10d
183	jmp	3b
184
18510:
186	/* Loop limit was reached. Return bail-out value of 0. */
187	xorl	%eax, %eax
188	ret
189
190	SET_SIZE(__cp_tsc_read)
191
192
193/*
194 * uint_t
195 * __cp_getcpu(comm_page_t *)
196 *
197 * Stack usage: 0 bytes
198 */
199	ENTRY_NP(__cp_getcpu)
200	movl	CP_TSC_TYPE(%rdi), %edi
201	/*
202	 * If RDTSCP is available, it is a quick way to grab the cpu_id which
203	 * is stored in the TSC_AUX MSR by the kernel.
204	 */
205	cmpl	$TSC_TSCP, %edi
206	jne	1f
207	rdtscp
208	movl	%ecx, %eax
209	ret
2101:
211	mov	$GETCPU_GDT_OFFSET, %eax
212	lsl	%eax, %eax
213	ret
214	SET_SIZE(__cp_getcpu)
215
216/*
217 * hrtime_t
218 * __cp_gethrtime(comm_page_t *cp)
219 *
220 * Stack usage: 0x20 local + 0x8 call = 0x28 bytes
221 *
222 * %rsp+0x00 - hrtime_t tsc_last
223 * %rsp+0x08 - hrtime_t hrtime_base
224 * %rsp+0x10 - commpage_t *cp
225 * %rsp+0x18 - int hres_lock
226 */
227	ENTRY_NP(__cp_gethrtime)
228	subq	$0x20, %rsp
229	movq	%rdi, 0x10(%rsp)
2301:
231	movl	CP_HRES_LOCK(%rdi), %r9d
232	movl	%r9d, 0x18(%rsp)
233
234	movq	CP_TSC_LAST(%rdi), %rax
235	movq	CP_TSC_HRTIME_BASE(%rdi), %rdx
236	movq	%rax, (%rsp)
237	movq	%rdx, 0x8(%rsp)
238
239	call	__cp_tsc_read
240
241	/*
242	 * Failure is inferred from a TSC reading of 0.  The normal fasttrap
243	 * mechanism can be used as a fallback in such cases.
244	 */
245	testq	%rax, %rax
246	jz	6f
247
248	movq	0x10(%rsp), %rdi
249	movl	0x18(%rsp), %r9d
250	movl	CP_HRES_LOCK(%rdi), %edx
251	andl	$0xfffffffe, %r9d
252	cmpl	%r9d, %edx
253	jne	1b
254
255	/*
256	 * The in-kernel logic for calculating hrtime performs several checks
257	 * to protect against edge cases.  That logic is summarized as:
258	 * if (tsc >= tsc_last) {
259	 *         delta -= tsc_last;
260	 * } else if (tsc >= tsc_last - 2*tsc_max_delta) {
261	 *         delta = 0;
262	 * } else {
263	 *         delta = MIN(tsc, tsc_resume_cap);
264	 * }
265	 *
266	 * The below implementation achieves the same result, although it is
267	 * structured for speed and optimized for the fast path:
268	 *
269	 * delta = tsc - tsc_last;
270	 * if (delta < 0) {
271	 *         delta += (tsc_max_delta << 1);
272	 *         if (delta >= 0) {
273	 *                 delta = 0;
274	 *         } else {
275	 *                 delta = MIN(tsc, tsc_resume_cap);
276	 *         }
277	 * }
278	 */
279	movq	(%rsp), %rdx
280	subq	%rdx, %rax		/* delta = tsc - tsc_last */
281	jbe	3f			/* if (delta < 0) */
282
2832:
284	/*
285	 * Optimized TSC_CONVERT_AND_ADD:
286	 * hrtime_base += (tsc_delta * nsec_scale) >> (32 - NSEC_SHIFT)
287	 *
288	 * Since the multiply and shift are done in 128-bit, there is no need
289	 * to worry about overflow.
290	 */
291	movl	CP_NSEC_SCALE(%rdi), %ecx
292	mulq	%rcx
293	shrdq	$_CONST(32 - NSEC_SHIFT), %rdx, %rax
294	movq	0x8(%rsp), %r8
295	addq	%r8, %rax
296
297	addq	$0x20, %rsp
298	ret
299
3003:
301	movq	%rax, %r9		/* save (tsc - tsc_last) in r9 */
302	movl	CP_TSC_MAX_DELTA(%rdi), %ecx
303	sall	$1, %ecx
304	addq	%rcx, %rax		/* delta += (tsc_max_delta << 1) */
305	jae	4f			/* delta < 0 */
306	xorq	%rax, %rax
307	jmp	2b
308
3094:
310	/*
311	 * Repopulate %rax with the TSC reading by adding tsc_last to %r9
312	 * (which holds tsc - tsc_last)
313	 */
314	movq	(%rsp), %rax
315	addq	%r9, %rax
316
317	/* delta = MIN(tsc, resume_cap) */
318	movq	CP_TSC_RESUME_CAP(%rdi), %rcx
319	cmpq	%rcx, %rax
320	jbe	5f
321	movq	%rcx, %rax
3225:
323	jmp	2b
324
3256:
326	movl	$T_GETHRTIME, %eax
327	int	$T_FASTTRAP
328	addq	$0x20, %rsp
329	ret
330
331	SET_SIZE(__cp_gethrtime)
332
333/*
334 * int
335 * __cp_clock_gettime_monotonic(comm_page_t *cp, timespec_t *tsp)
336 *
337 * Stack usage: 0x8 local + 0x8 call + 0x28 called func. = 0x38 bytes
338 *
339 * %rsp+0x00 - timespec_t *tsp
340 */
341	ENTRY_NP(__cp_clock_gettime_monotonic)
342	subq	$0x8, %rsp
343	movq	%rsi, (%rsp)
344
345	call	__cp_gethrtime
346
347	/*
348	 * Convert from hrtime_t (int64_t in nanoseconds) to timespec_t.
349	 * This uses the same approach as hrt2ts, although it has been updated
350	 * to utilize 64-bit math.
351	 * 1 / 1,000,000,000 =
352	 * 1000100101110000010111110100000100110110101101001010110110011B-26
353	 * = 0x112e0be826d694b3 * 2^-26
354	 *
355	 * secs = (nsecs * 0x112e0be826d694b3) >> 26
356	 *
357	 * In order to account for the 2s-compliment of negative inputs, a
358	 * final operation completes the process:
359	 *
360	 * secs -= (nsecs >> 63)
361	 */
362	movq	%rax, %r11
363	movq	$0x112e0be826d694b3, %rdx
364	imulq	%rdx
365	sarq	$0x1a, %rdx
366	movq	%r11, %rax
367	sarq	$0x3f, %rax
368	subq	%rax, %rdx
369	movq	(%rsp), %rsi
370	movq	%rdx, (%rsi)
371	/*
372	 * Populating tv_nsec is easier:
373	 * tv_nsec = nsecs - (secs * NANOSEC)
374	 */
375	imulq	$NANOSEC, %rdx, %rdx
376	subq	%rdx, %r11
377	movq	%r11, 0x8(%rsi)
378
379	xorl	%eax, %eax
380	addq	$0x8, %rsp
381	ret
382	SET_SIZE(__cp_clock_gettime_monotonic)
383
384/*
385 * int
386 * __cp_clock_gettime_realtime(comm_page_t *cp, timespec_t *tsp)
387 *
388 * Stack usage: 0x18 local + 0x8 call + 0x28 called func. = 0x48 bytes
389 *
390 * %rsp+0x00 - commpage_t *cp
391 * %rsp+0x08 - timespec_t *tsp
392 * %rsp+0x10 - int hres_lock
393 */
394	ENTRY_NP(__cp_clock_gettime_realtime)
395	subq	$0x18, %rsp
396	movq	%rdi, (%rsp)
397	movq	%rsi, 0x8(%rsp)
398
3991:
400	movl	CP_HRES_LOCK(%rdi), %eax
401	movl	%eax, 0x10(%rsp)
402
403	call	__cp_gethrtime
404	movq	(%rsp), %rdi
405	movq	CP_HRES_LAST_TICK(%rdi), %rdx
406	subq	%rdx, %rax			/* nslt = hrtime - last_tick */
407	jb	1b
408	movq	CP_HRESTIME(%rdi), %r9
409	movq	_CONST(CP_HRESTIME + CP_HRESTIME_INCR)(%rdi), %r10
410	movl	CP_HRESTIME_ADJ(%rdi), %r11d
411
412	addq	%rax, %r10			/* now.tv_nsec += nslt */
413
414	cmpl	$0, %r11d
415	jb	4f				/* hres_adj > 0 */
416	ja	6f				/* hres_adj < 0 */
417
4182:
419	cmpq	$NANOSEC, %r10
420	jae	8f				/* tv_nsec >= NANOSEC */
421
4223:
423	movl	0x10(%rsp), %eax
424	movl	CP_HRES_LOCK(%rdi), %edx
425	andl	$0xfffffffe, %edx
426	cmpl	%eax, %edx
427	jne	1b
428
429	movq	0x8(%rsp), %rsi
430	movq	%r9, (%rsi)
431	movq	%r10, 0x8(%rsi)
432
433	xorl	%eax, %eax
434	addq	$0x18, %rsp
435	ret
436
437
4384:						/* hres_adj > 0 */
439	sarq	$ADJ_SHIFT, %rax
440	cmpl	%r11d, %eax
441	jbe	5f
442	movl	%r11d, %eax
4435:
444	addq	%rax, %r10
445	jmp	2b
446
4476:						/* hres_adj < 0 */
448	sarq	$ADJ_SHIFT, %rax
449	negl	%r11d
450	cmpl	%r11d, %eax
451	jbe	7f
452	movl	%r11d, %eax
4537:
454	subq	%rax, %r10
455	jmp	2b
456
4578:						/* tv_nsec >= NANOSEC */
458	subq	$NANOSEC, %r10
459	incq	%r9
460	cmpq	$NANOSEC, %r10
461	jae	8b
462	jmp	3b
463
464	SET_SIZE(__cp_clock_gettime_realtime)
465