xref: /titanic_50/usr/src/lib/commpage/amd64/cp_subr.s (revision c702eacaf4d917acee909844383924df2accf8f5)
1/*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source.  A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12/*
13 * Copyright 2016 Joyent, Inc.
14 */
15
16#include <sys/asm_linkage.h>
17#include <sys/segments.h>
18#include <sys/time_impl.h>
19#include <sys/tsc.h>
20#include <cp_offsets.h>
21
22#define	GETCPU_GDT_OFFSET	SEL_GDT(GDT_CPUID, SEL_UPL)
23
24	.file	"cp_subr.s"
25
26/*
27 * These are cloned from TSC and time related code in the kernel.  They should
28 * be kept in sync in the case that the source values are changed.
29 * See: uts/i86pc/os/timestamp.c
30 */
31#define	NSEC_SHIFT	5
32#define	ADJ_SHIFT	4
33#define	NANOSEC		0x3b9aca00
34
35/*
36 * hrtime_t
37 * __cp_tsc_read(comm_page_t *cp)
38 *
39 * Stack usage: 0 bytes
40 */
41	ENTRY_NP(__cp_tsc_read)
42	movl	CP_TSC_TYPE(%rdi), %esi
43	movl	CP_TSC_NCPU(%rdi), %r8d
44	leaq	CP_TSC_SYNC_TICK_DELTA(%rdi), %r9
45
46	cmpl	$TSC_TSCP, %esi
47	jne	2f
48	rdtscp
49	/*
50	 * When the TSC is read, the low 32 bits are placed in %eax while the
51	 * high 32 bits are placed in %edx.  They are shifted and ORed together
52	 * to obtain the full 64-bit value.
53	 */
54	shlq	$0x20, %rdx
55	orq	%rdx, %rax
56	cmpl	$0, %esi
57	jne	1f
58	ret
591:
60	/*
61	 * When cp_tsc_ncpu is non-zero, it indicates the length of the
62	 * cp_tsc_sync_tick_delta array, which contains per-CPU offsets for the
63	 * TSC.  The CPU ID furnished by the IA32_TSC_AUX register via rdtscp
64	 * is used to look up an offset value in that array and apply it to the
65	 * TSC reading.
66	 */
67	movq	(%r9, %rcx, 8), %rdx
68	addq	%rdx, %rax
69	ret
70
712:
72	/*
73	 * Without rdtscp, there is no way to perform a TSC reading and
74	 * simultaneously query the current CPU.  If tsc_ncpu indicates that
75	 * per-CPU TSC offsets are present, the ID of the current CPU is
76	 * queried before performing a TSC reading.  It will be later compared
77	 * to a second CPU ID lookup to catch CPU migrations.
78	 *
79	 * This method will catch all but the most pathological scheduling.
80	 */
81	cmpl	$0, %r8d
82	je	3f
83	movl	$GETCPU_GDT_OFFSET, %edx
84	lsl	%dx, %edx
85
863:
87	/* Save the most recently queried CPU ID for later comparison. */
88	movl	%edx, %r10d
89
90	cmpl	$TSC_RDTSC_MFENCE, %esi
91	jne	4f
92	mfence
93	rdtsc
94	jmp	7f
95
964:
97	cmpl	$TSC_RDTSC_LFENCE, %esi
98	jne	5f
99	lfence
100	rdtsc
101	jmp	7f
102
1035:
104	cmpl	$TSC_RDTSC_CPUID, %esi
105	jne	6f
106	/*
107	 * Since the amd64 ABI dictates that %rbx is callee-saved, it must be
108	 * preserved here.  Its contents will be overwritten when cpuid is used
109	 * as a serializing instruction.
110	 */
111	movq	%rbx, %r11
112	xorl	%eax, %eax
113	cpuid
114	rdtsc
115	movq	%r11, %rbx
116	jmp	7f
117
1186:
119	/*
120	 * Other protections should have prevented this function from being
121	 * called in the first place.  The only sane action is to abort.
122	 * The easiest means in this context is via SIGILL.
123	 */
124	ud2a
125
1267:
127	shlq	$0x20, %rdx
128	orq	%rdx, %rax
129
130	/*
131	 * Query the current CPU again if a per-CPU offset is being applied to
132	 * the TSC reading.  If the result differs from the earlier reading,
133	 * then a migration has occured and the TSC must be read again.
134	 */
135	cmpl	$0, %r8d
136	je	8f
137	movl	$GETCPU_GDT_OFFSET, %edx
138	lsl	%dx, %edx
139	cmpl	%edx, %r10d
140	jne	3b
141	movq	(%r9, %rdx, 8), %rdx
142	addq	%rdx, %rax
1438:
144	ret
145	SET_SIZE(__cp_tsc_read)
146
147
148/*
149 * uint_t
150 * __cp_getcpu(comm_page_t *)
151 *
152 * Stack usage: 0 bytes
153 */
154	ENTRY_NP(__cp_getcpu)
155	movl	CP_TSC_TYPE(%rdi), %edi
156	/*
157	 * If RDTSCP is available, it is a quick way to grab the cpu_id which
158	 * is stored in the TSC_AUX MSR by the kernel.
159	 */
160	cmpl	$TSC_TSCP, %edi
161	jne	1f
162	rdtscp
163	movl	%ecx, %eax
164	ret
1651:
166	mov	$GETCPU_GDT_OFFSET, %eax
167	lsl	%ax, %eax
168	ret
169	SET_SIZE(__cp_getcpu)
170
171/*
172 * hrtime_t
173 * __cp_gethrtime(comm_page_t *cp)
174 *
175 * Stack usage: 0x20 local + 0x8 call = 0x28 bytes
176 *
177 * %rsp+0x00 - hrtime_t tsc_last
178 * %rsp+0x08 - hrtime_t hrtime_base
179 * %rsp+0x10 - commpage_t *cp
180 * %rsp+0x18 - int hres_lock
181 */
182	ENTRY_NP(__cp_gethrtime)
183	subq	$0x20, %rsp
184	movq	%rdi, 0x10(%rsp)
1851:
186	movl	CP_HRES_LOCK(%rdi), %r9d
187	movl	%r9d, 0x18(%rsp)
188
189	movq	CP_TSC_LAST(%rdi), %rax
190	movq	CP_TSC_HRTIME_BASE(%rdi), %rdx
191	movq	%rax, (%rsp)
192	movq	%rdx, 0x8(%rsp)
193
194	call	__cp_tsc_read
195	movq	0x10(%rsp), %rdi
196
197	movl	0x18(%rsp), %r9d
198	movl	CP_HRES_LOCK(%rdi), %edx
199	andl	$0xfffffffe, %r9d
200	cmpl	%r9d, %edx
201	jne	1b
202
203	/*
204	 * The in-kernel logic for calculating hrtime performs several checks
205	 * to protect against edge cases.  That logic is summarized as:
206	 * if (tsc >= tsc_last) {
207	 *         delta -= tsc_last;
208	 * } else if (tsc >= tsc_last - 2*tsc_max_delta) {
209	 *         delta = 0;
210	 * } else {
211	 *         delta = MIN(tsc, tsc_resume_cap);
212	 * }
213	 *
214	 * The below implementation achieves the same result, although it is
215	 * structured for speed and optimized for the fast path:
216	 *
217	 * delta = tsc - tsc_last;
218	 * if (delta < 0) {
219	 *         delta += (tsc_max_delta << 1);
220	 *         if (delta >= 0) {
221	 *                 delta = 0;
222	 *         } else {
223	 *                 delta = MIN(tsc, tsc_resume_cap);
224	 *         }
225	 * }
226	 */
227	movq	(%rsp), %rdx
228	subq	%rdx, %rax		/* delta = tsc - tsc_last */
229	jbe	3f			/* if (delta < 0) */
230
2312:
232	/*
233	 * Optimized TSC_CONVERT_AND_ADD:
234	 * hrtime_base += (tsc_delta * nsec_scale) >> (32 - NSEC_SHIFT)
235	 *
236	 * Since the multiply and shift are done in 128-bit, there is no need
237	 * to worry about overflow.
238	 */
239	movl	CP_NSEC_SCALE(%rdi), %ecx
240	mulq	%rcx
241	shrdq	$_CONST(32 - NSEC_SHIFT), %rdx, %rax
242	movq	0x8(%rsp), %r8
243	addq	%r8, %rax
244
245	addq	$0x20, %rsp
246	ret
247
2483:
249	movq	%rax, %r9		/* save (tsc - tsc_last) in r9 */
250	movl	CP_TSC_MAX_DELTA(%rdi), %ecx
251	sall	$1, %ecx
252	addq	%rcx, %rax		/* delta += (tsc_max_delta << 1) */
253	jae	4f			/* delta < 0 */
254	xorq	%rax, %rax
255	jmp	2b
256
2574:
258	/*
259	 * Repopulate %rax with the TSC reading by adding tsc_last to %r9
260	 * (which holds tsc - tsc_last)
261	 */
262	movq	(%rsp), %rax
263	addq	%r9, %rax
264
265	/* delta = MIN(tsc, resume_cap) */
266	movq	CP_TSC_RESUME_CAP(%rdi), %rcx
267	cmpq	%rcx, %rax
268	jbe	5f
269	movq	%rcx, %rax
2705:
271	jmp	2b
272
273	SET_SIZE(__cp_gethrtime)
274
275/*
276 * int
277 * __cp_clock_gettime_monotonic(comm_page_t *cp, timespec_t *tsp)
278 *
279 * Stack usage: 0x8 local + 0x8 call + 0x28 called func. = 0x38 bytes
280 *
281 * %rsp+0x00 - timespec_t *tsp
282 */
283	ENTRY_NP(__cp_clock_gettime_monotonic)
284	subq	$0x8, %rsp
285	movq	%rsi, (%rsp)
286
287	call	__cp_gethrtime
288
289	/*
290	 * Convert from hrtime_t (int64_t in nanoseconds) to timespec_t.
291	 * This uses the same approach as hrt2ts, although it has been updated
292	 * to utilize 64-bit math.
293	 * 1 / 1,000,000,000 =
294	 * 1000100101110000010111110100000100110110101101001010110110011B-26
295	 * = 0x112e0be826d694b3 * 2^-26
296	 *
297	 * secs = (nsecs * 0x112e0be826d694b3) >> 26
298	 *
299	 * In order to account for the 2s-compliment of negative inputs, a
300	 * final operation completes the process:
301	 *
302	 * secs -= (nsecs >> 63)
303	 */
304	movq	%rax, %r11
305	movq	$0x112e0be826d694b3, %rdx
306	imulq	%rdx
307	sarq	$0x1a, %rdx
308	movq	%r11, %rax
309	sarq	$0x3f, %rax
310	subq	%rax, %rdx
311	movq	(%rsp), %rsi
312	movq	%rdx, (%rsi)
313	/*
314	 * Populating tv_nsec is easier:
315	 * tv_nsec = nsecs - (secs * NANOSEC)
316	 */
317	imulq	$NANOSEC, %rdx, %rdx
318	subq	%rdx, %r11
319	movq	%r11, 0x8(%rsi)
320
321	xorl	%eax, %eax
322	addq	$0x8, %rsp
323	ret
324	SET_SIZE(__cp_clock_gettime_monotonic)
325
326/*
327 * int
328 * __cp_clock_gettime_realtime(comm_page_t *cp, timespec_t *tsp)
329 *
330 * Stack usage: 0x18 local + 0x8 call + 0x28 called func. = 0x48 bytes
331 *
332 * %rsp+0x00 - commpage_t *cp
333 * %rsp+0x08 - timespec_t *tsp
334 * %rsp+0x10 - int hres_lock
335 */
336	ENTRY_NP(__cp_clock_gettime_realtime)
337	subq	$0x18, %rsp
338	movq	%rdi, (%rsp)
339	movq	%rsi, 0x8(%rsp)
340
3411:
342	movl	CP_HRES_LOCK(%rdi), %eax
343	movl	%eax, 0x10(%rsp)
344
345	call	__cp_gethrtime
346	movq	(%rsp), %rdi
347	movq	CP_HRES_LAST_TICK(%rdi), %rdx
348	subq	%rdx, %rax			/* nslt = hrtime - last_tick */
349	jb	1b
350	movq	CP_HRESTIME(%rdi), %r9
351	movq	_CONST(CP_HRESTIME + CP_HRESTIME_INCR)(%rdi), %r10
352	movl	CP_HRESTIME_ADJ(%rdi), %r11d
353
354	addq	%rax, %r10			/* now.tv_nsec += nslt */
355
356	cmpl	$0, %r11d
357	jb	4f				/* hres_adj > 0 */
358	ja	6f				/* hres_adj < 0 */
359
3602:
361	cmpq	$NANOSEC, %r10
362	jae	8f				/* tv_nsec >= NANOSEC */
363
3643:
365	movl	0x10(%rsp), %eax
366	movl	CP_HRES_LOCK(%rdi), %edx
367	andl	$0xfffffffe, %edx
368	cmpl	%eax, %edx
369	jne	1b
370
371	movq	0x8(%rsp), %rsi
372	movq	%r9, (%rsi)
373	movq	%r10, 0x8(%rsi)
374
375	xorl	%eax, %eax
376	addq	$0x18, %rsp
377	ret
378
379
3804:						/* hres_adj > 0 */
381	sarq	$ADJ_SHIFT, %rax
382	cmpl	%r11d, %eax
383	jbe	5f
384	movl	%r11d, %eax
3855:
386	addq	%rax, %r10
387	jmp	2b
388
3896:						/* hres_adj < 0 */
390	sarq	$ADJ_SHIFT, %rax
391	negl	%r11d
392	cmpl	%r11d, %eax
393	jbe	7f
394	movl	%r11d, %eax
3957:
396	subq	%rax, %r10
397	jmp	2b
398
3998:						/* tv_nsec >= NANOSEC */
400	subq	$NANOSEC, %r10
401	incq	%r9
402	cmpq	$NANOSEC, %r10
403	jae	8b
404	jmp	3b
405
406	SET_SIZE(__cp_clock_gettime_realtime)
407