1/* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12/* 13 * Copyright 2016 Joyent, Inc. 14 */ 15 16#include <sys/asm_linkage.h> 17#include <sys/segments.h> 18#include <sys/time_impl.h> 19#include <sys/tsc.h> 20#include <cp_offsets.h> 21 22#define GETCPU_GDT_OFFSET SEL_GDT(GDT_CPUID, SEL_UPL) 23 24 .file "cp_subr.s" 25 26/* 27 * These are cloned from TSC and time related code in the kernel. They should 28 * be kept in sync in the case that the source values are changed. 29 * See: uts/i86pc/os/timestamp.c 30 */ 31#define NSEC_SHIFT 5 32#define ADJ_SHIFT 4 33#define NANOSEC 0x3b9aca00 34 35/* 36 * hrtime_t 37 * __cp_tsc_read(comm_page_t *cp) 38 * 39 * Stack usage: 0 bytes 40 */ 41 ENTRY_NP(__cp_tsc_read) 42 movl CP_TSC_TYPE(%rdi), %esi 43 movl CP_TSC_NCPU(%rdi), %r8d 44 leaq CP_TSC_SYNC_TICK_DELTA(%rdi), %r9 45 46 cmpl $TSC_TSCP, %esi 47 jne 2f 48 rdtscp 49 /* 50 * When the TSC is read, the low 32 bits are placed in %eax while the 51 * high 32 bits are placed in %edx. They are shifted and ORed together 52 * to obtain the full 64-bit value. 53 */ 54 shlq $0x20, %rdx 55 orq %rdx, %rax 56 cmpl $0, %esi 57 jne 1f 58 ret 591: 60 /* 61 * When cp_tsc_ncpu is non-zero, it indicates the length of the 62 * cp_tsc_sync_tick_delta array, which contains per-CPU offsets for the 63 * TSC. The CPU ID furnished by the IA32_TSC_AUX register via rdtscp 64 * is used to look up an offset value in that array and apply it to the 65 * TSC reading. 66 */ 67 movq (%r9, %rcx, 8), %rdx 68 addq %rdx, %rax 69 ret 70 712: 72 /* 73 * Without rdtscp, there is no way to perform a TSC reading and 74 * simultaneously query the current CPU. If tsc_ncpu indicates that 75 * per-CPU TSC offsets are present, the ID of the current CPU is 76 * queried before performing a TSC reading. It will be later compared 77 * to a second CPU ID lookup to catch CPU migrations. 78 * 79 * This method will catch all but the most pathological scheduling. 80 */ 81 cmpl $0, %r8d 82 je 3f 83 movl $GETCPU_GDT_OFFSET, %edx 84 lsl %dx, %edx 85 863: 87 /* Save the most recently queried CPU ID for later comparison. */ 88 movl %edx, %r10d 89 90 cmpl $TSC_RDTSC_MFENCE, %esi 91 jne 4f 92 mfence 93 rdtsc 94 jmp 7f 95 964: 97 cmpl $TSC_RDTSC_LFENCE, %esi 98 jne 5f 99 lfence 100 rdtsc 101 jmp 7f 102 1035: 104 cmpl $TSC_RDTSC_CPUID, %esi 105 jne 6f 106 /* 107 * Since the amd64 ABI dictates that %rbx is callee-saved, it must be 108 * preserved here. Its contents will be overwritten when cpuid is used 109 * as a serializing instruction. 110 */ 111 movq %rbx, %r11 112 xorl %eax, %eax 113 cpuid 114 rdtsc 115 movq %r11, %rbx 116 jmp 7f 117 1186: 119 /* 120 * Other protections should have prevented this function from being 121 * called in the first place. The only sane action is to abort. 122 * The easiest means in this context is via SIGILL. 123 */ 124 ud2a 125 1267: 127 shlq $0x20, %rdx 128 orq %rdx, %rax 129 130 /* 131 * Query the current CPU again if a per-CPU offset is being applied to 132 * the TSC reading. If the result differs from the earlier reading, 133 * then a migration has occured and the TSC must be read again. 134 */ 135 cmpl $0, %r8d 136 je 8f 137 movl $GETCPU_GDT_OFFSET, %edx 138 lsl %dx, %edx 139 cmpl %edx, %r10d 140 jne 3b 141 movq (%r9, %rdx, 8), %rdx 142 addq %rdx, %rax 1438: 144 ret 145 SET_SIZE(__cp_tsc_read) 146 147 148/* 149 * uint_t 150 * __cp_getcpu(comm_page_t *) 151 * 152 * Stack usage: 0 bytes 153 */ 154 ENTRY_NP(__cp_getcpu) 155 movl CP_TSC_TYPE(%rdi), %edi 156 /* 157 * If RDTSCP is available, it is a quick way to grab the cpu_id which 158 * is stored in the TSC_AUX MSR by the kernel. 159 */ 160 cmpl $TSC_TSCP, %edi 161 jne 1f 162 rdtscp 163 movl %ecx, %eax 164 ret 1651: 166 mov $GETCPU_GDT_OFFSET, %eax 167 lsl %ax, %eax 168 ret 169 SET_SIZE(__cp_getcpu) 170 171/* 172 * hrtime_t 173 * __cp_gethrtime(comm_page_t *cp) 174 * 175 * Stack usage: 0x20 local + 0x8 call = 0x28 bytes 176 * 177 * %rsp+0x00 - hrtime_t tsc_last 178 * %rsp+0x08 - hrtime_t hrtime_base 179 * %rsp+0x10 - commpage_t *cp 180 * %rsp+0x18 - int hres_lock 181 */ 182 ENTRY_NP(__cp_gethrtime) 183 subq $0x20, %rsp 184 movq %rdi, 0x10(%rsp) 1851: 186 movl CP_HRES_LOCK(%rdi), %r9d 187 movl %r9d, 0x18(%rsp) 188 189 movq CP_TSC_LAST(%rdi), %rax 190 movq CP_TSC_HRTIME_BASE(%rdi), %rdx 191 movq %rax, (%rsp) 192 movq %rdx, 0x8(%rsp) 193 194 call __cp_tsc_read 195 movq 0x10(%rsp), %rdi 196 197 movl 0x18(%rsp), %r9d 198 movl CP_HRES_LOCK(%rdi), %edx 199 andl $0xfffffffe, %r9d 200 cmpl %r9d, %edx 201 jne 1b 202 203 /* 204 * The in-kernel logic for calculating hrtime performs several checks 205 * to protect against edge cases. That logic is summarized as: 206 * if (tsc >= tsc_last) { 207 * delta -= tsc_last; 208 * } else if (tsc >= tsc_last - 2*tsc_max_delta) { 209 * delta = 0; 210 * } else { 211 * delta = MIN(tsc, tsc_resume_cap); 212 * } 213 * 214 * The below implementation achieves the same result, although it is 215 * structured for speed and optimized for the fast path: 216 * 217 * delta = tsc - tsc_last; 218 * if (delta < 0) { 219 * delta += (tsc_max_delta << 1); 220 * if (delta >= 0) { 221 * delta = 0; 222 * } else { 223 * delta = MIN(tsc, tsc_resume_cap); 224 * } 225 * } 226 */ 227 movq (%rsp), %rdx 228 subq %rdx, %rax /* delta = tsc - tsc_last */ 229 jbe 3f /* if (delta < 0) */ 230 2312: 232 /* 233 * Optimized TSC_CONVERT_AND_ADD: 234 * hrtime_base += (tsc_delta * nsec_scale) >> (32 - NSEC_SHIFT) 235 * 236 * Since the multiply and shift are done in 128-bit, there is no need 237 * to worry about overflow. 238 */ 239 movl CP_NSEC_SCALE(%rdi), %ecx 240 mulq %rcx 241 shrdq $_CONST(32 - NSEC_SHIFT), %rdx, %rax 242 movq 0x8(%rsp), %r8 243 addq %r8, %rax 244 245 addq $0x20, %rsp 246 ret 247 2483: 249 movq %rax, %r9 /* save (tsc - tsc_last) in r9 */ 250 movl CP_TSC_MAX_DELTA(%rdi), %ecx 251 sall $1, %ecx 252 addq %rcx, %rax /* delta += (tsc_max_delta << 1) */ 253 jae 4f /* delta < 0 */ 254 xorq %rax, %rax 255 jmp 2b 256 2574: 258 /* 259 * Repopulate %rax with the TSC reading by adding tsc_last to %r9 260 * (which holds tsc - tsc_last) 261 */ 262 movq (%rsp), %rax 263 addq %r9, %rax 264 265 /* delta = MIN(tsc, resume_cap) */ 266 movq CP_TSC_RESUME_CAP(%rdi), %rcx 267 cmpq %rcx, %rax 268 jbe 5f 269 movq %rcx, %rax 2705: 271 jmp 2b 272 273 SET_SIZE(__cp_gethrtime) 274 275/* 276 * int 277 * __cp_clock_gettime_monotonic(comm_page_t *cp, timespec_t *tsp) 278 * 279 * Stack usage: 0x8 local + 0x8 call + 0x28 called func. = 0x38 bytes 280 * 281 * %rsp+0x00 - timespec_t *tsp 282 */ 283 ENTRY_NP(__cp_clock_gettime_monotonic) 284 subq $0x8, %rsp 285 movq %rsi, (%rsp) 286 287 call __cp_gethrtime 288 289 /* 290 * Convert from hrtime_t (int64_t in nanoseconds) to timespec_t. 291 * This uses the same approach as hrt2ts, although it has been updated 292 * to utilize 64-bit math. 293 * 1 / 1,000,000,000 = 294 * 1000100101110000010111110100000100110110101101001010110110011B-26 295 * = 0x112e0be826d694b3 * 2^-26 296 * 297 * secs = (nsecs * 0x112e0be826d694b3) >> 26 298 * 299 * In order to account for the 2s-compliment of negative inputs, a 300 * final operation completes the process: 301 * 302 * secs -= (nsecs >> 63) 303 */ 304 movq %rax, %r11 305 movq $0x112e0be826d694b3, %rdx 306 imulq %rdx 307 sarq $0x1a, %rdx 308 movq %r11, %rax 309 sarq $0x3f, %rax 310 subq %rax, %rdx 311 movq (%rsp), %rsi 312 movq %rdx, (%rsi) 313 /* 314 * Populating tv_nsec is easier: 315 * tv_nsec = nsecs - (secs * NANOSEC) 316 */ 317 imulq $NANOSEC, %rdx, %rdx 318 subq %rdx, %r11 319 movq %r11, 0x8(%rsi) 320 321 xorl %eax, %eax 322 addq $0x8, %rsp 323 ret 324 SET_SIZE(__cp_clock_gettime_monotonic) 325 326/* 327 * int 328 * __cp_clock_gettime_realtime(comm_page_t *cp, timespec_t *tsp) 329 * 330 * Stack usage: 0x18 local + 0x8 call + 0x28 called func. = 0x48 bytes 331 * 332 * %rsp+0x00 - commpage_t *cp 333 * %rsp+0x08 - timespec_t *tsp 334 * %rsp+0x10 - int hres_lock 335 */ 336 ENTRY_NP(__cp_clock_gettime_realtime) 337 subq $0x18, %rsp 338 movq %rdi, (%rsp) 339 movq %rsi, 0x8(%rsp) 340 3411: 342 movl CP_HRES_LOCK(%rdi), %eax 343 movl %eax, 0x10(%rsp) 344 345 call __cp_gethrtime 346 movq (%rsp), %rdi 347 movq CP_HRES_LAST_TICK(%rdi), %rdx 348 subq %rdx, %rax /* nslt = hrtime - last_tick */ 349 jb 1b 350 movq CP_HRESTIME(%rdi), %r9 351 movq _CONST(CP_HRESTIME + CP_HRESTIME_INCR)(%rdi), %r10 352 movl CP_HRESTIME_ADJ(%rdi), %r11d 353 354 addq %rax, %r10 /* now.tv_nsec += nslt */ 355 356 cmpl $0, %r11d 357 jb 4f /* hres_adj > 0 */ 358 ja 6f /* hres_adj < 0 */ 359 3602: 361 cmpq $NANOSEC, %r10 362 jae 8f /* tv_nsec >= NANOSEC */ 363 3643: 365 movl 0x10(%rsp), %eax 366 movl CP_HRES_LOCK(%rdi), %edx 367 andl $0xfffffffe, %edx 368 cmpl %eax, %edx 369 jne 1b 370 371 movq 0x8(%rsp), %rsi 372 movq %r9, (%rsi) 373 movq %r10, 0x8(%rsi) 374 375 xorl %eax, %eax 376 addq $0x18, %rsp 377 ret 378 379 3804: /* hres_adj > 0 */ 381 sarq $ADJ_SHIFT, %rax 382 cmpl %r11d, %eax 383 jbe 5f 384 movl %r11d, %eax 3855: 386 addq %rax, %r10 387 jmp 2b 388 3896: /* hres_adj < 0 */ 390 sarq $ADJ_SHIFT, %rax 391 negl %r11d 392 cmpl %r11d, %eax 393 jbe 7f 394 movl %r11d, %eax 3957: 396 subq %rax, %r10 397 jmp 2b 398 3998: /* tv_nsec >= NANOSEC */ 400 subq $NANOSEC, %r10 401 incq %r9 402 cmpq $NANOSEC, %r10 403 jae 8b 404 jmp 3b 405 406 SET_SIZE(__cp_clock_gettime_realtime) 407