xref: /linux/drivers/misc/sgi-gru/grufault.c (revision 367b8112fe2ea5c39a7bb4d263dcdd9b612fae18)
1 /*
2  * SN Platform GRU Driver
3  *
4  *              FAULT HANDLER FOR GRU DETECTED TLB MISSES
5  *
6  * This file contains code that handles TLB misses within the GRU.
7  * These misses are reported either via interrupts or user polling of
8  * the user CB.
9  *
10  *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
11  *
12  *  This program is free software; you can redistribute it and/or modify
13  *  it under the terms of the GNU General Public License as published by
14  *  the Free Software Foundation; either version 2 of the License, or
15  *  (at your option) any later version.
16  *
17  *  This program is distributed in the hope that it will be useful,
18  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
19  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20  *  GNU General Public License for more details.
21  *
22  *  You should have received a copy of the GNU General Public License
23  *  along with this program; if not, write to the Free Software
24  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
25  */
26 
27 #include <linux/kernel.h>
28 #include <linux/errno.h>
29 #include <linux/spinlock.h>
30 #include <linux/mm.h>
31 #include <linux/hugetlb.h>
32 #include <linux/device.h>
33 #include <linux/io.h>
34 #include <linux/uaccess.h>
35 #include <asm/pgtable.h>
36 #include "gru.h"
37 #include "grutables.h"
38 #include "grulib.h"
39 #include "gru_instructions.h"
40 #include <asm/uv/uv_hub.h>
41 
42 /*
43  * Test if a physical address is a valid GRU GSEG address
44  */
45 static inline int is_gru_paddr(unsigned long paddr)
46 {
47 	return paddr >= gru_start_paddr && paddr < gru_end_paddr;
48 }
49 
50 /*
51  * Find the vma of a GRU segment. Caller must hold mmap_sem.
52  */
53 struct vm_area_struct *gru_find_vma(unsigned long vaddr)
54 {
55 	struct vm_area_struct *vma;
56 
57 	vma = find_vma(current->mm, vaddr);
58 	if (vma && vma->vm_start <= vaddr && vma->vm_ops == &gru_vm_ops)
59 		return vma;
60 	return NULL;
61 }
62 
63 /*
64  * Find and lock the gts that contains the specified user vaddr.
65  *
66  * Returns:
67  * 	- *gts with the mmap_sem locked for read and the GTS locked.
68  *	- NULL if vaddr invalid OR is not a valid GSEG vaddr.
69  */
70 
71 static struct gru_thread_state *gru_find_lock_gts(unsigned long vaddr)
72 {
73 	struct mm_struct *mm = current->mm;
74 	struct vm_area_struct *vma;
75 	struct gru_thread_state *gts = NULL;
76 
77 	down_read(&mm->mmap_sem);
78 	vma = gru_find_vma(vaddr);
79 	if (vma)
80 		gts = gru_find_thread_state(vma, TSID(vaddr, vma));
81 	if (gts)
82 		mutex_lock(&gts->ts_ctxlock);
83 	else
84 		up_read(&mm->mmap_sem);
85 	return gts;
86 }
87 
88 static struct gru_thread_state *gru_alloc_locked_gts(unsigned long vaddr)
89 {
90 	struct mm_struct *mm = current->mm;
91 	struct vm_area_struct *vma;
92 	struct gru_thread_state *gts = NULL;
93 
94 	down_write(&mm->mmap_sem);
95 	vma = gru_find_vma(vaddr);
96 	if (vma)
97 		gts = gru_alloc_thread_state(vma, TSID(vaddr, vma));
98 	if (gts) {
99 		mutex_lock(&gts->ts_ctxlock);
100 		downgrade_write(&mm->mmap_sem);
101 	} else {
102 		up_write(&mm->mmap_sem);
103 	}
104 
105 	return gts;
106 }
107 
108 /*
109  * Unlock a GTS that was previously locked with gru_find_lock_gts().
110  */
111 static void gru_unlock_gts(struct gru_thread_state *gts)
112 {
113 	mutex_unlock(&gts->ts_ctxlock);
114 	up_read(&current->mm->mmap_sem);
115 }
116 
117 /*
118  * Set a CB.istatus to active using a user virtual address. This must be done
119  * just prior to a TFH RESTART. The new cb.istatus is an in-cache status ONLY.
120  * If the line is evicted, the status may be lost. The in-cache update
121  * is necessary to prevent the user from seeing a stale cb.istatus that will
122  * change as soon as the TFH restart is complete. Races may cause an
123  * occasional failure to clear the cb.istatus, but that is ok.
124  *
125  * If the cb address is not valid (should not happen, but...), nothing
126  * bad will happen.. The get_user()/put_user() will fail but there
127  * are no bad side-effects.
128  */
129 static void gru_cb_set_istatus_active(unsigned long __user *cb)
130 {
131 	union {
132 		struct gru_instruction_bits bits;
133 		unsigned long dw;
134 	} u;
135 
136 	if (cb) {
137 		get_user(u.dw, cb);
138 		u.bits.istatus = CBS_ACTIVE;
139 		put_user(u.dw, cb);
140 	}
141 }
142 
143 /*
144  * Convert a interrupt IRQ to a pointer to the GRU GTS that caused the
145  * interrupt. Interrupts are always sent to a cpu on the blade that contains the
146  * GRU (except for headless blades which are not currently supported). A blade
147  * has N grus; a block of N consecutive IRQs is assigned to the GRUs. The IRQ
148  * number uniquely identifies the GRU chiplet on the local blade that caused the
149  * interrupt. Always called in interrupt context.
150  */
151 static inline struct gru_state *irq_to_gru(int irq)
152 {
153 	return &gru_base[uv_numa_blade_id()]->bs_grus[irq - IRQ_GRU];
154 }
155 
156 /*
157  * Read & clear a TFM
158  *
159  * The GRU has an array of fault maps. A map is private to a cpu
160  * Only one cpu will be accessing a cpu's fault map.
161  *
162  * This function scans the cpu-private fault map & clears all bits that
163  * are set. The function returns a bitmap that indicates the bits that
164  * were cleared. Note that sense the maps may be updated asynchronously by
165  * the GRU, atomic operations must be used to clear bits.
166  */
167 static void get_clear_fault_map(struct gru_state *gru,
168 				struct gru_tlb_fault_map *map)
169 {
170 	unsigned long i, k;
171 	struct gru_tlb_fault_map *tfm;
172 
173 	tfm = get_tfm_for_cpu(gru, gru_cpu_fault_map_id());
174 	prefetchw(tfm);		/* Helps on hardware, required for emulator */
175 	for (i = 0; i < BITS_TO_LONGS(GRU_NUM_CBE); i++) {
176 		k = tfm->fault_bits[i];
177 		if (k)
178 			k = xchg(&tfm->fault_bits[i], 0UL);
179 		map->fault_bits[i] = k;
180 	}
181 
182 	/*
183 	 * Not functionally required but helps performance. (Required
184 	 * on emulator)
185 	 */
186 	gru_flush_cache(tfm);
187 }
188 
189 /*
190  * Atomic (interrupt context) & non-atomic (user context) functions to
191  * convert a vaddr into a physical address. The size of the page
192  * is returned in pageshift.
193  * 	returns:
194  * 		  0 - successful
195  * 		< 0 - error code
196  * 		  1 - (atomic only) try again in non-atomic context
197  */
198 static int non_atomic_pte_lookup(struct vm_area_struct *vma,
199 				 unsigned long vaddr, int write,
200 				 unsigned long *paddr, int *pageshift)
201 {
202 	struct page *page;
203 
204 	/* ZZZ Need to handle HUGE pages */
205 	if (is_vm_hugetlb_page(vma))
206 		return -EFAULT;
207 	*pageshift = PAGE_SHIFT;
208 	if (get_user_pages
209 	    (current, current->mm, vaddr, 1, write, 0, &page, NULL) <= 0)
210 		return -EFAULT;
211 	*paddr = page_to_phys(page);
212 	put_page(page);
213 	return 0;
214 }
215 
216 /*
217  * atomic_pte_lookup
218  *
219  * Convert a user virtual address to a physical address
220  * Only supports Intel large pages (2MB only) on x86_64.
221  *	ZZZ - hugepage support is incomplete
222  *
223  * NOTE: mmap_sem is already held on entry to this function. This
224  * guarantees existence of the page tables.
225  */
226 static int atomic_pte_lookup(struct vm_area_struct *vma, unsigned long vaddr,
227 	int write, unsigned long *paddr, int *pageshift)
228 {
229 	pgd_t *pgdp;
230 	pmd_t *pmdp;
231 	pud_t *pudp;
232 	pte_t pte;
233 
234 	pgdp = pgd_offset(vma->vm_mm, vaddr);
235 	if (unlikely(pgd_none(*pgdp)))
236 		goto err;
237 
238 	pudp = pud_offset(pgdp, vaddr);
239 	if (unlikely(pud_none(*pudp)))
240 		goto err;
241 
242 	pmdp = pmd_offset(pudp, vaddr);
243 	if (unlikely(pmd_none(*pmdp)))
244 		goto err;
245 #ifdef CONFIG_X86_64
246 	if (unlikely(pmd_large(*pmdp)))
247 		pte = *(pte_t *) pmdp;
248 	else
249 #endif
250 		pte = *pte_offset_kernel(pmdp, vaddr);
251 
252 	if (unlikely(!pte_present(pte) ||
253 		     (write && (!pte_write(pte) || !pte_dirty(pte)))))
254 		return 1;
255 
256 	*paddr = pte_pfn(pte) << PAGE_SHIFT;
257 	*pageshift = is_vm_hugetlb_page(vma) ? HPAGE_SHIFT : PAGE_SHIFT;
258 	return 0;
259 
260 err:
261 	local_irq_enable();
262 	return 1;
263 }
264 
265 /*
266  * Drop a TLB entry into the GRU. The fault is described by info in an TFH.
267  *	Input:
268  *		cb    Address of user CBR. Null if not running in user context
269  * 	Return:
270  * 		  0 = dropin, exception, or switch to UPM successful
271  * 		  1 = range invalidate active
272  * 		< 0 = error code
273  *
274  */
275 static int gru_try_dropin(struct gru_thread_state *gts,
276 			  struct gru_tlb_fault_handle *tfh,
277 			  unsigned long __user *cb)
278 {
279 	struct mm_struct *mm = gts->ts_mm;
280 	struct vm_area_struct *vma;
281 	int pageshift, asid, write, ret;
282 	unsigned long paddr, gpa, vaddr;
283 
284 	/*
285 	 * NOTE: The GRU contains magic hardware that eliminates races between
286 	 * TLB invalidates and TLB dropins. If an invalidate occurs
287 	 * in the window between reading the TFH and the subsequent TLB dropin,
288 	 * the dropin is ignored. This eliminates the need for additional locks.
289 	 */
290 
291 	/*
292 	 * Error if TFH state is IDLE or FMM mode & the user issuing a UPM call.
293 	 * Might be a hardware race OR a stupid user. Ignore FMM because FMM
294 	 * is a transient state.
295 	 */
296 	if (tfh->state == TFHSTATE_IDLE)
297 		goto failidle;
298 	if (tfh->state == TFHSTATE_MISS_FMM && cb)
299 		goto failfmm;
300 
301 	write = (tfh->cause & TFHCAUSE_TLB_MOD) != 0;
302 	vaddr = tfh->missvaddr;
303 	asid = tfh->missasid;
304 	if (asid == 0)
305 		goto failnoasid;
306 
307 	rmb();	/* TFH must be cache resident before reading ms_range_active */
308 
309 	/*
310 	 * TFH is cache resident - at least briefly. Fail the dropin
311 	 * if a range invalidate is active.
312 	 */
313 	if (atomic_read(&gts->ts_gms->ms_range_active))
314 		goto failactive;
315 
316 	vma = find_vma(mm, vaddr);
317 	if (!vma)
318 		goto failinval;
319 
320 	/*
321 	 * Atomic lookup is faster & usually works even if called in non-atomic
322 	 * context.
323 	 */
324 	rmb();	/* Must/check ms_range_active before loading PTEs */
325 	ret = atomic_pte_lookup(vma, vaddr, write, &paddr, &pageshift);
326 	if (ret) {
327 		if (!cb)
328 			goto failupm;
329 		if (non_atomic_pte_lookup(vma, vaddr, write, &paddr,
330 					  &pageshift))
331 			goto failinval;
332 	}
333 	if (is_gru_paddr(paddr))
334 		goto failinval;
335 
336 	paddr = paddr & ~((1UL << pageshift) - 1);
337 	gpa = uv_soc_phys_ram_to_gpa(paddr);
338 	gru_cb_set_istatus_active(cb);
339 	tfh_write_restart(tfh, gpa, GAA_RAM, vaddr, asid, write,
340 			  GRU_PAGESIZE(pageshift));
341 	STAT(tlb_dropin);
342 	gru_dbg(grudev,
343 		"%s: tfh 0x%p, vaddr 0x%lx, asid 0x%x, ps %d, gpa 0x%lx\n",
344 		ret ? "non-atomic" : "atomic", tfh, vaddr, asid,
345 		pageshift, gpa);
346 	return 0;
347 
348 failnoasid:
349 	/* No asid (delayed unload). */
350 	STAT(tlb_dropin_fail_no_asid);
351 	gru_dbg(grudev, "FAILED no_asid tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
352 	if (!cb)
353 		tfh_user_polling_mode(tfh);
354 	else
355 		gru_flush_cache(tfh);
356 	return -EAGAIN;
357 
358 failupm:
359 	/* Atomic failure switch CBR to UPM */
360 	tfh_user_polling_mode(tfh);
361 	STAT(tlb_dropin_fail_upm);
362 	gru_dbg(grudev, "FAILED upm tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
363 	return 1;
364 
365 failfmm:
366 	/* FMM state on UPM call */
367 	STAT(tlb_dropin_fail_fmm);
368 	gru_dbg(grudev, "FAILED fmm tfh: 0x%p, state %d\n", tfh, tfh->state);
369 	return 0;
370 
371 failidle:
372 	/* TFH was idle  - no miss pending */
373 	gru_flush_cache(tfh);
374 	if (cb)
375 		gru_flush_cache(cb);
376 	STAT(tlb_dropin_fail_idle);
377 	gru_dbg(grudev, "FAILED idle tfh: 0x%p, state %d\n", tfh, tfh->state);
378 	return 0;
379 
380 failinval:
381 	/* All errors (atomic & non-atomic) switch CBR to EXCEPTION state */
382 	tfh_exception(tfh);
383 	STAT(tlb_dropin_fail_invalid);
384 	gru_dbg(grudev, "FAILED inval tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
385 	return -EFAULT;
386 
387 failactive:
388 	/* Range invalidate active. Switch to UPM iff atomic */
389 	if (!cb)
390 		tfh_user_polling_mode(tfh);
391 	else
392 		gru_flush_cache(tfh);
393 	STAT(tlb_dropin_fail_range_active);
394 	gru_dbg(grudev, "FAILED range active: tfh 0x%p, vaddr 0x%lx\n",
395 		tfh, vaddr);
396 	return 1;
397 }
398 
399 /*
400  * Process an external interrupt from the GRU. This interrupt is
401  * caused by a TLB miss.
402  * Note that this is the interrupt handler that is registered with linux
403  * interrupt handlers.
404  */
405 irqreturn_t gru_intr(int irq, void *dev_id)
406 {
407 	struct gru_state *gru;
408 	struct gru_tlb_fault_map map;
409 	struct gru_thread_state *gts;
410 	struct gru_tlb_fault_handle *tfh = NULL;
411 	int cbrnum, ctxnum;
412 
413 	STAT(intr);
414 
415 	gru = irq_to_gru(irq);
416 	if (!gru) {
417 		dev_err(grudev, "GRU: invalid interrupt: cpu %d, irq %d\n",
418 			raw_smp_processor_id(), irq);
419 		return IRQ_NONE;
420 	}
421 	get_clear_fault_map(gru, &map);
422 	gru_dbg(grudev, "irq %d, gru %x, map 0x%lx\n", irq, gru->gs_gid,
423 		map.fault_bits[0]);
424 
425 	for_each_cbr_in_tfm(cbrnum, map.fault_bits) {
426 		tfh = get_tfh_by_index(gru, cbrnum);
427 		prefetchw(tfh);	/* Helps on hdw, required for emulator */
428 
429 		/*
430 		 * When hardware sets a bit in the faultmap, it implicitly
431 		 * locks the GRU context so that it cannot be unloaded.
432 		 * The gts cannot change until a TFH start/writestart command
433 		 * is issued.
434 		 */
435 		ctxnum = tfh->ctxnum;
436 		gts = gru->gs_gts[ctxnum];
437 
438 		/*
439 		 * This is running in interrupt context. Trylock the mmap_sem.
440 		 * If it fails, retry the fault in user context.
441 		 */
442 		if (down_read_trylock(&gts->ts_mm->mmap_sem)) {
443 			gru_try_dropin(gts, tfh, NULL);
444 			up_read(&gts->ts_mm->mmap_sem);
445 		} else {
446 			tfh_user_polling_mode(tfh);
447 		}
448 	}
449 	return IRQ_HANDLED;
450 }
451 
452 
453 static int gru_user_dropin(struct gru_thread_state *gts,
454 			   struct gru_tlb_fault_handle *tfh,
455 			   unsigned long __user *cb)
456 {
457 	struct gru_mm_struct *gms = gts->ts_gms;
458 	int ret;
459 
460 	while (1) {
461 		wait_event(gms->ms_wait_queue,
462 			   atomic_read(&gms->ms_range_active) == 0);
463 		prefetchw(tfh);	/* Helps on hdw, required for emulator */
464 		ret = gru_try_dropin(gts, tfh, cb);
465 		if (ret <= 0)
466 			return ret;
467 		STAT(call_os_wait_queue);
468 	}
469 }
470 
471 /*
472  * This interface is called as a result of a user detecting a "call OS" bit
473  * in a user CB. Normally means that a TLB fault has occurred.
474  * 	cb - user virtual address of the CB
475  */
476 int gru_handle_user_call_os(unsigned long cb)
477 {
478 	struct gru_tlb_fault_handle *tfh;
479 	struct gru_thread_state *gts;
480 	unsigned long __user *cbp;
481 	int ucbnum, cbrnum, ret = -EINVAL;
482 
483 	STAT(call_os);
484 	gru_dbg(grudev, "address 0x%lx\n", cb);
485 
486 	/* sanity check the cb pointer */
487 	ucbnum = get_cb_number((void *)cb);
488 	if ((cb & (GRU_HANDLE_STRIDE - 1)) || ucbnum >= GRU_NUM_CB)
489 		return -EINVAL;
490 	cbp = (unsigned long *)cb;
491 
492 	gts = gru_find_lock_gts(cb);
493 	if (!gts)
494 		return -EINVAL;
495 
496 	if (ucbnum >= gts->ts_cbr_au_count * GRU_CBR_AU_SIZE) {
497 		ret = -EINVAL;
498 		goto exit;
499 	}
500 
501 	/*
502 	 * If force_unload is set, the UPM TLB fault is phony. The task
503 	 * has migrated to another node and the GSEG must be moved. Just
504 	 * unload the context. The task will page fault and assign a new
505 	 * context.
506 	 */
507 	ret = -EAGAIN;
508 	cbrnum = thread_cbr_number(gts, ucbnum);
509 	if (gts->ts_force_unload) {
510 		gru_unload_context(gts, 1);
511 	} else if (gts->ts_gru) {
512 		tfh = get_tfh_by_index(gts->ts_gru, cbrnum);
513 		ret = gru_user_dropin(gts, tfh, cbp);
514 	}
515 exit:
516 	gru_unlock_gts(gts);
517 	return ret;
518 }
519 
520 /*
521  * Fetch the exception detail information for a CB that terminated with
522  * an exception.
523  */
524 int gru_get_exception_detail(unsigned long arg)
525 {
526 	struct control_block_extended_exc_detail excdet;
527 	struct gru_control_block_extended *cbe;
528 	struct gru_thread_state *gts;
529 	int ucbnum, cbrnum, ret;
530 
531 	STAT(user_exception);
532 	if (copy_from_user(&excdet, (void __user *)arg, sizeof(excdet)))
533 		return -EFAULT;
534 
535 	gru_dbg(grudev, "address 0x%lx\n", excdet.cb);
536 	gts = gru_find_lock_gts(excdet.cb);
537 	if (!gts)
538 		return -EINVAL;
539 
540 	if (gts->ts_gru) {
541 		ucbnum = get_cb_number((void *)excdet.cb);
542 		cbrnum = thread_cbr_number(gts, ucbnum);
543 		cbe = get_cbe_by_index(gts->ts_gru, cbrnum);
544 		prefetchw(cbe);		/* Harmless on hardware, required for emulator */
545 		excdet.opc = cbe->opccpy;
546 		excdet.exopc = cbe->exopccpy;
547 		excdet.ecause = cbe->ecause;
548 		excdet.exceptdet0 = cbe->idef1upd;
549 		excdet.exceptdet1 = cbe->idef3upd;
550 		ret = 0;
551 	} else {
552 		ret = -EAGAIN;
553 	}
554 	gru_unlock_gts(gts);
555 
556 	gru_dbg(grudev, "address 0x%lx, ecause 0x%x\n", excdet.cb,
557 		excdet.ecause);
558 	if (!ret && copy_to_user((void __user *)arg, &excdet, sizeof(excdet)))
559 		ret = -EFAULT;
560 	return ret;
561 }
562 
563 /*
564  * User request to unload a context. Content is saved for possible reload.
565  */
566 int gru_user_unload_context(unsigned long arg)
567 {
568 	struct gru_thread_state *gts;
569 	struct gru_unload_context_req req;
570 
571 	STAT(user_unload_context);
572 	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
573 		return -EFAULT;
574 
575 	gru_dbg(grudev, "gseg 0x%lx\n", req.gseg);
576 
577 	gts = gru_find_lock_gts(req.gseg);
578 	if (!gts)
579 		return -EINVAL;
580 
581 	if (gts->ts_gru)
582 		gru_unload_context(gts, 1);
583 	gru_unlock_gts(gts);
584 
585 	return 0;
586 }
587 
588 /*
589  * User request to flush a range of virtual addresses from the GRU TLB
590  * (Mainly for testing).
591  */
592 int gru_user_flush_tlb(unsigned long arg)
593 {
594 	struct gru_thread_state *gts;
595 	struct gru_flush_tlb_req req;
596 
597 	STAT(user_flush_tlb);
598 	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
599 		return -EFAULT;
600 
601 	gru_dbg(grudev, "gseg 0x%lx, vaddr 0x%lx, len 0x%lx\n", req.gseg,
602 		req.vaddr, req.len);
603 
604 	gts = gru_find_lock_gts(req.gseg);
605 	if (!gts)
606 		return -EINVAL;
607 
608 	gru_flush_tlb_range(gts->ts_gms, req.vaddr, req.vaddr + req.len);
609 	gru_unlock_gts(gts);
610 
611 	return 0;
612 }
613 
614 /*
615  * Register the current task as the user of the GSEG slice.
616  * Needed for TLB fault interrupt targeting.
617  */
618 int gru_set_task_slice(long address)
619 {
620 	struct gru_thread_state *gts;
621 
622 	STAT(set_task_slice);
623 	gru_dbg(grudev, "address 0x%lx\n", address);
624 	gts = gru_alloc_locked_gts(address);
625 	if (!gts)
626 		return -EINVAL;
627 
628 	gts->ts_tgid_owner = current->tgid;
629 	gru_unlock_gts(gts);
630 
631 	return 0;
632 }
633