1133ff0eaSJérôme Glisse /* 2133ff0eaSJérôme Glisse * Copyright 2013 Red Hat Inc. 3133ff0eaSJérôme Glisse * 4133ff0eaSJérôme Glisse * This program is free software; you can redistribute it and/or modify 5133ff0eaSJérôme Glisse * it under the terms of the GNU General Public License as published by 6133ff0eaSJérôme Glisse * the Free Software Foundation; either version 2 of the License, or 7133ff0eaSJérôme Glisse * (at your option) any later version. 8133ff0eaSJérôme Glisse * 9133ff0eaSJérôme Glisse * This program is distributed in the hope that it will be useful, 10133ff0eaSJérôme Glisse * but WITHOUT ANY WARRANTY; without even the implied warranty of 11133ff0eaSJérôme Glisse * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12133ff0eaSJérôme Glisse * GNU General Public License for more details. 13133ff0eaSJérôme Glisse * 14f813f219SJérôme Glisse * Authors: Jérôme Glisse <jglisse@redhat.com> 15133ff0eaSJérôme Glisse */ 16133ff0eaSJérôme Glisse /* 17133ff0eaSJérôme Glisse * Refer to include/linux/hmm.h for information about heterogeneous memory 18133ff0eaSJérôme Glisse * management or HMM for short. 19133ff0eaSJérôme Glisse */ 20133ff0eaSJérôme Glisse #include <linux/mm.h> 21133ff0eaSJérôme Glisse #include <linux/hmm.h> 22858b54daSJérôme Glisse #include <linux/init.h> 23da4c3c73SJérôme Glisse #include <linux/rmap.h> 24da4c3c73SJérôme Glisse #include <linux/swap.h> 25133ff0eaSJérôme Glisse #include <linux/slab.h> 26133ff0eaSJérôme Glisse #include <linux/sched.h> 274ef589dcSJérôme Glisse #include <linux/mmzone.h> 284ef589dcSJérôme Glisse #include <linux/pagemap.h> 29da4c3c73SJérôme Glisse #include <linux/swapops.h> 30da4c3c73SJérôme Glisse #include <linux/hugetlb.h> 314ef589dcSJérôme Glisse #include <linux/memremap.h> 327b2d55d2SJérôme Glisse #include <linux/jump_label.h> 33c0b12405SJérôme Glisse #include <linux/mmu_notifier.h> 344ef589dcSJérôme Glisse #include <linux/memory_hotplug.h> 354ef589dcSJérôme Glisse 364ef589dcSJérôme Glisse #define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT) 37133ff0eaSJérôme Glisse 386b368cd4SJérôme Glisse #if IS_ENABLED(CONFIG_HMM_MIRROR) 39c0b12405SJérôme Glisse static const struct mmu_notifier_ops hmm_mmu_notifier_ops; 40c0b12405SJérôme Glisse 41704f3f2cSJérôme Glisse static inline struct hmm *mm_get_hmm(struct mm_struct *mm) 42133ff0eaSJérôme Glisse { 43c0b12405SJérôme Glisse struct hmm *hmm = READ_ONCE(mm->hmm); 44704f3f2cSJérôme Glisse 45704f3f2cSJérôme Glisse if (hmm && kref_get_unless_zero(&hmm->kref)) 46704f3f2cSJérôme Glisse return hmm; 47704f3f2cSJérôme Glisse 48704f3f2cSJérôme Glisse return NULL; 49704f3f2cSJérôme Glisse } 50704f3f2cSJérôme Glisse 51704f3f2cSJérôme Glisse /** 52704f3f2cSJérôme Glisse * hmm_get_or_create - register HMM against an mm (HMM internal) 53704f3f2cSJérôme Glisse * 54704f3f2cSJérôme Glisse * @mm: mm struct to attach to 55704f3f2cSJérôme Glisse * Returns: returns an HMM object, either by referencing the existing 56704f3f2cSJérôme Glisse * (per-process) object, or by creating a new one. 57704f3f2cSJérôme Glisse * 58704f3f2cSJérôme Glisse * This is not intended to be used directly by device drivers. If mm already 59704f3f2cSJérôme Glisse * has an HMM struct then it get a reference on it and returns it. Otherwise 60704f3f2cSJérôme Glisse * it allocates an HMM struct, initializes it, associate it with the mm and 61704f3f2cSJérôme Glisse * returns it. 62704f3f2cSJérôme Glisse */ 63704f3f2cSJérôme Glisse static struct hmm *hmm_get_or_create(struct mm_struct *mm) 64704f3f2cSJérôme Glisse { 65704f3f2cSJérôme Glisse struct hmm *hmm = mm_get_hmm(mm); 66c0b12405SJérôme Glisse bool cleanup = false; 67133ff0eaSJérôme Glisse 68c0b12405SJérôme Glisse if (hmm) 69c0b12405SJérôme Glisse return hmm; 70c0b12405SJérôme Glisse 71c0b12405SJérôme Glisse hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); 72c0b12405SJérôme Glisse if (!hmm) 73c0b12405SJérôme Glisse return NULL; 74*a3e0d41cSJérôme Glisse init_waitqueue_head(&hmm->wq); 75c0b12405SJérôme Glisse INIT_LIST_HEAD(&hmm->mirrors); 76c0b12405SJérôme Glisse init_rwsem(&hmm->mirrors_sem); 77c0b12405SJérôme Glisse hmm->mmu_notifier.ops = NULL; 78da4c3c73SJérôme Glisse INIT_LIST_HEAD(&hmm->ranges); 79*a3e0d41cSJérôme Glisse mutex_init(&hmm->lock); 80704f3f2cSJérôme Glisse kref_init(&hmm->kref); 81*a3e0d41cSJérôme Glisse hmm->notifiers = 0; 82*a3e0d41cSJérôme Glisse hmm->dead = false; 83c0b12405SJérôme Glisse hmm->mm = mm; 84c0b12405SJérôme Glisse 85c0b12405SJérôme Glisse spin_lock(&mm->page_table_lock); 86c0b12405SJérôme Glisse if (!mm->hmm) 87c0b12405SJérôme Glisse mm->hmm = hmm; 88c0b12405SJérôme Glisse else 89c0b12405SJérôme Glisse cleanup = true; 90c0b12405SJérôme Glisse spin_unlock(&mm->page_table_lock); 91c0b12405SJérôme Glisse 9286a2d598SRalph Campbell if (cleanup) 9386a2d598SRalph Campbell goto error; 9486a2d598SRalph Campbell 9586a2d598SRalph Campbell /* 9686a2d598SRalph Campbell * We should only get here if hold the mmap_sem in write mode ie on 9786a2d598SRalph Campbell * registration of first mirror through hmm_mirror_register() 9886a2d598SRalph Campbell */ 9986a2d598SRalph Campbell hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; 10086a2d598SRalph Campbell if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) 10186a2d598SRalph Campbell goto error_mm; 102c0b12405SJérôme Glisse 103704f3f2cSJérôme Glisse return hmm; 10486a2d598SRalph Campbell 10586a2d598SRalph Campbell error_mm: 10686a2d598SRalph Campbell spin_lock(&mm->page_table_lock); 10786a2d598SRalph Campbell if (mm->hmm == hmm) 10886a2d598SRalph Campbell mm->hmm = NULL; 10986a2d598SRalph Campbell spin_unlock(&mm->page_table_lock); 11086a2d598SRalph Campbell error: 11186a2d598SRalph Campbell kfree(hmm); 11286a2d598SRalph Campbell return NULL; 113133ff0eaSJérôme Glisse } 114133ff0eaSJérôme Glisse 115704f3f2cSJérôme Glisse static void hmm_free(struct kref *kref) 116704f3f2cSJérôme Glisse { 117704f3f2cSJérôme Glisse struct hmm *hmm = container_of(kref, struct hmm, kref); 118704f3f2cSJérôme Glisse struct mm_struct *mm = hmm->mm; 119704f3f2cSJérôme Glisse 120704f3f2cSJérôme Glisse mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); 121704f3f2cSJérôme Glisse 122704f3f2cSJérôme Glisse spin_lock(&mm->page_table_lock); 123704f3f2cSJérôme Glisse if (mm->hmm == hmm) 124704f3f2cSJérôme Glisse mm->hmm = NULL; 125704f3f2cSJérôme Glisse spin_unlock(&mm->page_table_lock); 126704f3f2cSJérôme Glisse 127704f3f2cSJérôme Glisse kfree(hmm); 128704f3f2cSJérôme Glisse } 129704f3f2cSJérôme Glisse 130704f3f2cSJérôme Glisse static inline void hmm_put(struct hmm *hmm) 131704f3f2cSJérôme Glisse { 132704f3f2cSJérôme Glisse kref_put(&hmm->kref, hmm_free); 133704f3f2cSJérôme Glisse } 134704f3f2cSJérôme Glisse 135133ff0eaSJérôme Glisse void hmm_mm_destroy(struct mm_struct *mm) 136133ff0eaSJérôme Glisse { 137704f3f2cSJérôme Glisse struct hmm *hmm; 138704f3f2cSJérôme Glisse 139704f3f2cSJérôme Glisse spin_lock(&mm->page_table_lock); 140704f3f2cSJérôme Glisse hmm = mm_get_hmm(mm); 141704f3f2cSJérôme Glisse mm->hmm = NULL; 142704f3f2cSJérôme Glisse if (hmm) { 143704f3f2cSJérôme Glisse hmm->mm = NULL; 144*a3e0d41cSJérôme Glisse hmm->dead = true; 145704f3f2cSJérôme Glisse spin_unlock(&mm->page_table_lock); 146704f3f2cSJérôme Glisse hmm_put(hmm); 147704f3f2cSJérôme Glisse return; 148704f3f2cSJérôme Glisse } 149704f3f2cSJérôme Glisse 150704f3f2cSJérôme Glisse spin_unlock(&mm->page_table_lock); 151133ff0eaSJérôme Glisse } 152c0b12405SJérôme Glisse 153*a3e0d41cSJérôme Glisse static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) 154c0b12405SJérôme Glisse { 155*a3e0d41cSJérôme Glisse struct hmm *hmm = mm_get_hmm(mm); 156c0b12405SJérôme Glisse struct hmm_mirror *mirror; 157da4c3c73SJérôme Glisse struct hmm_range *range; 158da4c3c73SJérôme Glisse 159*a3e0d41cSJérôme Glisse /* Report this HMM as dying. */ 160*a3e0d41cSJérôme Glisse hmm->dead = true; 161da4c3c73SJérôme Glisse 162*a3e0d41cSJérôme Glisse /* Wake-up everyone waiting on any range. */ 163*a3e0d41cSJérôme Glisse mutex_lock(&hmm->lock); 164*a3e0d41cSJérôme Glisse list_for_each_entry(range, &hmm->ranges, list) { 165da4c3c73SJérôme Glisse range->valid = false; 166da4c3c73SJérôme Glisse } 167*a3e0d41cSJérôme Glisse wake_up_all(&hmm->wq); 168*a3e0d41cSJérôme Glisse mutex_unlock(&hmm->lock); 169e1401513SRalph Campbell 170e1401513SRalph Campbell down_write(&hmm->mirrors_sem); 171e1401513SRalph Campbell mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, 172e1401513SRalph Campbell list); 173e1401513SRalph Campbell while (mirror) { 174e1401513SRalph Campbell list_del_init(&mirror->list); 175e1401513SRalph Campbell if (mirror->ops->release) { 176e1401513SRalph Campbell /* 177e1401513SRalph Campbell * Drop mirrors_sem so callback can wait on any pending 178e1401513SRalph Campbell * work that might itself trigger mmu_notifier callback 179e1401513SRalph Campbell * and thus would deadlock with us. 180e1401513SRalph Campbell */ 181e1401513SRalph Campbell up_write(&hmm->mirrors_sem); 182e1401513SRalph Campbell mirror->ops->release(mirror); 183e1401513SRalph Campbell down_write(&hmm->mirrors_sem); 184e1401513SRalph Campbell } 185e1401513SRalph Campbell mirror = list_first_entry_or_null(&hmm->mirrors, 186e1401513SRalph Campbell struct hmm_mirror, list); 187e1401513SRalph Campbell } 188e1401513SRalph Campbell up_write(&hmm->mirrors_sem); 189704f3f2cSJérôme Glisse 190704f3f2cSJérôme Glisse hmm_put(hmm); 191e1401513SRalph Campbell } 192e1401513SRalph Campbell 19393065ac7SMichal Hocko static int hmm_invalidate_range_start(struct mmu_notifier *mn, 194*a3e0d41cSJérôme Glisse const struct mmu_notifier_range *nrange) 195c0b12405SJérôme Glisse { 196*a3e0d41cSJérôme Glisse struct hmm *hmm = mm_get_hmm(nrange->mm); 197*a3e0d41cSJérôme Glisse struct hmm_mirror *mirror; 198ec131b2dSJérôme Glisse struct hmm_update update; 199*a3e0d41cSJérôme Glisse struct hmm_range *range; 200*a3e0d41cSJérôme Glisse int ret = 0; 201c0b12405SJérôme Glisse 202c0b12405SJérôme Glisse VM_BUG_ON(!hmm); 203c0b12405SJérôme Glisse 204*a3e0d41cSJérôme Glisse update.start = nrange->start; 205*a3e0d41cSJérôme Glisse update.end = nrange->end; 206ec131b2dSJérôme Glisse update.event = HMM_UPDATE_INVALIDATE; 207*a3e0d41cSJérôme Glisse update.blockable = nrange->blockable; 208*a3e0d41cSJérôme Glisse 209*a3e0d41cSJérôme Glisse if (nrange->blockable) 210*a3e0d41cSJérôme Glisse mutex_lock(&hmm->lock); 211*a3e0d41cSJérôme Glisse else if (!mutex_trylock(&hmm->lock)) { 212*a3e0d41cSJérôme Glisse ret = -EAGAIN; 213*a3e0d41cSJérôme Glisse goto out; 214*a3e0d41cSJérôme Glisse } 215*a3e0d41cSJérôme Glisse hmm->notifiers++; 216*a3e0d41cSJérôme Glisse list_for_each_entry(range, &hmm->ranges, list) { 217*a3e0d41cSJérôme Glisse if (update.end < range->start || update.start >= range->end) 218*a3e0d41cSJérôme Glisse continue; 219*a3e0d41cSJérôme Glisse 220*a3e0d41cSJérôme Glisse range->valid = false; 221*a3e0d41cSJérôme Glisse } 222*a3e0d41cSJérôme Glisse mutex_unlock(&hmm->lock); 223*a3e0d41cSJérôme Glisse 224*a3e0d41cSJérôme Glisse if (nrange->blockable) 225*a3e0d41cSJérôme Glisse down_read(&hmm->mirrors_sem); 226*a3e0d41cSJérôme Glisse else if (!down_read_trylock(&hmm->mirrors_sem)) { 227*a3e0d41cSJérôme Glisse ret = -EAGAIN; 228*a3e0d41cSJérôme Glisse goto out; 229*a3e0d41cSJérôme Glisse } 230*a3e0d41cSJérôme Glisse list_for_each_entry(mirror, &hmm->mirrors, list) { 231*a3e0d41cSJérôme Glisse int ret; 232*a3e0d41cSJérôme Glisse 233*a3e0d41cSJérôme Glisse ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update); 234*a3e0d41cSJérôme Glisse if (!update.blockable && ret == -EAGAIN) { 235*a3e0d41cSJérôme Glisse up_read(&hmm->mirrors_sem); 236*a3e0d41cSJérôme Glisse ret = -EAGAIN; 237*a3e0d41cSJérôme Glisse goto out; 238*a3e0d41cSJérôme Glisse } 239*a3e0d41cSJérôme Glisse } 240*a3e0d41cSJérôme Glisse up_read(&hmm->mirrors_sem); 241*a3e0d41cSJérôme Glisse 242*a3e0d41cSJérôme Glisse out: 243704f3f2cSJérôme Glisse hmm_put(hmm); 244704f3f2cSJérôme Glisse return ret; 245c0b12405SJérôme Glisse } 246c0b12405SJérôme Glisse 247c0b12405SJérôme Glisse static void hmm_invalidate_range_end(struct mmu_notifier *mn, 248*a3e0d41cSJérôme Glisse const struct mmu_notifier_range *nrange) 249c0b12405SJérôme Glisse { 250*a3e0d41cSJérôme Glisse struct hmm *hmm = mm_get_hmm(nrange->mm); 251c0b12405SJérôme Glisse 252c0b12405SJérôme Glisse VM_BUG_ON(!hmm); 253c0b12405SJérôme Glisse 254*a3e0d41cSJérôme Glisse mutex_lock(&hmm->lock); 255*a3e0d41cSJérôme Glisse hmm->notifiers--; 256*a3e0d41cSJérôme Glisse if (!hmm->notifiers) { 257*a3e0d41cSJérôme Glisse struct hmm_range *range; 258*a3e0d41cSJérôme Glisse 259*a3e0d41cSJérôme Glisse list_for_each_entry(range, &hmm->ranges, list) { 260*a3e0d41cSJérôme Glisse if (range->valid) 261*a3e0d41cSJérôme Glisse continue; 262*a3e0d41cSJérôme Glisse range->valid = true; 263*a3e0d41cSJérôme Glisse } 264*a3e0d41cSJérôme Glisse wake_up_all(&hmm->wq); 265*a3e0d41cSJérôme Glisse } 266*a3e0d41cSJérôme Glisse mutex_unlock(&hmm->lock); 267*a3e0d41cSJérôme Glisse 268704f3f2cSJérôme Glisse hmm_put(hmm); 269c0b12405SJérôme Glisse } 270c0b12405SJérôme Glisse 271c0b12405SJérôme Glisse static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { 272e1401513SRalph Campbell .release = hmm_release, 273c0b12405SJérôme Glisse .invalidate_range_start = hmm_invalidate_range_start, 274c0b12405SJérôme Glisse .invalidate_range_end = hmm_invalidate_range_end, 275c0b12405SJérôme Glisse }; 276c0b12405SJérôme Glisse 277c0b12405SJérôme Glisse /* 278c0b12405SJérôme Glisse * hmm_mirror_register() - register a mirror against an mm 279c0b12405SJérôme Glisse * 280c0b12405SJérôme Glisse * @mirror: new mirror struct to register 281c0b12405SJérôme Glisse * @mm: mm to register against 282c0b12405SJérôme Glisse * 283c0b12405SJérôme Glisse * To start mirroring a process address space, the device driver must register 284c0b12405SJérôme Glisse * an HMM mirror struct. 285c0b12405SJérôme Glisse * 286c0b12405SJérôme Glisse * THE mm->mmap_sem MUST BE HELD IN WRITE MODE ! 287c0b12405SJérôme Glisse */ 288c0b12405SJérôme Glisse int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) 289c0b12405SJérôme Glisse { 290c0b12405SJérôme Glisse /* Sanity check */ 291c0b12405SJérôme Glisse if (!mm || !mirror || !mirror->ops) 292c0b12405SJérôme Glisse return -EINVAL; 293c0b12405SJérôme Glisse 294704f3f2cSJérôme Glisse mirror->hmm = hmm_get_or_create(mm); 295c0b12405SJérôme Glisse if (!mirror->hmm) 296c0b12405SJérôme Glisse return -ENOMEM; 297c0b12405SJérôme Glisse 298c0b12405SJérôme Glisse down_write(&mirror->hmm->mirrors_sem); 299c0b12405SJérôme Glisse list_add(&mirror->list, &mirror->hmm->mirrors); 300c0b12405SJérôme Glisse up_write(&mirror->hmm->mirrors_sem); 301c0b12405SJérôme Glisse 302c0b12405SJérôme Glisse return 0; 303c0b12405SJérôme Glisse } 304c0b12405SJérôme Glisse EXPORT_SYMBOL(hmm_mirror_register); 305c0b12405SJérôme Glisse 306c0b12405SJérôme Glisse /* 307c0b12405SJérôme Glisse * hmm_mirror_unregister() - unregister a mirror 308c0b12405SJérôme Glisse * 309c0b12405SJérôme Glisse * @mirror: new mirror struct to register 310c0b12405SJérôme Glisse * 311c0b12405SJérôme Glisse * Stop mirroring a process address space, and cleanup. 312c0b12405SJérôme Glisse */ 313c0b12405SJérôme Glisse void hmm_mirror_unregister(struct hmm_mirror *mirror) 314c0b12405SJérôme Glisse { 315704f3f2cSJérôme Glisse struct hmm *hmm = READ_ONCE(mirror->hmm); 316c0b12405SJérôme Glisse 317704f3f2cSJérôme Glisse if (hmm == NULL) 318c01cbba2SJérôme Glisse return; 319c01cbba2SJérôme Glisse 320c0b12405SJérôme Glisse down_write(&hmm->mirrors_sem); 321e1401513SRalph Campbell list_del_init(&mirror->list); 322704f3f2cSJérôme Glisse /* To protect us against double unregister ... */ 323c01cbba2SJérôme Glisse mirror->hmm = NULL; 324c0b12405SJérôme Glisse up_write(&hmm->mirrors_sem); 325c01cbba2SJérôme Glisse 326704f3f2cSJérôme Glisse hmm_put(hmm); 327c0b12405SJérôme Glisse } 328c0b12405SJérôme Glisse EXPORT_SYMBOL(hmm_mirror_unregister); 329da4c3c73SJérôme Glisse 33074eee180SJérôme Glisse struct hmm_vma_walk { 33174eee180SJérôme Glisse struct hmm_range *range; 33274eee180SJérôme Glisse unsigned long last; 33374eee180SJérôme Glisse bool fault; 33474eee180SJérôme Glisse bool block; 33574eee180SJérôme Glisse }; 33674eee180SJérôme Glisse 3372aee09d8SJérôme Glisse static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, 3382aee09d8SJérôme Glisse bool write_fault, uint64_t *pfn) 33974eee180SJérôme Glisse { 34074eee180SJérôme Glisse unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; 34174eee180SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 342f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 34374eee180SJérôme Glisse struct vm_area_struct *vma = walk->vma; 34450a7ca3cSSouptick Joarder vm_fault_t ret; 34574eee180SJérôme Glisse 34674eee180SJérôme Glisse flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; 3472aee09d8SJérôme Glisse flags |= write_fault ? FAULT_FLAG_WRITE : 0; 34850a7ca3cSSouptick Joarder ret = handle_mm_fault(vma, addr, flags); 34950a7ca3cSSouptick Joarder if (ret & VM_FAULT_RETRY) 35073231612SJérôme Glisse return -EAGAIN; 35150a7ca3cSSouptick Joarder if (ret & VM_FAULT_ERROR) { 352f88a1e90SJérôme Glisse *pfn = range->values[HMM_PFN_ERROR]; 35374eee180SJérôme Glisse return -EFAULT; 35474eee180SJérôme Glisse } 35574eee180SJérôme Glisse 35673231612SJérôme Glisse return -EBUSY; 35774eee180SJérôme Glisse } 35874eee180SJérôme Glisse 359da4c3c73SJérôme Glisse static int hmm_pfns_bad(unsigned long addr, 360da4c3c73SJérôme Glisse unsigned long end, 361da4c3c73SJérôme Glisse struct mm_walk *walk) 362da4c3c73SJérôme Glisse { 363c719547fSJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 364c719547fSJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 365ff05c0c6SJérôme Glisse uint64_t *pfns = range->pfns; 366da4c3c73SJérôme Glisse unsigned long i; 367da4c3c73SJérôme Glisse 368da4c3c73SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT; 369da4c3c73SJérôme Glisse for (; addr < end; addr += PAGE_SIZE, i++) 370f88a1e90SJérôme Glisse pfns[i] = range->values[HMM_PFN_ERROR]; 371da4c3c73SJérôme Glisse 372da4c3c73SJérôme Glisse return 0; 373da4c3c73SJérôme Glisse } 374da4c3c73SJérôme Glisse 3755504ed29SJérôme Glisse /* 3765504ed29SJérôme Glisse * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s) 3775504ed29SJérôme Glisse * @start: range virtual start address (inclusive) 3785504ed29SJérôme Glisse * @end: range virtual end address (exclusive) 3792aee09d8SJérôme Glisse * @fault: should we fault or not ? 3802aee09d8SJérôme Glisse * @write_fault: write fault ? 3815504ed29SJérôme Glisse * @walk: mm_walk structure 38273231612SJérôme Glisse * Returns: 0 on success, -EBUSY after page fault, or page fault error 3835504ed29SJérôme Glisse * 3845504ed29SJérôme Glisse * This function will be called whenever pmd_none() or pte_none() returns true, 3855504ed29SJérôme Glisse * or whenever there is no page directory covering the virtual address range. 3865504ed29SJérôme Glisse */ 3872aee09d8SJérôme Glisse static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, 3882aee09d8SJérôme Glisse bool fault, bool write_fault, 389da4c3c73SJérôme Glisse struct mm_walk *walk) 390da4c3c73SJérôme Glisse { 39174eee180SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 39274eee180SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 393ff05c0c6SJérôme Glisse uint64_t *pfns = range->pfns; 394da4c3c73SJérôme Glisse unsigned long i; 395da4c3c73SJérôme Glisse 39674eee180SJérôme Glisse hmm_vma_walk->last = addr; 397da4c3c73SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT; 39874eee180SJérôme Glisse for (; addr < end; addr += PAGE_SIZE, i++) { 399f88a1e90SJérôme Glisse pfns[i] = range->values[HMM_PFN_NONE]; 4002aee09d8SJérôme Glisse if (fault || write_fault) { 40174eee180SJérôme Glisse int ret; 402da4c3c73SJérôme Glisse 4032aee09d8SJérôme Glisse ret = hmm_vma_do_fault(walk, addr, write_fault, 4042aee09d8SJérôme Glisse &pfns[i]); 40573231612SJérôme Glisse if (ret != -EBUSY) 40674eee180SJérôme Glisse return ret; 40774eee180SJérôme Glisse } 40874eee180SJérôme Glisse } 40974eee180SJérôme Glisse 41073231612SJérôme Glisse return (fault || write_fault) ? -EBUSY : 0; 4112aee09d8SJérôme Glisse } 4122aee09d8SJérôme Glisse 4132aee09d8SJérôme Glisse static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 4142aee09d8SJérôme Glisse uint64_t pfns, uint64_t cpu_flags, 4152aee09d8SJérôme Glisse bool *fault, bool *write_fault) 4162aee09d8SJérôme Glisse { 417f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 418f88a1e90SJérôme Glisse 4192aee09d8SJérôme Glisse if (!hmm_vma_walk->fault) 4202aee09d8SJérôme Glisse return; 4212aee09d8SJérôme Glisse 4222aee09d8SJérôme Glisse /* We aren't ask to do anything ... */ 423f88a1e90SJérôme Glisse if (!(pfns & range->flags[HMM_PFN_VALID])) 4242aee09d8SJérôme Glisse return; 425f88a1e90SJérôme Glisse /* If this is device memory than only fault if explicitly requested */ 426f88a1e90SJérôme Glisse if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { 427f88a1e90SJérôme Glisse /* Do we fault on device memory ? */ 428f88a1e90SJérôme Glisse if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { 429f88a1e90SJérôme Glisse *write_fault = pfns & range->flags[HMM_PFN_WRITE]; 430f88a1e90SJérôme Glisse *fault = true; 431f88a1e90SJérôme Glisse } 4322aee09d8SJérôme Glisse return; 4332aee09d8SJérôme Glisse } 434f88a1e90SJérôme Glisse 435f88a1e90SJérôme Glisse /* If CPU page table is not valid then we need to fault */ 436f88a1e90SJérôme Glisse *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); 437f88a1e90SJérôme Glisse /* Need to write fault ? */ 438f88a1e90SJérôme Glisse if ((pfns & range->flags[HMM_PFN_WRITE]) && 439f88a1e90SJérôme Glisse !(cpu_flags & range->flags[HMM_PFN_WRITE])) { 440f88a1e90SJérôme Glisse *write_fault = true; 4412aee09d8SJérôme Glisse *fault = true; 4422aee09d8SJérôme Glisse } 4432aee09d8SJérôme Glisse } 4442aee09d8SJérôme Glisse 4452aee09d8SJérôme Glisse static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 4462aee09d8SJérôme Glisse const uint64_t *pfns, unsigned long npages, 4472aee09d8SJérôme Glisse uint64_t cpu_flags, bool *fault, 4482aee09d8SJérôme Glisse bool *write_fault) 4492aee09d8SJérôme Glisse { 4502aee09d8SJérôme Glisse unsigned long i; 4512aee09d8SJérôme Glisse 4522aee09d8SJérôme Glisse if (!hmm_vma_walk->fault) { 4532aee09d8SJérôme Glisse *fault = *write_fault = false; 4542aee09d8SJérôme Glisse return; 4552aee09d8SJérôme Glisse } 4562aee09d8SJérôme Glisse 457*a3e0d41cSJérôme Glisse *fault = *write_fault = false; 4582aee09d8SJérôme Glisse for (i = 0; i < npages; ++i) { 4592aee09d8SJérôme Glisse hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, 4602aee09d8SJérôme Glisse fault, write_fault); 461*a3e0d41cSJérôme Glisse if ((*write_fault)) 4622aee09d8SJérôme Glisse return; 4632aee09d8SJérôme Glisse } 4642aee09d8SJérôme Glisse } 4652aee09d8SJérôme Glisse 4662aee09d8SJérôme Glisse static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, 4672aee09d8SJérôme Glisse struct mm_walk *walk) 4682aee09d8SJérôme Glisse { 4692aee09d8SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 4702aee09d8SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 4712aee09d8SJérôme Glisse bool fault, write_fault; 4722aee09d8SJérôme Glisse unsigned long i, npages; 4732aee09d8SJérôme Glisse uint64_t *pfns; 4742aee09d8SJérôme Glisse 4752aee09d8SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT; 4762aee09d8SJérôme Glisse npages = (end - addr) >> PAGE_SHIFT; 4772aee09d8SJérôme Glisse pfns = &range->pfns[i]; 4782aee09d8SJérôme Glisse hmm_range_need_fault(hmm_vma_walk, pfns, npages, 4792aee09d8SJérôme Glisse 0, &fault, &write_fault); 4802aee09d8SJérôme Glisse return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 4812aee09d8SJérôme Glisse } 4822aee09d8SJérôme Glisse 483f88a1e90SJérôme Glisse static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) 4842aee09d8SJérôme Glisse { 4852aee09d8SJérôme Glisse if (pmd_protnone(pmd)) 4862aee09d8SJérôme Glisse return 0; 487f88a1e90SJérôme Glisse return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | 488f88a1e90SJérôme Glisse range->flags[HMM_PFN_WRITE] : 489f88a1e90SJérôme Glisse range->flags[HMM_PFN_VALID]; 490da4c3c73SJérôme Glisse } 491da4c3c73SJérôme Glisse 49253f5c3f4SJérôme Glisse static int hmm_vma_handle_pmd(struct mm_walk *walk, 49353f5c3f4SJérôme Glisse unsigned long addr, 49453f5c3f4SJérôme Glisse unsigned long end, 49553f5c3f4SJérôme Glisse uint64_t *pfns, 49653f5c3f4SJérôme Glisse pmd_t pmd) 49753f5c3f4SJérôme Glisse { 49853f5c3f4SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 499f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 5002aee09d8SJérôme Glisse unsigned long pfn, npages, i; 5012aee09d8SJérôme Glisse bool fault, write_fault; 502f88a1e90SJérôme Glisse uint64_t cpu_flags; 50353f5c3f4SJérôme Glisse 5042aee09d8SJérôme Glisse npages = (end - addr) >> PAGE_SHIFT; 505f88a1e90SJérôme Glisse cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); 5062aee09d8SJérôme Glisse hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, 5072aee09d8SJérôme Glisse &fault, &write_fault); 50853f5c3f4SJérôme Glisse 5092aee09d8SJérôme Glisse if (pmd_protnone(pmd) || fault || write_fault) 5102aee09d8SJérôme Glisse return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 51153f5c3f4SJérôme Glisse 51253f5c3f4SJérôme Glisse pfn = pmd_pfn(pmd) + pte_index(addr); 51353f5c3f4SJérôme Glisse for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) 514f88a1e90SJérôme Glisse pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; 51553f5c3f4SJérôme Glisse hmm_vma_walk->last = end; 51653f5c3f4SJérôme Glisse return 0; 51753f5c3f4SJérôme Glisse } 51853f5c3f4SJérôme Glisse 519f88a1e90SJérôme Glisse static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) 5202aee09d8SJérôme Glisse { 5212aee09d8SJérôme Glisse if (pte_none(pte) || !pte_present(pte)) 5222aee09d8SJérôme Glisse return 0; 523f88a1e90SJérôme Glisse return pte_write(pte) ? range->flags[HMM_PFN_VALID] | 524f88a1e90SJérôme Glisse range->flags[HMM_PFN_WRITE] : 525f88a1e90SJérôme Glisse range->flags[HMM_PFN_VALID]; 5262aee09d8SJérôme Glisse } 5272aee09d8SJérôme Glisse 52853f5c3f4SJérôme Glisse static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, 52953f5c3f4SJérôme Glisse unsigned long end, pmd_t *pmdp, pte_t *ptep, 53053f5c3f4SJérôme Glisse uint64_t *pfn) 53153f5c3f4SJérôme Glisse { 53253f5c3f4SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 533f88a1e90SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 53453f5c3f4SJérôme Glisse struct vm_area_struct *vma = walk->vma; 5352aee09d8SJérôme Glisse bool fault, write_fault; 5362aee09d8SJérôme Glisse uint64_t cpu_flags; 53753f5c3f4SJérôme Glisse pte_t pte = *ptep; 538f88a1e90SJérôme Glisse uint64_t orig_pfn = *pfn; 53953f5c3f4SJérôme Glisse 540f88a1e90SJérôme Glisse *pfn = range->values[HMM_PFN_NONE]; 54173231612SJérôme Glisse fault = write_fault = false; 54253f5c3f4SJérôme Glisse 54353f5c3f4SJérôme Glisse if (pte_none(pte)) { 54473231612SJérôme Glisse hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, 54573231612SJérôme Glisse &fault, &write_fault); 5462aee09d8SJérôme Glisse if (fault || write_fault) 54753f5c3f4SJérôme Glisse goto fault; 54853f5c3f4SJérôme Glisse return 0; 54953f5c3f4SJérôme Glisse } 55053f5c3f4SJérôme Glisse 55153f5c3f4SJérôme Glisse if (!pte_present(pte)) { 55253f5c3f4SJérôme Glisse swp_entry_t entry = pte_to_swp_entry(pte); 55353f5c3f4SJérôme Glisse 55453f5c3f4SJérôme Glisse if (!non_swap_entry(entry)) { 5552aee09d8SJérôme Glisse if (fault || write_fault) 55653f5c3f4SJérôme Glisse goto fault; 55753f5c3f4SJérôme Glisse return 0; 55853f5c3f4SJérôme Glisse } 55953f5c3f4SJérôme Glisse 56053f5c3f4SJérôme Glisse /* 56153f5c3f4SJérôme Glisse * This is a special swap entry, ignore migration, use 56253f5c3f4SJérôme Glisse * device and report anything else as error. 56353f5c3f4SJérôme Glisse */ 56453f5c3f4SJérôme Glisse if (is_device_private_entry(entry)) { 565f88a1e90SJérôme Glisse cpu_flags = range->flags[HMM_PFN_VALID] | 566f88a1e90SJérôme Glisse range->flags[HMM_PFN_DEVICE_PRIVATE]; 5672aee09d8SJérôme Glisse cpu_flags |= is_write_device_private_entry(entry) ? 568f88a1e90SJérôme Glisse range->flags[HMM_PFN_WRITE] : 0; 569f88a1e90SJérôme Glisse hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 570f88a1e90SJérôme Glisse &fault, &write_fault); 571f88a1e90SJérôme Glisse if (fault || write_fault) 572f88a1e90SJérôme Glisse goto fault; 573f88a1e90SJérôme Glisse *pfn = hmm_pfn_from_pfn(range, swp_offset(entry)); 574f88a1e90SJérôme Glisse *pfn |= cpu_flags; 57553f5c3f4SJérôme Glisse return 0; 57653f5c3f4SJérôme Glisse } 57753f5c3f4SJérôme Glisse 57853f5c3f4SJérôme Glisse if (is_migration_entry(entry)) { 5792aee09d8SJérôme Glisse if (fault || write_fault) { 58053f5c3f4SJérôme Glisse pte_unmap(ptep); 58153f5c3f4SJérôme Glisse hmm_vma_walk->last = addr; 58253f5c3f4SJérôme Glisse migration_entry_wait(vma->vm_mm, 58353f5c3f4SJérôme Glisse pmdp, addr); 58473231612SJérôme Glisse return -EBUSY; 58553f5c3f4SJérôme Glisse } 58653f5c3f4SJérôme Glisse return 0; 58753f5c3f4SJérôme Glisse } 58853f5c3f4SJérôme Glisse 58953f5c3f4SJérôme Glisse /* Report error for everything else */ 590f88a1e90SJérôme Glisse *pfn = range->values[HMM_PFN_ERROR]; 59153f5c3f4SJérôme Glisse return -EFAULT; 59273231612SJérôme Glisse } else { 59373231612SJérôme Glisse cpu_flags = pte_to_hmm_pfn_flags(range, pte); 59473231612SJérôme Glisse hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 59573231612SJérôme Glisse &fault, &write_fault); 59653f5c3f4SJérôme Glisse } 59753f5c3f4SJérôme Glisse 5982aee09d8SJérôme Glisse if (fault || write_fault) 59953f5c3f4SJérôme Glisse goto fault; 60053f5c3f4SJérôme Glisse 601f88a1e90SJérôme Glisse *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags; 60253f5c3f4SJérôme Glisse return 0; 60353f5c3f4SJérôme Glisse 60453f5c3f4SJérôme Glisse fault: 60553f5c3f4SJérôme Glisse pte_unmap(ptep); 60653f5c3f4SJérôme Glisse /* Fault any virtual address we were asked to fault */ 6072aee09d8SJérôme Glisse return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 60853f5c3f4SJérôme Glisse } 60953f5c3f4SJérôme Glisse 610da4c3c73SJérôme Glisse static int hmm_vma_walk_pmd(pmd_t *pmdp, 611da4c3c73SJérôme Glisse unsigned long start, 612da4c3c73SJérôme Glisse unsigned long end, 613da4c3c73SJérôme Glisse struct mm_walk *walk) 614da4c3c73SJérôme Glisse { 61574eee180SJérôme Glisse struct hmm_vma_walk *hmm_vma_walk = walk->private; 61674eee180SJérôme Glisse struct hmm_range *range = hmm_vma_walk->range; 617d08faca0SJérôme Glisse struct vm_area_struct *vma = walk->vma; 618ff05c0c6SJérôme Glisse uint64_t *pfns = range->pfns; 619da4c3c73SJérôme Glisse unsigned long addr = start, i; 620da4c3c73SJérôme Glisse pte_t *ptep; 621da4c3c73SJérôme Glisse pmd_t pmd; 622da4c3c73SJérôme Glisse 623d08faca0SJérôme Glisse 624d08faca0SJérôme Glisse again: 625d08faca0SJérôme Glisse pmd = READ_ONCE(*pmdp); 626d08faca0SJérôme Glisse if (pmd_none(pmd)) 627d08faca0SJérôme Glisse return hmm_vma_walk_hole(start, end, walk); 628d08faca0SJérôme Glisse 629d08faca0SJérôme Glisse if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB)) 630d08faca0SJérôme Glisse return hmm_pfns_bad(start, end, walk); 631d08faca0SJérôme Glisse 632d08faca0SJérôme Glisse if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { 633d08faca0SJérôme Glisse bool fault, write_fault; 634d08faca0SJérôme Glisse unsigned long npages; 635d08faca0SJérôme Glisse uint64_t *pfns; 636d08faca0SJérôme Glisse 637d08faca0SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT; 638d08faca0SJérôme Glisse npages = (end - addr) >> PAGE_SHIFT; 639d08faca0SJérôme Glisse pfns = &range->pfns[i]; 640d08faca0SJérôme Glisse 641d08faca0SJérôme Glisse hmm_range_need_fault(hmm_vma_walk, pfns, npages, 642d08faca0SJérôme Glisse 0, &fault, &write_fault); 643d08faca0SJérôme Glisse if (fault || write_fault) { 644d08faca0SJérôme Glisse hmm_vma_walk->last = addr; 645d08faca0SJérôme Glisse pmd_migration_entry_wait(vma->vm_mm, pmdp); 64673231612SJérôme Glisse return -EBUSY; 647d08faca0SJérôme Glisse } 648d08faca0SJérôme Glisse return 0; 649d08faca0SJérôme Glisse } else if (!pmd_present(pmd)) 650d08faca0SJérôme Glisse return hmm_pfns_bad(start, end, walk); 651d08faca0SJérôme Glisse 652d08faca0SJérôme Glisse if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { 653da4c3c73SJérôme Glisse /* 654da4c3c73SJérôme Glisse * No need to take pmd_lock here, even if some other threads 655da4c3c73SJérôme Glisse * is splitting the huge pmd we will get that event through 656da4c3c73SJérôme Glisse * mmu_notifier callback. 657da4c3c73SJérôme Glisse * 658da4c3c73SJérôme Glisse * So just read pmd value and check again its a transparent 659da4c3c73SJérôme Glisse * huge or device mapping one and compute corresponding pfn 660da4c3c73SJérôme Glisse * values. 661da4c3c73SJérôme Glisse */ 662da4c3c73SJérôme Glisse pmd = pmd_read_atomic(pmdp); 663da4c3c73SJérôme Glisse barrier(); 664da4c3c73SJérôme Glisse if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) 665da4c3c73SJérôme Glisse goto again; 666da4c3c73SJérôme Glisse 667d08faca0SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT; 66853f5c3f4SJérôme Glisse return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); 669da4c3c73SJérôme Glisse } 670da4c3c73SJérôme Glisse 671d08faca0SJérôme Glisse /* 672d08faca0SJérôme Glisse * We have handled all the valid case above ie either none, migration, 673d08faca0SJérôme Glisse * huge or transparent huge. At this point either it is a valid pmd 674d08faca0SJérôme Glisse * entry pointing to pte directory or it is a bad pmd that will not 675d08faca0SJérôme Glisse * recover. 676d08faca0SJérôme Glisse */ 677d08faca0SJérôme Glisse if (pmd_bad(pmd)) 678da4c3c73SJérôme Glisse return hmm_pfns_bad(start, end, walk); 679da4c3c73SJérôme Glisse 680da4c3c73SJérôme Glisse ptep = pte_offset_map(pmdp, addr); 681d08faca0SJérôme Glisse i = (addr - range->start) >> PAGE_SHIFT; 682da4c3c73SJérôme Glisse for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { 68353f5c3f4SJérôme Glisse int r; 684da4c3c73SJérôme Glisse 68553f5c3f4SJérôme Glisse r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); 68653f5c3f4SJérôme Glisse if (r) { 68753f5c3f4SJérôme Glisse /* hmm_vma_handle_pte() did unmap pte directory */ 68874eee180SJérôme Glisse hmm_vma_walk->last = addr; 68953f5c3f4SJérôme Glisse return r; 69074eee180SJérôme Glisse } 691da4c3c73SJérôme Glisse } 692da4c3c73SJérôme Glisse pte_unmap(ptep - 1); 693da4c3c73SJérôme Glisse 69453f5c3f4SJérôme Glisse hmm_vma_walk->last = addr; 695da4c3c73SJérôme Glisse return 0; 696da4c3c73SJérôme Glisse } 697da4c3c73SJérôme Glisse 698f88a1e90SJérôme Glisse static void hmm_pfns_clear(struct hmm_range *range, 699f88a1e90SJérôme Glisse uint64_t *pfns, 70033cd47dcSJérôme Glisse unsigned long addr, 70133cd47dcSJérôme Glisse unsigned long end) 70233cd47dcSJérôme Glisse { 70333cd47dcSJérôme Glisse for (; addr < end; addr += PAGE_SIZE, pfns++) 704f88a1e90SJérôme Glisse *pfns = range->values[HMM_PFN_NONE]; 70533cd47dcSJérôme Glisse } 70633cd47dcSJérôme Glisse 707855ce7d2SJérôme Glisse static void hmm_pfns_special(struct hmm_range *range) 708855ce7d2SJérôme Glisse { 709855ce7d2SJérôme Glisse unsigned long addr = range->start, i = 0; 710855ce7d2SJérôme Glisse 711855ce7d2SJérôme Glisse for (; addr < range->end; addr += PAGE_SIZE, i++) 712f88a1e90SJérôme Glisse range->pfns[i] = range->values[HMM_PFN_SPECIAL]; 713855ce7d2SJérôme Glisse } 714855ce7d2SJérôme Glisse 715da4c3c73SJérôme Glisse /* 716*a3e0d41cSJérôme Glisse * hmm_range_register() - start tracking change to CPU page table over a range 717*a3e0d41cSJérôme Glisse * @range: range 718*a3e0d41cSJérôme Glisse * @mm: the mm struct for the range of virtual address 719*a3e0d41cSJérôme Glisse * @start: start virtual address (inclusive) 720*a3e0d41cSJérôme Glisse * @end: end virtual address (exclusive) 721*a3e0d41cSJérôme Glisse * Returns 0 on success, -EFAULT if the address space is no longer valid 722*a3e0d41cSJérôme Glisse * 723*a3e0d41cSJérôme Glisse * Track updates to the CPU page table see include/linux/hmm.h 724*a3e0d41cSJérôme Glisse */ 725*a3e0d41cSJérôme Glisse int hmm_range_register(struct hmm_range *range, 726*a3e0d41cSJérôme Glisse struct mm_struct *mm, 727*a3e0d41cSJérôme Glisse unsigned long start, 728*a3e0d41cSJérôme Glisse unsigned long end) 729*a3e0d41cSJérôme Glisse { 730*a3e0d41cSJérôme Glisse range->start = start & PAGE_MASK; 731*a3e0d41cSJérôme Glisse range->end = end & PAGE_MASK; 732*a3e0d41cSJérôme Glisse range->valid = false; 733*a3e0d41cSJérôme Glisse range->hmm = NULL; 734*a3e0d41cSJérôme Glisse 735*a3e0d41cSJérôme Glisse if (range->start >= range->end) 736*a3e0d41cSJérôme Glisse return -EINVAL; 737*a3e0d41cSJérôme Glisse 738*a3e0d41cSJérôme Glisse range->start = start; 739*a3e0d41cSJérôme Glisse range->end = end; 740*a3e0d41cSJérôme Glisse 741*a3e0d41cSJérôme Glisse range->hmm = hmm_get_or_create(mm); 742*a3e0d41cSJérôme Glisse if (!range->hmm) 743*a3e0d41cSJérôme Glisse return -EFAULT; 744*a3e0d41cSJérôme Glisse 745*a3e0d41cSJérôme Glisse /* Check if hmm_mm_destroy() was call. */ 746*a3e0d41cSJérôme Glisse if (range->hmm->mm == NULL || range->hmm->dead) { 747*a3e0d41cSJérôme Glisse hmm_put(range->hmm); 748*a3e0d41cSJérôme Glisse return -EFAULT; 749*a3e0d41cSJérôme Glisse } 750*a3e0d41cSJérôme Glisse 751*a3e0d41cSJérôme Glisse /* Initialize range to track CPU page table update */ 752*a3e0d41cSJérôme Glisse mutex_lock(&range->hmm->lock); 753*a3e0d41cSJérôme Glisse 754*a3e0d41cSJérôme Glisse list_add_rcu(&range->list, &range->hmm->ranges); 755*a3e0d41cSJérôme Glisse 756*a3e0d41cSJérôme Glisse /* 757*a3e0d41cSJérôme Glisse * If there are any concurrent notifiers we have to wait for them for 758*a3e0d41cSJérôme Glisse * the range to be valid (see hmm_range_wait_until_valid()). 759*a3e0d41cSJérôme Glisse */ 760*a3e0d41cSJérôme Glisse if (!range->hmm->notifiers) 761*a3e0d41cSJérôme Glisse range->valid = true; 762*a3e0d41cSJérôme Glisse mutex_unlock(&range->hmm->lock); 763*a3e0d41cSJérôme Glisse 764*a3e0d41cSJérôme Glisse return 0; 765*a3e0d41cSJérôme Glisse } 766*a3e0d41cSJérôme Glisse EXPORT_SYMBOL(hmm_range_register); 767*a3e0d41cSJérôme Glisse 768*a3e0d41cSJérôme Glisse /* 769*a3e0d41cSJérôme Glisse * hmm_range_unregister() - stop tracking change to CPU page table over a range 770*a3e0d41cSJérôme Glisse * @range: range 771*a3e0d41cSJérôme Glisse * 772*a3e0d41cSJérôme Glisse * Range struct is used to track updates to the CPU page table after a call to 773*a3e0d41cSJérôme Glisse * hmm_range_register(). See include/linux/hmm.h for how to use it. 774*a3e0d41cSJérôme Glisse */ 775*a3e0d41cSJérôme Glisse void hmm_range_unregister(struct hmm_range *range) 776*a3e0d41cSJérôme Glisse { 777*a3e0d41cSJérôme Glisse /* Sanity check this really should not happen. */ 778*a3e0d41cSJérôme Glisse if (range->hmm == NULL || range->end <= range->start) 779*a3e0d41cSJérôme Glisse return; 780*a3e0d41cSJérôme Glisse 781*a3e0d41cSJérôme Glisse mutex_lock(&range->hmm->lock); 782*a3e0d41cSJérôme Glisse list_del_rcu(&range->list); 783*a3e0d41cSJérôme Glisse mutex_unlock(&range->hmm->lock); 784*a3e0d41cSJérôme Glisse 785*a3e0d41cSJérôme Glisse /* Drop reference taken by hmm_range_register() */ 786*a3e0d41cSJérôme Glisse range->valid = false; 787*a3e0d41cSJérôme Glisse hmm_put(range->hmm); 788*a3e0d41cSJérôme Glisse range->hmm = NULL; 789*a3e0d41cSJérôme Glisse } 790*a3e0d41cSJérôme Glisse EXPORT_SYMBOL(hmm_range_unregister); 791*a3e0d41cSJérôme Glisse 792*a3e0d41cSJérôme Glisse /* 79325f23a0cSJérôme Glisse * hmm_range_snapshot() - snapshot CPU page table for a range 79425f23a0cSJérôme Glisse * @range: range 795*a3e0d41cSJérôme Glisse * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid 796*a3e0d41cSJérôme Glisse * permission (for instance asking for write and range is read only), 797*a3e0d41cSJérôme Glisse * -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid 798*a3e0d41cSJérôme Glisse * vma or it is illegal to access that range), number of valid pages 799*a3e0d41cSJérôme Glisse * in range->pfns[] (from range start address). 800da4c3c73SJérôme Glisse * 801da4c3c73SJérôme Glisse * This snapshots the CPU page table for a range of virtual addresses. Snapshot 802*a3e0d41cSJérôme Glisse * validity is tracked by range struct. See in include/linux/hmm.h for example 803*a3e0d41cSJérôme Glisse * on how to use. 804da4c3c73SJérôme Glisse */ 80525f23a0cSJérôme Glisse long hmm_range_snapshot(struct hmm_range *range) 806da4c3c73SJérôme Glisse { 807*a3e0d41cSJérôme Glisse unsigned long start = range->start, end; 80874eee180SJérôme Glisse struct hmm_vma_walk hmm_vma_walk; 809*a3e0d41cSJérôme Glisse struct hmm *hmm = range->hmm; 810*a3e0d41cSJérôme Glisse struct vm_area_struct *vma; 811da4c3c73SJérôme Glisse struct mm_walk mm_walk; 812704f3f2cSJérôme Glisse 813704f3f2cSJérôme Glisse /* Check if hmm_mm_destroy() was call. */ 814*a3e0d41cSJérôme Glisse if (hmm->mm == NULL || hmm->dead) 815*a3e0d41cSJérôme Glisse return -EFAULT; 816da4c3c73SJérôme Glisse 817*a3e0d41cSJérôme Glisse do { 818*a3e0d41cSJérôme Glisse /* If range is no longer valid force retry. */ 819*a3e0d41cSJérôme Glisse if (!range->valid) 820*a3e0d41cSJérôme Glisse return -EAGAIN; 821*a3e0d41cSJérôme Glisse 822*a3e0d41cSJérôme Glisse vma = find_vma(hmm->mm, start); 823*a3e0d41cSJérôme Glisse if (vma == NULL || (vma->vm_flags & VM_SPECIAL)) 824*a3e0d41cSJérôme Glisse return -EFAULT; 825*a3e0d41cSJérôme Glisse 826*a3e0d41cSJérôme Glisse /* FIXME support hugetlb fs/dax */ 827*a3e0d41cSJérôme Glisse if (is_vm_hugetlb_page(vma) || vma_is_dax(vma)) { 828855ce7d2SJérôme Glisse hmm_pfns_special(range); 829855ce7d2SJérôme Glisse return -EINVAL; 830855ce7d2SJérôme Glisse } 831855ce7d2SJérôme Glisse 83286586a41SJérôme Glisse if (!(vma->vm_flags & VM_READ)) { 83386586a41SJérôme Glisse /* 834*a3e0d41cSJérôme Glisse * If vma do not allow read access, then assume that it 835*a3e0d41cSJérôme Glisse * does not allow write access, either. HMM does not 836*a3e0d41cSJérôme Glisse * support architecture that allow write without read. 83786586a41SJérôme Glisse */ 838*a3e0d41cSJérôme Glisse hmm_pfns_clear(range, range->pfns, 839*a3e0d41cSJérôme Glisse range->start, range->end); 84086586a41SJérôme Glisse return -EPERM; 84186586a41SJérôme Glisse } 84286586a41SJérôme Glisse 843*a3e0d41cSJérôme Glisse range->vma = vma; 844*a3e0d41cSJérôme Glisse hmm_vma_walk.last = start; 84574eee180SJérôme Glisse hmm_vma_walk.fault = false; 84674eee180SJérôme Glisse hmm_vma_walk.range = range; 84774eee180SJérôme Glisse mm_walk.private = &hmm_vma_walk; 848*a3e0d41cSJérôme Glisse end = min(range->end, vma->vm_end); 84974eee180SJérôme Glisse 850da4c3c73SJérôme Glisse mm_walk.vma = vma; 851da4c3c73SJérôme Glisse mm_walk.mm = vma->vm_mm; 852da4c3c73SJérôme Glisse mm_walk.pte_entry = NULL; 853da4c3c73SJérôme Glisse mm_walk.test_walk = NULL; 854da4c3c73SJérôme Glisse mm_walk.hugetlb_entry = NULL; 855da4c3c73SJérôme Glisse mm_walk.pmd_entry = hmm_vma_walk_pmd; 856da4c3c73SJérôme Glisse mm_walk.pte_hole = hmm_vma_walk_hole; 857da4c3c73SJérôme Glisse 858*a3e0d41cSJérôme Glisse walk_page_range(start, end, &mm_walk); 859*a3e0d41cSJérôme Glisse start = end; 860*a3e0d41cSJérôme Glisse } while (start < range->end); 861*a3e0d41cSJérôme Glisse 86225f23a0cSJérôme Glisse return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 863da4c3c73SJérôme Glisse } 86425f23a0cSJérôme Glisse EXPORT_SYMBOL(hmm_range_snapshot); 865da4c3c73SJérôme Glisse 866da4c3c73SJérôme Glisse /* 86773231612SJérôme Glisse * hmm_range_fault() - try to fault some address in a virtual address range 86808232a45SJérôme Glisse * @range: range being faulted 86974eee180SJérôme Glisse * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) 87073231612SJérôme Glisse * Returns: number of valid pages in range->pfns[] (from range start 87173231612SJérôme Glisse * address). This may be zero. If the return value is negative, 87273231612SJérôme Glisse * then one of the following values may be returned: 87373231612SJérôme Glisse * 87473231612SJérôme Glisse * -EINVAL invalid arguments or mm or virtual address are in an 87573231612SJérôme Glisse * invalid vma (ie either hugetlbfs or device file vma). 87673231612SJérôme Glisse * -ENOMEM: Out of memory. 87773231612SJérôme Glisse * -EPERM: Invalid permission (for instance asking for write and 87873231612SJérôme Glisse * range is read only). 87973231612SJérôme Glisse * -EAGAIN: If you need to retry and mmap_sem was drop. This can only 88073231612SJérôme Glisse * happens if block argument is false. 88173231612SJérôme Glisse * -EBUSY: If the the range is being invalidated and you should wait 88273231612SJérôme Glisse * for invalidation to finish. 88373231612SJérôme Glisse * -EFAULT: Invalid (ie either no valid vma or it is illegal to access 88473231612SJérôme Glisse * that range), number of valid pages in range->pfns[] (from 88573231612SJérôme Glisse * range start address). 88674eee180SJérôme Glisse * 88774eee180SJérôme Glisse * This is similar to a regular CPU page fault except that it will not trigger 88873231612SJérôme Glisse * any memory migration if the memory being faulted is not accessible by CPUs 88973231612SJérôme Glisse * and caller does not ask for migration. 89074eee180SJérôme Glisse * 891ff05c0c6SJérôme Glisse * On error, for one virtual address in the range, the function will mark the 892ff05c0c6SJérôme Glisse * corresponding HMM pfn entry with an error flag. 89374eee180SJérôme Glisse */ 89473231612SJérôme Glisse long hmm_range_fault(struct hmm_range *range, bool block) 89574eee180SJérôme Glisse { 896*a3e0d41cSJérôme Glisse unsigned long start = range->start, end; 89774eee180SJérôme Glisse struct hmm_vma_walk hmm_vma_walk; 898*a3e0d41cSJérôme Glisse struct hmm *hmm = range->hmm; 899*a3e0d41cSJérôme Glisse struct vm_area_struct *vma; 90074eee180SJérôme Glisse struct mm_walk mm_walk; 90174eee180SJérôme Glisse int ret; 90274eee180SJérôme Glisse 903704f3f2cSJérôme Glisse /* Check if hmm_mm_destroy() was call. */ 904*a3e0d41cSJérôme Glisse if (hmm->mm == NULL || hmm->dead) 905*a3e0d41cSJérôme Glisse return -EFAULT; 906*a3e0d41cSJérôme Glisse 907*a3e0d41cSJérôme Glisse do { 908*a3e0d41cSJérôme Glisse /* If range is no longer valid force retry. */ 909*a3e0d41cSJérôme Glisse if (!range->valid) { 910*a3e0d41cSJérôme Glisse up_read(&hmm->mm->mmap_sem); 911*a3e0d41cSJérôme Glisse return -EAGAIN; 912704f3f2cSJérôme Glisse } 91374eee180SJérôme Glisse 914*a3e0d41cSJérôme Glisse vma = find_vma(hmm->mm, start); 915*a3e0d41cSJérôme Glisse if (vma == NULL || (vma->vm_flags & VM_SPECIAL)) 916*a3e0d41cSJérôme Glisse return -EFAULT; 917*a3e0d41cSJérôme Glisse 918*a3e0d41cSJérôme Glisse /* FIXME support hugetlb fs/dax */ 919*a3e0d41cSJérôme Glisse if (is_vm_hugetlb_page(vma) || vma_is_dax(vma)) { 920855ce7d2SJérôme Glisse hmm_pfns_special(range); 921855ce7d2SJérôme Glisse return -EINVAL; 922855ce7d2SJérôme Glisse } 923855ce7d2SJérôme Glisse 92486586a41SJérôme Glisse if (!(vma->vm_flags & VM_READ)) { 92586586a41SJérôme Glisse /* 926*a3e0d41cSJérôme Glisse * If vma do not allow read access, then assume that it 927*a3e0d41cSJérôme Glisse * does not allow write access, either. HMM does not 928*a3e0d41cSJérôme Glisse * support architecture that allow write without read. 92986586a41SJérôme Glisse */ 930*a3e0d41cSJérôme Glisse hmm_pfns_clear(range, range->pfns, 931*a3e0d41cSJérôme Glisse range->start, range->end); 93286586a41SJérôme Glisse return -EPERM; 93386586a41SJérôme Glisse } 93474eee180SJérôme Glisse 935*a3e0d41cSJérôme Glisse range->vma = vma; 936*a3e0d41cSJérôme Glisse hmm_vma_walk.last = start; 93774eee180SJérôme Glisse hmm_vma_walk.fault = true; 93874eee180SJérôme Glisse hmm_vma_walk.block = block; 93974eee180SJérôme Glisse hmm_vma_walk.range = range; 94074eee180SJérôme Glisse mm_walk.private = &hmm_vma_walk; 941*a3e0d41cSJérôme Glisse end = min(range->end, vma->vm_end); 94274eee180SJérôme Glisse 94374eee180SJérôme Glisse mm_walk.vma = vma; 94474eee180SJérôme Glisse mm_walk.mm = vma->vm_mm; 94574eee180SJérôme Glisse mm_walk.pte_entry = NULL; 94674eee180SJérôme Glisse mm_walk.test_walk = NULL; 94774eee180SJérôme Glisse mm_walk.hugetlb_entry = NULL; 94874eee180SJérôme Glisse mm_walk.pmd_entry = hmm_vma_walk_pmd; 94974eee180SJérôme Glisse mm_walk.pte_hole = hmm_vma_walk_hole; 95074eee180SJérôme Glisse 95174eee180SJérôme Glisse do { 952*a3e0d41cSJérôme Glisse ret = walk_page_range(start, end, &mm_walk); 95374eee180SJérôme Glisse start = hmm_vma_walk.last; 954*a3e0d41cSJérôme Glisse 95573231612SJérôme Glisse /* Keep trying while the range is valid. */ 95673231612SJérôme Glisse } while (ret == -EBUSY && range->valid); 95774eee180SJérôme Glisse 95874eee180SJérôme Glisse if (ret) { 95974eee180SJérôme Glisse unsigned long i; 96074eee180SJérôme Glisse 96174eee180SJérôme Glisse i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 962*a3e0d41cSJérôme Glisse hmm_pfns_clear(range, &range->pfns[i], 963*a3e0d41cSJérôme Glisse hmm_vma_walk.last, range->end); 96473231612SJérôme Glisse return ret; 96574eee180SJérôme Glisse } 966*a3e0d41cSJérôme Glisse start = end; 967*a3e0d41cSJérôme Glisse 968*a3e0d41cSJérôme Glisse } while (start < range->end); 969704f3f2cSJérôme Glisse 97073231612SJérôme Glisse return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 97174eee180SJérôme Glisse } 97273231612SJérôme Glisse EXPORT_SYMBOL(hmm_range_fault); 973c0b12405SJérôme Glisse #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ 9744ef589dcSJérôme Glisse 9754ef589dcSJérôme Glisse 976df6ad698SJérôme Glisse #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) 9774ef589dcSJérôme Glisse struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, 9784ef589dcSJérôme Glisse unsigned long addr) 9794ef589dcSJérôme Glisse { 9804ef589dcSJérôme Glisse struct page *page; 9814ef589dcSJérôme Glisse 9824ef589dcSJérôme Glisse page = alloc_page_vma(GFP_HIGHUSER, vma, addr); 9834ef589dcSJérôme Glisse if (!page) 9844ef589dcSJérôme Glisse return NULL; 9854ef589dcSJérôme Glisse lock_page(page); 9864ef589dcSJérôme Glisse return page; 9874ef589dcSJérôme Glisse } 9884ef589dcSJérôme Glisse EXPORT_SYMBOL(hmm_vma_alloc_locked_page); 9894ef589dcSJérôme Glisse 9904ef589dcSJérôme Glisse 9914ef589dcSJérôme Glisse static void hmm_devmem_ref_release(struct percpu_ref *ref) 9924ef589dcSJérôme Glisse { 9934ef589dcSJérôme Glisse struct hmm_devmem *devmem; 9944ef589dcSJérôme Glisse 9954ef589dcSJérôme Glisse devmem = container_of(ref, struct hmm_devmem, ref); 9964ef589dcSJérôme Glisse complete(&devmem->completion); 9974ef589dcSJérôme Glisse } 9984ef589dcSJérôme Glisse 9994ef589dcSJérôme Glisse static void hmm_devmem_ref_exit(void *data) 10004ef589dcSJérôme Glisse { 10014ef589dcSJérôme Glisse struct percpu_ref *ref = data; 10024ef589dcSJérôme Glisse struct hmm_devmem *devmem; 10034ef589dcSJérôme Glisse 10044ef589dcSJérôme Glisse devmem = container_of(ref, struct hmm_devmem, ref); 1005bbecd94eSDan Williams wait_for_completion(&devmem->completion); 10064ef589dcSJérôme Glisse percpu_ref_exit(ref); 10074ef589dcSJérôme Glisse } 10084ef589dcSJérôme Glisse 1009bbecd94eSDan Williams static void hmm_devmem_ref_kill(struct percpu_ref *ref) 10104ef589dcSJérôme Glisse { 10114ef589dcSJérôme Glisse percpu_ref_kill(ref); 10124ef589dcSJérôme Glisse } 10134ef589dcSJérôme Glisse 1014b57e622eSSouptick Joarder static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma, 10154ef589dcSJérôme Glisse unsigned long addr, 10164ef589dcSJérôme Glisse const struct page *page, 10174ef589dcSJérôme Glisse unsigned int flags, 10184ef589dcSJérôme Glisse pmd_t *pmdp) 10194ef589dcSJérôme Glisse { 10204ef589dcSJérôme Glisse struct hmm_devmem *devmem = page->pgmap->data; 10214ef589dcSJérôme Glisse 10224ef589dcSJérôme Glisse return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp); 10234ef589dcSJérôme Glisse } 10244ef589dcSJérôme Glisse 10254ef589dcSJérôme Glisse static void hmm_devmem_free(struct page *page, void *data) 10264ef589dcSJérôme Glisse { 10274ef589dcSJérôme Glisse struct hmm_devmem *devmem = data; 10284ef589dcSJérôme Glisse 10292fa147bdSDan Williams page->mapping = NULL; 10302fa147bdSDan Williams 10314ef589dcSJérôme Glisse devmem->ops->free(devmem, page); 10324ef589dcSJérôme Glisse } 10334ef589dcSJérôme Glisse 10344ef589dcSJérôme Glisse /* 10354ef589dcSJérôme Glisse * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory 10364ef589dcSJérôme Glisse * 10374ef589dcSJérôme Glisse * @ops: memory event device driver callback (see struct hmm_devmem_ops) 10384ef589dcSJérôme Glisse * @device: device struct to bind the resource too 10394ef589dcSJérôme Glisse * @size: size in bytes of the device memory to add 10404ef589dcSJérôme Glisse * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise 10414ef589dcSJérôme Glisse * 10424ef589dcSJérôme Glisse * This function first finds an empty range of physical address big enough to 10434ef589dcSJérôme Glisse * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which 10444ef589dcSJérôme Glisse * in turn allocates struct pages. It does not do anything beyond that; all 10454ef589dcSJérôme Glisse * events affecting the memory will go through the various callbacks provided 10464ef589dcSJérôme Glisse * by hmm_devmem_ops struct. 10474ef589dcSJérôme Glisse * 10484ef589dcSJérôme Glisse * Device driver should call this function during device initialization and 10494ef589dcSJérôme Glisse * is then responsible of memory management. HMM only provides helpers. 10504ef589dcSJérôme Glisse */ 10514ef589dcSJérôme Glisse struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, 10524ef589dcSJérôme Glisse struct device *device, 10534ef589dcSJérôme Glisse unsigned long size) 10544ef589dcSJérôme Glisse { 10554ef589dcSJérôme Glisse struct hmm_devmem *devmem; 10564ef589dcSJérôme Glisse resource_size_t addr; 1057bbecd94eSDan Williams void *result; 10584ef589dcSJérôme Glisse int ret; 10594ef589dcSJérôme Glisse 1060e7638488SDan Williams dev_pagemap_get_ops(); 10614ef589dcSJérôme Glisse 106258ef15b7SDan Williams devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); 10634ef589dcSJérôme Glisse if (!devmem) 10644ef589dcSJérôme Glisse return ERR_PTR(-ENOMEM); 10654ef589dcSJérôme Glisse 10664ef589dcSJérôme Glisse init_completion(&devmem->completion); 10674ef589dcSJérôme Glisse devmem->pfn_first = -1UL; 10684ef589dcSJérôme Glisse devmem->pfn_last = -1UL; 10694ef589dcSJérôme Glisse devmem->resource = NULL; 10704ef589dcSJérôme Glisse devmem->device = device; 10714ef589dcSJérôme Glisse devmem->ops = ops; 10724ef589dcSJérôme Glisse 10734ef589dcSJérôme Glisse ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, 10744ef589dcSJérôme Glisse 0, GFP_KERNEL); 10754ef589dcSJérôme Glisse if (ret) 107658ef15b7SDan Williams return ERR_PTR(ret); 10774ef589dcSJérôme Glisse 107858ef15b7SDan Williams ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, &devmem->ref); 10794ef589dcSJérôme Glisse if (ret) 108058ef15b7SDan Williams return ERR_PTR(ret); 10814ef589dcSJérôme Glisse 10824ef589dcSJérôme Glisse size = ALIGN(size, PA_SECTION_SIZE); 10834ef589dcSJérôme Glisse addr = min((unsigned long)iomem_resource.end, 10844ef589dcSJérôme Glisse (1UL << MAX_PHYSMEM_BITS) - 1); 10854ef589dcSJérôme Glisse addr = addr - size + 1UL; 10864ef589dcSJérôme Glisse 10874ef589dcSJérôme Glisse /* 10884ef589dcSJérôme Glisse * FIXME add a new helper to quickly walk resource tree and find free 10894ef589dcSJérôme Glisse * range 10904ef589dcSJérôme Glisse * 10914ef589dcSJérôme Glisse * FIXME what about ioport_resource resource ? 10924ef589dcSJérôme Glisse */ 10934ef589dcSJérôme Glisse for (; addr > size && addr >= iomem_resource.start; addr -= size) { 10944ef589dcSJérôme Glisse ret = region_intersects(addr, size, 0, IORES_DESC_NONE); 10954ef589dcSJérôme Glisse if (ret != REGION_DISJOINT) 10964ef589dcSJérôme Glisse continue; 10974ef589dcSJérôme Glisse 10984ef589dcSJérôme Glisse devmem->resource = devm_request_mem_region(device, addr, size, 10994ef589dcSJérôme Glisse dev_name(device)); 110058ef15b7SDan Williams if (!devmem->resource) 110158ef15b7SDan Williams return ERR_PTR(-ENOMEM); 11024ef589dcSJérôme Glisse break; 11034ef589dcSJérôme Glisse } 110458ef15b7SDan Williams if (!devmem->resource) 110558ef15b7SDan Williams return ERR_PTR(-ERANGE); 11064ef589dcSJérôme Glisse 11074ef589dcSJérôme Glisse devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; 11084ef589dcSJérôme Glisse devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; 11094ef589dcSJérôme Glisse devmem->pfn_last = devmem->pfn_first + 11104ef589dcSJérôme Glisse (resource_size(devmem->resource) >> PAGE_SHIFT); 1111063a7d1dSDan Williams devmem->page_fault = hmm_devmem_fault; 11124ef589dcSJérôme Glisse 1113bbecd94eSDan Williams devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; 1114bbecd94eSDan Williams devmem->pagemap.res = *devmem->resource; 1115bbecd94eSDan Williams devmem->pagemap.page_free = hmm_devmem_free; 1116bbecd94eSDan Williams devmem->pagemap.altmap_valid = false; 1117bbecd94eSDan Williams devmem->pagemap.ref = &devmem->ref; 1118bbecd94eSDan Williams devmem->pagemap.data = devmem; 1119bbecd94eSDan Williams devmem->pagemap.kill = hmm_devmem_ref_kill; 112058ef15b7SDan Williams 1121bbecd94eSDan Williams result = devm_memremap_pages(devmem->device, &devmem->pagemap); 1122bbecd94eSDan Williams if (IS_ERR(result)) 1123bbecd94eSDan Williams return result; 11244ef589dcSJérôme Glisse return devmem; 11254ef589dcSJérôme Glisse } 112602917e9fSDan Williams EXPORT_SYMBOL_GPL(hmm_devmem_add); 11274ef589dcSJérôme Glisse 1128d3df0a42SJérôme Glisse struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, 1129d3df0a42SJérôme Glisse struct device *device, 1130d3df0a42SJérôme Glisse struct resource *res) 1131d3df0a42SJérôme Glisse { 1132d3df0a42SJérôme Glisse struct hmm_devmem *devmem; 1133bbecd94eSDan Williams void *result; 1134d3df0a42SJérôme Glisse int ret; 1135d3df0a42SJérôme Glisse 1136d3df0a42SJérôme Glisse if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) 1137d3df0a42SJérôme Glisse return ERR_PTR(-EINVAL); 1138d3df0a42SJérôme Glisse 1139e7638488SDan Williams dev_pagemap_get_ops(); 1140d3df0a42SJérôme Glisse 114158ef15b7SDan Williams devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); 1142d3df0a42SJérôme Glisse if (!devmem) 1143d3df0a42SJérôme Glisse return ERR_PTR(-ENOMEM); 1144d3df0a42SJérôme Glisse 1145d3df0a42SJérôme Glisse init_completion(&devmem->completion); 1146d3df0a42SJérôme Glisse devmem->pfn_first = -1UL; 1147d3df0a42SJérôme Glisse devmem->pfn_last = -1UL; 1148d3df0a42SJérôme Glisse devmem->resource = res; 1149d3df0a42SJérôme Glisse devmem->device = device; 1150d3df0a42SJérôme Glisse devmem->ops = ops; 1151d3df0a42SJérôme Glisse 1152d3df0a42SJérôme Glisse ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, 1153d3df0a42SJérôme Glisse 0, GFP_KERNEL); 1154d3df0a42SJérôme Glisse if (ret) 115558ef15b7SDan Williams return ERR_PTR(ret); 1156d3df0a42SJérôme Glisse 115758ef15b7SDan Williams ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, 115858ef15b7SDan Williams &devmem->ref); 1159d3df0a42SJérôme Glisse if (ret) 116058ef15b7SDan Williams return ERR_PTR(ret); 1161d3df0a42SJérôme Glisse 1162d3df0a42SJérôme Glisse devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; 1163d3df0a42SJérôme Glisse devmem->pfn_last = devmem->pfn_first + 1164d3df0a42SJérôme Glisse (resource_size(devmem->resource) >> PAGE_SHIFT); 1165063a7d1dSDan Williams devmem->page_fault = hmm_devmem_fault; 1166d3df0a42SJérôme Glisse 1167bbecd94eSDan Williams devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; 1168bbecd94eSDan Williams devmem->pagemap.res = *devmem->resource; 1169bbecd94eSDan Williams devmem->pagemap.page_free = hmm_devmem_free; 1170bbecd94eSDan Williams devmem->pagemap.altmap_valid = false; 1171bbecd94eSDan Williams devmem->pagemap.ref = &devmem->ref; 1172bbecd94eSDan Williams devmem->pagemap.data = devmem; 1173bbecd94eSDan Williams devmem->pagemap.kill = hmm_devmem_ref_kill; 117458ef15b7SDan Williams 1175bbecd94eSDan Williams result = devm_memremap_pages(devmem->device, &devmem->pagemap); 1176bbecd94eSDan Williams if (IS_ERR(result)) 1177bbecd94eSDan Williams return result; 1178d3df0a42SJérôme Glisse return devmem; 1179d3df0a42SJérôme Glisse } 118002917e9fSDan Williams EXPORT_SYMBOL_GPL(hmm_devmem_add_resource); 1181d3df0a42SJérôme Glisse 11824ef589dcSJérôme Glisse /* 1183858b54daSJérôme Glisse * A device driver that wants to handle multiple devices memory through a 1184858b54daSJérôme Glisse * single fake device can use hmm_device to do so. This is purely a helper 1185858b54daSJérôme Glisse * and it is not needed to make use of any HMM functionality. 1186858b54daSJérôme Glisse */ 1187858b54daSJérôme Glisse #define HMM_DEVICE_MAX 256 1188858b54daSJérôme Glisse 1189858b54daSJérôme Glisse static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX); 1190858b54daSJérôme Glisse static DEFINE_SPINLOCK(hmm_device_lock); 1191858b54daSJérôme Glisse static struct class *hmm_device_class; 1192858b54daSJérôme Glisse static dev_t hmm_device_devt; 1193858b54daSJérôme Glisse 1194858b54daSJérôme Glisse static void hmm_device_release(struct device *device) 1195858b54daSJérôme Glisse { 1196858b54daSJérôme Glisse struct hmm_device *hmm_device; 1197858b54daSJérôme Glisse 1198858b54daSJérôme Glisse hmm_device = container_of(device, struct hmm_device, device); 1199858b54daSJérôme Glisse spin_lock(&hmm_device_lock); 1200858b54daSJérôme Glisse clear_bit(hmm_device->minor, hmm_device_mask); 1201858b54daSJérôme Glisse spin_unlock(&hmm_device_lock); 1202858b54daSJérôme Glisse 1203858b54daSJérôme Glisse kfree(hmm_device); 1204858b54daSJérôme Glisse } 1205858b54daSJérôme Glisse 1206858b54daSJérôme Glisse struct hmm_device *hmm_device_new(void *drvdata) 1207858b54daSJérôme Glisse { 1208858b54daSJérôme Glisse struct hmm_device *hmm_device; 1209858b54daSJérôme Glisse 1210858b54daSJérôme Glisse hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL); 1211858b54daSJérôme Glisse if (!hmm_device) 1212858b54daSJérôme Glisse return ERR_PTR(-ENOMEM); 1213858b54daSJérôme Glisse 1214858b54daSJérôme Glisse spin_lock(&hmm_device_lock); 1215858b54daSJérôme Glisse hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX); 1216858b54daSJérôme Glisse if (hmm_device->minor >= HMM_DEVICE_MAX) { 1217858b54daSJérôme Glisse spin_unlock(&hmm_device_lock); 1218858b54daSJérôme Glisse kfree(hmm_device); 1219858b54daSJérôme Glisse return ERR_PTR(-EBUSY); 1220858b54daSJérôme Glisse } 1221858b54daSJérôme Glisse set_bit(hmm_device->minor, hmm_device_mask); 1222858b54daSJérôme Glisse spin_unlock(&hmm_device_lock); 1223858b54daSJérôme Glisse 1224858b54daSJérôme Glisse dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor); 1225858b54daSJérôme Glisse hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt), 1226858b54daSJérôme Glisse hmm_device->minor); 1227858b54daSJérôme Glisse hmm_device->device.release = hmm_device_release; 1228858b54daSJérôme Glisse dev_set_drvdata(&hmm_device->device, drvdata); 1229858b54daSJérôme Glisse hmm_device->device.class = hmm_device_class; 1230858b54daSJérôme Glisse device_initialize(&hmm_device->device); 1231858b54daSJérôme Glisse 1232858b54daSJérôme Glisse return hmm_device; 1233858b54daSJérôme Glisse } 1234858b54daSJérôme Glisse EXPORT_SYMBOL(hmm_device_new); 1235858b54daSJérôme Glisse 1236858b54daSJérôme Glisse void hmm_device_put(struct hmm_device *hmm_device) 1237858b54daSJérôme Glisse { 1238858b54daSJérôme Glisse put_device(&hmm_device->device); 1239858b54daSJérôme Glisse } 1240858b54daSJérôme Glisse EXPORT_SYMBOL(hmm_device_put); 1241858b54daSJérôme Glisse 1242858b54daSJérôme Glisse static int __init hmm_init(void) 1243858b54daSJérôme Glisse { 1244858b54daSJérôme Glisse int ret; 1245858b54daSJérôme Glisse 1246858b54daSJérôme Glisse ret = alloc_chrdev_region(&hmm_device_devt, 0, 1247858b54daSJérôme Glisse HMM_DEVICE_MAX, 1248858b54daSJérôme Glisse "hmm_device"); 1249858b54daSJérôme Glisse if (ret) 1250858b54daSJérôme Glisse return ret; 1251858b54daSJérôme Glisse 1252858b54daSJérôme Glisse hmm_device_class = class_create(THIS_MODULE, "hmm_device"); 1253858b54daSJérôme Glisse if (IS_ERR(hmm_device_class)) { 1254858b54daSJérôme Glisse unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX); 1255858b54daSJérôme Glisse return PTR_ERR(hmm_device_class); 1256858b54daSJérôme Glisse } 1257858b54daSJérôme Glisse return 0; 1258858b54daSJérôme Glisse } 1259858b54daSJérôme Glisse 1260858b54daSJérôme Glisse device_initcall(hmm_init); 1261df6ad698SJérôme Glisse #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ 1262