1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef _LINUX_PAGEWALK_H 3 #define _LINUX_PAGEWALK_H 4 5 #include <linux/mm.h> 6 7 struct mm_walk; 8 9 /* Locking requirement during a page walk. */ 10 enum page_walk_lock { 11 /* mmap_lock should be locked for read to stabilize the vma tree */ 12 PGWALK_RDLOCK = 0, 13 /* vma will be write-locked during the walk */ 14 PGWALK_WRLOCK = 1, 15 /* vma is expected to be already write-locked during the walk */ 16 PGWALK_WRLOCK_VERIFY = 2, 17 }; 18 19 /** 20 * struct mm_walk_ops - callbacks for walk_page_range 21 * @pgd_entry: if set, called for each non-empty PGD (top-level) entry 22 * @p4d_entry: if set, called for each non-empty P4D entry 23 * @pud_entry: if set, called for each non-empty PUD entry 24 * @pmd_entry: if set, called for each non-empty PMD entry 25 * this handler is required to be able to handle 26 * pmd_trans_huge() pmds. They may simply choose to 27 * split_huge_page() instead of handling it explicitly. 28 * @pte_entry: if set, called for each PTE (lowest-level) entry 29 * including empty ones, except if @install_pte is set. 30 * If @install_pte is set, @pte_entry is called only for 31 * existing PTEs. 32 * @pte_hole: if set, called for each hole at all levels, 33 * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD. 34 * Any folded depths (where PTRS_PER_P?D is equal to 1) 35 * are skipped. If @install_pte is specified, this will 36 * not trigger for any populated ranges. 37 * @hugetlb_entry: if set, called for each hugetlb entry. This hook 38 * function is called with the vma lock held, in order to 39 * protect against a concurrent freeing of the pte_t* or 40 * the ptl. In some cases, the hook function needs to drop 41 * and retake the vma lock in order to avoid deadlocks 42 * while calling other functions. In such cases the hook 43 * function must either refrain from accessing the pte or 44 * ptl after dropping the vma lock, or else revalidate 45 * those items after re-acquiring the vma lock and before 46 * accessing them. 47 * @test_walk: caller specific callback function to determine whether 48 * we walk over the current vma or not. Returning 0 means 49 * "do page table walk over the current vma", returning 50 * a negative value means "abort current page table walk 51 * right now" and returning 1 means "skip the current vma" 52 * Note that this callback is not called when the caller 53 * passes in a single VMA as for walk_page_vma(). 54 * @pre_vma: if set, called before starting walk on a non-null vma. 55 * @post_vma: if set, called after a walk on a non-null vma, provided 56 * that @pre_vma and the vma walk succeeded. 57 * @install_pte: if set, missing page table entries are installed and 58 * thus all levels are always walked in the specified 59 * range. This callback is then invoked at the PTE level 60 * (having split any THP pages prior), providing the PTE to 61 * install. If allocations fail, the walk is aborted. This 62 * operation is only available for userland memory. Not 63 * usable for hugetlb ranges. 64 * 65 * p?d_entry callbacks are called even if those levels are folded on a 66 * particular architecture/configuration. 67 */ 68 struct mm_walk_ops { 69 int (*pgd_entry)(pgd_t *pgd, unsigned long addr, 70 unsigned long next, struct mm_walk *walk); 71 int (*p4d_entry)(p4d_t *p4d, unsigned long addr, 72 unsigned long next, struct mm_walk *walk); 73 int (*pud_entry)(pud_t *pud, unsigned long addr, 74 unsigned long next, struct mm_walk *walk); 75 int (*pmd_entry)(pmd_t *pmd, unsigned long addr, 76 unsigned long next, struct mm_walk *walk); 77 int (*pte_entry)(pte_t *pte, unsigned long addr, 78 unsigned long next, struct mm_walk *walk); 79 int (*pte_hole)(unsigned long addr, unsigned long next, 80 int depth, struct mm_walk *walk); 81 int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, 82 unsigned long addr, unsigned long next, 83 struct mm_walk *walk); 84 int (*test_walk)(unsigned long addr, unsigned long next, 85 struct mm_walk *walk); 86 int (*pre_vma)(unsigned long start, unsigned long end, 87 struct mm_walk *walk); 88 void (*post_vma)(struct mm_walk *walk); 89 int (*install_pte)(unsigned long addr, unsigned long next, 90 pte_t *ptep, struct mm_walk *walk); 91 enum page_walk_lock walk_lock; 92 }; 93 94 /* 95 * Action for pud_entry / pmd_entry callbacks. 96 * ACTION_SUBTREE is the default 97 */ 98 enum page_walk_action { 99 /* Descend to next level, splitting huge pages if needed and possible */ 100 ACTION_SUBTREE = 0, 101 /* Continue to next entry at this level (ignoring any subtree) */ 102 ACTION_CONTINUE = 1, 103 /* Call again for this entry */ 104 ACTION_AGAIN = 2 105 }; 106 107 /** 108 * struct mm_walk - walk_page_range data 109 * @ops: operation to call during the walk 110 * @mm: mm_struct representing the target process of page table walk 111 * @pgd: pointer to PGD; only valid with no_vma (otherwise set to NULL) 112 * @vma: vma currently walked (NULL if walking outside vmas) 113 * @action: next action to perform (see enum page_walk_action) 114 * @no_vma: walk ignoring vmas (vma will always be NULL) 115 * @private: private data for callbacks' usage 116 * 117 * (see the comment on walk_page_range() for more details) 118 */ 119 struct mm_walk { 120 const struct mm_walk_ops *ops; 121 struct mm_struct *mm; 122 pgd_t *pgd; 123 struct vm_area_struct *vma; 124 enum page_walk_action action; 125 bool no_vma; 126 void *private; 127 }; 128 129 int walk_page_range(struct mm_struct *mm, unsigned long start, 130 unsigned long end, const struct mm_walk_ops *ops, 131 void *private); 132 int walk_page_range_novma(struct mm_struct *mm, unsigned long start, 133 unsigned long end, const struct mm_walk_ops *ops, 134 pgd_t *pgd, 135 void *private); 136 int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, 137 unsigned long end, const struct mm_walk_ops *ops, 138 void *private); 139 int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, 140 void *private); 141 int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, 142 pgoff_t nr, const struct mm_walk_ops *ops, 143 void *private); 144 145 typedef int __bitwise folio_walk_flags_t; 146 147 /* 148 * Walk migration entries as well. Careful: a large folio might get split 149 * concurrently. 150 */ 151 #define FW_MIGRATION ((__force folio_walk_flags_t)BIT(0)) 152 153 /* Walk shared zeropages (small + huge) as well. */ 154 #define FW_ZEROPAGE ((__force folio_walk_flags_t)BIT(1)) 155 156 enum folio_walk_level { 157 FW_LEVEL_PTE, 158 FW_LEVEL_PMD, 159 FW_LEVEL_PUD, 160 }; 161 162 /** 163 * struct folio_walk - folio_walk_start() / folio_walk_end() data 164 * @page: exact folio page referenced (if applicable) 165 * @level: page table level identifying the entry type 166 * @pte: pointer to the page table entry (FW_LEVEL_PTE). 167 * @pmd: pointer to the page table entry (FW_LEVEL_PMD). 168 * @pud: pointer to the page table entry (FW_LEVEL_PUD). 169 * @ptl: pointer to the page table lock. 170 * 171 * (see folio_walk_start() documentation for more details) 172 */ 173 struct folio_walk { 174 /* public */ 175 struct page *page; 176 enum folio_walk_level level; 177 union { 178 pte_t *ptep; 179 pud_t *pudp; 180 pmd_t *pmdp; 181 }; 182 union { 183 pte_t pte; 184 pud_t pud; 185 pmd_t pmd; 186 }; 187 /* private */ 188 struct vm_area_struct *vma; 189 spinlock_t *ptl; 190 }; 191 192 struct folio *folio_walk_start(struct folio_walk *fw, 193 struct vm_area_struct *vma, unsigned long addr, 194 folio_walk_flags_t flags); 195 196 #define folio_walk_end(__fw, __vma) do { \ 197 spin_unlock((__fw)->ptl); \ 198 if (likely((__fw)->level == FW_LEVEL_PTE)) \ 199 pte_unmap((__fw)->ptep); \ 200 vma_pgtable_walk_end(__vma); \ 201 } while (0) 202 203 #endif /* _LINUX_PAGEWALK_H */ 204