1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/mm.h>
3 #include <linux/gfp.h>
4 #include <linux/hugetlb.h>
5 #include <asm/pgalloc.h>
6 #include <asm/tlb.h>
7 #include <asm/fixmap.h>
8 #include <asm/mtrr.h>
9
10 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
11 phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
12 EXPORT_SYMBOL(physical_mask);
13 #endif
14
15 #ifdef CONFIG_HIGHPTE
16 #define PGTABLE_HIGHMEM __GFP_HIGHMEM
17 #else
18 #define PGTABLE_HIGHMEM 0
19 #endif
20
21 #ifndef CONFIG_PARAVIRT
22 #ifndef CONFIG_PT_RECLAIM
23 static inline
paravirt_tlb_remove_table(struct mmu_gather * tlb,void * table)24 void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
25 {
26 struct ptdesc *ptdesc = (struct ptdesc *)table;
27
28 pagetable_dtor(ptdesc);
29 tlb_remove_page(tlb, ptdesc_page(ptdesc));
30 }
31 #else
32 static inline
paravirt_tlb_remove_table(struct mmu_gather * tlb,void * table)33 void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
34 {
35 tlb_remove_table(tlb, table);
36 }
37 #endif /* !CONFIG_PT_RECLAIM */
38 #endif /* !CONFIG_PARAVIRT */
39
40 gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
41
pte_alloc_one(struct mm_struct * mm)42 pgtable_t pte_alloc_one(struct mm_struct *mm)
43 {
44 return __pte_alloc_one(mm, __userpte_alloc_gfp);
45 }
46
setup_userpte(char * arg)47 static int __init setup_userpte(char *arg)
48 {
49 if (!arg)
50 return -EINVAL;
51
52 /*
53 * "userpte=nohigh" disables allocation of user pagetables in
54 * high memory.
55 */
56 if (strcmp(arg, "nohigh") == 0)
57 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
58 else
59 return -EINVAL;
60 return 0;
61 }
62 early_param("userpte", setup_userpte);
63
___pte_free_tlb(struct mmu_gather * tlb,struct page * pte)64 void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
65 {
66 paravirt_release_pte(page_to_pfn(pte));
67 paravirt_tlb_remove_table(tlb, page_ptdesc(pte));
68 }
69
70 #if CONFIG_PGTABLE_LEVELS > 2
___pmd_free_tlb(struct mmu_gather * tlb,pmd_t * pmd)71 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
72 {
73 paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
74 /*
75 * NOTE! For PAE, any changes to the top page-directory-pointer-table
76 * entries need a full cr3 reload to flush.
77 */
78 #ifdef CONFIG_X86_PAE
79 tlb->need_flush_all = 1;
80 #endif
81 paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pmd));
82 }
83
84 #if CONFIG_PGTABLE_LEVELS > 3
___pud_free_tlb(struct mmu_gather * tlb,pud_t * pud)85 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
86 {
87 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
88 paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pud));
89 }
90
91 #if CONFIG_PGTABLE_LEVELS > 4
___p4d_free_tlb(struct mmu_gather * tlb,p4d_t * p4d)92 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
93 {
94 paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
95 paravirt_tlb_remove_table(tlb, virt_to_ptdesc(p4d));
96 }
97 #endif /* CONFIG_PGTABLE_LEVELS > 4 */
98 #endif /* CONFIG_PGTABLE_LEVELS > 3 */
99 #endif /* CONFIG_PGTABLE_LEVELS > 2 */
100
pgd_list_add(pgd_t * pgd)101 static inline void pgd_list_add(pgd_t *pgd)
102 {
103 struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
104
105 list_add(&ptdesc->pt_list, &pgd_list);
106 }
107
pgd_list_del(pgd_t * pgd)108 static inline void pgd_list_del(pgd_t *pgd)
109 {
110 struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
111
112 list_del(&ptdesc->pt_list);
113 }
114
115 #define UNSHARED_PTRS_PER_PGD \
116 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
117 #define MAX_UNSHARED_PTRS_PER_PGD \
118 MAX_T(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD)
119
120
pgd_set_mm(pgd_t * pgd,struct mm_struct * mm)121 static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
122 {
123 virt_to_ptdesc(pgd)->pt_mm = mm;
124 }
125
pgd_page_get_mm(struct page * page)126 struct mm_struct *pgd_page_get_mm(struct page *page)
127 {
128 return page_ptdesc(page)->pt_mm;
129 }
130
pgd_ctor(struct mm_struct * mm,pgd_t * pgd)131 static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
132 {
133 /* If the pgd points to a shared pagetable level (either the
134 ptes in non-PAE, or shared PMD in PAE), then just copy the
135 references from swapper_pg_dir. */
136 if (CONFIG_PGTABLE_LEVELS == 2 ||
137 (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
138 CONFIG_PGTABLE_LEVELS >= 4) {
139 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
140 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
141 KERNEL_PGD_PTRS);
142 }
143
144 /* list required to sync kernel mapping updates */
145 if (!SHARED_KERNEL_PMD) {
146 pgd_set_mm(pgd, mm);
147 pgd_list_add(pgd);
148 }
149 }
150
pgd_dtor(pgd_t * pgd)151 static void pgd_dtor(pgd_t *pgd)
152 {
153 if (SHARED_KERNEL_PMD)
154 return;
155
156 spin_lock(&pgd_lock);
157 pgd_list_del(pgd);
158 spin_unlock(&pgd_lock);
159 }
160
161 /*
162 * List of all pgd's needed for non-PAE so it can invalidate entries
163 * in both cached and uncached pgd's; not needed for PAE since the
164 * kernel pmd is shared. If PAE were not to share the pmd a similar
165 * tactic would be needed. This is essentially codepath-based locking
166 * against pageattr.c; it is the unique case in which a valid change
167 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
168 * vmalloc faults work because attached pagetables are never freed.
169 * -- nyc
170 */
171
172 #ifdef CONFIG_X86_PAE
173 /*
174 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
175 * updating the top-level pagetable entries to guarantee the
176 * processor notices the update. Since this is expensive, and
177 * all 4 top-level entries are used almost immediately in a
178 * new process's life, we just pre-populate them here.
179 *
180 * Also, if we're in a paravirt environment where the kernel pmd is
181 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
182 * and initialize the kernel pmds here.
183 */
184 #define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
185 #define MAX_PREALLOCATED_PMDS MAX_UNSHARED_PTRS_PER_PGD
186
187 /*
188 * We allocate separate PMDs for the kernel part of the user page-table
189 * when PTI is enabled. We need them to map the per-process LDT into the
190 * user-space page-table.
191 */
192 #define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \
193 KERNEL_PGD_PTRS : 0)
194 #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS
195
pud_populate(struct mm_struct * mm,pud_t * pudp,pmd_t * pmd)196 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
197 {
198 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
199
200 /* Note: almost everything apart from _PAGE_PRESENT is
201 reserved at the pmd (PDPT) level. */
202 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
203
204 /*
205 * According to Intel App note "TLBs, Paging-Structure Caches,
206 * and Their Invalidation", April 2007, document 317080-001,
207 * section 8.1: in PAE mode we explicitly have to flush the
208 * TLB via cr3 if the top-level pgd is changed...
209 */
210 flush_tlb_mm(mm);
211 }
212 #else /* !CONFIG_X86_PAE */
213
214 /* No need to prepopulate any pagetable entries in non-PAE modes. */
215 #define PREALLOCATED_PMDS 0
216 #define MAX_PREALLOCATED_PMDS 0
217 #define PREALLOCATED_USER_PMDS 0
218 #define MAX_PREALLOCATED_USER_PMDS 0
219 #endif /* CONFIG_X86_PAE */
220
free_pmds(struct mm_struct * mm,pmd_t * pmds[],int count)221 static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
222 {
223 int i;
224 struct ptdesc *ptdesc;
225
226 for (i = 0; i < count; i++)
227 if (pmds[i]) {
228 ptdesc = virt_to_ptdesc(pmds[i]);
229
230 pagetable_dtor(ptdesc);
231 pagetable_free(ptdesc);
232 mm_dec_nr_pmds(mm);
233 }
234 }
235
preallocate_pmds(struct mm_struct * mm,pmd_t * pmds[],int count)236 static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
237 {
238 int i;
239 bool failed = false;
240 gfp_t gfp = GFP_PGTABLE_USER;
241
242 if (mm == &init_mm)
243 gfp &= ~__GFP_ACCOUNT;
244 gfp &= ~__GFP_HIGHMEM;
245
246 for (i = 0; i < count; i++) {
247 pmd_t *pmd = NULL;
248 struct ptdesc *ptdesc = pagetable_alloc(gfp, 0);
249
250 if (!ptdesc)
251 failed = true;
252 if (ptdesc && !pagetable_pmd_ctor(ptdesc)) {
253 pagetable_free(ptdesc);
254 ptdesc = NULL;
255 failed = true;
256 }
257 if (ptdesc) {
258 mm_inc_nr_pmds(mm);
259 pmd = ptdesc_address(ptdesc);
260 }
261
262 pmds[i] = pmd;
263 }
264
265 if (failed) {
266 free_pmds(mm, pmds, count);
267 return -ENOMEM;
268 }
269
270 return 0;
271 }
272
273 /*
274 * Mop up any pmd pages which may still be attached to the pgd.
275 * Normally they will be freed by munmap/exit_mmap, but any pmd we
276 * preallocate which never got a corresponding vma will need to be
277 * freed manually.
278 */
mop_up_one_pmd(struct mm_struct * mm,pgd_t * pgdp)279 static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
280 {
281 pgd_t pgd = *pgdp;
282
283 if (pgd_val(pgd) != 0) {
284 pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
285
286 pgd_clear(pgdp);
287
288 paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
289 pmd_free(mm, pmd);
290 mm_dec_nr_pmds(mm);
291 }
292 }
293
pgd_mop_up_pmds(struct mm_struct * mm,pgd_t * pgdp)294 static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
295 {
296 int i;
297
298 for (i = 0; i < PREALLOCATED_PMDS; i++)
299 mop_up_one_pmd(mm, &pgdp[i]);
300
301 #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
302
303 if (!boot_cpu_has(X86_FEATURE_PTI))
304 return;
305
306 pgdp = kernel_to_user_pgdp(pgdp);
307
308 for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
309 mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
310 #endif
311 }
312
pgd_prepopulate_pmd(struct mm_struct * mm,pgd_t * pgd,pmd_t * pmds[])313 static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
314 {
315 p4d_t *p4d;
316 pud_t *pud;
317 int i;
318
319 p4d = p4d_offset(pgd, 0);
320 pud = pud_offset(p4d, 0);
321
322 for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
323 pmd_t *pmd = pmds[i];
324
325 if (i >= KERNEL_PGD_BOUNDARY)
326 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
327 sizeof(pmd_t) * PTRS_PER_PMD);
328
329 pud_populate(mm, pud, pmd);
330 }
331 }
332
333 #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
pgd_prepopulate_user_pmd(struct mm_struct * mm,pgd_t * k_pgd,pmd_t * pmds[])334 static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
335 pgd_t *k_pgd, pmd_t *pmds[])
336 {
337 pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
338 pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
339 p4d_t *u_p4d;
340 pud_t *u_pud;
341 int i;
342
343 u_p4d = p4d_offset(u_pgd, 0);
344 u_pud = pud_offset(u_p4d, 0);
345
346 s_pgd += KERNEL_PGD_BOUNDARY;
347 u_pud += KERNEL_PGD_BOUNDARY;
348
349 for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
350 pmd_t *pmd = pmds[i];
351
352 memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
353 sizeof(pmd_t) * PTRS_PER_PMD);
354
355 pud_populate(mm, u_pud, pmd);
356 }
357
358 }
359 #else
pgd_prepopulate_user_pmd(struct mm_struct * mm,pgd_t * k_pgd,pmd_t * pmds[])360 static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
361 pgd_t *k_pgd, pmd_t *pmds[])
362 {
363 }
364 #endif
365 /*
366 * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
367 * assumes that pgd should be in one page.
368 *
369 * But kernel with PAE paging that is not running as a Xen domain
370 * only needs to allocate 32 bytes for pgd instead of one page.
371 */
372 #ifdef CONFIG_X86_PAE
373
374 #include <linux/slab.h>
375
376 #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t))
377 #define PGD_ALIGN 32
378
379 static struct kmem_cache *pgd_cache;
380
pgtable_cache_init(void)381 void __init pgtable_cache_init(void)
382 {
383 /*
384 * When PAE kernel is running as a Xen domain, it does not use
385 * shared kernel pmd. And this requires a whole page for pgd.
386 */
387 if (!SHARED_KERNEL_PMD)
388 return;
389
390 /*
391 * when PAE kernel is not running as a Xen domain, it uses
392 * shared kernel pmd. Shared kernel pmd does not require a whole
393 * page for pgd. We are able to just allocate a 32-byte for pgd.
394 * During boot time, we create a 32-byte slab for pgd table allocation.
395 */
396 pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
397 SLAB_PANIC, NULL);
398 }
399
_pgd_alloc(struct mm_struct * mm)400 static inline pgd_t *_pgd_alloc(struct mm_struct *mm)
401 {
402 /*
403 * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
404 * We allocate one page for pgd.
405 */
406 if (!SHARED_KERNEL_PMD)
407 return __pgd_alloc(mm, PGD_ALLOCATION_ORDER);
408
409 /*
410 * Now PAE kernel is not running as a Xen domain. We can allocate
411 * a 32-byte slab for pgd to save memory space.
412 */
413 return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER);
414 }
415
_pgd_free(struct mm_struct * mm,pgd_t * pgd)416 static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd)
417 {
418 if (!SHARED_KERNEL_PMD)
419 __pgd_free(mm, pgd);
420 else
421 kmem_cache_free(pgd_cache, pgd);
422 }
423 #else
424
_pgd_alloc(struct mm_struct * mm)425 static inline pgd_t *_pgd_alloc(struct mm_struct *mm)
426 {
427 return __pgd_alloc(mm, PGD_ALLOCATION_ORDER);
428 }
429
_pgd_free(struct mm_struct * mm,pgd_t * pgd)430 static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd)
431 {
432 __pgd_free(mm, pgd);
433 }
434 #endif /* CONFIG_X86_PAE */
435
pgd_alloc(struct mm_struct * mm)436 pgd_t *pgd_alloc(struct mm_struct *mm)
437 {
438 pgd_t *pgd;
439 pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
440 pmd_t *pmds[MAX_PREALLOCATED_PMDS];
441
442 pgd = _pgd_alloc(mm);
443
444 if (pgd == NULL)
445 goto out;
446
447 mm->pgd = pgd;
448
449 if (sizeof(pmds) != 0 &&
450 preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
451 goto out_free_pgd;
452
453 if (sizeof(u_pmds) != 0 &&
454 preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
455 goto out_free_pmds;
456
457 if (paravirt_pgd_alloc(mm) != 0)
458 goto out_free_user_pmds;
459
460 /*
461 * Make sure that pre-populating the pmds is atomic with
462 * respect to anything walking the pgd_list, so that they
463 * never see a partially populated pgd.
464 */
465 spin_lock(&pgd_lock);
466
467 pgd_ctor(mm, pgd);
468 if (sizeof(pmds) != 0)
469 pgd_prepopulate_pmd(mm, pgd, pmds);
470
471 if (sizeof(u_pmds) != 0)
472 pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
473
474 spin_unlock(&pgd_lock);
475
476 return pgd;
477
478 out_free_user_pmds:
479 if (sizeof(u_pmds) != 0)
480 free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
481 out_free_pmds:
482 if (sizeof(pmds) != 0)
483 free_pmds(mm, pmds, PREALLOCATED_PMDS);
484 out_free_pgd:
485 _pgd_free(mm, pgd);
486 out:
487 return NULL;
488 }
489
pgd_free(struct mm_struct * mm,pgd_t * pgd)490 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
491 {
492 pgd_mop_up_pmds(mm, pgd);
493 pgd_dtor(pgd);
494 paravirt_pgd_free(mm, pgd);
495 _pgd_free(mm, pgd);
496 }
497
498 /*
499 * Used to set accessed or dirty bits in the page table entries
500 * on other architectures. On x86, the accessed and dirty bits
501 * are tracked by hardware. However, do_wp_page calls this function
502 * to also make the pte writeable at the same time the dirty bit is
503 * set. In that case we do actually need to write the PTE.
504 */
ptep_set_access_flags(struct vm_area_struct * vma,unsigned long address,pte_t * ptep,pte_t entry,int dirty)505 int ptep_set_access_flags(struct vm_area_struct *vma,
506 unsigned long address, pte_t *ptep,
507 pte_t entry, int dirty)
508 {
509 int changed = !pte_same(*ptep, entry);
510
511 if (changed && dirty)
512 set_pte(ptep, entry);
513
514 return changed;
515 }
516
517 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
pmdp_set_access_flags(struct vm_area_struct * vma,unsigned long address,pmd_t * pmdp,pmd_t entry,int dirty)518 int pmdp_set_access_flags(struct vm_area_struct *vma,
519 unsigned long address, pmd_t *pmdp,
520 pmd_t entry, int dirty)
521 {
522 int changed = !pmd_same(*pmdp, entry);
523
524 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
525
526 if (changed && dirty) {
527 set_pmd(pmdp, entry);
528 /*
529 * We had a write-protection fault here and changed the pmd
530 * to to more permissive. No need to flush the TLB for that,
531 * #PF is architecturally guaranteed to do that and in the
532 * worst-case we'll generate a spurious fault.
533 */
534 }
535
536 return changed;
537 }
538
pudp_set_access_flags(struct vm_area_struct * vma,unsigned long address,pud_t * pudp,pud_t entry,int dirty)539 int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
540 pud_t *pudp, pud_t entry, int dirty)
541 {
542 int changed = !pud_same(*pudp, entry);
543
544 VM_BUG_ON(address & ~HPAGE_PUD_MASK);
545
546 if (changed && dirty) {
547 set_pud(pudp, entry);
548 /*
549 * We had a write-protection fault here and changed the pud
550 * to to more permissive. No need to flush the TLB for that,
551 * #PF is architecturally guaranteed to do that and in the
552 * worst-case we'll generate a spurious fault.
553 */
554 }
555
556 return changed;
557 }
558 #endif
559
ptep_test_and_clear_young(struct vm_area_struct * vma,unsigned long addr,pte_t * ptep)560 int ptep_test_and_clear_young(struct vm_area_struct *vma,
561 unsigned long addr, pte_t *ptep)
562 {
563 int ret = 0;
564
565 if (pte_young(*ptep))
566 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
567 (unsigned long *) &ptep->pte);
568
569 return ret;
570 }
571
572 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
pmdp_test_and_clear_young(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmdp)573 int pmdp_test_and_clear_young(struct vm_area_struct *vma,
574 unsigned long addr, pmd_t *pmdp)
575 {
576 int ret = 0;
577
578 if (pmd_young(*pmdp))
579 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
580 (unsigned long *)pmdp);
581
582 return ret;
583 }
584 #endif
585
586 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
pudp_test_and_clear_young(struct vm_area_struct * vma,unsigned long addr,pud_t * pudp)587 int pudp_test_and_clear_young(struct vm_area_struct *vma,
588 unsigned long addr, pud_t *pudp)
589 {
590 int ret = 0;
591
592 if (pud_young(*pudp))
593 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
594 (unsigned long *)pudp);
595
596 return ret;
597 }
598 #endif
599
ptep_clear_flush_young(struct vm_area_struct * vma,unsigned long address,pte_t * ptep)600 int ptep_clear_flush_young(struct vm_area_struct *vma,
601 unsigned long address, pte_t *ptep)
602 {
603 /*
604 * On x86 CPUs, clearing the accessed bit without a TLB flush
605 * doesn't cause data corruption. [ It could cause incorrect
606 * page aging and the (mistaken) reclaim of hot pages, but the
607 * chance of that should be relatively low. ]
608 *
609 * So as a performance optimization don't flush the TLB when
610 * clearing the accessed bit, it will eventually be flushed by
611 * a context switch or a VM operation anyway. [ In the rare
612 * event of it not getting flushed for a long time the delay
613 * shouldn't really matter because there's no real memory
614 * pressure for swapout to react to. ]
615 */
616 return ptep_test_and_clear_young(vma, address, ptep);
617 }
618
619 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
pmdp_clear_flush_young(struct vm_area_struct * vma,unsigned long address,pmd_t * pmdp)620 int pmdp_clear_flush_young(struct vm_area_struct *vma,
621 unsigned long address, pmd_t *pmdp)
622 {
623 int young;
624
625 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
626
627 young = pmdp_test_and_clear_young(vma, address, pmdp);
628 if (young)
629 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
630
631 return young;
632 }
633
pmdp_invalidate_ad(struct vm_area_struct * vma,unsigned long address,pmd_t * pmdp)634 pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
635 pmd_t *pmdp)
636 {
637 VM_WARN_ON_ONCE(!pmd_present(*pmdp));
638
639 /*
640 * No flush is necessary. Once an invalid PTE is established, the PTE's
641 * access and dirty bits cannot be updated.
642 */
643 return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
644 }
645 #endif
646
647 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
648 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
pudp_invalidate(struct vm_area_struct * vma,unsigned long address,pud_t * pudp)649 pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
650 pud_t *pudp)
651 {
652 VM_WARN_ON_ONCE(!pud_present(*pudp));
653 pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp));
654 flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
655 return old;
656 }
657 #endif
658
659 /**
660 * reserve_top_address - reserves a hole in the top of kernel address space
661 * @reserve - size of hole to reserve
662 *
663 * Can be used to relocate the fixmap area and poke a hole in the top
664 * of kernel address space to make room for a hypervisor.
665 */
reserve_top_address(unsigned long reserve)666 void __init reserve_top_address(unsigned long reserve)
667 {
668 #ifdef CONFIG_X86_32
669 BUG_ON(fixmaps_set > 0);
670 __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
671 printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
672 -reserve, __FIXADDR_TOP + PAGE_SIZE);
673 #endif
674 }
675
676 int fixmaps_set;
677
__native_set_fixmap(enum fixed_addresses idx,pte_t pte)678 void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
679 {
680 unsigned long address = __fix_to_virt(idx);
681
682 #ifdef CONFIG_X86_64
683 /*
684 * Ensure that the static initial page tables are covering the
685 * fixmap completely.
686 */
687 BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
688 (FIXMAP_PMD_NUM * PTRS_PER_PTE));
689 #endif
690
691 if (idx >= __end_of_fixed_addresses) {
692 BUG();
693 return;
694 }
695 set_pte_vaddr(address, pte);
696 fixmaps_set++;
697 }
698
native_set_fixmap(unsigned idx,phys_addr_t phys,pgprot_t flags)699 void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
700 phys_addr_t phys, pgprot_t flags)
701 {
702 /* Sanitize 'prot' against any unsupported bits: */
703 pgprot_val(flags) &= __default_kernel_pte_mask;
704
705 __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
706 }
707
708 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
709 #ifdef CONFIG_X86_5LEVEL
710 /**
711 * p4d_set_huge - setup kernel P4D mapping
712 *
713 * No 512GB pages yet -- always return 0
714 */
p4d_set_huge(p4d_t * p4d,phys_addr_t addr,pgprot_t prot)715 int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
716 {
717 return 0;
718 }
719
720 /**
721 * p4d_clear_huge - clear kernel P4D mapping when it is set
722 *
723 * No 512GB pages yet -- always return 0
724 */
p4d_clear_huge(p4d_t * p4d)725 void p4d_clear_huge(p4d_t *p4d)
726 {
727 }
728 #endif
729
730 /**
731 * pud_set_huge - setup kernel PUD mapping
732 *
733 * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
734 * function sets up a huge page only if the complete range has the same MTRR
735 * caching mode.
736 *
737 * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
738 * page mapping attempt fails.
739 *
740 * Returns 1 on success and 0 on failure.
741 */
pud_set_huge(pud_t * pud,phys_addr_t addr,pgprot_t prot)742 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
743 {
744 u8 uniform;
745
746 mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
747 if (!uniform)
748 return 0;
749
750 /* Bail out if we are we on a populated non-leaf entry: */
751 if (pud_present(*pud) && !pud_leaf(*pud))
752 return 0;
753
754 set_pte((pte_t *)pud, pfn_pte(
755 (u64)addr >> PAGE_SHIFT,
756 __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
757
758 return 1;
759 }
760
761 /**
762 * pmd_set_huge - setup kernel PMD mapping
763 *
764 * See text over pud_set_huge() above.
765 *
766 * Returns 1 on success and 0 on failure.
767 */
pmd_set_huge(pmd_t * pmd,phys_addr_t addr,pgprot_t prot)768 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
769 {
770 u8 uniform;
771
772 mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
773 if (!uniform) {
774 pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
775 __func__, addr, addr + PMD_SIZE);
776 return 0;
777 }
778
779 /* Bail out if we are we on a populated non-leaf entry: */
780 if (pmd_present(*pmd) && !pmd_leaf(*pmd))
781 return 0;
782
783 set_pte((pte_t *)pmd, pfn_pte(
784 (u64)addr >> PAGE_SHIFT,
785 __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
786
787 return 1;
788 }
789
790 /**
791 * pud_clear_huge - clear kernel PUD mapping when it is set
792 *
793 * Returns 1 on success and 0 on failure (no PUD map is found).
794 */
pud_clear_huge(pud_t * pud)795 int pud_clear_huge(pud_t *pud)
796 {
797 if (pud_leaf(*pud)) {
798 pud_clear(pud);
799 return 1;
800 }
801
802 return 0;
803 }
804
805 /**
806 * pmd_clear_huge - clear kernel PMD mapping when it is set
807 *
808 * Returns 1 on success and 0 on failure (no PMD map is found).
809 */
pmd_clear_huge(pmd_t * pmd)810 int pmd_clear_huge(pmd_t *pmd)
811 {
812 if (pmd_leaf(*pmd)) {
813 pmd_clear(pmd);
814 return 1;
815 }
816
817 return 0;
818 }
819
820 #ifdef CONFIG_X86_64
821 /**
822 * pud_free_pmd_page - Clear pud entry and free pmd page.
823 * @pud: Pointer to a PUD.
824 * @addr: Virtual address associated with pud.
825 *
826 * Context: The pud range has been unmapped and TLB purged.
827 * Return: 1 if clearing the entry succeeded. 0 otherwise.
828 *
829 * NOTE: Callers must allow a single page allocation.
830 */
pud_free_pmd_page(pud_t * pud,unsigned long addr)831 int pud_free_pmd_page(pud_t *pud, unsigned long addr)
832 {
833 pmd_t *pmd, *pmd_sv;
834 pte_t *pte;
835 int i;
836
837 pmd = pud_pgtable(*pud);
838 pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
839 if (!pmd_sv)
840 return 0;
841
842 for (i = 0; i < PTRS_PER_PMD; i++) {
843 pmd_sv[i] = pmd[i];
844 if (!pmd_none(pmd[i]))
845 pmd_clear(&pmd[i]);
846 }
847
848 pud_clear(pud);
849
850 /* INVLPG to clear all paging-structure caches */
851 flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
852
853 for (i = 0; i < PTRS_PER_PMD; i++) {
854 if (!pmd_none(pmd_sv[i])) {
855 pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
856 free_page((unsigned long)pte);
857 }
858 }
859
860 free_page((unsigned long)pmd_sv);
861
862 pagetable_dtor(virt_to_ptdesc(pmd));
863 free_page((unsigned long)pmd);
864
865 return 1;
866 }
867
868 /**
869 * pmd_free_pte_page - Clear pmd entry and free pte page.
870 * @pmd: Pointer to a PMD.
871 * @addr: Virtual address associated with pmd.
872 *
873 * Context: The pmd range has been unmapped and TLB purged.
874 * Return: 1 if clearing the entry succeeded. 0 otherwise.
875 */
pmd_free_pte_page(pmd_t * pmd,unsigned long addr)876 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
877 {
878 pte_t *pte;
879
880 pte = (pte_t *)pmd_page_vaddr(*pmd);
881 pmd_clear(pmd);
882
883 /* INVLPG to clear all paging-structure caches */
884 flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
885
886 free_page((unsigned long)pte);
887
888 return 1;
889 }
890
891 #else /* !CONFIG_X86_64 */
892
893 /*
894 * Disable free page handling on x86-PAE. This assures that ioremap()
895 * does not update sync'd pmd entries. See vmalloc_sync_one().
896 */
pmd_free_pte_page(pmd_t * pmd,unsigned long addr)897 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
898 {
899 return pmd_none(*pmd);
900 }
901
902 #endif /* CONFIG_X86_64 */
903 #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
904
pte_mkwrite(pte_t pte,struct vm_area_struct * vma)905 pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
906 {
907 if (vma->vm_flags & VM_SHADOW_STACK)
908 return pte_mkwrite_shstk(pte);
909
910 pte = pte_mkwrite_novma(pte);
911
912 return pte_clear_saveddirty(pte);
913 }
914
pmd_mkwrite(pmd_t pmd,struct vm_area_struct * vma)915 pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
916 {
917 if (vma->vm_flags & VM_SHADOW_STACK)
918 return pmd_mkwrite_shstk(pmd);
919
920 pmd = pmd_mkwrite_novma(pmd);
921
922 return pmd_clear_saveddirty(pmd);
923 }
924
arch_check_zapped_pte(struct vm_area_struct * vma,pte_t pte)925 void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte)
926 {
927 /*
928 * Hardware before shadow stack can (rarely) set Dirty=1
929 * on a Write=0 PTE. So the below condition
930 * only indicates a software bug when shadow stack is
931 * supported by the HW. This checking is covered in
932 * pte_shstk().
933 */
934 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
935 pte_shstk(pte));
936 }
937
arch_check_zapped_pmd(struct vm_area_struct * vma,pmd_t pmd)938 void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd)
939 {
940 /* See note in arch_check_zapped_pte() */
941 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
942 pmd_shstk(pmd));
943 }
944
arch_check_zapped_pud(struct vm_area_struct * vma,pud_t pud)945 void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud)
946 {
947 /* See note in arch_check_zapped_pte() */
948 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud));
949 }
950