1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Page table handling routines for radix page table.
4 *
5 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
6 */
7
8 #define pr_fmt(fmt) "radix-mmu: " fmt
9
10 #include <linux/io.h>
11 #include <linux/kernel.h>
12 #include <linux/sched/mm.h>
13 #include <linux/memblock.h>
14 #include <linux/of.h>
15 #include <linux/of_fdt.h>
16 #include <linux/mm.h>
17 #include <linux/hugetlb.h>
18 #include <linux/string_helpers.h>
19 #include <linux/memory.h>
20 #include <linux/kfence.h>
21
22 #include <asm/pgalloc.h>
23 #include <asm/mmu_context.h>
24 #include <asm/dma.h>
25 #include <asm/machdep.h>
26 #include <asm/mmu.h>
27 #include <asm/firmware.h>
28 #include <asm/powernv.h>
29 #include <asm/sections.h>
30 #include <asm/smp.h>
31 #include <asm/trace.h>
32 #include <asm/uaccess.h>
33 #include <asm/ultravisor.h>
34 #include <asm/set_memory.h>
35 #include <asm/kfence.h>
36
37 #include <trace/events/thp.h>
38
39 #include <mm/mmu_decl.h>
40
41 unsigned int mmu_base_pid;
42
early_alloc_pgtable(unsigned long size,int nid,unsigned long region_start,unsigned long region_end)43 static __ref void *early_alloc_pgtable(unsigned long size, int nid,
44 unsigned long region_start, unsigned long region_end)
45 {
46 phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
47 phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
48 void *ptr;
49
50 if (region_start)
51 min_addr = region_start;
52 if (region_end)
53 max_addr = region_end;
54
55 ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);
56
57 if (!ptr)
58 panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
59 __func__, size, size, nid, &min_addr, &max_addr);
60
61 return ptr;
62 }
63
64 /*
65 * When allocating pud or pmd pointers, we allocate a complete page
66 * of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This
67 * is to ensure that the page obtained from the memblock allocator
68 * can be completely used as page table page and can be freed
69 * correctly when the page table entries are removed.
70 */
early_map_kernel_page(unsigned long ea,unsigned long pa,pgprot_t flags,unsigned int map_page_size,int nid,unsigned long region_start,unsigned long region_end)71 static int early_map_kernel_page(unsigned long ea, unsigned long pa,
72 pgprot_t flags,
73 unsigned int map_page_size,
74 int nid,
75 unsigned long region_start, unsigned long region_end)
76 {
77 unsigned long pfn = pa >> PAGE_SHIFT;
78 pgd_t *pgdp;
79 p4d_t *p4dp;
80 pud_t *pudp;
81 pmd_t *pmdp;
82 pte_t *ptep;
83
84 pgdp = pgd_offset_k(ea);
85 p4dp = p4d_offset(pgdp, ea);
86 if (p4d_none(*p4dp)) {
87 pudp = early_alloc_pgtable(PAGE_SIZE, nid,
88 region_start, region_end);
89 p4d_populate(&init_mm, p4dp, pudp);
90 }
91 pudp = pud_offset(p4dp, ea);
92 if (map_page_size == PUD_SIZE) {
93 ptep = (pte_t *)pudp;
94 goto set_the_pte;
95 }
96 if (pud_none(*pudp)) {
97 pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,
98 region_end);
99 pud_populate(&init_mm, pudp, pmdp);
100 }
101 pmdp = pmd_offset(pudp, ea);
102 if (map_page_size == PMD_SIZE) {
103 ptep = pmdp_ptep(pmdp);
104 goto set_the_pte;
105 }
106 if (!pmd_present(*pmdp)) {
107 ptep = early_alloc_pgtable(PAGE_SIZE, nid,
108 region_start, region_end);
109 pmd_populate_kernel(&init_mm, pmdp, ptep);
110 }
111 ptep = pte_offset_kernel(pmdp, ea);
112
113 set_the_pte:
114 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
115 asm volatile("ptesync": : :"memory");
116 return 0;
117 }
118
119 /*
120 * nid, region_start, and region_end are hints to try to place the page
121 * table memory in the same node or region.
122 */
__map_kernel_page(unsigned long ea,unsigned long pa,pgprot_t flags,unsigned int map_page_size,int nid,unsigned long region_start,unsigned long region_end)123 static int __map_kernel_page(unsigned long ea, unsigned long pa,
124 pgprot_t flags,
125 unsigned int map_page_size,
126 int nid,
127 unsigned long region_start, unsigned long region_end)
128 {
129 unsigned long pfn = pa >> PAGE_SHIFT;
130 pgd_t *pgdp;
131 p4d_t *p4dp;
132 pud_t *pudp;
133 pmd_t *pmdp;
134 pte_t *ptep;
135 /*
136 * Make sure task size is correct as per the max adddr
137 */
138 BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
139
140 #ifdef CONFIG_PPC_64K_PAGES
141 BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
142 #endif
143
144 if (unlikely(!slab_is_available()))
145 return early_map_kernel_page(ea, pa, flags, map_page_size,
146 nid, region_start, region_end);
147
148 /*
149 * Should make page table allocation functions be able to take a
150 * node, so we can place kernel page tables on the right nodes after
151 * boot.
152 */
153 pgdp = pgd_offset_k(ea);
154 p4dp = p4d_offset(pgdp, ea);
155 pudp = pud_alloc(&init_mm, p4dp, ea);
156 if (!pudp)
157 return -ENOMEM;
158 if (map_page_size == PUD_SIZE) {
159 ptep = (pte_t *)pudp;
160 goto set_the_pte;
161 }
162 pmdp = pmd_alloc(&init_mm, pudp, ea);
163 if (!pmdp)
164 return -ENOMEM;
165 if (map_page_size == PMD_SIZE) {
166 ptep = pmdp_ptep(pmdp);
167 goto set_the_pte;
168 }
169 ptep = pte_alloc_kernel(pmdp, ea);
170 if (!ptep)
171 return -ENOMEM;
172
173 set_the_pte:
174 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
175 asm volatile("ptesync": : :"memory");
176 return 0;
177 }
178
radix__map_kernel_page(unsigned long ea,unsigned long pa,pgprot_t flags,unsigned int map_page_size)179 int radix__map_kernel_page(unsigned long ea, unsigned long pa,
180 pgprot_t flags,
181 unsigned int map_page_size)
182 {
183 return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
184 }
185
186 #ifdef CONFIG_STRICT_KERNEL_RWX
radix__change_memory_range(unsigned long start,unsigned long end,unsigned long clear)187 static void radix__change_memory_range(unsigned long start, unsigned long end,
188 unsigned long clear)
189 {
190 unsigned long idx;
191 pgd_t *pgdp;
192 p4d_t *p4dp;
193 pud_t *pudp;
194 pmd_t *pmdp;
195 pte_t *ptep;
196
197 start = ALIGN_DOWN(start, PAGE_SIZE);
198 end = PAGE_ALIGN(end); // aligns up
199
200 pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
201 start, end, clear);
202
203 for (idx = start; idx < end; idx += PAGE_SIZE) {
204 pgdp = pgd_offset_k(idx);
205 p4dp = p4d_offset(pgdp, idx);
206 pudp = pud_alloc(&init_mm, p4dp, idx);
207 if (!pudp)
208 continue;
209 if (pud_leaf(*pudp)) {
210 ptep = (pte_t *)pudp;
211 goto update_the_pte;
212 }
213 pmdp = pmd_alloc(&init_mm, pudp, idx);
214 if (!pmdp)
215 continue;
216 if (pmd_leaf(*pmdp)) {
217 ptep = pmdp_ptep(pmdp);
218 goto update_the_pte;
219 }
220 ptep = pte_alloc_kernel(pmdp, idx);
221 if (!ptep)
222 continue;
223 update_the_pte:
224 radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
225 }
226
227 radix__flush_tlb_kernel_range(start, end);
228 }
229
radix__mark_rodata_ro(void)230 void radix__mark_rodata_ro(void)
231 {
232 unsigned long start, end;
233
234 start = (unsigned long)_stext;
235 end = (unsigned long)__end_rodata;
236
237 radix__change_memory_range(start, end, _PAGE_WRITE);
238
239 for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) {
240 end = start + PAGE_SIZE;
241 if (overlaps_interrupt_vector_text(start, end))
242 radix__change_memory_range(start, end, _PAGE_WRITE);
243 else
244 break;
245 }
246 }
247
radix__mark_initmem_nx(void)248 void radix__mark_initmem_nx(void)
249 {
250 unsigned long start = (unsigned long)__init_begin;
251 unsigned long end = (unsigned long)__init_end;
252
253 radix__change_memory_range(start, end, _PAGE_EXEC);
254 }
255 #endif /* CONFIG_STRICT_KERNEL_RWX */
256
257 static inline void __meminit
print_mapping(unsigned long start,unsigned long end,unsigned long size,bool exec)258 print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
259 {
260 char buf[10];
261
262 if (end <= start)
263 return;
264
265 string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
266
267 pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
268 exec ? " (exec)" : "");
269 }
270
next_boundary(unsigned long addr,unsigned long end)271 static unsigned long next_boundary(unsigned long addr, unsigned long end)
272 {
273 #ifdef CONFIG_STRICT_KERNEL_RWX
274 unsigned long stext_phys;
275
276 stext_phys = __pa_symbol(_stext);
277
278 // Relocatable kernel running at non-zero real address
279 if (stext_phys != 0) {
280 // The end of interrupts code at zero is a rodata boundary
281 unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys;
282 if (addr < end_intr)
283 return end_intr;
284
285 // Start of relocated kernel text is a rodata boundary
286 if (addr < stext_phys)
287 return stext_phys;
288 }
289
290 if (addr < __pa_symbol(__srwx_boundary))
291 return __pa_symbol(__srwx_boundary);
292 #endif
293 return end;
294 }
295
create_physical_mapping(unsigned long start,unsigned long end,int nid,pgprot_t _prot,unsigned long mapping_sz_limit)296 static int __meminit create_physical_mapping(unsigned long start,
297 unsigned long end,
298 int nid, pgprot_t _prot,
299 unsigned long mapping_sz_limit)
300 {
301 unsigned long vaddr, addr, mapping_size = 0;
302 bool prev_exec, exec = false;
303 pgprot_t prot;
304 int psize;
305 unsigned long max_mapping_size = memory_block_size;
306
307 if (mapping_sz_limit < max_mapping_size)
308 max_mapping_size = mapping_sz_limit;
309
310 if (debug_pagealloc_enabled())
311 max_mapping_size = PAGE_SIZE;
312
313 start = ALIGN(start, PAGE_SIZE);
314 end = ALIGN_DOWN(end, PAGE_SIZE);
315 for (addr = start; addr < end; addr += mapping_size) {
316 unsigned long gap, previous_size;
317 int rc;
318
319 gap = next_boundary(addr, end) - addr;
320 if (gap > max_mapping_size)
321 gap = max_mapping_size;
322 previous_size = mapping_size;
323 prev_exec = exec;
324
325 if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
326 mmu_psize_defs[MMU_PAGE_1G].shift) {
327 mapping_size = PUD_SIZE;
328 psize = MMU_PAGE_1G;
329 } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
330 mmu_psize_defs[MMU_PAGE_2M].shift) {
331 mapping_size = PMD_SIZE;
332 psize = MMU_PAGE_2M;
333 } else {
334 mapping_size = PAGE_SIZE;
335 psize = mmu_virtual_psize;
336 }
337
338 vaddr = (unsigned long)__va(addr);
339
340 if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
341 overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
342 prot = PAGE_KERNEL_X;
343 exec = true;
344 } else {
345 prot = _prot;
346 exec = false;
347 }
348
349 if (mapping_size != previous_size || exec != prev_exec) {
350 print_mapping(start, addr, previous_size, prev_exec);
351 start = addr;
352 }
353
354 rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
355 if (rc)
356 return rc;
357
358 update_page_count(psize, 1);
359 }
360
361 print_mapping(start, addr, mapping_size, exec);
362 return 0;
363 }
364
365 #ifdef CONFIG_KFENCE
alloc_kfence_pool(void)366 static inline phys_addr_t alloc_kfence_pool(void)
367 {
368 phys_addr_t kfence_pool;
369
370 /*
371 * TODO: Support to enable KFENCE after bootup depends on the ability to
372 * split page table mappings. As such support is not currently
373 * implemented for radix pagetables, support enabling KFENCE
374 * only at system startup for now.
375 *
376 * After support for splitting mappings is available on radix,
377 * alloc_kfence_pool() & map_kfence_pool() can be dropped and
378 * mapping for __kfence_pool memory can be
379 * split during arch_kfence_init_pool().
380 */
381 if (!kfence_early_init)
382 goto no_kfence;
383
384 kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
385 if (!kfence_pool)
386 goto no_kfence;
387
388 memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE);
389 return kfence_pool;
390
391 no_kfence:
392 disable_kfence();
393 return 0;
394 }
395
map_kfence_pool(phys_addr_t kfence_pool)396 static inline void map_kfence_pool(phys_addr_t kfence_pool)
397 {
398 if (!kfence_pool)
399 return;
400
401 if (create_physical_mapping(kfence_pool, kfence_pool + KFENCE_POOL_SIZE,
402 -1, PAGE_KERNEL, PAGE_SIZE))
403 goto err;
404
405 memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
406 __kfence_pool = __va(kfence_pool);
407 return;
408
409 err:
410 memblock_phys_free(kfence_pool, KFENCE_POOL_SIZE);
411 disable_kfence();
412 }
413 #else
alloc_kfence_pool(void)414 static inline phys_addr_t alloc_kfence_pool(void) { return 0; }
map_kfence_pool(phys_addr_t kfence_pool)415 static inline void map_kfence_pool(phys_addr_t kfence_pool) { }
416 #endif
417
radix_init_pgtable(void)418 static void __init radix_init_pgtable(void)
419 {
420 phys_addr_t kfence_pool;
421 unsigned long rts_field;
422 phys_addr_t start, end;
423 u64 i;
424
425 /* We don't support slb for radix */
426 slb_set_size(0);
427
428 kfence_pool = alloc_kfence_pool();
429
430 /*
431 * Create the linear mapping
432 */
433 for_each_mem_range(i, &start, &end) {
434 /*
435 * The memblock allocator is up at this point, so the
436 * page tables will be allocated within the range. No
437 * need or a node (which we don't have yet).
438 */
439
440 if (end >= RADIX_VMALLOC_START) {
441 pr_warn("Outside the supported range\n");
442 continue;
443 }
444
445 WARN_ON(create_physical_mapping(start, end,
446 -1, PAGE_KERNEL, ~0UL));
447 }
448
449 map_kfence_pool(kfence_pool);
450
451 if (!cpu_has_feature(CPU_FTR_HVMODE) &&
452 cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
453 /*
454 * Older versions of KVM on these machines prefer if the
455 * guest only uses the low 19 PID bits.
456 */
457 mmu_pid_bits = 19;
458 }
459 mmu_base_pid = 1;
460
461 /*
462 * Allocate Partition table and process table for the
463 * host.
464 */
465 BUG_ON(PRTB_SIZE_SHIFT > 36);
466 process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
467 /*
468 * Fill in the process table.
469 */
470 rts_field = radix__get_tree_size();
471 process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
472
473 /*
474 * The init_mm context is given the first available (non-zero) PID,
475 * which is the "guard PID" and contains no page table. PIDR should
476 * never be set to zero because that duplicates the kernel address
477 * space at the 0x0... offset (quadrant 0)!
478 *
479 * An arbitrary PID that may later be allocated by the PID allocator
480 * for userspace processes must not be used either, because that
481 * would cause stale user mappings for that PID on CPUs outside of
482 * the TLB invalidation scheme (because it won't be in mm_cpumask).
483 *
484 * So permanently carve out one PID for the purpose of a guard PID.
485 */
486 init_mm.context.id = mmu_base_pid;
487 mmu_base_pid++;
488 }
489
radix_init_partition_table(void)490 static void __init radix_init_partition_table(void)
491 {
492 unsigned long rts_field, dw0, dw1;
493
494 mmu_partition_table_init();
495 rts_field = radix__get_tree_size();
496 dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
497 dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR;
498 mmu_partition_table_set_entry(0, dw0, dw1, false);
499
500 pr_info("Initializing Radix MMU\n");
501 }
502
get_idx_from_shift(unsigned int shift)503 static int __init get_idx_from_shift(unsigned int shift)
504 {
505 int idx = -1;
506
507 switch (shift) {
508 case 0xc:
509 idx = MMU_PAGE_4K;
510 break;
511 case 0x10:
512 idx = MMU_PAGE_64K;
513 break;
514 case 0x15:
515 idx = MMU_PAGE_2M;
516 break;
517 case 0x1e:
518 idx = MMU_PAGE_1G;
519 break;
520 }
521 return idx;
522 }
523
radix_dt_scan_page_sizes(unsigned long node,const char * uname,int depth,void * data)524 static int __init radix_dt_scan_page_sizes(unsigned long node,
525 const char *uname, int depth,
526 void *data)
527 {
528 int size = 0;
529 int shift, idx;
530 unsigned int ap;
531 const __be32 *prop;
532 const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
533
534 /* We are scanning "cpu" nodes only */
535 if (type == NULL || strcmp(type, "cpu") != 0)
536 return 0;
537
538 /* Grab page size encodings */
539 prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
540 if (!prop)
541 return 0;
542
543 pr_info("Page sizes from device-tree:\n");
544 for (; size >= 4; size -= 4, ++prop) {
545
546 struct mmu_psize_def *def;
547
548 /* top 3 bit is AP encoding */
549 shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
550 ap = be32_to_cpu(prop[0]) >> 29;
551 pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
552
553 idx = get_idx_from_shift(shift);
554 if (idx < 0)
555 continue;
556
557 def = &mmu_psize_defs[idx];
558 def->shift = shift;
559 def->ap = ap;
560 def->h_rpt_pgsize = psize_to_rpti_pgsize(idx);
561 }
562
563 /* needed ? */
564 cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
565 return 1;
566 }
567
radix__early_init_devtree(void)568 void __init radix__early_init_devtree(void)
569 {
570 int rc;
571
572 /*
573 * Try to find the available page sizes in the device-tree
574 */
575 rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
576 if (!rc) {
577 /*
578 * No page size details found in device tree.
579 * Let's assume we have page 4k and 64k support
580 */
581 mmu_psize_defs[MMU_PAGE_4K].shift = 12;
582 mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
583 mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize =
584 psize_to_rpti_pgsize(MMU_PAGE_4K);
585
586 mmu_psize_defs[MMU_PAGE_64K].shift = 16;
587 mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
588 mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
589 psize_to_rpti_pgsize(MMU_PAGE_64K);
590 }
591 return;
592 }
593
radix__early_init_mmu(void)594 void __init radix__early_init_mmu(void)
595 {
596 unsigned long lpcr;
597
598 #ifdef CONFIG_PPC_64S_HASH_MMU
599 #ifdef CONFIG_PPC_64K_PAGES
600 /* PAGE_SIZE mappings */
601 mmu_virtual_psize = MMU_PAGE_64K;
602 #else
603 mmu_virtual_psize = MMU_PAGE_4K;
604 #endif
605 #endif
606 /*
607 * initialize page table size
608 */
609 __pte_index_size = RADIX_PTE_INDEX_SIZE;
610 __pmd_index_size = RADIX_PMD_INDEX_SIZE;
611 __pud_index_size = RADIX_PUD_INDEX_SIZE;
612 __pgd_index_size = RADIX_PGD_INDEX_SIZE;
613 __pud_cache_index = RADIX_PUD_INDEX_SIZE;
614 __pte_table_size = RADIX_PTE_TABLE_SIZE;
615 __pmd_table_size = RADIX_PMD_TABLE_SIZE;
616 __pud_table_size = RADIX_PUD_TABLE_SIZE;
617 __pgd_table_size = RADIX_PGD_TABLE_SIZE;
618
619 __pmd_val_bits = RADIX_PMD_VAL_BITS;
620 __pud_val_bits = RADIX_PUD_VAL_BITS;
621 __pgd_val_bits = RADIX_PGD_VAL_BITS;
622
623 __kernel_virt_start = RADIX_KERN_VIRT_START;
624 __vmalloc_start = RADIX_VMALLOC_START;
625 __vmalloc_end = RADIX_VMALLOC_END;
626 __kernel_io_start = RADIX_KERN_IO_START;
627 __kernel_io_end = RADIX_KERN_IO_END;
628 vmemmap = (struct page *)RADIX_VMEMMAP_START;
629 ioremap_bot = IOREMAP_BASE;
630
631 #ifdef CONFIG_PCI
632 pci_io_base = ISA_IO_BASE;
633 #endif
634 __pte_frag_nr = RADIX_PTE_FRAG_NR;
635 __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
636 __pmd_frag_nr = RADIX_PMD_FRAG_NR;
637 __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
638
639 radix_init_pgtable();
640
641 if (!firmware_has_feature(FW_FEATURE_LPAR)) {
642 lpcr = mfspr(SPRN_LPCR);
643 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
644 radix_init_partition_table();
645 } else {
646 radix_init_pseries();
647 }
648
649 memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
650
651 /* Switch to the guard PID before turning on MMU */
652 radix__switch_mmu_context(NULL, &init_mm);
653 tlbiel_all();
654 }
655
radix__early_init_mmu_secondary(void)656 void radix__early_init_mmu_secondary(void)
657 {
658 unsigned long lpcr;
659 /*
660 * update partition table control register and UPRT
661 */
662 if (!firmware_has_feature(FW_FEATURE_LPAR)) {
663 lpcr = mfspr(SPRN_LPCR);
664 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
665
666 set_ptcr_when_no_uv(__pa(partition_tb) |
667 (PATB_SIZE_SHIFT - 12));
668 }
669
670 radix__switch_mmu_context(NULL, &init_mm);
671 tlbiel_all();
672
673 /* Make sure userspace can't change the AMR */
674 mtspr(SPRN_UAMOR, 0);
675 }
676
677 /* Called during kexec sequence with MMU off */
radix__mmu_cleanup_all(void)678 notrace void radix__mmu_cleanup_all(void)
679 {
680 unsigned long lpcr;
681
682 if (!firmware_has_feature(FW_FEATURE_LPAR)) {
683 lpcr = mfspr(SPRN_LPCR);
684 mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
685 set_ptcr_when_no_uv(0);
686 powernv_set_nmmu_ptcr(0);
687 radix__flush_tlb_all();
688 }
689 }
690
691 #ifdef CONFIG_MEMORY_HOTPLUG
free_pte_table(pte_t * pte_start,pmd_t * pmd)692 static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
693 {
694 pte_t *pte;
695 int i;
696
697 for (i = 0; i < PTRS_PER_PTE; i++) {
698 pte = pte_start + i;
699 if (!pte_none(*pte))
700 return;
701 }
702
703 pte_free_kernel(&init_mm, pte_start);
704 pmd_clear(pmd);
705 }
706
free_pmd_table(pmd_t * pmd_start,pud_t * pud)707 static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
708 {
709 pmd_t *pmd;
710 int i;
711
712 for (i = 0; i < PTRS_PER_PMD; i++) {
713 pmd = pmd_start + i;
714 if (!pmd_none(*pmd))
715 return;
716 }
717
718 pmd_free(&init_mm, pmd_start);
719 pud_clear(pud);
720 }
721
free_pud_table(pud_t * pud_start,p4d_t * p4d)722 static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
723 {
724 pud_t *pud;
725 int i;
726
727 for (i = 0; i < PTRS_PER_PUD; i++) {
728 pud = pud_start + i;
729 if (!pud_none(*pud))
730 return;
731 }
732
733 pud_free(&init_mm, pud_start);
734 p4d_clear(p4d);
735 }
736
737 #ifdef CONFIG_SPARSEMEM_VMEMMAP
vmemmap_pmd_is_unused(unsigned long addr,unsigned long end)738 static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
739 {
740 unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
741
742 return !vmemmap_populated(start, PMD_SIZE);
743 }
744
vmemmap_page_is_unused(unsigned long addr,unsigned long end)745 static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end)
746 {
747 unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
748
749 return !vmemmap_populated(start, PAGE_SIZE);
750
751 }
752 #endif
753
free_vmemmap_pages(struct page * page,struct vmem_altmap * altmap,int order)754 static void __meminit free_vmemmap_pages(struct page *page,
755 struct vmem_altmap *altmap,
756 int order)
757 {
758 unsigned int nr_pages = 1 << order;
759
760 if (altmap) {
761 unsigned long alt_start, alt_end;
762 unsigned long base_pfn = page_to_pfn(page);
763
764 /*
765 * with 2M vmemmap mmaping we can have things setup
766 * such that even though atlmap is specified we never
767 * used altmap.
768 */
769 alt_start = altmap->base_pfn;
770 alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
771
772 if (base_pfn >= alt_start && base_pfn < alt_end) {
773 vmem_altmap_free(altmap, nr_pages);
774 return;
775 }
776 }
777
778 if (PageReserved(page)) {
779 /* allocated from memblock */
780 while (nr_pages--)
781 free_reserved_page(page++);
782 } else
783 free_pages((unsigned long)page_address(page), order);
784 }
785
remove_pte_table(pte_t * pte_start,unsigned long addr,unsigned long end,bool direct,struct vmem_altmap * altmap)786 static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr,
787 unsigned long end, bool direct,
788 struct vmem_altmap *altmap)
789 {
790 unsigned long next, pages = 0;
791 pte_t *pte;
792
793 pte = pte_start + pte_index(addr);
794 for (; addr < end; addr = next, pte++) {
795 next = (addr + PAGE_SIZE) & PAGE_MASK;
796 if (next > end)
797 next = end;
798
799 if (!pte_present(*pte))
800 continue;
801
802 if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
803 if (!direct)
804 free_vmemmap_pages(pte_page(*pte), altmap, 0);
805 pte_clear(&init_mm, addr, pte);
806 pages++;
807 }
808 #ifdef CONFIG_SPARSEMEM_VMEMMAP
809 else if (!direct && vmemmap_page_is_unused(addr, next)) {
810 free_vmemmap_pages(pte_page(*pte), altmap, 0);
811 pte_clear(&init_mm, addr, pte);
812 }
813 #endif
814 }
815 if (direct)
816 update_page_count(mmu_virtual_psize, -pages);
817 }
818
remove_pmd_table(pmd_t * pmd_start,unsigned long addr,unsigned long end,bool direct,struct vmem_altmap * altmap)819 static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
820 unsigned long end, bool direct,
821 struct vmem_altmap *altmap)
822 {
823 unsigned long next, pages = 0;
824 pte_t *pte_base;
825 pmd_t *pmd;
826
827 pmd = pmd_start + pmd_index(addr);
828 for (; addr < end; addr = next, pmd++) {
829 next = pmd_addr_end(addr, end);
830
831 if (!pmd_present(*pmd))
832 continue;
833
834 if (pmd_leaf(*pmd)) {
835 if (IS_ALIGNED(addr, PMD_SIZE) &&
836 IS_ALIGNED(next, PMD_SIZE)) {
837 if (!direct)
838 free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
839 pte_clear(&init_mm, addr, (pte_t *)pmd);
840 pages++;
841 }
842 #ifdef CONFIG_SPARSEMEM_VMEMMAP
843 else if (!direct && vmemmap_pmd_is_unused(addr, next)) {
844 free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
845 pte_clear(&init_mm, addr, (pte_t *)pmd);
846 }
847 #endif
848 continue;
849 }
850
851 pte_base = (pte_t *)pmd_page_vaddr(*pmd);
852 remove_pte_table(pte_base, addr, next, direct, altmap);
853 free_pte_table(pte_base, pmd);
854 }
855 if (direct)
856 update_page_count(MMU_PAGE_2M, -pages);
857 }
858
remove_pud_table(pud_t * pud_start,unsigned long addr,unsigned long end,bool direct,struct vmem_altmap * altmap)859 static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
860 unsigned long end, bool direct,
861 struct vmem_altmap *altmap)
862 {
863 unsigned long next, pages = 0;
864 pmd_t *pmd_base;
865 pud_t *pud;
866
867 pud = pud_start + pud_index(addr);
868 for (; addr < end; addr = next, pud++) {
869 next = pud_addr_end(addr, end);
870
871 if (!pud_present(*pud))
872 continue;
873
874 if (pud_leaf(*pud)) {
875 if (!IS_ALIGNED(addr, PUD_SIZE) ||
876 !IS_ALIGNED(next, PUD_SIZE)) {
877 WARN_ONCE(1, "%s: unaligned range\n", __func__);
878 continue;
879 }
880 pte_clear(&init_mm, addr, (pte_t *)pud);
881 pages++;
882 continue;
883 }
884
885 pmd_base = pud_pgtable(*pud);
886 remove_pmd_table(pmd_base, addr, next, direct, altmap);
887 free_pmd_table(pmd_base, pud);
888 }
889 if (direct)
890 update_page_count(MMU_PAGE_1G, -pages);
891 }
892
893 static void __meminit
remove_pagetable(unsigned long start,unsigned long end,bool direct,struct vmem_altmap * altmap)894 remove_pagetable(unsigned long start, unsigned long end, bool direct,
895 struct vmem_altmap *altmap)
896 {
897 unsigned long addr, next;
898 pud_t *pud_base;
899 pgd_t *pgd;
900 p4d_t *p4d;
901
902 spin_lock(&init_mm.page_table_lock);
903
904 for (addr = start; addr < end; addr = next) {
905 next = pgd_addr_end(addr, end);
906
907 pgd = pgd_offset_k(addr);
908 p4d = p4d_offset(pgd, addr);
909 if (!p4d_present(*p4d))
910 continue;
911
912 if (p4d_leaf(*p4d)) {
913 if (!IS_ALIGNED(addr, P4D_SIZE) ||
914 !IS_ALIGNED(next, P4D_SIZE)) {
915 WARN_ONCE(1, "%s: unaligned range\n", __func__);
916 continue;
917 }
918
919 pte_clear(&init_mm, addr, (pte_t *)pgd);
920 continue;
921 }
922
923 pud_base = p4d_pgtable(*p4d);
924 remove_pud_table(pud_base, addr, next, direct, altmap);
925 free_pud_table(pud_base, p4d);
926 }
927
928 spin_unlock(&init_mm.page_table_lock);
929 radix__flush_tlb_kernel_range(start, end);
930 }
931
radix__create_section_mapping(unsigned long start,unsigned long end,int nid,pgprot_t prot)932 int __meminit radix__create_section_mapping(unsigned long start,
933 unsigned long end, int nid,
934 pgprot_t prot)
935 {
936 if (end >= RADIX_VMALLOC_START) {
937 pr_warn("Outside the supported range\n");
938 return -1;
939 }
940
941 return create_physical_mapping(__pa(start), __pa(end),
942 nid, prot, ~0UL);
943 }
944
radix__remove_section_mapping(unsigned long start,unsigned long end)945 int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
946 {
947 remove_pagetable(start, end, true, NULL);
948 return 0;
949 }
950 #endif /* CONFIG_MEMORY_HOTPLUG */
951
952 #ifdef CONFIG_SPARSEMEM_VMEMMAP
__map_kernel_page_nid(unsigned long ea,unsigned long pa,pgprot_t flags,unsigned int map_page_size,int nid)953 static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
954 pgprot_t flags, unsigned int map_page_size,
955 int nid)
956 {
957 return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
958 }
959
radix__vmemmap_create_mapping(unsigned long start,unsigned long page_size,unsigned long phys)960 int __meminit radix__vmemmap_create_mapping(unsigned long start,
961 unsigned long page_size,
962 unsigned long phys)
963 {
964 /* Create a PTE encoding */
965 int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
966 int ret;
967
968 if ((start + page_size) >= RADIX_VMEMMAP_END) {
969 pr_warn("Outside the supported range\n");
970 return -1;
971 }
972
973 ret = __map_kernel_page_nid(start, phys, PAGE_KERNEL, page_size, nid);
974 BUG_ON(ret);
975
976 return 0;
977 }
978
979
vmemmap_can_optimize(struct vmem_altmap * altmap,struct dev_pagemap * pgmap)980 bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
981 {
982 if (radix_enabled())
983 return __vmemmap_can_optimize(altmap, pgmap);
984
985 return false;
986 }
987
vmemmap_check_pmd(pmd_t * pmdp,int node,unsigned long addr,unsigned long next)988 int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
989 unsigned long addr, unsigned long next)
990 {
991 int large = pmd_leaf(*pmdp);
992
993 if (large)
994 vmemmap_verify(pmdp_ptep(pmdp), node, addr, next);
995
996 return large;
997 }
998
vmemmap_set_pmd(pmd_t * pmdp,void * p,int node,unsigned long addr,unsigned long next)999 void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
1000 unsigned long addr, unsigned long next)
1001 {
1002 pte_t entry;
1003 pte_t *ptep = pmdp_ptep(pmdp);
1004
1005 VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE));
1006 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
1007 set_pte_at(&init_mm, addr, ptep, entry);
1008 asm volatile("ptesync": : :"memory");
1009
1010 vmemmap_verify(ptep, node, addr, next);
1011 }
1012
radix__vmemmap_pte_populate(pmd_t * pmdp,unsigned long addr,int node,struct vmem_altmap * altmap,struct page * reuse)1013 static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr,
1014 int node,
1015 struct vmem_altmap *altmap,
1016 struct page *reuse)
1017 {
1018 pte_t *pte = pte_offset_kernel(pmdp, addr);
1019
1020 if (pte_none(*pte)) {
1021 pte_t entry;
1022 void *p;
1023
1024 if (!reuse) {
1025 /*
1026 * make sure we don't create altmap mappings
1027 * covering things outside the device.
1028 */
1029 if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE))
1030 altmap = NULL;
1031
1032 p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
1033 if (!p && altmap)
1034 p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL);
1035 if (!p)
1036 return NULL;
1037 pr_debug("PAGE_SIZE vmemmap mapping\n");
1038 } else {
1039 /*
1040 * When a PTE/PMD entry is freed from the init_mm
1041 * there's a free_pages() call to this page allocated
1042 * above. Thus this get_page() is paired with the
1043 * put_page_testzero() on the freeing path.
1044 * This can only called by certain ZONE_DEVICE path,
1045 * and through vmemmap_populate_compound_pages() when
1046 * slab is available.
1047 */
1048 get_page(reuse);
1049 p = page_to_virt(reuse);
1050 pr_debug("Tail page reuse vmemmap mapping\n");
1051 }
1052
1053 VM_BUG_ON(!PAGE_ALIGNED(addr));
1054 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
1055 set_pte_at(&init_mm, addr, pte, entry);
1056 asm volatile("ptesync": : :"memory");
1057 }
1058 return pte;
1059 }
1060
vmemmap_pud_alloc(p4d_t * p4dp,int node,unsigned long address)1061 static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node,
1062 unsigned long address)
1063 {
1064 pud_t *pud;
1065
1066 /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
1067 if (unlikely(p4d_none(*p4dp))) {
1068 if (unlikely(!slab_is_available())) {
1069 pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
1070 p4d_populate(&init_mm, p4dp, pud);
1071 /* go to the pud_offset */
1072 } else
1073 return pud_alloc(&init_mm, p4dp, address);
1074 }
1075 return pud_offset(p4dp, address);
1076 }
1077
vmemmap_pmd_alloc(pud_t * pudp,int node,unsigned long address)1078 static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node,
1079 unsigned long address)
1080 {
1081 pmd_t *pmd;
1082
1083 /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
1084 if (unlikely(pud_none(*pudp))) {
1085 if (unlikely(!slab_is_available())) {
1086 pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
1087 pud_populate(&init_mm, pudp, pmd);
1088 } else
1089 return pmd_alloc(&init_mm, pudp, address);
1090 }
1091 return pmd_offset(pudp, address);
1092 }
1093
vmemmap_pte_alloc(pmd_t * pmdp,int node,unsigned long address)1094 static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node,
1095 unsigned long address)
1096 {
1097 pte_t *pte;
1098
1099 /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
1100 if (unlikely(pmd_none(*pmdp))) {
1101 if (unlikely(!slab_is_available())) {
1102 pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
1103 pmd_populate(&init_mm, pmdp, pte);
1104 } else
1105 return pte_alloc_kernel(pmdp, address);
1106 }
1107 return pte_offset_kernel(pmdp, address);
1108 }
1109
1110
1111
radix__vmemmap_populate(unsigned long start,unsigned long end,int node,struct vmem_altmap * altmap)1112 int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node,
1113 struct vmem_altmap *altmap)
1114 {
1115 unsigned long addr;
1116 unsigned long next;
1117 pgd_t *pgd;
1118 p4d_t *p4d;
1119 pud_t *pud;
1120 pmd_t *pmd;
1121 pte_t *pte;
1122
1123 for (addr = start; addr < end; addr = next) {
1124 next = pmd_addr_end(addr, end);
1125
1126 pgd = pgd_offset_k(addr);
1127 p4d = p4d_offset(pgd, addr);
1128 pud = vmemmap_pud_alloc(p4d, node, addr);
1129 if (!pud)
1130 return -ENOMEM;
1131 pmd = vmemmap_pmd_alloc(pud, node, addr);
1132 if (!pmd)
1133 return -ENOMEM;
1134
1135 if (pmd_none(READ_ONCE(*pmd))) {
1136 void *p;
1137
1138 /*
1139 * keep it simple by checking addr PMD_SIZE alignment
1140 * and verifying the device boundary condition.
1141 * For us to use a pmd mapping, both addr and pfn should
1142 * be aligned. We skip if addr is not aligned and for
1143 * pfn we hope we have extra area in the altmap that
1144 * can help to find an aligned block. This can result
1145 * in altmap block allocation failures, in which case
1146 * we fallback to RAM for vmemmap allocation.
1147 */
1148 if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) ||
1149 altmap_cross_boundary(altmap, addr, PMD_SIZE))) {
1150 /*
1151 * make sure we don't create altmap mappings
1152 * covering things outside the device.
1153 */
1154 goto base_mapping;
1155 }
1156
1157 p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
1158 if (p) {
1159 vmemmap_set_pmd(pmd, p, node, addr, next);
1160 pr_debug("PMD_SIZE vmemmap mapping\n");
1161 continue;
1162 } else if (altmap) {
1163 /*
1164 * A vmemmap block allocation can fail due to
1165 * alignment requirements and we trying to align
1166 * things aggressively there by running out of
1167 * space. Try base mapping on failure.
1168 */
1169 goto base_mapping;
1170 }
1171 } else if (vmemmap_check_pmd(pmd, node, addr, next)) {
1172 /*
1173 * If a huge mapping exist due to early call to
1174 * vmemmap_populate, let's try to use that.
1175 */
1176 continue;
1177 }
1178 base_mapping:
1179 /*
1180 * Not able allocate higher order memory to back memmap
1181 * or we found a pointer to pte page. Allocate base page
1182 * size vmemmap
1183 */
1184 pte = vmemmap_pte_alloc(pmd, node, addr);
1185 if (!pte)
1186 return -ENOMEM;
1187
1188 pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL);
1189 if (!pte)
1190 return -ENOMEM;
1191
1192 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1193 next = addr + PAGE_SIZE;
1194 }
1195 return 0;
1196 }
1197
radix__vmemmap_populate_address(unsigned long addr,int node,struct vmem_altmap * altmap,struct page * reuse)1198 static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node,
1199 struct vmem_altmap *altmap,
1200 struct page *reuse)
1201 {
1202 pgd_t *pgd;
1203 p4d_t *p4d;
1204 pud_t *pud;
1205 pmd_t *pmd;
1206 pte_t *pte;
1207
1208 pgd = pgd_offset_k(addr);
1209 p4d = p4d_offset(pgd, addr);
1210 pud = vmemmap_pud_alloc(p4d, node, addr);
1211 if (!pud)
1212 return NULL;
1213 pmd = vmemmap_pmd_alloc(pud, node, addr);
1214 if (!pmd)
1215 return NULL;
1216 if (pmd_leaf(*pmd))
1217 /*
1218 * The second page is mapped as a hugepage due to a nearby request.
1219 * Force our mapping to page size without deduplication
1220 */
1221 return NULL;
1222 pte = vmemmap_pte_alloc(pmd, node, addr);
1223 if (!pte)
1224 return NULL;
1225 radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
1226 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1227
1228 return pte;
1229 }
1230
vmemmap_compound_tail_page(unsigned long addr,unsigned long pfn_offset,int node)1231 static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr,
1232 unsigned long pfn_offset, int node)
1233 {
1234 pgd_t *pgd;
1235 p4d_t *p4d;
1236 pud_t *pud;
1237 pmd_t *pmd;
1238 pte_t *pte;
1239 unsigned long map_addr;
1240
1241 /* the second vmemmap page which we use for duplication */
1242 map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE;
1243 pgd = pgd_offset_k(map_addr);
1244 p4d = p4d_offset(pgd, map_addr);
1245 pud = vmemmap_pud_alloc(p4d, node, map_addr);
1246 if (!pud)
1247 return NULL;
1248 pmd = vmemmap_pmd_alloc(pud, node, map_addr);
1249 if (!pmd)
1250 return NULL;
1251 if (pmd_leaf(*pmd))
1252 /*
1253 * The second page is mapped as a hugepage due to a nearby request.
1254 * Force our mapping to page size without deduplication
1255 */
1256 return NULL;
1257 pte = vmemmap_pte_alloc(pmd, node, map_addr);
1258 if (!pte)
1259 return NULL;
1260 /*
1261 * Check if there exist a mapping to the left
1262 */
1263 if (pte_none(*pte)) {
1264 /*
1265 * Populate the head page vmemmap page.
1266 * It can fall in different pmd, hence
1267 * vmemmap_populate_address()
1268 */
1269 pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL);
1270 if (!pte)
1271 return NULL;
1272 /*
1273 * Populate the tail pages vmemmap page
1274 */
1275 pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL);
1276 if (!pte)
1277 return NULL;
1278 vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE);
1279 return pte;
1280 }
1281 return pte;
1282 }
1283
vmemmap_populate_compound_pages(unsigned long start_pfn,unsigned long start,unsigned long end,int node,struct dev_pagemap * pgmap)1284 int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
1285 unsigned long start,
1286 unsigned long end, int node,
1287 struct dev_pagemap *pgmap)
1288 {
1289 /*
1290 * we want to map things as base page size mapping so that
1291 * we can save space in vmemmap. We could have huge mapping
1292 * covering out both edges.
1293 */
1294 unsigned long addr;
1295 unsigned long addr_pfn = start_pfn;
1296 unsigned long next;
1297 pgd_t *pgd;
1298 p4d_t *p4d;
1299 pud_t *pud;
1300 pmd_t *pmd;
1301 pte_t *pte;
1302
1303 for (addr = start; addr < end; addr = next) {
1304
1305 pgd = pgd_offset_k(addr);
1306 p4d = p4d_offset(pgd, addr);
1307 pud = vmemmap_pud_alloc(p4d, node, addr);
1308 if (!pud)
1309 return -ENOMEM;
1310 pmd = vmemmap_pmd_alloc(pud, node, addr);
1311 if (!pmd)
1312 return -ENOMEM;
1313
1314 if (pmd_leaf(READ_ONCE(*pmd))) {
1315 /* existing huge mapping. Skip the range */
1316 addr_pfn += (PMD_SIZE >> PAGE_SHIFT);
1317 next = pmd_addr_end(addr, end);
1318 continue;
1319 }
1320 pte = vmemmap_pte_alloc(pmd, node, addr);
1321 if (!pte)
1322 return -ENOMEM;
1323 if (!pte_none(*pte)) {
1324 /*
1325 * This could be because we already have a compound
1326 * page whose VMEMMAP_RESERVE_NR pages were mapped and
1327 * this request fall in those pages.
1328 */
1329 addr_pfn += 1;
1330 next = addr + PAGE_SIZE;
1331 continue;
1332 } else {
1333 unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
1334 unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages);
1335 pte_t *tail_page_pte;
1336
1337 /*
1338 * if the address is aligned to huge page size it is the
1339 * head mapping.
1340 */
1341 if (pfn_offset == 0) {
1342 /* Populate the head page vmemmap page */
1343 pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
1344 if (!pte)
1345 return -ENOMEM;
1346 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1347
1348 /*
1349 * Populate the tail pages vmemmap page
1350 * It can fall in different pmd, hence
1351 * vmemmap_populate_address()
1352 */
1353 pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL);
1354 if (!pte)
1355 return -ENOMEM;
1356
1357 addr_pfn += 2;
1358 next = addr + 2 * PAGE_SIZE;
1359 continue;
1360 }
1361 /*
1362 * get the 2nd mapping details
1363 * Also create it if that doesn't exist
1364 */
1365 tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node);
1366 if (!tail_page_pte) {
1367
1368 pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
1369 if (!pte)
1370 return -ENOMEM;
1371 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1372
1373 addr_pfn += 1;
1374 next = addr + PAGE_SIZE;
1375 continue;
1376 }
1377
1378 pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte));
1379 if (!pte)
1380 return -ENOMEM;
1381 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1382
1383 addr_pfn += 1;
1384 next = addr + PAGE_SIZE;
1385 continue;
1386 }
1387 }
1388 return 0;
1389 }
1390
1391
1392 #ifdef CONFIG_MEMORY_HOTPLUG
radix__vmemmap_remove_mapping(unsigned long start,unsigned long page_size)1393 void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
1394 {
1395 remove_pagetable(start, start + page_size, true, NULL);
1396 }
1397
radix__vmemmap_free(unsigned long start,unsigned long end,struct vmem_altmap * altmap)1398 void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
1399 struct vmem_altmap *altmap)
1400 {
1401 remove_pagetable(start, end, false, altmap);
1402 }
1403 #endif
1404 #endif
1405
1406 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1407
radix__pmd_hugepage_update(struct mm_struct * mm,unsigned long addr,pmd_t * pmdp,unsigned long clr,unsigned long set)1408 unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
1409 pmd_t *pmdp, unsigned long clr,
1410 unsigned long set)
1411 {
1412 unsigned long old;
1413
1414 #ifdef CONFIG_DEBUG_VM
1415 WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
1416 assert_spin_locked(pmd_lockptr(mm, pmdp));
1417 #endif
1418
1419 old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1);
1420 trace_hugepage_update_pmd(addr, old, clr, set);
1421
1422 return old;
1423 }
1424
radix__pud_hugepage_update(struct mm_struct * mm,unsigned long addr,pud_t * pudp,unsigned long clr,unsigned long set)1425 unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
1426 pud_t *pudp, unsigned long clr,
1427 unsigned long set)
1428 {
1429 unsigned long old;
1430
1431 #ifdef CONFIG_DEBUG_VM
1432 WARN_ON(!pud_devmap(*pudp));
1433 assert_spin_locked(pud_lockptr(mm, pudp));
1434 #endif
1435
1436 old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1);
1437 trace_hugepage_update_pud(addr, old, clr, set);
1438
1439 return old;
1440 }
1441
radix__pmdp_collapse_flush(struct vm_area_struct * vma,unsigned long address,pmd_t * pmdp)1442 pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
1443 pmd_t *pmdp)
1444
1445 {
1446 pmd_t pmd;
1447
1448 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1449 VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
1450 VM_BUG_ON(pmd_devmap(*pmdp));
1451 /*
1452 * khugepaged calls this for normal pmd
1453 */
1454 pmd = *pmdp;
1455 pmd_clear(pmdp);
1456
1457 radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
1458
1459 return pmd;
1460 }
1461
1462 /*
1463 * For us pgtable_t is pte_t *. Inorder to save the deposisted
1464 * page table, we consider the allocated page table as a list
1465 * head. On withdraw we need to make sure we zero out the used
1466 * list_head memory area.
1467 */
radix__pgtable_trans_huge_deposit(struct mm_struct * mm,pmd_t * pmdp,pgtable_t pgtable)1468 void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
1469 pgtable_t pgtable)
1470 {
1471 struct list_head *lh = (struct list_head *) pgtable;
1472
1473 assert_spin_locked(pmd_lockptr(mm, pmdp));
1474
1475 /* FIFO */
1476 if (!pmd_huge_pte(mm, pmdp))
1477 INIT_LIST_HEAD(lh);
1478 else
1479 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
1480 pmd_huge_pte(mm, pmdp) = pgtable;
1481 }
1482
radix__pgtable_trans_huge_withdraw(struct mm_struct * mm,pmd_t * pmdp)1483 pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
1484 {
1485 pte_t *ptep;
1486 pgtable_t pgtable;
1487 struct list_head *lh;
1488
1489 assert_spin_locked(pmd_lockptr(mm, pmdp));
1490
1491 /* FIFO */
1492 pgtable = pmd_huge_pte(mm, pmdp);
1493 lh = (struct list_head *) pgtable;
1494 if (list_empty(lh))
1495 pmd_huge_pte(mm, pmdp) = NULL;
1496 else {
1497 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
1498 list_del(lh);
1499 }
1500 ptep = (pte_t *) pgtable;
1501 *ptep = __pte(0);
1502 ptep++;
1503 *ptep = __pte(0);
1504 return pgtable;
1505 }
1506
radix__pmdp_huge_get_and_clear(struct mm_struct * mm,unsigned long addr,pmd_t * pmdp)1507 pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
1508 unsigned long addr, pmd_t *pmdp)
1509 {
1510 pmd_t old_pmd;
1511 unsigned long old;
1512
1513 old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
1514 old_pmd = __pmd(old);
1515 return old_pmd;
1516 }
1517
radix__pudp_huge_get_and_clear(struct mm_struct * mm,unsigned long addr,pud_t * pudp)1518 pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
1519 unsigned long addr, pud_t *pudp)
1520 {
1521 pud_t old_pud;
1522 unsigned long old;
1523
1524 old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0);
1525 old_pud = __pud(old);
1526 return old_pud;
1527 }
1528
1529 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1530
radix__ptep_set_access_flags(struct vm_area_struct * vma,pte_t * ptep,pte_t entry,unsigned long address,int psize)1531 void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
1532 pte_t entry, unsigned long address, int psize)
1533 {
1534 struct mm_struct *mm = vma->vm_mm;
1535 unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_SOFT_DIRTY |
1536 _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
1537
1538 unsigned long change = pte_val(entry) ^ pte_val(*ptep);
1539 /*
1540 * On POWER9, the NMMU is not able to relax PTE access permissions
1541 * for a translation with a TLB. The PTE must be invalidated, TLB
1542 * flushed before the new PTE is installed.
1543 *
1544 * This only needs to be done for radix, because hash translation does
1545 * flush when updating the linux pte (and we don't support NMMU
1546 * accelerators on HPT on POWER9 anyway XXX: do we?).
1547 *
1548 * POWER10 (and P9P) NMMU does behave as per ISA.
1549 */
1550 if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) &&
1551 atomic_read(&mm->context.copros) > 0) {
1552 unsigned long old_pte, new_pte;
1553
1554 old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
1555 new_pte = old_pte | set;
1556 radix__flush_tlb_page_psize(mm, address, psize);
1557 __radix_pte_update(ptep, _PAGE_INVALID, new_pte);
1558 } else {
1559 __radix_pte_update(ptep, 0, set);
1560 /*
1561 * Book3S does not require a TLB flush when relaxing access
1562 * restrictions when the address space (modulo the POWER9 nest
1563 * MMU issue above) because the MMU will reload the PTE after
1564 * taking an access fault, as defined by the architecture. See
1565 * "Setting a Reference or Change Bit or Upgrading Access
1566 * Authority (PTE Subject to Atomic Hardware Updates)" in
1567 * Power ISA Version 3.1B.
1568 */
1569 }
1570 /* See ptesync comment in radix__set_pte_at */
1571 }
1572
radix__ptep_modify_prot_commit(struct vm_area_struct * vma,unsigned long addr,pte_t * ptep,pte_t old_pte,pte_t pte)1573 void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
1574 unsigned long addr, pte_t *ptep,
1575 pte_t old_pte, pte_t pte)
1576 {
1577 struct mm_struct *mm = vma->vm_mm;
1578
1579 /*
1580 * POWER9 NMMU must flush the TLB after clearing the PTE before
1581 * installing a PTE with more relaxed access permissions, see
1582 * radix__ptep_set_access_flags.
1583 */
1584 if (!cpu_has_feature(CPU_FTR_ARCH_31) &&
1585 is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
1586 (atomic_read(&mm->context.copros) > 0))
1587 radix__flush_tlb_page(vma, addr);
1588
1589 set_pte_at(mm, addr, ptep, pte);
1590 }
1591
pud_set_huge(pud_t * pud,phys_addr_t addr,pgprot_t prot)1592 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
1593 {
1594 pte_t *ptep = (pte_t *)pud;
1595 pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot);
1596
1597 if (!radix_enabled())
1598 return 0;
1599
1600 set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud);
1601
1602 return 1;
1603 }
1604
pud_clear_huge(pud_t * pud)1605 int pud_clear_huge(pud_t *pud)
1606 {
1607 if (pud_leaf(*pud)) {
1608 pud_clear(pud);
1609 return 1;
1610 }
1611
1612 return 0;
1613 }
1614
pud_free_pmd_page(pud_t * pud,unsigned long addr)1615 int pud_free_pmd_page(pud_t *pud, unsigned long addr)
1616 {
1617 pmd_t *pmd;
1618 int i;
1619
1620 pmd = pud_pgtable(*pud);
1621 pud_clear(pud);
1622
1623 flush_tlb_kernel_range(addr, addr + PUD_SIZE);
1624
1625 for (i = 0; i < PTRS_PER_PMD; i++) {
1626 if (!pmd_none(pmd[i])) {
1627 pte_t *pte;
1628 pte = (pte_t *)pmd_page_vaddr(pmd[i]);
1629
1630 pte_free_kernel(&init_mm, pte);
1631 }
1632 }
1633
1634 pmd_free(&init_mm, pmd);
1635
1636 return 1;
1637 }
1638
pmd_set_huge(pmd_t * pmd,phys_addr_t addr,pgprot_t prot)1639 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
1640 {
1641 pte_t *ptep = (pte_t *)pmd;
1642 pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot);
1643
1644 if (!radix_enabled())
1645 return 0;
1646
1647 set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd);
1648
1649 return 1;
1650 }
1651
pmd_clear_huge(pmd_t * pmd)1652 int pmd_clear_huge(pmd_t *pmd)
1653 {
1654 if (pmd_leaf(*pmd)) {
1655 pmd_clear(pmd);
1656 return 1;
1657 }
1658
1659 return 0;
1660 }
1661
pmd_free_pte_page(pmd_t * pmd,unsigned long addr)1662 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
1663 {
1664 pte_t *pte;
1665
1666 pte = (pte_t *)pmd_page_vaddr(*pmd);
1667 pmd_clear(pmd);
1668
1669 flush_tlb_kernel_range(addr, addr + PMD_SIZE);
1670
1671 pte_free_kernel(&init_mm, pte);
1672
1673 return 1;
1674 }
1675