1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4 * Author: Christoffer Dall <[email protected]>
5 */
6
7 #include <linux/bug.h>
8 #include <linux/cpu_pm.h>
9 #include <linux/entry-kvm.h>
10 #include <linux/errno.h>
11 #include <linux/err.h>
12 #include <linux/kvm_host.h>
13 #include <linux/list.h>
14 #include <linux/module.h>
15 #include <linux/vmalloc.h>
16 #include <linux/fs.h>
17 #include <linux/mman.h>
18 #include <linux/sched.h>
19 #include <linux/kvm.h>
20 #include <linux/kvm_irqfd.h>
21 #include <linux/irqbypass.h>
22 #include <linux/sched/stat.h>
23 #include <linux/psci.h>
24 #include <trace/events/kvm.h>
25
26 #define CREATE_TRACE_POINTS
27 #include "trace_arm.h"
28
29 #include <linux/uaccess.h>
30 #include <asm/ptrace.h>
31 #include <asm/mman.h>
32 #include <asm/tlbflush.h>
33 #include <asm/cacheflush.h>
34 #include <asm/cpufeature.h>
35 #include <asm/virt.h>
36 #include <asm/kvm_arm.h>
37 #include <asm/kvm_asm.h>
38 #include <asm/kvm_emulate.h>
39 #include <asm/kvm_mmu.h>
40 #include <asm/kvm_nested.h>
41 #include <asm/kvm_pkvm.h>
42 #include <asm/kvm_ptrauth.h>
43 #include <asm/sections.h>
44
45 #include <kvm/arm_hypercalls.h>
46 #include <kvm/arm_pmu.h>
47 #include <kvm/arm_psci.h>
48
49 #include "sys_regs.h"
50
51 static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT;
52
53 enum kvm_wfx_trap_policy {
54 KVM_WFX_NOTRAP_SINGLE_TASK, /* Default option */
55 KVM_WFX_NOTRAP,
56 KVM_WFX_TRAP,
57 };
58
59 static enum kvm_wfx_trap_policy kvm_wfi_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK;
60 static enum kvm_wfx_trap_policy kvm_wfe_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK;
61
62 DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
63
64 DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_base);
65 DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
66
67 DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
68
69 static bool vgic_present, kvm_arm_initialised;
70
71 static DEFINE_PER_CPU(unsigned char, kvm_hyp_initialized);
72
is_kvm_arm_initialised(void)73 bool is_kvm_arm_initialised(void)
74 {
75 return kvm_arm_initialised;
76 }
77
kvm_arch_vcpu_should_kick(struct kvm_vcpu * vcpu)78 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
79 {
80 return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
81 }
82
kvm_vm_ioctl_enable_cap(struct kvm * kvm,struct kvm_enable_cap * cap)83 int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
84 struct kvm_enable_cap *cap)
85 {
86 int r = -EINVAL;
87
88 if (cap->flags)
89 return -EINVAL;
90
91 if (kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(cap->cap))
92 return -EINVAL;
93
94 switch (cap->cap) {
95 case KVM_CAP_ARM_NISV_TO_USER:
96 r = 0;
97 set_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER,
98 &kvm->arch.flags);
99 break;
100 case KVM_CAP_ARM_MTE:
101 mutex_lock(&kvm->lock);
102 if (system_supports_mte() && !kvm->created_vcpus) {
103 r = 0;
104 set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags);
105 }
106 mutex_unlock(&kvm->lock);
107 break;
108 case KVM_CAP_ARM_SYSTEM_SUSPEND:
109 r = 0;
110 set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
111 break;
112 case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
113 mutex_lock(&kvm->slots_lock);
114 /*
115 * To keep things simple, allow changing the chunk
116 * size only when no memory slots have been created.
117 */
118 if (kvm_are_all_memslots_empty(kvm)) {
119 u64 new_cap = cap->args[0];
120
121 if (!new_cap || kvm_is_block_size_supported(new_cap)) {
122 r = 0;
123 kvm->arch.mmu.split_page_chunk_size = new_cap;
124 }
125 }
126 mutex_unlock(&kvm->slots_lock);
127 break;
128 default:
129 break;
130 }
131
132 return r;
133 }
134
kvm_arm_default_max_vcpus(void)135 static int kvm_arm_default_max_vcpus(void)
136 {
137 return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
138 }
139
140 /**
141 * kvm_arch_init_vm - initializes a VM data structure
142 * @kvm: pointer to the KVM struct
143 * @type: kvm device type
144 */
kvm_arch_init_vm(struct kvm * kvm,unsigned long type)145 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
146 {
147 int ret;
148
149 mutex_init(&kvm->arch.config_lock);
150
151 #ifdef CONFIG_LOCKDEP
152 /* Clue in lockdep that the config_lock must be taken inside kvm->lock */
153 mutex_lock(&kvm->lock);
154 mutex_lock(&kvm->arch.config_lock);
155 mutex_unlock(&kvm->arch.config_lock);
156 mutex_unlock(&kvm->lock);
157 #endif
158
159 kvm_init_nested(kvm);
160
161 ret = kvm_share_hyp(kvm, kvm + 1);
162 if (ret)
163 return ret;
164
165 ret = pkvm_init_host_vm(kvm);
166 if (ret)
167 goto err_unshare_kvm;
168
169 if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL_ACCOUNT)) {
170 ret = -ENOMEM;
171 goto err_unshare_kvm;
172 }
173 cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask);
174
175 ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu, type);
176 if (ret)
177 goto err_free_cpumask;
178
179 kvm_vgic_early_init(kvm);
180
181 kvm_timer_init_vm(kvm);
182
183 /* The maximum number of VCPUs is limited by the host's GIC model */
184 kvm->max_vcpus = kvm_arm_default_max_vcpus();
185
186 kvm_arm_init_hypercalls(kvm);
187
188 bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES);
189
190 return 0;
191
192 err_free_cpumask:
193 free_cpumask_var(kvm->arch.supported_cpus);
194 err_unshare_kvm:
195 kvm_unshare_hyp(kvm, kvm + 1);
196 return ret;
197 }
198
kvm_arch_vcpu_fault(struct kvm_vcpu * vcpu,struct vm_fault * vmf)199 vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
200 {
201 return VM_FAULT_SIGBUS;
202 }
203
kvm_arch_create_vm_debugfs(struct kvm * kvm)204 void kvm_arch_create_vm_debugfs(struct kvm *kvm)
205 {
206 kvm_sys_regs_create_debugfs(kvm);
207 kvm_s2_ptdump_create_debugfs(kvm);
208 }
209
kvm_destroy_mpidr_data(struct kvm * kvm)210 static void kvm_destroy_mpidr_data(struct kvm *kvm)
211 {
212 struct kvm_mpidr_data *data;
213
214 mutex_lock(&kvm->arch.config_lock);
215
216 data = rcu_dereference_protected(kvm->arch.mpidr_data,
217 lockdep_is_held(&kvm->arch.config_lock));
218 if (data) {
219 rcu_assign_pointer(kvm->arch.mpidr_data, NULL);
220 synchronize_rcu();
221 kfree(data);
222 }
223
224 mutex_unlock(&kvm->arch.config_lock);
225 }
226
227 /**
228 * kvm_arch_destroy_vm - destroy the VM data structure
229 * @kvm: pointer to the KVM struct
230 */
kvm_arch_destroy_vm(struct kvm * kvm)231 void kvm_arch_destroy_vm(struct kvm *kvm)
232 {
233 bitmap_free(kvm->arch.pmu_filter);
234 free_cpumask_var(kvm->arch.supported_cpus);
235
236 kvm_vgic_destroy(kvm);
237
238 if (is_protected_kvm_enabled())
239 pkvm_destroy_hyp_vm(kvm);
240
241 kvm_destroy_mpidr_data(kvm);
242
243 kfree(kvm->arch.sysreg_masks);
244 kvm_destroy_vcpus(kvm);
245
246 kvm_unshare_hyp(kvm, kvm + 1);
247
248 kvm_arm_teardown_hypercalls(kvm);
249 }
250
kvm_has_full_ptr_auth(void)251 static bool kvm_has_full_ptr_auth(void)
252 {
253 bool apa, gpa, api, gpi, apa3, gpa3;
254 u64 isar1, isar2, val;
255
256 /*
257 * Check that:
258 *
259 * - both Address and Generic auth are implemented for a given
260 * algorithm (Q5, IMPDEF or Q3)
261 * - only a single algorithm is implemented.
262 */
263 if (!system_has_full_ptr_auth())
264 return false;
265
266 isar1 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1);
267 isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1);
268
269 apa = !!FIELD_GET(ID_AA64ISAR1_EL1_APA_MASK, isar1);
270 val = FIELD_GET(ID_AA64ISAR1_EL1_GPA_MASK, isar1);
271 gpa = (val == ID_AA64ISAR1_EL1_GPA_IMP);
272
273 api = !!FIELD_GET(ID_AA64ISAR1_EL1_API_MASK, isar1);
274 val = FIELD_GET(ID_AA64ISAR1_EL1_GPI_MASK, isar1);
275 gpi = (val == ID_AA64ISAR1_EL1_GPI_IMP);
276
277 apa3 = !!FIELD_GET(ID_AA64ISAR2_EL1_APA3_MASK, isar2);
278 val = FIELD_GET(ID_AA64ISAR2_EL1_GPA3_MASK, isar2);
279 gpa3 = (val == ID_AA64ISAR2_EL1_GPA3_IMP);
280
281 return (apa == gpa && api == gpi && apa3 == gpa3 &&
282 (apa + api + apa3) == 1);
283 }
284
kvm_vm_ioctl_check_extension(struct kvm * kvm,long ext)285 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
286 {
287 int r;
288
289 if (kvm && kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(ext))
290 return 0;
291
292 switch (ext) {
293 case KVM_CAP_IRQCHIP:
294 r = vgic_present;
295 break;
296 case KVM_CAP_IOEVENTFD:
297 case KVM_CAP_USER_MEMORY:
298 case KVM_CAP_SYNC_MMU:
299 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
300 case KVM_CAP_ONE_REG:
301 case KVM_CAP_ARM_PSCI:
302 case KVM_CAP_ARM_PSCI_0_2:
303 case KVM_CAP_READONLY_MEM:
304 case KVM_CAP_MP_STATE:
305 case KVM_CAP_IMMEDIATE_EXIT:
306 case KVM_CAP_VCPU_EVENTS:
307 case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2:
308 case KVM_CAP_ARM_NISV_TO_USER:
309 case KVM_CAP_ARM_INJECT_EXT_DABT:
310 case KVM_CAP_SET_GUEST_DEBUG:
311 case KVM_CAP_VCPU_ATTRIBUTES:
312 case KVM_CAP_PTP_KVM:
313 case KVM_CAP_ARM_SYSTEM_SUSPEND:
314 case KVM_CAP_IRQFD_RESAMPLE:
315 case KVM_CAP_COUNTER_OFFSET:
316 r = 1;
317 break;
318 case KVM_CAP_SET_GUEST_DEBUG2:
319 return KVM_GUESTDBG_VALID_MASK;
320 case KVM_CAP_ARM_SET_DEVICE_ADDR:
321 r = 1;
322 break;
323 case KVM_CAP_NR_VCPUS:
324 /*
325 * ARM64 treats KVM_CAP_NR_CPUS differently from all other
326 * architectures, as it does not always bound it to
327 * KVM_CAP_MAX_VCPUS. It should not matter much because
328 * this is just an advisory value.
329 */
330 r = min_t(unsigned int, num_online_cpus(),
331 kvm_arm_default_max_vcpus());
332 break;
333 case KVM_CAP_MAX_VCPUS:
334 case KVM_CAP_MAX_VCPU_ID:
335 if (kvm)
336 r = kvm->max_vcpus;
337 else
338 r = kvm_arm_default_max_vcpus();
339 break;
340 case KVM_CAP_MSI_DEVID:
341 if (!kvm)
342 r = -EINVAL;
343 else
344 r = kvm->arch.vgic.msis_require_devid;
345 break;
346 case KVM_CAP_ARM_USER_IRQ:
347 /*
348 * 1: EL1_VTIMER, EL1_PTIMER, and PMU.
349 * (bump this number if adding more devices)
350 */
351 r = 1;
352 break;
353 case KVM_CAP_ARM_MTE:
354 r = system_supports_mte();
355 break;
356 case KVM_CAP_STEAL_TIME:
357 r = kvm_arm_pvtime_supported();
358 break;
359 case KVM_CAP_ARM_EL1_32BIT:
360 r = cpus_have_final_cap(ARM64_HAS_32BIT_EL1);
361 break;
362 case KVM_CAP_GUEST_DEBUG_HW_BPS:
363 r = get_num_brps();
364 break;
365 case KVM_CAP_GUEST_DEBUG_HW_WPS:
366 r = get_num_wrps();
367 break;
368 case KVM_CAP_ARM_PMU_V3:
369 r = kvm_arm_support_pmu_v3();
370 break;
371 case KVM_CAP_ARM_INJECT_SERROR_ESR:
372 r = cpus_have_final_cap(ARM64_HAS_RAS_EXTN);
373 break;
374 case KVM_CAP_ARM_VM_IPA_SIZE:
375 r = get_kvm_ipa_limit();
376 break;
377 case KVM_CAP_ARM_SVE:
378 r = system_supports_sve();
379 break;
380 case KVM_CAP_ARM_PTRAUTH_ADDRESS:
381 case KVM_CAP_ARM_PTRAUTH_GENERIC:
382 r = kvm_has_full_ptr_auth();
383 break;
384 case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
385 if (kvm)
386 r = kvm->arch.mmu.split_page_chunk_size;
387 else
388 r = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
389 break;
390 case KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES:
391 r = kvm_supported_block_sizes();
392 break;
393 case KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES:
394 r = BIT(0);
395 break;
396 default:
397 r = 0;
398 }
399
400 return r;
401 }
402
kvm_arch_dev_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)403 long kvm_arch_dev_ioctl(struct file *filp,
404 unsigned int ioctl, unsigned long arg)
405 {
406 return -EINVAL;
407 }
408
kvm_arch_alloc_vm(void)409 struct kvm *kvm_arch_alloc_vm(void)
410 {
411 size_t sz = sizeof(struct kvm);
412
413 if (!has_vhe())
414 return kzalloc(sz, GFP_KERNEL_ACCOUNT);
415
416 return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO);
417 }
418
kvm_arch_vcpu_precreate(struct kvm * kvm,unsigned int id)419 int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
420 {
421 if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
422 return -EBUSY;
423
424 if (id >= kvm->max_vcpus)
425 return -EINVAL;
426
427 return 0;
428 }
429
kvm_arch_vcpu_create(struct kvm_vcpu * vcpu)430 int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
431 {
432 int err;
433
434 spin_lock_init(&vcpu->arch.mp_state_lock);
435
436 #ifdef CONFIG_LOCKDEP
437 /* Inform lockdep that the config_lock is acquired after vcpu->mutex */
438 mutex_lock(&vcpu->mutex);
439 mutex_lock(&vcpu->kvm->arch.config_lock);
440 mutex_unlock(&vcpu->kvm->arch.config_lock);
441 mutex_unlock(&vcpu->mutex);
442 #endif
443
444 /* Force users to call KVM_ARM_VCPU_INIT */
445 vcpu_clear_flag(vcpu, VCPU_INITIALIZED);
446
447 vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO;
448
449 /* Set up the timer */
450 kvm_timer_vcpu_init(vcpu);
451
452 kvm_pmu_vcpu_init(vcpu);
453
454 kvm_arm_pvtime_vcpu_init(&vcpu->arch);
455
456 vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
457
458 /*
459 * This vCPU may have been created after mpidr_data was initialized.
460 * Throw out the pre-computed mappings if that is the case which forces
461 * KVM to fall back to iteratively searching the vCPUs.
462 */
463 kvm_destroy_mpidr_data(vcpu->kvm);
464
465 err = kvm_vgic_vcpu_init(vcpu);
466 if (err)
467 return err;
468
469 err = kvm_share_hyp(vcpu, vcpu + 1);
470 if (err)
471 kvm_vgic_vcpu_destroy(vcpu);
472
473 return err;
474 }
475
kvm_arch_vcpu_postcreate(struct kvm_vcpu * vcpu)476 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
477 {
478 }
479
kvm_arch_vcpu_destroy(struct kvm_vcpu * vcpu)480 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
481 {
482 if (!is_protected_kvm_enabled())
483 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
484 else
485 free_hyp_memcache(&vcpu->arch.pkvm_memcache);
486 kvm_timer_vcpu_terminate(vcpu);
487 kvm_pmu_vcpu_destroy(vcpu);
488 kvm_vgic_vcpu_destroy(vcpu);
489 kvm_arm_vcpu_destroy(vcpu);
490 }
491
kvm_arch_vcpu_blocking(struct kvm_vcpu * vcpu)492 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
493 {
494
495 }
496
kvm_arch_vcpu_unblocking(struct kvm_vcpu * vcpu)497 void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
498 {
499
500 }
501
vcpu_set_pauth_traps(struct kvm_vcpu * vcpu)502 static void vcpu_set_pauth_traps(struct kvm_vcpu *vcpu)
503 {
504 if (vcpu_has_ptrauth(vcpu) && !is_protected_kvm_enabled()) {
505 /*
506 * Either we're running an L2 guest, and the API/APK bits come
507 * from L1's HCR_EL2, or API/APK are both set.
508 */
509 if (unlikely(vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))) {
510 u64 val;
511
512 val = __vcpu_sys_reg(vcpu, HCR_EL2);
513 val &= (HCR_API | HCR_APK);
514 vcpu->arch.hcr_el2 &= ~(HCR_API | HCR_APK);
515 vcpu->arch.hcr_el2 |= val;
516 } else {
517 vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK);
518 }
519
520 /*
521 * Save the host keys if there is any chance for the guest
522 * to use pauth, as the entry code will reload the guest
523 * keys in that case.
524 */
525 if (vcpu->arch.hcr_el2 & (HCR_API | HCR_APK)) {
526 struct kvm_cpu_context *ctxt;
527
528 ctxt = this_cpu_ptr_hyp_sym(kvm_hyp_ctxt);
529 ptrauth_save_keys(ctxt);
530 }
531 }
532 }
533
kvm_vcpu_should_clear_twi(struct kvm_vcpu * vcpu)534 static bool kvm_vcpu_should_clear_twi(struct kvm_vcpu *vcpu)
535 {
536 if (unlikely(kvm_wfi_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK))
537 return kvm_wfi_trap_policy == KVM_WFX_NOTRAP;
538
539 return single_task_running() &&
540 (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count) ||
541 vcpu->kvm->arch.vgic.nassgireq);
542 }
543
kvm_vcpu_should_clear_twe(struct kvm_vcpu * vcpu)544 static bool kvm_vcpu_should_clear_twe(struct kvm_vcpu *vcpu)
545 {
546 if (unlikely(kvm_wfe_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK))
547 return kvm_wfe_trap_policy == KVM_WFX_NOTRAP;
548
549 return single_task_running();
550 }
551
kvm_arch_vcpu_load(struct kvm_vcpu * vcpu,int cpu)552 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
553 {
554 struct kvm_s2_mmu *mmu;
555 int *last_ran;
556
557 if (is_protected_kvm_enabled())
558 goto nommu;
559
560 if (vcpu_has_nv(vcpu))
561 kvm_vcpu_load_hw_mmu(vcpu);
562
563 mmu = vcpu->arch.hw_mmu;
564 last_ran = this_cpu_ptr(mmu->last_vcpu_ran);
565
566 /*
567 * Ensure a VMID is allocated for the MMU before programming VTTBR_EL2,
568 * which happens eagerly in VHE.
569 *
570 * Also, the VMID allocator only preserves VMIDs that are active at the
571 * time of rollover, so KVM might need to grab a new VMID for the MMU if
572 * this is called from kvm_sched_in().
573 */
574 kvm_arm_vmid_update(&mmu->vmid);
575
576 /*
577 * We guarantee that both TLBs and I-cache are private to each
578 * vcpu. If detecting that a vcpu from the same VM has
579 * previously run on the same physical CPU, call into the
580 * hypervisor code to nuke the relevant contexts.
581 *
582 * We might get preempted before the vCPU actually runs, but
583 * over-invalidation doesn't affect correctness.
584 */
585 if (*last_ran != vcpu->vcpu_idx) {
586 kvm_call_hyp(__kvm_flush_cpu_context, mmu);
587 *last_ran = vcpu->vcpu_idx;
588 }
589
590 nommu:
591 vcpu->cpu = cpu;
592
593 kvm_vgic_load(vcpu);
594 kvm_timer_vcpu_load(vcpu);
595 kvm_vcpu_load_debug(vcpu);
596 if (has_vhe())
597 kvm_vcpu_load_vhe(vcpu);
598 kvm_arch_vcpu_load_fp(vcpu);
599 kvm_vcpu_pmu_restore_guest(vcpu);
600 if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
601 kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
602
603 if (kvm_vcpu_should_clear_twe(vcpu))
604 vcpu->arch.hcr_el2 &= ~HCR_TWE;
605 else
606 vcpu->arch.hcr_el2 |= HCR_TWE;
607
608 if (kvm_vcpu_should_clear_twi(vcpu))
609 vcpu->arch.hcr_el2 &= ~HCR_TWI;
610 else
611 vcpu->arch.hcr_el2 |= HCR_TWI;
612
613 vcpu_set_pauth_traps(vcpu);
614
615 if (is_protected_kvm_enabled()) {
616 kvm_call_hyp_nvhe(__pkvm_vcpu_load,
617 vcpu->kvm->arch.pkvm.handle,
618 vcpu->vcpu_idx, vcpu->arch.hcr_el2);
619 kvm_call_hyp(__vgic_v3_restore_vmcr_aprs,
620 &vcpu->arch.vgic_cpu.vgic_v3);
621 }
622
623 if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus))
624 vcpu_set_on_unsupported_cpu(vcpu);
625 }
626
kvm_arch_vcpu_put(struct kvm_vcpu * vcpu)627 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
628 {
629 if (is_protected_kvm_enabled()) {
630 kvm_call_hyp(__vgic_v3_save_vmcr_aprs,
631 &vcpu->arch.vgic_cpu.vgic_v3);
632 kvm_call_hyp_nvhe(__pkvm_vcpu_put);
633 }
634
635 kvm_vcpu_put_debug(vcpu);
636 kvm_arch_vcpu_put_fp(vcpu);
637 if (has_vhe())
638 kvm_vcpu_put_vhe(vcpu);
639 kvm_timer_vcpu_put(vcpu);
640 kvm_vgic_put(vcpu);
641 kvm_vcpu_pmu_restore_host(vcpu);
642 if (vcpu_has_nv(vcpu))
643 kvm_vcpu_put_hw_mmu(vcpu);
644 kvm_arm_vmid_clear_active();
645
646 vcpu_clear_on_unsupported_cpu(vcpu);
647 vcpu->cpu = -1;
648 }
649
__kvm_arm_vcpu_power_off(struct kvm_vcpu * vcpu)650 static void __kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
651 {
652 WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED);
653 kvm_make_request(KVM_REQ_SLEEP, vcpu);
654 kvm_vcpu_kick(vcpu);
655 }
656
kvm_arm_vcpu_power_off(struct kvm_vcpu * vcpu)657 void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
658 {
659 spin_lock(&vcpu->arch.mp_state_lock);
660 __kvm_arm_vcpu_power_off(vcpu);
661 spin_unlock(&vcpu->arch.mp_state_lock);
662 }
663
kvm_arm_vcpu_stopped(struct kvm_vcpu * vcpu)664 bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu)
665 {
666 return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_STOPPED;
667 }
668
kvm_arm_vcpu_suspend(struct kvm_vcpu * vcpu)669 static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu)
670 {
671 WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_SUSPENDED);
672 kvm_make_request(KVM_REQ_SUSPEND, vcpu);
673 kvm_vcpu_kick(vcpu);
674 }
675
kvm_arm_vcpu_suspended(struct kvm_vcpu * vcpu)676 static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu)
677 {
678 return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_SUSPENDED;
679 }
680
kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu * vcpu,struct kvm_mp_state * mp_state)681 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
682 struct kvm_mp_state *mp_state)
683 {
684 *mp_state = READ_ONCE(vcpu->arch.mp_state);
685
686 return 0;
687 }
688
kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu * vcpu,struct kvm_mp_state * mp_state)689 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
690 struct kvm_mp_state *mp_state)
691 {
692 int ret = 0;
693
694 spin_lock(&vcpu->arch.mp_state_lock);
695
696 switch (mp_state->mp_state) {
697 case KVM_MP_STATE_RUNNABLE:
698 WRITE_ONCE(vcpu->arch.mp_state, *mp_state);
699 break;
700 case KVM_MP_STATE_STOPPED:
701 __kvm_arm_vcpu_power_off(vcpu);
702 break;
703 case KVM_MP_STATE_SUSPENDED:
704 kvm_arm_vcpu_suspend(vcpu);
705 break;
706 default:
707 ret = -EINVAL;
708 }
709
710 spin_unlock(&vcpu->arch.mp_state_lock);
711
712 return ret;
713 }
714
715 /**
716 * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled
717 * @v: The VCPU pointer
718 *
719 * If the guest CPU is not waiting for interrupts or an interrupt line is
720 * asserted, the CPU is by definition runnable.
721 */
kvm_arch_vcpu_runnable(struct kvm_vcpu * v)722 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
723 {
724 bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF);
725 return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
726 && !kvm_arm_vcpu_stopped(v) && !v->arch.pause);
727 }
728
kvm_arch_vcpu_in_kernel(struct kvm_vcpu * vcpu)729 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
730 {
731 return vcpu_mode_priv(vcpu);
732 }
733
734 #ifdef CONFIG_GUEST_PERF_EVENTS
kvm_arch_vcpu_get_ip(struct kvm_vcpu * vcpu)735 unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
736 {
737 return *vcpu_pc(vcpu);
738 }
739 #endif
740
kvm_init_mpidr_data(struct kvm * kvm)741 static void kvm_init_mpidr_data(struct kvm *kvm)
742 {
743 struct kvm_mpidr_data *data = NULL;
744 unsigned long c, mask, nr_entries;
745 u64 aff_set = 0, aff_clr = ~0UL;
746 struct kvm_vcpu *vcpu;
747
748 mutex_lock(&kvm->arch.config_lock);
749
750 if (rcu_access_pointer(kvm->arch.mpidr_data) ||
751 atomic_read(&kvm->online_vcpus) == 1)
752 goto out;
753
754 kvm_for_each_vcpu(c, vcpu, kvm) {
755 u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
756 aff_set |= aff;
757 aff_clr &= aff;
758 }
759
760 /*
761 * A significant bit can be either 0 or 1, and will only appear in
762 * aff_set. Use aff_clr to weed out the useless stuff.
763 */
764 mask = aff_set ^ aff_clr;
765 nr_entries = BIT_ULL(hweight_long(mask));
766
767 /*
768 * Don't let userspace fool us. If we need more than a single page
769 * to describe the compressed MPIDR array, just fall back to the
770 * iterative method. Single vcpu VMs do not need this either.
771 */
772 if (struct_size(data, cmpidr_to_idx, nr_entries) <= PAGE_SIZE)
773 data = kzalloc(struct_size(data, cmpidr_to_idx, nr_entries),
774 GFP_KERNEL_ACCOUNT);
775
776 if (!data)
777 goto out;
778
779 data->mpidr_mask = mask;
780
781 kvm_for_each_vcpu(c, vcpu, kvm) {
782 u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
783 u16 index = kvm_mpidr_index(data, aff);
784
785 data->cmpidr_to_idx[index] = c;
786 }
787
788 rcu_assign_pointer(kvm->arch.mpidr_data, data);
789 out:
790 mutex_unlock(&kvm->arch.config_lock);
791 }
792
793 /*
794 * Handle both the initialisation that is being done when the vcpu is
795 * run for the first time, as well as the updates that must be
796 * performed each time we get a new thread dealing with this vcpu.
797 */
kvm_arch_vcpu_run_pid_change(struct kvm_vcpu * vcpu)798 int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
799 {
800 struct kvm *kvm = vcpu->kvm;
801 int ret;
802
803 if (!kvm_vcpu_initialized(vcpu))
804 return -ENOEXEC;
805
806 if (!kvm_arm_vcpu_is_finalized(vcpu))
807 return -EPERM;
808
809 ret = kvm_arch_vcpu_run_map_fp(vcpu);
810 if (ret)
811 return ret;
812
813 if (likely(vcpu_has_run_once(vcpu)))
814 return 0;
815
816 kvm_init_mpidr_data(kvm);
817
818 if (likely(irqchip_in_kernel(kvm))) {
819 /*
820 * Map the VGIC hardware resources before running a vcpu the
821 * first time on this VM.
822 */
823 ret = kvm_vgic_map_resources(kvm);
824 if (ret)
825 return ret;
826 }
827
828 ret = kvm_finalize_sys_regs(vcpu);
829 if (ret)
830 return ret;
831
832 /*
833 * This needs to happen after any restriction has been applied
834 * to the feature set.
835 */
836 kvm_calculate_traps(vcpu);
837
838 ret = kvm_timer_enable(vcpu);
839 if (ret)
840 return ret;
841
842 ret = kvm_arm_pmu_v3_enable(vcpu);
843 if (ret)
844 return ret;
845
846 if (is_protected_kvm_enabled()) {
847 ret = pkvm_create_hyp_vm(kvm);
848 if (ret)
849 return ret;
850 }
851
852 mutex_lock(&kvm->arch.config_lock);
853 set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags);
854 mutex_unlock(&kvm->arch.config_lock);
855
856 return ret;
857 }
858
kvm_arch_intc_initialized(struct kvm * kvm)859 bool kvm_arch_intc_initialized(struct kvm *kvm)
860 {
861 return vgic_initialized(kvm);
862 }
863
kvm_arm_halt_guest(struct kvm * kvm)864 void kvm_arm_halt_guest(struct kvm *kvm)
865 {
866 unsigned long i;
867 struct kvm_vcpu *vcpu;
868
869 kvm_for_each_vcpu(i, vcpu, kvm)
870 vcpu->arch.pause = true;
871 kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP);
872 }
873
kvm_arm_resume_guest(struct kvm * kvm)874 void kvm_arm_resume_guest(struct kvm *kvm)
875 {
876 unsigned long i;
877 struct kvm_vcpu *vcpu;
878
879 kvm_for_each_vcpu(i, vcpu, kvm) {
880 vcpu->arch.pause = false;
881 __kvm_vcpu_wake_up(vcpu);
882 }
883 }
884
kvm_vcpu_sleep(struct kvm_vcpu * vcpu)885 static void kvm_vcpu_sleep(struct kvm_vcpu *vcpu)
886 {
887 struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
888
889 rcuwait_wait_event(wait,
890 (!kvm_arm_vcpu_stopped(vcpu)) && (!vcpu->arch.pause),
891 TASK_INTERRUPTIBLE);
892
893 if (kvm_arm_vcpu_stopped(vcpu) || vcpu->arch.pause) {
894 /* Awaken to handle a signal, request we sleep again later. */
895 kvm_make_request(KVM_REQ_SLEEP, vcpu);
896 }
897
898 /*
899 * Make sure we will observe a potential reset request if we've
900 * observed a change to the power state. Pairs with the smp_wmb() in
901 * kvm_psci_vcpu_on().
902 */
903 smp_rmb();
904 }
905
906 /**
907 * kvm_vcpu_wfi - emulate Wait-For-Interrupt behavior
908 * @vcpu: The VCPU pointer
909 *
910 * Suspend execution of a vCPU until a valid wake event is detected, i.e. until
911 * the vCPU is runnable. The vCPU may or may not be scheduled out, depending
912 * on when a wake event arrives, e.g. there may already be a pending wake event.
913 */
kvm_vcpu_wfi(struct kvm_vcpu * vcpu)914 void kvm_vcpu_wfi(struct kvm_vcpu *vcpu)
915 {
916 /*
917 * Sync back the state of the GIC CPU interface so that we have
918 * the latest PMR and group enables. This ensures that
919 * kvm_arch_vcpu_runnable has up-to-date data to decide whether
920 * we have pending interrupts, e.g. when determining if the
921 * vCPU should block.
922 *
923 * For the same reason, we want to tell GICv4 that we need
924 * doorbells to be signalled, should an interrupt become pending.
925 */
926 preempt_disable();
927 vcpu_set_flag(vcpu, IN_WFI);
928 kvm_vgic_put(vcpu);
929 preempt_enable();
930
931 kvm_vcpu_halt(vcpu);
932 vcpu_clear_flag(vcpu, IN_WFIT);
933
934 preempt_disable();
935 vcpu_clear_flag(vcpu, IN_WFI);
936 kvm_vgic_load(vcpu);
937 preempt_enable();
938 }
939
kvm_vcpu_suspend(struct kvm_vcpu * vcpu)940 static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu)
941 {
942 if (!kvm_arm_vcpu_suspended(vcpu))
943 return 1;
944
945 kvm_vcpu_wfi(vcpu);
946
947 /*
948 * The suspend state is sticky; we do not leave it until userspace
949 * explicitly marks the vCPU as runnable. Request that we suspend again
950 * later.
951 */
952 kvm_make_request(KVM_REQ_SUSPEND, vcpu);
953
954 /*
955 * Check to make sure the vCPU is actually runnable. If so, exit to
956 * userspace informing it of the wakeup condition.
957 */
958 if (kvm_arch_vcpu_runnable(vcpu)) {
959 memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
960 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_WAKEUP;
961 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
962 return 0;
963 }
964
965 /*
966 * Otherwise, we were unblocked to process a different event, such as a
967 * pending signal. Return 1 and allow kvm_arch_vcpu_ioctl_run() to
968 * process the event.
969 */
970 return 1;
971 }
972
973 /**
974 * check_vcpu_requests - check and handle pending vCPU requests
975 * @vcpu: the VCPU pointer
976 *
977 * Return: 1 if we should enter the guest
978 * 0 if we should exit to userspace
979 * < 0 if we should exit to userspace, where the return value indicates
980 * an error
981 */
check_vcpu_requests(struct kvm_vcpu * vcpu)982 static int check_vcpu_requests(struct kvm_vcpu *vcpu)
983 {
984 if (kvm_request_pending(vcpu)) {
985 if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu))
986 return -EIO;
987
988 if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
989 kvm_vcpu_sleep(vcpu);
990
991 if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
992 kvm_reset_vcpu(vcpu);
993
994 /*
995 * Clear IRQ_PENDING requests that were made to guarantee
996 * that a VCPU sees new virtual interrupts.
997 */
998 kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu);
999
1000 if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
1001 kvm_update_stolen_time(vcpu);
1002
1003 if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) {
1004 /* The distributor enable bits were changed */
1005 preempt_disable();
1006 vgic_v4_put(vcpu);
1007 vgic_v4_load(vcpu);
1008 preempt_enable();
1009 }
1010
1011 if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu))
1012 kvm_vcpu_reload_pmu(vcpu);
1013
1014 if (kvm_check_request(KVM_REQ_RESYNC_PMU_EL0, vcpu))
1015 kvm_vcpu_pmu_restore_guest(vcpu);
1016
1017 if (kvm_check_request(KVM_REQ_SUSPEND, vcpu))
1018 return kvm_vcpu_suspend(vcpu);
1019
1020 if (kvm_dirty_ring_check_request(vcpu))
1021 return 0;
1022
1023 check_nested_vcpu_requests(vcpu);
1024 }
1025
1026 return 1;
1027 }
1028
vcpu_mode_is_bad_32bit(struct kvm_vcpu * vcpu)1029 static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu)
1030 {
1031 if (likely(!vcpu_mode_is_32bit(vcpu)))
1032 return false;
1033
1034 if (vcpu_has_nv(vcpu))
1035 return true;
1036
1037 return !kvm_supports_32bit_el0();
1038 }
1039
1040 /**
1041 * kvm_vcpu_exit_request - returns true if the VCPU should *not* enter the guest
1042 * @vcpu: The VCPU pointer
1043 * @ret: Pointer to write optional return code
1044 *
1045 * Returns: true if the VCPU needs to return to a preemptible + interruptible
1046 * and skip guest entry.
1047 *
1048 * This function disambiguates between two different types of exits: exits to a
1049 * preemptible + interruptible kernel context and exits to userspace. For an
1050 * exit to userspace, this function will write the return code to ret and return
1051 * true. For an exit to preemptible + interruptible kernel context (i.e. check
1052 * for pending work and re-enter), return true without writing to ret.
1053 */
kvm_vcpu_exit_request(struct kvm_vcpu * vcpu,int * ret)1054 static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret)
1055 {
1056 struct kvm_run *run = vcpu->run;
1057
1058 /*
1059 * If we're using a userspace irqchip, then check if we need
1060 * to tell a userspace irqchip about timer or PMU level
1061 * changes and if so, exit to userspace (the actual level
1062 * state gets updated in kvm_timer_update_run and
1063 * kvm_pmu_update_run below).
1064 */
1065 if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
1066 if (kvm_timer_should_notify_user(vcpu) ||
1067 kvm_pmu_should_notify_user(vcpu)) {
1068 *ret = -EINTR;
1069 run->exit_reason = KVM_EXIT_INTR;
1070 return true;
1071 }
1072 }
1073
1074 if (unlikely(vcpu_on_unsupported_cpu(vcpu))) {
1075 run->exit_reason = KVM_EXIT_FAIL_ENTRY;
1076 run->fail_entry.hardware_entry_failure_reason = KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED;
1077 run->fail_entry.cpu = smp_processor_id();
1078 *ret = 0;
1079 return true;
1080 }
1081
1082 return kvm_request_pending(vcpu) ||
1083 xfer_to_guest_mode_work_pending();
1084 }
1085
1086 /*
1087 * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while
1088 * the vCPU is running.
1089 *
1090 * This must be noinstr as instrumentation may make use of RCU, and this is not
1091 * safe during the EQS.
1092 */
kvm_arm_vcpu_enter_exit(struct kvm_vcpu * vcpu)1093 static int noinstr kvm_arm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
1094 {
1095 int ret;
1096
1097 guest_state_enter_irqoff();
1098 ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu);
1099 guest_state_exit_irqoff();
1100
1101 return ret;
1102 }
1103
1104 /**
1105 * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
1106 * @vcpu: The VCPU pointer
1107 *
1108 * This function is called through the VCPU_RUN ioctl called from user space. It
1109 * will execute VM code in a loop until the time slice for the process is used
1110 * or some emulation is needed from user space in which case the function will
1111 * return with return value 0 and with the kvm_run structure filled in with the
1112 * required data for the requested emulation.
1113 */
kvm_arch_vcpu_ioctl_run(struct kvm_vcpu * vcpu)1114 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
1115 {
1116 struct kvm_run *run = vcpu->run;
1117 int ret;
1118
1119 if (run->exit_reason == KVM_EXIT_MMIO) {
1120 ret = kvm_handle_mmio_return(vcpu);
1121 if (ret <= 0)
1122 return ret;
1123 }
1124
1125 vcpu_load(vcpu);
1126
1127 if (!vcpu->wants_to_run) {
1128 ret = -EINTR;
1129 goto out;
1130 }
1131
1132 kvm_sigset_activate(vcpu);
1133
1134 ret = 1;
1135 run->exit_reason = KVM_EXIT_UNKNOWN;
1136 run->flags = 0;
1137 while (ret > 0) {
1138 /*
1139 * Check conditions before entering the guest
1140 */
1141 ret = xfer_to_guest_mode_handle_work(vcpu);
1142 if (!ret)
1143 ret = 1;
1144
1145 if (ret > 0)
1146 ret = check_vcpu_requests(vcpu);
1147
1148 /*
1149 * Preparing the interrupts to be injected also
1150 * involves poking the GIC, which must be done in a
1151 * non-preemptible context.
1152 */
1153 preempt_disable();
1154
1155 kvm_pmu_flush_hwstate(vcpu);
1156
1157 local_irq_disable();
1158
1159 kvm_vgic_flush_hwstate(vcpu);
1160
1161 kvm_pmu_update_vcpu_events(vcpu);
1162
1163 /*
1164 * Ensure we set mode to IN_GUEST_MODE after we disable
1165 * interrupts and before the final VCPU requests check.
1166 * See the comment in kvm_vcpu_exiting_guest_mode() and
1167 * Documentation/virt/kvm/vcpu-requests.rst
1168 */
1169 smp_store_mb(vcpu->mode, IN_GUEST_MODE);
1170
1171 if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) {
1172 vcpu->mode = OUTSIDE_GUEST_MODE;
1173 isb(); /* Ensure work in x_flush_hwstate is committed */
1174 kvm_pmu_sync_hwstate(vcpu);
1175 if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
1176 kvm_timer_sync_user(vcpu);
1177 kvm_vgic_sync_hwstate(vcpu);
1178 local_irq_enable();
1179 preempt_enable();
1180 continue;
1181 }
1182
1183 kvm_arch_vcpu_ctxflush_fp(vcpu);
1184
1185 /**************************************************************
1186 * Enter the guest
1187 */
1188 trace_kvm_entry(*vcpu_pc(vcpu));
1189 guest_timing_enter_irqoff();
1190
1191 ret = kvm_arm_vcpu_enter_exit(vcpu);
1192
1193 vcpu->mode = OUTSIDE_GUEST_MODE;
1194 vcpu->stat.exits++;
1195 /*
1196 * Back from guest
1197 *************************************************************/
1198
1199 /*
1200 * We must sync the PMU state before the vgic state so
1201 * that the vgic can properly sample the updated state of the
1202 * interrupt line.
1203 */
1204 kvm_pmu_sync_hwstate(vcpu);
1205
1206 /*
1207 * Sync the vgic state before syncing the timer state because
1208 * the timer code needs to know if the virtual timer
1209 * interrupts are active.
1210 */
1211 kvm_vgic_sync_hwstate(vcpu);
1212
1213 /*
1214 * Sync the timer hardware state before enabling interrupts as
1215 * we don't want vtimer interrupts to race with syncing the
1216 * timer virtual interrupt state.
1217 */
1218 if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
1219 kvm_timer_sync_user(vcpu);
1220
1221 if (is_hyp_ctxt(vcpu))
1222 kvm_timer_sync_nested(vcpu);
1223
1224 kvm_arch_vcpu_ctxsync_fp(vcpu);
1225
1226 /*
1227 * We must ensure that any pending interrupts are taken before
1228 * we exit guest timing so that timer ticks are accounted as
1229 * guest time. Transiently unmask interrupts so that any
1230 * pending interrupts are taken.
1231 *
1232 * Per ARM DDI 0487G.b section D1.13.4, an ISB (or other
1233 * context synchronization event) is necessary to ensure that
1234 * pending interrupts are taken.
1235 */
1236 if (ARM_EXCEPTION_CODE(ret) == ARM_EXCEPTION_IRQ) {
1237 local_irq_enable();
1238 isb();
1239 local_irq_disable();
1240 }
1241
1242 guest_timing_exit_irqoff();
1243
1244 local_irq_enable();
1245
1246 trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
1247
1248 /* Exit types that need handling before we can be preempted */
1249 handle_exit_early(vcpu, ret);
1250
1251 preempt_enable();
1252
1253 /*
1254 * The ARMv8 architecture doesn't give the hypervisor
1255 * a mechanism to prevent a guest from dropping to AArch32 EL0
1256 * if implemented by the CPU. If we spot the guest in such
1257 * state and that we decided it wasn't supposed to do so (like
1258 * with the asymmetric AArch32 case), return to userspace with
1259 * a fatal error.
1260 */
1261 if (vcpu_mode_is_bad_32bit(vcpu)) {
1262 /*
1263 * As we have caught the guest red-handed, decide that
1264 * it isn't fit for purpose anymore by making the vcpu
1265 * invalid. The VMM can try and fix it by issuing a
1266 * KVM_ARM_VCPU_INIT if it really wants to.
1267 */
1268 vcpu_clear_flag(vcpu, VCPU_INITIALIZED);
1269 ret = ARM_EXCEPTION_IL;
1270 }
1271
1272 ret = handle_exit(vcpu, ret);
1273 }
1274
1275 /* Tell userspace about in-kernel device output levels */
1276 if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
1277 kvm_timer_update_run(vcpu);
1278 kvm_pmu_update_run(vcpu);
1279 }
1280
1281 kvm_sigset_deactivate(vcpu);
1282
1283 out:
1284 /*
1285 * In the unlikely event that we are returning to userspace
1286 * with pending exceptions or PC adjustment, commit these
1287 * adjustments in order to give userspace a consistent view of
1288 * the vcpu state. Note that this relies on __kvm_adjust_pc()
1289 * being preempt-safe on VHE.
1290 */
1291 if (unlikely(vcpu_get_flag(vcpu, PENDING_EXCEPTION) ||
1292 vcpu_get_flag(vcpu, INCREMENT_PC)))
1293 kvm_call_hyp(__kvm_adjust_pc, vcpu);
1294
1295 vcpu_put(vcpu);
1296 return ret;
1297 }
1298
vcpu_interrupt_line(struct kvm_vcpu * vcpu,int number,bool level)1299 static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
1300 {
1301 int bit_index;
1302 bool set;
1303 unsigned long *hcr;
1304
1305 if (number == KVM_ARM_IRQ_CPU_IRQ)
1306 bit_index = __ffs(HCR_VI);
1307 else /* KVM_ARM_IRQ_CPU_FIQ */
1308 bit_index = __ffs(HCR_VF);
1309
1310 hcr = vcpu_hcr(vcpu);
1311 if (level)
1312 set = test_and_set_bit(bit_index, hcr);
1313 else
1314 set = test_and_clear_bit(bit_index, hcr);
1315
1316 /*
1317 * If we didn't change anything, no need to wake up or kick other CPUs
1318 */
1319 if (set == level)
1320 return 0;
1321
1322 /*
1323 * The vcpu irq_lines field was updated, wake up sleeping VCPUs and
1324 * trigger a world-switch round on the running physical CPU to set the
1325 * virtual IRQ/FIQ fields in the HCR appropriately.
1326 */
1327 kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
1328 kvm_vcpu_kick(vcpu);
1329
1330 return 0;
1331 }
1332
kvm_vm_ioctl_irq_line(struct kvm * kvm,struct kvm_irq_level * irq_level,bool line_status)1333 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
1334 bool line_status)
1335 {
1336 u32 irq = irq_level->irq;
1337 unsigned int irq_type, vcpu_id, irq_num;
1338 struct kvm_vcpu *vcpu = NULL;
1339 bool level = irq_level->level;
1340
1341 irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK;
1342 vcpu_id = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK;
1343 vcpu_id += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1);
1344 irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK;
1345
1346 trace_kvm_irq_line(irq_type, vcpu_id, irq_num, irq_level->level);
1347
1348 switch (irq_type) {
1349 case KVM_ARM_IRQ_TYPE_CPU:
1350 if (irqchip_in_kernel(kvm))
1351 return -ENXIO;
1352
1353 vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
1354 if (!vcpu)
1355 return -EINVAL;
1356
1357 if (irq_num > KVM_ARM_IRQ_CPU_FIQ)
1358 return -EINVAL;
1359
1360 return vcpu_interrupt_line(vcpu, irq_num, level);
1361 case KVM_ARM_IRQ_TYPE_PPI:
1362 if (!irqchip_in_kernel(kvm))
1363 return -ENXIO;
1364
1365 vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
1366 if (!vcpu)
1367 return -EINVAL;
1368
1369 if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS)
1370 return -EINVAL;
1371
1372 return kvm_vgic_inject_irq(kvm, vcpu, irq_num, level, NULL);
1373 case KVM_ARM_IRQ_TYPE_SPI:
1374 if (!irqchip_in_kernel(kvm))
1375 return -ENXIO;
1376
1377 if (irq_num < VGIC_NR_PRIVATE_IRQS)
1378 return -EINVAL;
1379
1380 return kvm_vgic_inject_irq(kvm, NULL, irq_num, level, NULL);
1381 }
1382
1383 return -EINVAL;
1384 }
1385
system_supported_vcpu_features(void)1386 static unsigned long system_supported_vcpu_features(void)
1387 {
1388 unsigned long features = KVM_VCPU_VALID_FEATURES;
1389
1390 if (!cpus_have_final_cap(ARM64_HAS_32BIT_EL1))
1391 clear_bit(KVM_ARM_VCPU_EL1_32BIT, &features);
1392
1393 if (!kvm_arm_support_pmu_v3())
1394 clear_bit(KVM_ARM_VCPU_PMU_V3, &features);
1395
1396 if (!system_supports_sve())
1397 clear_bit(KVM_ARM_VCPU_SVE, &features);
1398
1399 if (!kvm_has_full_ptr_auth()) {
1400 clear_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features);
1401 clear_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features);
1402 }
1403
1404 if (!cpus_have_final_cap(ARM64_HAS_NESTED_VIRT))
1405 clear_bit(KVM_ARM_VCPU_HAS_EL2, &features);
1406
1407 return features;
1408 }
1409
kvm_vcpu_init_check_features(struct kvm_vcpu * vcpu,const struct kvm_vcpu_init * init)1410 static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu,
1411 const struct kvm_vcpu_init *init)
1412 {
1413 unsigned long features = init->features[0];
1414 int i;
1415
1416 if (features & ~KVM_VCPU_VALID_FEATURES)
1417 return -ENOENT;
1418
1419 for (i = 1; i < ARRAY_SIZE(init->features); i++) {
1420 if (init->features[i])
1421 return -ENOENT;
1422 }
1423
1424 if (features & ~system_supported_vcpu_features())
1425 return -EINVAL;
1426
1427 /*
1428 * For now make sure that both address/generic pointer authentication
1429 * features are requested by the userspace together.
1430 */
1431 if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features) !=
1432 test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features))
1433 return -EINVAL;
1434
1435 if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features))
1436 return 0;
1437
1438 /* MTE is incompatible with AArch32 */
1439 if (kvm_has_mte(vcpu->kvm))
1440 return -EINVAL;
1441
1442 /* NV is incompatible with AArch32 */
1443 if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features))
1444 return -EINVAL;
1445
1446 return 0;
1447 }
1448
kvm_vcpu_init_changed(struct kvm_vcpu * vcpu,const struct kvm_vcpu_init * init)1449 static bool kvm_vcpu_init_changed(struct kvm_vcpu *vcpu,
1450 const struct kvm_vcpu_init *init)
1451 {
1452 unsigned long features = init->features[0];
1453
1454 return !bitmap_equal(vcpu->kvm->arch.vcpu_features, &features,
1455 KVM_VCPU_MAX_FEATURES);
1456 }
1457
kvm_setup_vcpu(struct kvm_vcpu * vcpu)1458 static int kvm_setup_vcpu(struct kvm_vcpu *vcpu)
1459 {
1460 struct kvm *kvm = vcpu->kvm;
1461 int ret = 0;
1462
1463 /*
1464 * When the vCPU has a PMU, but no PMU is set for the guest
1465 * yet, set the default one.
1466 */
1467 if (kvm_vcpu_has_pmu(vcpu) && !kvm->arch.arm_pmu)
1468 ret = kvm_arm_set_default_pmu(kvm);
1469
1470 /* Prepare for nested if required */
1471 if (!ret && vcpu_has_nv(vcpu))
1472 ret = kvm_vcpu_init_nested(vcpu);
1473
1474 return ret;
1475 }
1476
__kvm_vcpu_set_target(struct kvm_vcpu * vcpu,const struct kvm_vcpu_init * init)1477 static int __kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
1478 const struct kvm_vcpu_init *init)
1479 {
1480 unsigned long features = init->features[0];
1481 struct kvm *kvm = vcpu->kvm;
1482 int ret = -EINVAL;
1483
1484 mutex_lock(&kvm->arch.config_lock);
1485
1486 if (test_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags) &&
1487 kvm_vcpu_init_changed(vcpu, init))
1488 goto out_unlock;
1489
1490 bitmap_copy(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES);
1491
1492 ret = kvm_setup_vcpu(vcpu);
1493 if (ret)
1494 goto out_unlock;
1495
1496 /* Now we know what it is, we can reset it. */
1497 kvm_reset_vcpu(vcpu);
1498
1499 set_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags);
1500 vcpu_set_flag(vcpu, VCPU_INITIALIZED);
1501 ret = 0;
1502 out_unlock:
1503 mutex_unlock(&kvm->arch.config_lock);
1504 return ret;
1505 }
1506
kvm_vcpu_set_target(struct kvm_vcpu * vcpu,const struct kvm_vcpu_init * init)1507 static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
1508 const struct kvm_vcpu_init *init)
1509 {
1510 int ret;
1511
1512 if (init->target != KVM_ARM_TARGET_GENERIC_V8 &&
1513 init->target != kvm_target_cpu())
1514 return -EINVAL;
1515
1516 ret = kvm_vcpu_init_check_features(vcpu, init);
1517 if (ret)
1518 return ret;
1519
1520 if (!kvm_vcpu_initialized(vcpu))
1521 return __kvm_vcpu_set_target(vcpu, init);
1522
1523 if (kvm_vcpu_init_changed(vcpu, init))
1524 return -EINVAL;
1525
1526 kvm_reset_vcpu(vcpu);
1527 return 0;
1528 }
1529
kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu * vcpu,struct kvm_vcpu_init * init)1530 static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
1531 struct kvm_vcpu_init *init)
1532 {
1533 bool power_off = false;
1534 int ret;
1535
1536 /*
1537 * Treat the power-off vCPU feature as ephemeral. Clear the bit to avoid
1538 * reflecting it in the finalized feature set, thus limiting its scope
1539 * to a single KVM_ARM_VCPU_INIT call.
1540 */
1541 if (init->features[0] & BIT(KVM_ARM_VCPU_POWER_OFF)) {
1542 init->features[0] &= ~BIT(KVM_ARM_VCPU_POWER_OFF);
1543 power_off = true;
1544 }
1545
1546 ret = kvm_vcpu_set_target(vcpu, init);
1547 if (ret)
1548 return ret;
1549
1550 /*
1551 * Ensure a rebooted VM will fault in RAM pages and detect if the
1552 * guest MMU is turned off and flush the caches as needed.
1553 *
1554 * S2FWB enforces all memory accesses to RAM being cacheable,
1555 * ensuring that the data side is always coherent. We still
1556 * need to invalidate the I-cache though, as FWB does *not*
1557 * imply CTR_EL0.DIC.
1558 */
1559 if (vcpu_has_run_once(vcpu)) {
1560 if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
1561 stage2_unmap_vm(vcpu->kvm);
1562 else
1563 icache_inval_all_pou();
1564 }
1565
1566 vcpu_reset_hcr(vcpu);
1567
1568 /*
1569 * Handle the "start in power-off" case.
1570 */
1571 spin_lock(&vcpu->arch.mp_state_lock);
1572
1573 if (power_off)
1574 __kvm_arm_vcpu_power_off(vcpu);
1575 else
1576 WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE);
1577
1578 spin_unlock(&vcpu->arch.mp_state_lock);
1579
1580 return 0;
1581 }
1582
kvm_arm_vcpu_set_attr(struct kvm_vcpu * vcpu,struct kvm_device_attr * attr)1583 static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu,
1584 struct kvm_device_attr *attr)
1585 {
1586 int ret = -ENXIO;
1587
1588 switch (attr->group) {
1589 default:
1590 ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr);
1591 break;
1592 }
1593
1594 return ret;
1595 }
1596
kvm_arm_vcpu_get_attr(struct kvm_vcpu * vcpu,struct kvm_device_attr * attr)1597 static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu,
1598 struct kvm_device_attr *attr)
1599 {
1600 int ret = -ENXIO;
1601
1602 switch (attr->group) {
1603 default:
1604 ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr);
1605 break;
1606 }
1607
1608 return ret;
1609 }
1610
kvm_arm_vcpu_has_attr(struct kvm_vcpu * vcpu,struct kvm_device_attr * attr)1611 static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu,
1612 struct kvm_device_attr *attr)
1613 {
1614 int ret = -ENXIO;
1615
1616 switch (attr->group) {
1617 default:
1618 ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr);
1619 break;
1620 }
1621
1622 return ret;
1623 }
1624
kvm_arm_vcpu_get_events(struct kvm_vcpu * vcpu,struct kvm_vcpu_events * events)1625 static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
1626 struct kvm_vcpu_events *events)
1627 {
1628 memset(events, 0, sizeof(*events));
1629
1630 return __kvm_arm_vcpu_get_events(vcpu, events);
1631 }
1632
kvm_arm_vcpu_set_events(struct kvm_vcpu * vcpu,struct kvm_vcpu_events * events)1633 static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
1634 struct kvm_vcpu_events *events)
1635 {
1636 int i;
1637
1638 /* check whether the reserved field is zero */
1639 for (i = 0; i < ARRAY_SIZE(events->reserved); i++)
1640 if (events->reserved[i])
1641 return -EINVAL;
1642
1643 /* check whether the pad field is zero */
1644 for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++)
1645 if (events->exception.pad[i])
1646 return -EINVAL;
1647
1648 return __kvm_arm_vcpu_set_events(vcpu, events);
1649 }
1650
kvm_arch_vcpu_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)1651 long kvm_arch_vcpu_ioctl(struct file *filp,
1652 unsigned int ioctl, unsigned long arg)
1653 {
1654 struct kvm_vcpu *vcpu = filp->private_data;
1655 void __user *argp = (void __user *)arg;
1656 struct kvm_device_attr attr;
1657 long r;
1658
1659 switch (ioctl) {
1660 case KVM_ARM_VCPU_INIT: {
1661 struct kvm_vcpu_init init;
1662
1663 r = -EFAULT;
1664 if (copy_from_user(&init, argp, sizeof(init)))
1665 break;
1666
1667 r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init);
1668 break;
1669 }
1670 case KVM_SET_ONE_REG:
1671 case KVM_GET_ONE_REG: {
1672 struct kvm_one_reg reg;
1673
1674 r = -ENOEXEC;
1675 if (unlikely(!kvm_vcpu_initialized(vcpu)))
1676 break;
1677
1678 r = -EFAULT;
1679 if (copy_from_user(®, argp, sizeof(reg)))
1680 break;
1681
1682 /*
1683 * We could owe a reset due to PSCI. Handle the pending reset
1684 * here to ensure userspace register accesses are ordered after
1685 * the reset.
1686 */
1687 if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
1688 kvm_reset_vcpu(vcpu);
1689
1690 if (ioctl == KVM_SET_ONE_REG)
1691 r = kvm_arm_set_reg(vcpu, ®);
1692 else
1693 r = kvm_arm_get_reg(vcpu, ®);
1694 break;
1695 }
1696 case KVM_GET_REG_LIST: {
1697 struct kvm_reg_list __user *user_list = argp;
1698 struct kvm_reg_list reg_list;
1699 unsigned n;
1700
1701 r = -ENOEXEC;
1702 if (unlikely(!kvm_vcpu_initialized(vcpu)))
1703 break;
1704
1705 r = -EPERM;
1706 if (!kvm_arm_vcpu_is_finalized(vcpu))
1707 break;
1708
1709 r = -EFAULT;
1710 if (copy_from_user(®_list, user_list, sizeof(reg_list)))
1711 break;
1712 n = reg_list.n;
1713 reg_list.n = kvm_arm_num_regs(vcpu);
1714 if (copy_to_user(user_list, ®_list, sizeof(reg_list)))
1715 break;
1716 r = -E2BIG;
1717 if (n < reg_list.n)
1718 break;
1719 r = kvm_arm_copy_reg_indices(vcpu, user_list->reg);
1720 break;
1721 }
1722 case KVM_SET_DEVICE_ATTR: {
1723 r = -EFAULT;
1724 if (copy_from_user(&attr, argp, sizeof(attr)))
1725 break;
1726 r = kvm_arm_vcpu_set_attr(vcpu, &attr);
1727 break;
1728 }
1729 case KVM_GET_DEVICE_ATTR: {
1730 r = -EFAULT;
1731 if (copy_from_user(&attr, argp, sizeof(attr)))
1732 break;
1733 r = kvm_arm_vcpu_get_attr(vcpu, &attr);
1734 break;
1735 }
1736 case KVM_HAS_DEVICE_ATTR: {
1737 r = -EFAULT;
1738 if (copy_from_user(&attr, argp, sizeof(attr)))
1739 break;
1740 r = kvm_arm_vcpu_has_attr(vcpu, &attr);
1741 break;
1742 }
1743 case KVM_GET_VCPU_EVENTS: {
1744 struct kvm_vcpu_events events;
1745
1746 if (kvm_arm_vcpu_get_events(vcpu, &events))
1747 return -EINVAL;
1748
1749 if (copy_to_user(argp, &events, sizeof(events)))
1750 return -EFAULT;
1751
1752 return 0;
1753 }
1754 case KVM_SET_VCPU_EVENTS: {
1755 struct kvm_vcpu_events events;
1756
1757 if (copy_from_user(&events, argp, sizeof(events)))
1758 return -EFAULT;
1759
1760 return kvm_arm_vcpu_set_events(vcpu, &events);
1761 }
1762 case KVM_ARM_VCPU_FINALIZE: {
1763 int what;
1764
1765 if (!kvm_vcpu_initialized(vcpu))
1766 return -ENOEXEC;
1767
1768 if (get_user(what, (const int __user *)argp))
1769 return -EFAULT;
1770
1771 return kvm_arm_vcpu_finalize(vcpu, what);
1772 }
1773 default:
1774 r = -EINVAL;
1775 }
1776
1777 return r;
1778 }
1779
kvm_arch_sync_dirty_log(struct kvm * kvm,struct kvm_memory_slot * memslot)1780 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
1781 {
1782
1783 }
1784
kvm_vm_ioctl_set_device_addr(struct kvm * kvm,struct kvm_arm_device_addr * dev_addr)1785 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
1786 struct kvm_arm_device_addr *dev_addr)
1787 {
1788 switch (FIELD_GET(KVM_ARM_DEVICE_ID_MASK, dev_addr->id)) {
1789 case KVM_ARM_DEVICE_VGIC_V2:
1790 if (!vgic_present)
1791 return -ENXIO;
1792 return kvm_set_legacy_vgic_v2_addr(kvm, dev_addr);
1793 default:
1794 return -ENODEV;
1795 }
1796 }
1797
kvm_vm_has_attr(struct kvm * kvm,struct kvm_device_attr * attr)1798 static int kvm_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
1799 {
1800 switch (attr->group) {
1801 case KVM_ARM_VM_SMCCC_CTRL:
1802 return kvm_vm_smccc_has_attr(kvm, attr);
1803 default:
1804 return -ENXIO;
1805 }
1806 }
1807
kvm_vm_set_attr(struct kvm * kvm,struct kvm_device_attr * attr)1808 static int kvm_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
1809 {
1810 switch (attr->group) {
1811 case KVM_ARM_VM_SMCCC_CTRL:
1812 return kvm_vm_smccc_set_attr(kvm, attr);
1813 default:
1814 return -ENXIO;
1815 }
1816 }
1817
kvm_arch_vm_ioctl(struct file * filp,unsigned int ioctl,unsigned long arg)1818 int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
1819 {
1820 struct kvm *kvm = filp->private_data;
1821 void __user *argp = (void __user *)arg;
1822 struct kvm_device_attr attr;
1823
1824 switch (ioctl) {
1825 case KVM_CREATE_IRQCHIP: {
1826 int ret;
1827 if (!vgic_present)
1828 return -ENXIO;
1829 mutex_lock(&kvm->lock);
1830 ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
1831 mutex_unlock(&kvm->lock);
1832 return ret;
1833 }
1834 case KVM_ARM_SET_DEVICE_ADDR: {
1835 struct kvm_arm_device_addr dev_addr;
1836
1837 if (copy_from_user(&dev_addr, argp, sizeof(dev_addr)))
1838 return -EFAULT;
1839 return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr);
1840 }
1841 case KVM_ARM_PREFERRED_TARGET: {
1842 struct kvm_vcpu_init init = {
1843 .target = KVM_ARM_TARGET_GENERIC_V8,
1844 };
1845
1846 if (copy_to_user(argp, &init, sizeof(init)))
1847 return -EFAULT;
1848
1849 return 0;
1850 }
1851 case KVM_ARM_MTE_COPY_TAGS: {
1852 struct kvm_arm_copy_mte_tags copy_tags;
1853
1854 if (copy_from_user(©_tags, argp, sizeof(copy_tags)))
1855 return -EFAULT;
1856 return kvm_vm_ioctl_mte_copy_tags(kvm, ©_tags);
1857 }
1858 case KVM_ARM_SET_COUNTER_OFFSET: {
1859 struct kvm_arm_counter_offset offset;
1860
1861 if (copy_from_user(&offset, argp, sizeof(offset)))
1862 return -EFAULT;
1863 return kvm_vm_ioctl_set_counter_offset(kvm, &offset);
1864 }
1865 case KVM_HAS_DEVICE_ATTR: {
1866 if (copy_from_user(&attr, argp, sizeof(attr)))
1867 return -EFAULT;
1868
1869 return kvm_vm_has_attr(kvm, &attr);
1870 }
1871 case KVM_SET_DEVICE_ATTR: {
1872 if (copy_from_user(&attr, argp, sizeof(attr)))
1873 return -EFAULT;
1874
1875 return kvm_vm_set_attr(kvm, &attr);
1876 }
1877 case KVM_ARM_GET_REG_WRITABLE_MASKS: {
1878 struct reg_mask_range range;
1879
1880 if (copy_from_user(&range, argp, sizeof(range)))
1881 return -EFAULT;
1882 return kvm_vm_ioctl_get_reg_writable_masks(kvm, &range);
1883 }
1884 default:
1885 return -EINVAL;
1886 }
1887 }
1888
1889 /* unlocks vcpus from @vcpu_lock_idx and smaller */
unlock_vcpus(struct kvm * kvm,int vcpu_lock_idx)1890 static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
1891 {
1892 struct kvm_vcpu *tmp_vcpu;
1893
1894 for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
1895 tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
1896 mutex_unlock(&tmp_vcpu->mutex);
1897 }
1898 }
1899
unlock_all_vcpus(struct kvm * kvm)1900 void unlock_all_vcpus(struct kvm *kvm)
1901 {
1902 lockdep_assert_held(&kvm->lock);
1903
1904 unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
1905 }
1906
1907 /* Returns true if all vcpus were locked, false otherwise */
lock_all_vcpus(struct kvm * kvm)1908 bool lock_all_vcpus(struct kvm *kvm)
1909 {
1910 struct kvm_vcpu *tmp_vcpu;
1911 unsigned long c;
1912
1913 lockdep_assert_held(&kvm->lock);
1914
1915 /*
1916 * Any time a vcpu is in an ioctl (including running), the
1917 * core KVM code tries to grab the vcpu->mutex.
1918 *
1919 * By grabbing the vcpu->mutex of all VCPUs we ensure that no
1920 * other VCPUs can fiddle with the state while we access it.
1921 */
1922 kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
1923 if (!mutex_trylock(&tmp_vcpu->mutex)) {
1924 unlock_vcpus(kvm, c - 1);
1925 return false;
1926 }
1927 }
1928
1929 return true;
1930 }
1931
nvhe_percpu_size(void)1932 static unsigned long nvhe_percpu_size(void)
1933 {
1934 return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) -
1935 (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start);
1936 }
1937
nvhe_percpu_order(void)1938 static unsigned long nvhe_percpu_order(void)
1939 {
1940 unsigned long size = nvhe_percpu_size();
1941
1942 return size ? get_order(size) : 0;
1943 }
1944
pkvm_host_sve_state_order(void)1945 static size_t pkvm_host_sve_state_order(void)
1946 {
1947 return get_order(pkvm_host_sve_state_size());
1948 }
1949
1950 /* A lookup table holding the hypervisor VA for each vector slot */
1951 static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS];
1952
kvm_init_vector_slot(void * base,enum arm64_hyp_spectre_vector slot)1953 static void kvm_init_vector_slot(void *base, enum arm64_hyp_spectre_vector slot)
1954 {
1955 hyp_spectre_vector_selector[slot] = __kvm_vector_slot2addr(base, slot);
1956 }
1957
kvm_init_vector_slots(void)1958 static int kvm_init_vector_slots(void)
1959 {
1960 int err;
1961 void *base;
1962
1963 base = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector));
1964 kvm_init_vector_slot(base, HYP_VECTOR_DIRECT);
1965
1966 base = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs));
1967 kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_DIRECT);
1968
1969 if (kvm_system_needs_idmapped_vectors() &&
1970 !is_protected_kvm_enabled()) {
1971 err = create_hyp_exec_mappings(__pa_symbol(__bp_harden_hyp_vecs),
1972 __BP_HARDEN_HYP_VECS_SZ, &base);
1973 if (err)
1974 return err;
1975 }
1976
1977 kvm_init_vector_slot(base, HYP_VECTOR_INDIRECT);
1978 kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_INDIRECT);
1979 return 0;
1980 }
1981
cpu_prepare_hyp_mode(int cpu,u32 hyp_va_bits)1982 static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
1983 {
1984 struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
1985 unsigned long tcr;
1986
1987 /*
1988 * Calculate the raw per-cpu offset without a translation from the
1989 * kernel's mapping to the linear mapping, and store it in tpidr_el2
1990 * so that we can use adr_l to access per-cpu variables in EL2.
1991 * Also drop the KASAN tag which gets in the way...
1992 */
1993 params->tpidr_el2 = (unsigned long)kasan_reset_tag(per_cpu_ptr_nvhe_sym(__per_cpu_start, cpu)) -
1994 (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));
1995
1996 params->mair_el2 = read_sysreg(mair_el1);
1997
1998 tcr = read_sysreg(tcr_el1);
1999 if (cpus_have_final_cap(ARM64_KVM_HVHE)) {
2000 tcr &= ~(TCR_HD | TCR_HA | TCR_A1 | TCR_T0SZ_MASK);
2001 tcr |= TCR_EPD1_MASK;
2002 } else {
2003 unsigned long ips = FIELD_GET(TCR_IPS_MASK, tcr);
2004
2005 tcr &= TCR_EL2_MASK;
2006 tcr |= TCR_EL2_RES1 | FIELD_PREP(TCR_EL2_PS_MASK, ips);
2007 if (lpa2_is_enabled())
2008 tcr |= TCR_EL2_DS;
2009 }
2010 tcr |= TCR_T0SZ(hyp_va_bits);
2011 params->tcr_el2 = tcr;
2012
2013 params->pgd_pa = kvm_mmu_get_httbr();
2014 if (is_protected_kvm_enabled())
2015 params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS;
2016 else
2017 params->hcr_el2 = HCR_HOST_NVHE_FLAGS;
2018 if (cpus_have_final_cap(ARM64_KVM_HVHE))
2019 params->hcr_el2 |= HCR_E2H;
2020 params->vttbr = params->vtcr = 0;
2021
2022 /*
2023 * Flush the init params from the data cache because the struct will
2024 * be read while the MMU is off.
2025 */
2026 kvm_flush_dcache_to_poc(params, sizeof(*params));
2027 }
2028
hyp_install_host_vector(void)2029 static void hyp_install_host_vector(void)
2030 {
2031 struct kvm_nvhe_init_params *params;
2032 struct arm_smccc_res res;
2033
2034 /* Switch from the HYP stub to our own HYP init vector */
2035 __hyp_set_vectors(kvm_get_idmap_vector());
2036
2037 /*
2038 * Call initialization code, and switch to the full blown HYP code.
2039 * If the cpucaps haven't been finalized yet, something has gone very
2040 * wrong, and hyp will crash and burn when it uses any
2041 * cpus_have_*_cap() wrapper.
2042 */
2043 BUG_ON(!system_capabilities_finalized());
2044 params = this_cpu_ptr_nvhe_sym(kvm_init_params);
2045 arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), virt_to_phys(params), &res);
2046 WARN_ON(res.a0 != SMCCC_RET_SUCCESS);
2047 }
2048
cpu_init_hyp_mode(void)2049 static void cpu_init_hyp_mode(void)
2050 {
2051 hyp_install_host_vector();
2052
2053 /*
2054 * Disabling SSBD on a non-VHE system requires us to enable SSBS
2055 * at EL2.
2056 */
2057 if (this_cpu_has_cap(ARM64_SSBS) &&
2058 arm64_get_spectre_v4_state() == SPECTRE_VULNERABLE) {
2059 kvm_call_hyp_nvhe(__kvm_enable_ssbs);
2060 }
2061 }
2062
cpu_hyp_reset(void)2063 static void cpu_hyp_reset(void)
2064 {
2065 if (!is_kernel_in_hyp_mode())
2066 __hyp_reset_vectors();
2067 }
2068
2069 /*
2070 * EL2 vectors can be mapped and rerouted in a number of ways,
2071 * depending on the kernel configuration and CPU present:
2072 *
2073 * - If the CPU is affected by Spectre-v2, the hardening sequence is
2074 * placed in one of the vector slots, which is executed before jumping
2075 * to the real vectors.
2076 *
2077 * - If the CPU also has the ARM64_SPECTRE_V3A cap, the slot
2078 * containing the hardening sequence is mapped next to the idmap page,
2079 * and executed before jumping to the real vectors.
2080 *
2081 * - If the CPU only has the ARM64_SPECTRE_V3A cap, then an
2082 * empty slot is selected, mapped next to the idmap page, and
2083 * executed before jumping to the real vectors.
2084 *
2085 * Note that ARM64_SPECTRE_V3A is somewhat incompatible with
2086 * VHE, as we don't have hypervisor-specific mappings. If the system
2087 * is VHE and yet selects this capability, it will be ignored.
2088 */
cpu_set_hyp_vector(void)2089 static void cpu_set_hyp_vector(void)
2090 {
2091 struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data);
2092 void *vector = hyp_spectre_vector_selector[data->slot];
2093
2094 if (!is_protected_kvm_enabled())
2095 *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector;
2096 else
2097 kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot);
2098 }
2099
cpu_hyp_init_context(void)2100 static void cpu_hyp_init_context(void)
2101 {
2102 kvm_init_host_cpu_context(host_data_ptr(host_ctxt));
2103 kvm_init_host_debug_data();
2104
2105 if (!is_kernel_in_hyp_mode())
2106 cpu_init_hyp_mode();
2107 }
2108
cpu_hyp_init_features(void)2109 static void cpu_hyp_init_features(void)
2110 {
2111 cpu_set_hyp_vector();
2112
2113 if (is_kernel_in_hyp_mode())
2114 kvm_timer_init_vhe();
2115
2116 if (vgic_present)
2117 kvm_vgic_init_cpu_hardware();
2118 }
2119
cpu_hyp_reinit(void)2120 static void cpu_hyp_reinit(void)
2121 {
2122 cpu_hyp_reset();
2123 cpu_hyp_init_context();
2124 cpu_hyp_init_features();
2125 }
2126
cpu_hyp_init(void * discard)2127 static void cpu_hyp_init(void *discard)
2128 {
2129 if (!__this_cpu_read(kvm_hyp_initialized)) {
2130 cpu_hyp_reinit();
2131 __this_cpu_write(kvm_hyp_initialized, 1);
2132 }
2133 }
2134
cpu_hyp_uninit(void * discard)2135 static void cpu_hyp_uninit(void *discard)
2136 {
2137 if (__this_cpu_read(kvm_hyp_initialized)) {
2138 cpu_hyp_reset();
2139 __this_cpu_write(kvm_hyp_initialized, 0);
2140 }
2141 }
2142
kvm_arch_enable_virtualization_cpu(void)2143 int kvm_arch_enable_virtualization_cpu(void)
2144 {
2145 /*
2146 * Most calls to this function are made with migration
2147 * disabled, but not with preemption disabled. The former is
2148 * enough to ensure correctness, but most of the helpers
2149 * expect the later and will throw a tantrum otherwise.
2150 */
2151 preempt_disable();
2152
2153 cpu_hyp_init(NULL);
2154
2155 kvm_vgic_cpu_up();
2156 kvm_timer_cpu_up();
2157
2158 preempt_enable();
2159
2160 return 0;
2161 }
2162
kvm_arch_disable_virtualization_cpu(void)2163 void kvm_arch_disable_virtualization_cpu(void)
2164 {
2165 kvm_timer_cpu_down();
2166 kvm_vgic_cpu_down();
2167
2168 if (!is_protected_kvm_enabled())
2169 cpu_hyp_uninit(NULL);
2170 }
2171
2172 #ifdef CONFIG_CPU_PM
hyp_init_cpu_pm_notifier(struct notifier_block * self,unsigned long cmd,void * v)2173 static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
2174 unsigned long cmd,
2175 void *v)
2176 {
2177 /*
2178 * kvm_hyp_initialized is left with its old value over
2179 * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
2180 * re-enable hyp.
2181 */
2182 switch (cmd) {
2183 case CPU_PM_ENTER:
2184 if (__this_cpu_read(kvm_hyp_initialized))
2185 /*
2186 * don't update kvm_hyp_initialized here
2187 * so that the hyp will be re-enabled
2188 * when we resume. See below.
2189 */
2190 cpu_hyp_reset();
2191
2192 return NOTIFY_OK;
2193 case CPU_PM_ENTER_FAILED:
2194 case CPU_PM_EXIT:
2195 if (__this_cpu_read(kvm_hyp_initialized))
2196 /* The hyp was enabled before suspend. */
2197 cpu_hyp_reinit();
2198
2199 return NOTIFY_OK;
2200
2201 default:
2202 return NOTIFY_DONE;
2203 }
2204 }
2205
2206 static struct notifier_block hyp_init_cpu_pm_nb = {
2207 .notifier_call = hyp_init_cpu_pm_notifier,
2208 };
2209
hyp_cpu_pm_init(void)2210 static void __init hyp_cpu_pm_init(void)
2211 {
2212 if (!is_protected_kvm_enabled())
2213 cpu_pm_register_notifier(&hyp_init_cpu_pm_nb);
2214 }
hyp_cpu_pm_exit(void)2215 static void __init hyp_cpu_pm_exit(void)
2216 {
2217 if (!is_protected_kvm_enabled())
2218 cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb);
2219 }
2220 #else
hyp_cpu_pm_init(void)2221 static inline void __init hyp_cpu_pm_init(void)
2222 {
2223 }
hyp_cpu_pm_exit(void)2224 static inline void __init hyp_cpu_pm_exit(void)
2225 {
2226 }
2227 #endif
2228
init_cpu_logical_map(void)2229 static void __init init_cpu_logical_map(void)
2230 {
2231 unsigned int cpu;
2232
2233 /*
2234 * Copy the MPIDR <-> logical CPU ID mapping to hyp.
2235 * Only copy the set of online CPUs whose features have been checked
2236 * against the finalized system capabilities. The hypervisor will not
2237 * allow any other CPUs from the `possible` set to boot.
2238 */
2239 for_each_online_cpu(cpu)
2240 hyp_cpu_logical_map[cpu] = cpu_logical_map(cpu);
2241 }
2242
2243 #define init_psci_0_1_impl_state(config, what) \
2244 config.psci_0_1_ ## what ## _implemented = psci_ops.what
2245
init_psci_relay(void)2246 static bool __init init_psci_relay(void)
2247 {
2248 /*
2249 * If PSCI has not been initialized, protected KVM cannot install
2250 * itself on newly booted CPUs.
2251 */
2252 if (!psci_ops.get_version) {
2253 kvm_err("Cannot initialize protected mode without PSCI\n");
2254 return false;
2255 }
2256
2257 kvm_host_psci_config.version = psci_ops.get_version();
2258 kvm_host_psci_config.smccc_version = arm_smccc_get_version();
2259
2260 if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) {
2261 kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids();
2262 init_psci_0_1_impl_state(kvm_host_psci_config, cpu_suspend);
2263 init_psci_0_1_impl_state(kvm_host_psci_config, cpu_on);
2264 init_psci_0_1_impl_state(kvm_host_psci_config, cpu_off);
2265 init_psci_0_1_impl_state(kvm_host_psci_config, migrate);
2266 }
2267 return true;
2268 }
2269
init_subsystems(void)2270 static int __init init_subsystems(void)
2271 {
2272 int err = 0;
2273
2274 /*
2275 * Enable hardware so that subsystem initialisation can access EL2.
2276 */
2277 on_each_cpu(cpu_hyp_init, NULL, 1);
2278
2279 /*
2280 * Register CPU lower-power notifier
2281 */
2282 hyp_cpu_pm_init();
2283
2284 /*
2285 * Init HYP view of VGIC
2286 */
2287 err = kvm_vgic_hyp_init();
2288 switch (err) {
2289 case 0:
2290 vgic_present = true;
2291 break;
2292 case -ENODEV:
2293 case -ENXIO:
2294 /*
2295 * No VGIC? No pKVM for you.
2296 *
2297 * Protected mode assumes that VGICv3 is present, so no point
2298 * in trying to hobble along if vgic initialization fails.
2299 */
2300 if (is_protected_kvm_enabled())
2301 goto out;
2302
2303 /*
2304 * Otherwise, userspace could choose to implement a GIC for its
2305 * guest on non-cooperative hardware.
2306 */
2307 vgic_present = false;
2308 err = 0;
2309 break;
2310 default:
2311 goto out;
2312 }
2313
2314 /*
2315 * Init HYP architected timer support
2316 */
2317 err = kvm_timer_hyp_init(vgic_present);
2318 if (err)
2319 goto out;
2320
2321 kvm_register_perf_callbacks(NULL);
2322
2323 out:
2324 if (err)
2325 hyp_cpu_pm_exit();
2326
2327 if (err || !is_protected_kvm_enabled())
2328 on_each_cpu(cpu_hyp_uninit, NULL, 1);
2329
2330 return err;
2331 }
2332
teardown_subsystems(void)2333 static void __init teardown_subsystems(void)
2334 {
2335 kvm_unregister_perf_callbacks();
2336 hyp_cpu_pm_exit();
2337 }
2338
teardown_hyp_mode(void)2339 static void __init teardown_hyp_mode(void)
2340 {
2341 bool free_sve = system_supports_sve() && is_protected_kvm_enabled();
2342 int cpu;
2343
2344 free_hyp_pgds();
2345 for_each_possible_cpu(cpu) {
2346 free_pages(per_cpu(kvm_arm_hyp_stack_base, cpu), NVHE_STACK_SHIFT - PAGE_SHIFT);
2347 free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order());
2348
2349 if (free_sve) {
2350 struct cpu_sve_state *sve_state;
2351
2352 sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state;
2353 free_pages((unsigned long) sve_state, pkvm_host_sve_state_order());
2354 }
2355 }
2356 }
2357
do_pkvm_init(u32 hyp_va_bits)2358 static int __init do_pkvm_init(u32 hyp_va_bits)
2359 {
2360 void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base));
2361 int ret;
2362
2363 preempt_disable();
2364 cpu_hyp_init_context();
2365 ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size,
2366 num_possible_cpus(), kern_hyp_va(per_cpu_base),
2367 hyp_va_bits);
2368 cpu_hyp_init_features();
2369
2370 /*
2371 * The stub hypercalls are now disabled, so set our local flag to
2372 * prevent a later re-init attempt in kvm_arch_enable_virtualization_cpu().
2373 */
2374 __this_cpu_write(kvm_hyp_initialized, 1);
2375 preempt_enable();
2376
2377 return ret;
2378 }
2379
get_hyp_id_aa64pfr0_el1(void)2380 static u64 get_hyp_id_aa64pfr0_el1(void)
2381 {
2382 /*
2383 * Track whether the system isn't affected by spectre/meltdown in the
2384 * hypervisor's view of id_aa64pfr0_el1, used for protected VMs.
2385 * Although this is per-CPU, we make it global for simplicity, e.g., not
2386 * to have to worry about vcpu migration.
2387 *
2388 * Unlike for non-protected VMs, userspace cannot override this for
2389 * protected VMs.
2390 */
2391 u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
2392
2393 val &= ~(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) |
2394 ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3));
2395
2396 val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2),
2397 arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED);
2398 val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3),
2399 arm64_get_meltdown_state() == SPECTRE_UNAFFECTED);
2400
2401 return val;
2402 }
2403
kvm_hyp_init_symbols(void)2404 static void kvm_hyp_init_symbols(void)
2405 {
2406 kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = get_hyp_id_aa64pfr0_el1();
2407 kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
2408 kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1);
2409 kvm_nvhe_sym(id_aa64isar1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1);
2410 kvm_nvhe_sym(id_aa64isar2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1);
2411 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
2412 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
2413 kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1);
2414 kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1);
2415 kvm_nvhe_sym(__icache_flags) = __icache_flags;
2416 kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits;
2417
2418 /*
2419 * Flush entire BSS since part of its data containing init symbols is read
2420 * while the MMU is off.
2421 */
2422 kvm_flush_dcache_to_poc(kvm_ksym_ref(__hyp_bss_start),
2423 kvm_ksym_ref(__hyp_bss_end) - kvm_ksym_ref(__hyp_bss_start));
2424 }
2425
kvm_hyp_init_protection(u32 hyp_va_bits)2426 static int __init kvm_hyp_init_protection(u32 hyp_va_bits)
2427 {
2428 void *addr = phys_to_virt(hyp_mem_base);
2429 int ret;
2430
2431 ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP);
2432 if (ret)
2433 return ret;
2434
2435 ret = do_pkvm_init(hyp_va_bits);
2436 if (ret)
2437 return ret;
2438
2439 free_hyp_pgds();
2440
2441 return 0;
2442 }
2443
init_pkvm_host_sve_state(void)2444 static int init_pkvm_host_sve_state(void)
2445 {
2446 int cpu;
2447
2448 if (!system_supports_sve())
2449 return 0;
2450
2451 /* Allocate pages for host sve state in protected mode. */
2452 for_each_possible_cpu(cpu) {
2453 struct page *page = alloc_pages(GFP_KERNEL, pkvm_host_sve_state_order());
2454
2455 if (!page)
2456 return -ENOMEM;
2457
2458 per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = page_address(page);
2459 }
2460
2461 /*
2462 * Don't map the pages in hyp since these are only used in protected
2463 * mode, which will (re)create its own mapping when initialized.
2464 */
2465
2466 return 0;
2467 }
2468
2469 /*
2470 * Finalizes the initialization of hyp mode, once everything else is initialized
2471 * and the initialziation process cannot fail.
2472 */
finalize_init_hyp_mode(void)2473 static void finalize_init_hyp_mode(void)
2474 {
2475 int cpu;
2476
2477 if (system_supports_sve() && is_protected_kvm_enabled()) {
2478 for_each_possible_cpu(cpu) {
2479 struct cpu_sve_state *sve_state;
2480
2481 sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state;
2482 per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state =
2483 kern_hyp_va(sve_state);
2484 }
2485 }
2486 }
2487
pkvm_hyp_init_ptrauth(void)2488 static void pkvm_hyp_init_ptrauth(void)
2489 {
2490 struct kvm_cpu_context *hyp_ctxt;
2491 int cpu;
2492
2493 for_each_possible_cpu(cpu) {
2494 hyp_ctxt = per_cpu_ptr_nvhe_sym(kvm_hyp_ctxt, cpu);
2495 hyp_ctxt->sys_regs[APIAKEYLO_EL1] = get_random_long();
2496 hyp_ctxt->sys_regs[APIAKEYHI_EL1] = get_random_long();
2497 hyp_ctxt->sys_regs[APIBKEYLO_EL1] = get_random_long();
2498 hyp_ctxt->sys_regs[APIBKEYHI_EL1] = get_random_long();
2499 hyp_ctxt->sys_regs[APDAKEYLO_EL1] = get_random_long();
2500 hyp_ctxt->sys_regs[APDAKEYHI_EL1] = get_random_long();
2501 hyp_ctxt->sys_regs[APDBKEYLO_EL1] = get_random_long();
2502 hyp_ctxt->sys_regs[APDBKEYHI_EL1] = get_random_long();
2503 hyp_ctxt->sys_regs[APGAKEYLO_EL1] = get_random_long();
2504 hyp_ctxt->sys_regs[APGAKEYHI_EL1] = get_random_long();
2505 }
2506 }
2507
2508 /* Inits Hyp-mode on all online CPUs */
init_hyp_mode(void)2509 static int __init init_hyp_mode(void)
2510 {
2511 u32 hyp_va_bits;
2512 int cpu;
2513 int err = -ENOMEM;
2514
2515 /*
2516 * The protected Hyp-mode cannot be initialized if the memory pool
2517 * allocation has failed.
2518 */
2519 if (is_protected_kvm_enabled() && !hyp_mem_base)
2520 goto out_err;
2521
2522 /*
2523 * Allocate Hyp PGD and setup Hyp identity mapping
2524 */
2525 err = kvm_mmu_init(&hyp_va_bits);
2526 if (err)
2527 goto out_err;
2528
2529 /*
2530 * Allocate stack pages for Hypervisor-mode
2531 */
2532 for_each_possible_cpu(cpu) {
2533 unsigned long stack_base;
2534
2535 stack_base = __get_free_pages(GFP_KERNEL, NVHE_STACK_SHIFT - PAGE_SHIFT);
2536 if (!stack_base) {
2537 err = -ENOMEM;
2538 goto out_err;
2539 }
2540
2541 per_cpu(kvm_arm_hyp_stack_base, cpu) = stack_base;
2542 }
2543
2544 /*
2545 * Allocate and initialize pages for Hypervisor-mode percpu regions.
2546 */
2547 for_each_possible_cpu(cpu) {
2548 struct page *page;
2549 void *page_addr;
2550
2551 page = alloc_pages(GFP_KERNEL, nvhe_percpu_order());
2552 if (!page) {
2553 err = -ENOMEM;
2554 goto out_err;
2555 }
2556
2557 page_addr = page_address(page);
2558 memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size());
2559 kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu] = (unsigned long)page_addr;
2560 }
2561
2562 /*
2563 * Map the Hyp-code called directly from the host
2564 */
2565 err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
2566 kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
2567 if (err) {
2568 kvm_err("Cannot map world-switch code\n");
2569 goto out_err;
2570 }
2571
2572 err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start),
2573 kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO);
2574 if (err) {
2575 kvm_err("Cannot map .hyp.rodata section\n");
2576 goto out_err;
2577 }
2578
2579 err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
2580 kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
2581 if (err) {
2582 kvm_err("Cannot map rodata section\n");
2583 goto out_err;
2584 }
2585
2586 /*
2587 * .hyp.bss is guaranteed to be placed at the beginning of the .bss
2588 * section thanks to an assertion in the linker script. Map it RW and
2589 * the rest of .bss RO.
2590 */
2591 err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_start),
2592 kvm_ksym_ref(__hyp_bss_end), PAGE_HYP);
2593 if (err) {
2594 kvm_err("Cannot map hyp bss section: %d\n", err);
2595 goto out_err;
2596 }
2597
2598 err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_end),
2599 kvm_ksym_ref(__bss_stop), PAGE_HYP_RO);
2600 if (err) {
2601 kvm_err("Cannot map bss section\n");
2602 goto out_err;
2603 }
2604
2605 /*
2606 * Map the Hyp stack pages
2607 */
2608 for_each_possible_cpu(cpu) {
2609 struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
2610 char *stack_base = (char *)per_cpu(kvm_arm_hyp_stack_base, cpu);
2611
2612 err = create_hyp_stack(__pa(stack_base), ¶ms->stack_hyp_va);
2613 if (err) {
2614 kvm_err("Cannot map hyp stack\n");
2615 goto out_err;
2616 }
2617
2618 /*
2619 * Save the stack PA in nvhe_init_params. This will be needed
2620 * to recreate the stack mapping in protected nVHE mode.
2621 * __hyp_pa() won't do the right thing there, since the stack
2622 * has been mapped in the flexible private VA space.
2623 */
2624 params->stack_pa = __pa(stack_base);
2625 }
2626
2627 for_each_possible_cpu(cpu) {
2628 char *percpu_begin = (char *)kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu];
2629 char *percpu_end = percpu_begin + nvhe_percpu_size();
2630
2631 /* Map Hyp percpu pages */
2632 err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP);
2633 if (err) {
2634 kvm_err("Cannot map hyp percpu region\n");
2635 goto out_err;
2636 }
2637
2638 /* Prepare the CPU initialization parameters */
2639 cpu_prepare_hyp_mode(cpu, hyp_va_bits);
2640 }
2641
2642 kvm_hyp_init_symbols();
2643
2644 if (is_protected_kvm_enabled()) {
2645 if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) &&
2646 cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH))
2647 pkvm_hyp_init_ptrauth();
2648
2649 init_cpu_logical_map();
2650
2651 if (!init_psci_relay()) {
2652 err = -ENODEV;
2653 goto out_err;
2654 }
2655
2656 err = init_pkvm_host_sve_state();
2657 if (err)
2658 goto out_err;
2659
2660 err = kvm_hyp_init_protection(hyp_va_bits);
2661 if (err) {
2662 kvm_err("Failed to init hyp memory protection\n");
2663 goto out_err;
2664 }
2665 }
2666
2667 return 0;
2668
2669 out_err:
2670 teardown_hyp_mode();
2671 kvm_err("error initializing Hyp mode: %d\n", err);
2672 return err;
2673 }
2674
kvm_mpidr_to_vcpu(struct kvm * kvm,unsigned long mpidr)2675 struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
2676 {
2677 struct kvm_vcpu *vcpu = NULL;
2678 struct kvm_mpidr_data *data;
2679 unsigned long i;
2680
2681 mpidr &= MPIDR_HWID_BITMASK;
2682
2683 rcu_read_lock();
2684 data = rcu_dereference(kvm->arch.mpidr_data);
2685
2686 if (data) {
2687 u16 idx = kvm_mpidr_index(data, mpidr);
2688
2689 vcpu = kvm_get_vcpu(kvm, data->cmpidr_to_idx[idx]);
2690 if (mpidr != kvm_vcpu_get_mpidr_aff(vcpu))
2691 vcpu = NULL;
2692 }
2693
2694 rcu_read_unlock();
2695
2696 if (vcpu)
2697 return vcpu;
2698
2699 kvm_for_each_vcpu(i, vcpu, kvm) {
2700 if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
2701 return vcpu;
2702 }
2703 return NULL;
2704 }
2705
kvm_arch_irqchip_in_kernel(struct kvm * kvm)2706 bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
2707 {
2708 return irqchip_in_kernel(kvm);
2709 }
2710
kvm_arch_has_irq_bypass(void)2711 bool kvm_arch_has_irq_bypass(void)
2712 {
2713 return true;
2714 }
2715
kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer * cons,struct irq_bypass_producer * prod)2716 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
2717 struct irq_bypass_producer *prod)
2718 {
2719 struct kvm_kernel_irqfd *irqfd =
2720 container_of(cons, struct kvm_kernel_irqfd, consumer);
2721
2722 return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq,
2723 &irqfd->irq_entry);
2724 }
kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer * cons,struct irq_bypass_producer * prod)2725 void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
2726 struct irq_bypass_producer *prod)
2727 {
2728 struct kvm_kernel_irqfd *irqfd =
2729 container_of(cons, struct kvm_kernel_irqfd, consumer);
2730
2731 kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq,
2732 &irqfd->irq_entry);
2733 }
2734
kvm_arch_irq_bypass_stop(struct irq_bypass_consumer * cons)2735 void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)
2736 {
2737 struct kvm_kernel_irqfd *irqfd =
2738 container_of(cons, struct kvm_kernel_irqfd, consumer);
2739
2740 kvm_arm_halt_guest(irqfd->kvm);
2741 }
2742
kvm_arch_irq_bypass_start(struct irq_bypass_consumer * cons)2743 void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons)
2744 {
2745 struct kvm_kernel_irqfd *irqfd =
2746 container_of(cons, struct kvm_kernel_irqfd, consumer);
2747
2748 kvm_arm_resume_guest(irqfd->kvm);
2749 }
2750
2751 /* Initialize Hyp-mode and memory mappings on all CPUs */
kvm_arm_init(void)2752 static __init int kvm_arm_init(void)
2753 {
2754 int err;
2755 bool in_hyp_mode;
2756
2757 if (!is_hyp_mode_available()) {
2758 kvm_info("HYP mode not available\n");
2759 return -ENODEV;
2760 }
2761
2762 if (kvm_get_mode() == KVM_MODE_NONE) {
2763 kvm_info("KVM disabled from command line\n");
2764 return -ENODEV;
2765 }
2766
2767 err = kvm_sys_reg_table_init();
2768 if (err) {
2769 kvm_info("Error initializing system register tables");
2770 return err;
2771 }
2772
2773 in_hyp_mode = is_kernel_in_hyp_mode();
2774
2775 if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) ||
2776 cpus_have_final_cap(ARM64_WORKAROUND_1508412))
2777 kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \
2778 "Only trusted guests should be used on this system.\n");
2779
2780 err = kvm_set_ipa_limit();
2781 if (err)
2782 return err;
2783
2784 err = kvm_arm_init_sve();
2785 if (err)
2786 return err;
2787
2788 err = kvm_arm_vmid_alloc_init();
2789 if (err) {
2790 kvm_err("Failed to initialize VMID allocator.\n");
2791 return err;
2792 }
2793
2794 if (!in_hyp_mode) {
2795 err = init_hyp_mode();
2796 if (err)
2797 goto out_err;
2798 }
2799
2800 err = kvm_init_vector_slots();
2801 if (err) {
2802 kvm_err("Cannot initialise vector slots\n");
2803 goto out_hyp;
2804 }
2805
2806 err = init_subsystems();
2807 if (err)
2808 goto out_hyp;
2809
2810 kvm_info("%s%sVHE mode initialized successfully\n",
2811 in_hyp_mode ? "" : (is_protected_kvm_enabled() ?
2812 "Protected " : "Hyp "),
2813 in_hyp_mode ? "" : (cpus_have_final_cap(ARM64_KVM_HVHE) ?
2814 "h" : "n"));
2815
2816 /*
2817 * FIXME: Do something reasonable if kvm_init() fails after pKVM
2818 * hypervisor protection is finalized.
2819 */
2820 err = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE);
2821 if (err)
2822 goto out_subs;
2823
2824 /*
2825 * This should be called after initialization is done and failure isn't
2826 * possible anymore.
2827 */
2828 if (!in_hyp_mode)
2829 finalize_init_hyp_mode();
2830
2831 kvm_arm_initialised = true;
2832
2833 return 0;
2834
2835 out_subs:
2836 teardown_subsystems();
2837 out_hyp:
2838 if (!in_hyp_mode)
2839 teardown_hyp_mode();
2840 out_err:
2841 kvm_arm_vmid_alloc_free();
2842 return err;
2843 }
2844
early_kvm_mode_cfg(char * arg)2845 static int __init early_kvm_mode_cfg(char *arg)
2846 {
2847 if (!arg)
2848 return -EINVAL;
2849
2850 if (strcmp(arg, "none") == 0) {
2851 kvm_mode = KVM_MODE_NONE;
2852 return 0;
2853 }
2854
2855 if (!is_hyp_mode_available()) {
2856 pr_warn_once("KVM is not available. Ignoring kvm-arm.mode\n");
2857 return 0;
2858 }
2859
2860 if (strcmp(arg, "protected") == 0) {
2861 if (!is_kernel_in_hyp_mode())
2862 kvm_mode = KVM_MODE_PROTECTED;
2863 else
2864 pr_warn_once("Protected KVM not available with VHE\n");
2865
2866 return 0;
2867 }
2868
2869 if (strcmp(arg, "nvhe") == 0 && !WARN_ON(is_kernel_in_hyp_mode())) {
2870 kvm_mode = KVM_MODE_DEFAULT;
2871 return 0;
2872 }
2873
2874 if (strcmp(arg, "nested") == 0 && !WARN_ON(!is_kernel_in_hyp_mode())) {
2875 kvm_mode = KVM_MODE_NV;
2876 return 0;
2877 }
2878
2879 return -EINVAL;
2880 }
2881 early_param("kvm-arm.mode", early_kvm_mode_cfg);
2882
early_kvm_wfx_trap_policy_cfg(char * arg,enum kvm_wfx_trap_policy * p)2883 static int __init early_kvm_wfx_trap_policy_cfg(char *arg, enum kvm_wfx_trap_policy *p)
2884 {
2885 if (!arg)
2886 return -EINVAL;
2887
2888 if (strcmp(arg, "trap") == 0) {
2889 *p = KVM_WFX_TRAP;
2890 return 0;
2891 }
2892
2893 if (strcmp(arg, "notrap") == 0) {
2894 *p = KVM_WFX_NOTRAP;
2895 return 0;
2896 }
2897
2898 return -EINVAL;
2899 }
2900
early_kvm_wfi_trap_policy_cfg(char * arg)2901 static int __init early_kvm_wfi_trap_policy_cfg(char *arg)
2902 {
2903 return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfi_trap_policy);
2904 }
2905 early_param("kvm-arm.wfi_trap_policy", early_kvm_wfi_trap_policy_cfg);
2906
early_kvm_wfe_trap_policy_cfg(char * arg)2907 static int __init early_kvm_wfe_trap_policy_cfg(char *arg)
2908 {
2909 return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfe_trap_policy);
2910 }
2911 early_param("kvm-arm.wfe_trap_policy", early_kvm_wfe_trap_policy_cfg);
2912
kvm_get_mode(void)2913 enum kvm_mode kvm_get_mode(void)
2914 {
2915 return kvm_mode;
2916 }
2917
2918 module_init(kvm_arm_init);
2919