1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Virtio-mem device driver.
4 *
5 * Copyright Red Hat, Inc. 2020
6 *
7 * Author(s): David Hildenbrand <[email protected]>
8 */
9
10 #include <linux/virtio.h>
11 #include <linux/virtio_mem.h>
12 #include <linux/workqueue.h>
13 #include <linux/slab.h>
14 #include <linux/module.h>
15 #include <linux/mm.h>
16 #include <linux/memory_hotplug.h>
17 #include <linux/memory.h>
18 #include <linux/hrtimer.h>
19 #include <linux/crash_dump.h>
20 #include <linux/mutex.h>
21 #include <linux/bitmap.h>
22 #include <linux/lockdep.h>
23 #include <linux/log2.h>
24 #include <linux/vmalloc.h>
25 #include <linux/suspend.h>
26
27 #include <acpi/acpi_numa.h>
28
29 static bool unplug_online = true;
30 module_param(unplug_online, bool, 0644);
31 MODULE_PARM_DESC(unplug_online, "Try to unplug online memory");
32
33 static bool force_bbm;
34 module_param(force_bbm, bool, 0444);
35 MODULE_PARM_DESC(force_bbm,
36 "Force Big Block Mode. Default is 0 (auto-selection)");
37
38 static unsigned long bbm_block_size;
39 module_param(bbm_block_size, ulong, 0444);
40 MODULE_PARM_DESC(bbm_block_size,
41 "Big Block size in bytes. Default is 0 (auto-detection).");
42
43 /*
44 * virtio-mem currently supports the following modes of operation:
45 *
46 * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The
47 * size of a Sub Block (SB) is determined based on the device block size, the
48 * pageblock size, and the maximum allocation granularity of the buddy.
49 * Subblocks within a Linux memory block might either be plugged or unplugged.
50 * Memory is added/removed to Linux MM in Linux memory block granularity.
51 *
52 * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks.
53 * Memory is added/removed to Linux MM in Big Block granularity.
54 *
55 * The mode is determined automatically based on the Linux memory block size
56 * and the device block size.
57 *
58 * User space / core MM (auto onlining) is responsible for onlining added
59 * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are
60 * always onlined separately, and all memory within a Linux memory block is
61 * onlined to the same zone - virtio-mem relies on this behavior.
62 */
63
64 /*
65 * State of a Linux memory block in SBM.
66 */
67 enum virtio_mem_sbm_mb_state {
68 /* Unplugged, not added to Linux. Can be reused later. */
69 VIRTIO_MEM_SBM_MB_UNUSED = 0,
70 /* (Partially) plugged, not added to Linux. Error on add_memory(). */
71 VIRTIO_MEM_SBM_MB_PLUGGED,
72 /* Fully plugged, fully added to Linux, offline. */
73 VIRTIO_MEM_SBM_MB_OFFLINE,
74 /* Partially plugged, fully added to Linux, offline. */
75 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
76 /* Fully plugged, fully added to Linux, onlined to a kernel zone. */
77 VIRTIO_MEM_SBM_MB_KERNEL,
78 /* Partially plugged, fully added to Linux, online to a kernel zone */
79 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
80 /* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */
81 VIRTIO_MEM_SBM_MB_MOVABLE,
82 /* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */
83 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
84 VIRTIO_MEM_SBM_MB_COUNT
85 };
86
87 /*
88 * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks.
89 */
90 enum virtio_mem_bbm_bb_state {
91 /* Unplugged, not added to Linux. Can be reused later. */
92 VIRTIO_MEM_BBM_BB_UNUSED = 0,
93 /* Plugged, not added to Linux. Error on add_memory(). */
94 VIRTIO_MEM_BBM_BB_PLUGGED,
95 /* Plugged and added to Linux. */
96 VIRTIO_MEM_BBM_BB_ADDED,
97 /* All online parts are fake-offline, ready to remove. */
98 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE,
99 VIRTIO_MEM_BBM_BB_COUNT
100 };
101
102 struct virtio_mem {
103 struct virtio_device *vdev;
104
105 /* We might first have to unplug all memory when starting up. */
106 bool unplug_all_required;
107
108 /* Workqueue that processes the plug/unplug requests. */
109 struct work_struct wq;
110 atomic_t wq_active;
111 atomic_t config_changed;
112
113 /* Virtqueue for guest->host requests. */
114 struct virtqueue *vq;
115
116 /* Wait for a host response to a guest request. */
117 wait_queue_head_t host_resp;
118
119 /* Space for one guest request and the host response. */
120 struct virtio_mem_req req;
121 struct virtio_mem_resp resp;
122
123 /* The current size of the device. */
124 uint64_t plugged_size;
125 /* The requested size of the device. */
126 uint64_t requested_size;
127
128 /* The device block size (for communicating with the device). */
129 uint64_t device_block_size;
130 /* The determined node id for all memory of the device. */
131 int nid;
132 /* Physical start address of the memory region. */
133 uint64_t addr;
134 /* Maximum region size in bytes. */
135 uint64_t region_size;
136 /* Usable region size in bytes. */
137 uint64_t usable_region_size;
138
139 /* The parent resource for all memory added via this device. */
140 struct resource *parent_resource;
141 /*
142 * Copy of "System RAM (virtio_mem)" to be used for
143 * add_memory_driver_managed().
144 */
145 const char *resource_name;
146 /* Memory group identification. */
147 int mgid;
148
149 /*
150 * We don't want to add too much memory if it's not getting onlined,
151 * to avoid running OOM. Besides this threshold, we allow to have at
152 * least two offline blocks at a time (whatever is bigger).
153 */
154 #define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD (1024 * 1024 * 1024)
155 atomic64_t offline_size;
156 uint64_t offline_threshold;
157
158 /* If set, the driver is in SBM, otherwise in BBM. */
159 bool in_sbm;
160
161 union {
162 struct {
163 /* Id of the first memory block of this device. */
164 unsigned long first_mb_id;
165 /* Id of the last usable memory block of this device. */
166 unsigned long last_usable_mb_id;
167 /* Id of the next memory bock to prepare when needed. */
168 unsigned long next_mb_id;
169
170 /* The subblock size. */
171 uint64_t sb_size;
172 /* The number of subblocks per Linux memory block. */
173 uint32_t sbs_per_mb;
174
175 /*
176 * Some of the Linux memory blocks tracked as "partially
177 * plugged" are completely unplugged and can be offlined
178 * and removed -- which previously failed.
179 */
180 bool have_unplugged_mb;
181
182 /* Summary of all memory block states. */
183 unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT];
184
185 /*
186 * One byte state per memory block. Allocated via
187 * vmalloc(). Resized (alloc+copy+free) on demand.
188 *
189 * With 128 MiB memory blocks, we have states for 512
190 * GiB of memory in one 4 KiB page.
191 */
192 uint8_t *mb_states;
193
194 /*
195 * Bitmap: one bit per subblock. Allocated similar to
196 * sbm.mb_states.
197 *
198 * A set bit means the corresponding subblock is
199 * plugged, otherwise it's unblocked.
200 *
201 * With 4 MiB subblocks, we manage 128 GiB of memory
202 * in one 4 KiB page.
203 */
204 unsigned long *sb_states;
205 } sbm;
206
207 struct {
208 /* Id of the first big block of this device. */
209 unsigned long first_bb_id;
210 /* Id of the last usable big block of this device. */
211 unsigned long last_usable_bb_id;
212 /* Id of the next device bock to prepare when needed. */
213 unsigned long next_bb_id;
214
215 /* Summary of all big block states. */
216 unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT];
217
218 /* One byte state per big block. See sbm.mb_states. */
219 uint8_t *bb_states;
220
221 /* The block size used for plugging/adding/removing. */
222 uint64_t bb_size;
223 } bbm;
224 };
225
226 /*
227 * Mutex that protects the sbm.mb_count, sbm.mb_states,
228 * sbm.sb_states, bbm.bb_count, and bbm.bb_states
229 *
230 * When this lock is held the pointers can't change, ONLINE and
231 * OFFLINE blocks can't change the state and no subblocks will get
232 * plugged/unplugged.
233 *
234 * In kdump mode, used to serialize requests, last_block_addr and
235 * last_block_plugged.
236 */
237 struct mutex hotplug_mutex;
238 bool hotplug_active;
239
240 /* An error occurred we cannot handle - stop processing requests. */
241 bool broken;
242
243 /* Cached valued of is_kdump_kernel() when the device was probed. */
244 bool in_kdump;
245
246 /* The driver is being removed. */
247 spinlock_t removal_lock;
248 bool removing;
249
250 /* Timer for retrying to plug/unplug memory. */
251 struct hrtimer retry_timer;
252 unsigned int retry_timer_ms;
253 #define VIRTIO_MEM_RETRY_TIMER_MIN_MS 50000
254 #define VIRTIO_MEM_RETRY_TIMER_MAX_MS 300000
255
256 /* Memory notifier (online/offline events). */
257 struct notifier_block memory_notifier;
258
259 /* Notifier to block hibernation image storing/reloading. */
260 struct notifier_block pm_notifier;
261
262 #ifdef CONFIG_PROC_VMCORE
263 /* vmcore callback for /proc/vmcore handling in kdump mode */
264 struct vmcore_cb vmcore_cb;
265 uint64_t last_block_addr;
266 bool last_block_plugged;
267 #endif /* CONFIG_PROC_VMCORE */
268
269 /* Next device in the list of virtio-mem devices. */
270 struct list_head next;
271 };
272
273 /*
274 * We have to share a single online_page callback among all virtio-mem
275 * devices. We use RCU to iterate the list in the callback.
276 */
277 static DEFINE_MUTEX(virtio_mem_mutex);
278 static LIST_HEAD(virtio_mem_devices);
279
280 static void virtio_mem_online_page_cb(struct page *page, unsigned int order);
281 static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
282 unsigned long nr_pages);
283 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
284 unsigned long nr_pages);
285 static void virtio_mem_retry(struct virtio_mem *vm);
286 static int virtio_mem_create_resource(struct virtio_mem *vm);
287 static void virtio_mem_delete_resource(struct virtio_mem *vm);
288
289 /*
290 * Register a virtio-mem device so it will be considered for the online_page
291 * callback.
292 */
register_virtio_mem_device(struct virtio_mem * vm)293 static int register_virtio_mem_device(struct virtio_mem *vm)
294 {
295 int rc = 0;
296
297 /* First device registers the callback. */
298 mutex_lock(&virtio_mem_mutex);
299 if (list_empty(&virtio_mem_devices))
300 rc = set_online_page_callback(&virtio_mem_online_page_cb);
301 if (!rc)
302 list_add_rcu(&vm->next, &virtio_mem_devices);
303 mutex_unlock(&virtio_mem_mutex);
304
305 return rc;
306 }
307
308 /*
309 * Unregister a virtio-mem device so it will no longer be considered for the
310 * online_page callback.
311 */
unregister_virtio_mem_device(struct virtio_mem * vm)312 static void unregister_virtio_mem_device(struct virtio_mem *vm)
313 {
314 /* Last device unregisters the callback. */
315 mutex_lock(&virtio_mem_mutex);
316 list_del_rcu(&vm->next);
317 if (list_empty(&virtio_mem_devices))
318 restore_online_page_callback(&virtio_mem_online_page_cb);
319 mutex_unlock(&virtio_mem_mutex);
320
321 synchronize_rcu();
322 }
323
324 /*
325 * Calculate the memory block id of a given address.
326 */
virtio_mem_phys_to_mb_id(unsigned long addr)327 static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr)
328 {
329 return addr / memory_block_size_bytes();
330 }
331
332 /*
333 * Calculate the physical start address of a given memory block id.
334 */
virtio_mem_mb_id_to_phys(unsigned long mb_id)335 static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id)
336 {
337 return mb_id * memory_block_size_bytes();
338 }
339
340 /*
341 * Calculate the big block id of a given address.
342 */
virtio_mem_phys_to_bb_id(struct virtio_mem * vm,uint64_t addr)343 static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm,
344 uint64_t addr)
345 {
346 return addr / vm->bbm.bb_size;
347 }
348
349 /*
350 * Calculate the physical start address of a given big block id.
351 */
virtio_mem_bb_id_to_phys(struct virtio_mem * vm,unsigned long bb_id)352 static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm,
353 unsigned long bb_id)
354 {
355 return bb_id * vm->bbm.bb_size;
356 }
357
358 /*
359 * Calculate the subblock id of a given address.
360 */
virtio_mem_phys_to_sb_id(struct virtio_mem * vm,unsigned long addr)361 static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm,
362 unsigned long addr)
363 {
364 const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr);
365 const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id);
366
367 return (addr - mb_addr) / vm->sbm.sb_size;
368 }
369
370 /*
371 * Set the state of a big block, taking care of the state counter.
372 */
virtio_mem_bbm_set_bb_state(struct virtio_mem * vm,unsigned long bb_id,enum virtio_mem_bbm_bb_state state)373 static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm,
374 unsigned long bb_id,
375 enum virtio_mem_bbm_bb_state state)
376 {
377 const unsigned long idx = bb_id - vm->bbm.first_bb_id;
378 enum virtio_mem_bbm_bb_state old_state;
379
380 old_state = vm->bbm.bb_states[idx];
381 vm->bbm.bb_states[idx] = state;
382
383 BUG_ON(vm->bbm.bb_count[old_state] == 0);
384 vm->bbm.bb_count[old_state]--;
385 vm->bbm.bb_count[state]++;
386 }
387
388 /*
389 * Get the state of a big block.
390 */
virtio_mem_bbm_get_bb_state(struct virtio_mem * vm,unsigned long bb_id)391 static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm,
392 unsigned long bb_id)
393 {
394 return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id];
395 }
396
397 /*
398 * Prepare the big block state array for the next big block.
399 */
virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem * vm)400 static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm)
401 {
402 unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id;
403 unsigned long new_bytes = old_bytes + 1;
404 int old_pages = PFN_UP(old_bytes);
405 int new_pages = PFN_UP(new_bytes);
406 uint8_t *new_array;
407
408 if (vm->bbm.bb_states && old_pages == new_pages)
409 return 0;
410
411 new_array = vzalloc(new_pages * PAGE_SIZE);
412 if (!new_array)
413 return -ENOMEM;
414
415 mutex_lock(&vm->hotplug_mutex);
416 if (vm->bbm.bb_states)
417 memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE);
418 vfree(vm->bbm.bb_states);
419 vm->bbm.bb_states = new_array;
420 mutex_unlock(&vm->hotplug_mutex);
421
422 return 0;
423 }
424
425 #define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \
426 for (_bb_id = vm->bbm.first_bb_id; \
427 _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \
428 _bb_id++) \
429 if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
430
431 #define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \
432 for (_bb_id = vm->bbm.next_bb_id - 1; \
433 _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \
434 _bb_id--) \
435 if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
436
437 /*
438 * Set the state of a memory block, taking care of the state counter.
439 */
virtio_mem_sbm_set_mb_state(struct virtio_mem * vm,unsigned long mb_id,uint8_t state)440 static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm,
441 unsigned long mb_id, uint8_t state)
442 {
443 const unsigned long idx = mb_id - vm->sbm.first_mb_id;
444 uint8_t old_state;
445
446 old_state = vm->sbm.mb_states[idx];
447 vm->sbm.mb_states[idx] = state;
448
449 BUG_ON(vm->sbm.mb_count[old_state] == 0);
450 vm->sbm.mb_count[old_state]--;
451 vm->sbm.mb_count[state]++;
452 }
453
454 /*
455 * Get the state of a memory block.
456 */
virtio_mem_sbm_get_mb_state(struct virtio_mem * vm,unsigned long mb_id)457 static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm,
458 unsigned long mb_id)
459 {
460 const unsigned long idx = mb_id - vm->sbm.first_mb_id;
461
462 return vm->sbm.mb_states[idx];
463 }
464
465 /*
466 * Prepare the state array for the next memory block.
467 */
virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem * vm)468 static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm)
469 {
470 int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id);
471 int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1);
472 uint8_t *new_array;
473
474 if (vm->sbm.mb_states && old_pages == new_pages)
475 return 0;
476
477 new_array = vzalloc(new_pages * PAGE_SIZE);
478 if (!new_array)
479 return -ENOMEM;
480
481 mutex_lock(&vm->hotplug_mutex);
482 if (vm->sbm.mb_states)
483 memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE);
484 vfree(vm->sbm.mb_states);
485 vm->sbm.mb_states = new_array;
486 mutex_unlock(&vm->hotplug_mutex);
487
488 return 0;
489 }
490
491 #define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \
492 for (_mb_id = _vm->sbm.first_mb_id; \
493 _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \
494 _mb_id++) \
495 if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
496
497 #define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \
498 for (_mb_id = _vm->sbm.next_mb_id - 1; \
499 _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \
500 _mb_id--) \
501 if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
502
503 /*
504 * Calculate the bit number in the subblock bitmap for the given subblock
505 * inside the given memory block.
506 */
virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem * vm,unsigned long mb_id,int sb_id)507 static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm,
508 unsigned long mb_id, int sb_id)
509 {
510 return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id;
511 }
512
513 /*
514 * Mark all selected subblocks plugged.
515 *
516 * Will not modify the state of the memory block.
517 */
virtio_mem_sbm_set_sb_plugged(struct virtio_mem * vm,unsigned long mb_id,int sb_id,int count)518 static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm,
519 unsigned long mb_id, int sb_id,
520 int count)
521 {
522 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
523
524 __bitmap_set(vm->sbm.sb_states, bit, count);
525 }
526
527 /*
528 * Mark all selected subblocks unplugged.
529 *
530 * Will not modify the state of the memory block.
531 */
virtio_mem_sbm_set_sb_unplugged(struct virtio_mem * vm,unsigned long mb_id,int sb_id,int count)532 static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm,
533 unsigned long mb_id, int sb_id,
534 int count)
535 {
536 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
537
538 __bitmap_clear(vm->sbm.sb_states, bit, count);
539 }
540
541 /*
542 * Test if all selected subblocks are plugged.
543 */
virtio_mem_sbm_test_sb_plugged(struct virtio_mem * vm,unsigned long mb_id,int sb_id,int count)544 static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm,
545 unsigned long mb_id, int sb_id,
546 int count)
547 {
548 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
549
550 if (count == 1)
551 return test_bit(bit, vm->sbm.sb_states);
552
553 /* TODO: Helper similar to bitmap_set() */
554 return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >=
555 bit + count;
556 }
557
558 /*
559 * Test if all selected subblocks are unplugged.
560 */
virtio_mem_sbm_test_sb_unplugged(struct virtio_mem * vm,unsigned long mb_id,int sb_id,int count)561 static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm,
562 unsigned long mb_id, int sb_id,
563 int count)
564 {
565 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
566
567 /* TODO: Helper similar to bitmap_set() */
568 return find_next_bit(vm->sbm.sb_states, bit + count, bit) >=
569 bit + count;
570 }
571
572 /*
573 * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is
574 * none.
575 */
virtio_mem_sbm_first_unplugged_sb(struct virtio_mem * vm,unsigned long mb_id)576 static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm,
577 unsigned long mb_id)
578 {
579 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0);
580
581 return find_next_zero_bit(vm->sbm.sb_states,
582 bit + vm->sbm.sbs_per_mb, bit) - bit;
583 }
584
585 /*
586 * Prepare the subblock bitmap for the next memory block.
587 */
virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem * vm)588 static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm)
589 {
590 const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id;
591 const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb;
592 const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb;
593 int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long));
594 int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long));
595 unsigned long *new_bitmap, *old_bitmap;
596
597 if (vm->sbm.sb_states && old_pages == new_pages)
598 return 0;
599
600 new_bitmap = vzalloc(new_pages * PAGE_SIZE);
601 if (!new_bitmap)
602 return -ENOMEM;
603
604 mutex_lock(&vm->hotplug_mutex);
605 if (vm->sbm.sb_states)
606 memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE);
607
608 old_bitmap = vm->sbm.sb_states;
609 vm->sbm.sb_states = new_bitmap;
610 mutex_unlock(&vm->hotplug_mutex);
611
612 vfree(old_bitmap);
613 return 0;
614 }
615
616 /*
617 * Test if we could add memory without creating too much offline memory -
618 * to avoid running OOM if memory is getting onlined deferred.
619 */
virtio_mem_could_add_memory(struct virtio_mem * vm,uint64_t size)620 static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size)
621 {
622 if (WARN_ON_ONCE(size > vm->offline_threshold))
623 return false;
624
625 return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold;
626 }
627
628 /*
629 * Try adding memory to Linux. Will usually only fail if out of memory.
630 *
631 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
632 * onlining code).
633 *
634 * Will not modify the state of memory blocks in virtio-mem.
635 */
virtio_mem_add_memory(struct virtio_mem * vm,uint64_t addr,uint64_t size)636 static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr,
637 uint64_t size)
638 {
639 int rc;
640
641 /*
642 * When force-unloading the driver and we still have memory added to
643 * Linux, the resource name has to stay.
644 */
645 if (!vm->resource_name) {
646 vm->resource_name = kstrdup_const("System RAM (virtio_mem)",
647 GFP_KERNEL);
648 if (!vm->resource_name)
649 return -ENOMEM;
650 }
651
652 dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr,
653 addr + size - 1);
654 /* Memory might get onlined immediately. */
655 atomic64_add(size, &vm->offline_size);
656 rc = add_memory_driver_managed(vm->mgid, addr, size, vm->resource_name,
657 MHP_MERGE_RESOURCE | MHP_NID_IS_MGID);
658 if (rc) {
659 atomic64_sub(size, &vm->offline_size);
660 dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc);
661 /*
662 * TODO: Linux MM does not properly clean up yet in all cases
663 * where adding of memory failed - especially on -ENOMEM.
664 */
665 }
666 return rc;
667 }
668
669 /*
670 * See virtio_mem_add_memory(): Try adding a single Linux memory block.
671 */
virtio_mem_sbm_add_mb(struct virtio_mem * vm,unsigned long mb_id)672 static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id)
673 {
674 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
675 const uint64_t size = memory_block_size_bytes();
676
677 return virtio_mem_add_memory(vm, addr, size);
678 }
679
680 /*
681 * See virtio_mem_add_memory(): Try adding a big block.
682 */
virtio_mem_bbm_add_bb(struct virtio_mem * vm,unsigned long bb_id)683 static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id)
684 {
685 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
686 const uint64_t size = vm->bbm.bb_size;
687
688 return virtio_mem_add_memory(vm, addr, size);
689 }
690
691 /*
692 * Try removing memory from Linux. Will only fail if memory blocks aren't
693 * offline.
694 *
695 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
696 * onlining code).
697 *
698 * Will not modify the state of memory blocks in virtio-mem.
699 */
virtio_mem_remove_memory(struct virtio_mem * vm,uint64_t addr,uint64_t size)700 static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr,
701 uint64_t size)
702 {
703 int rc;
704
705 dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr,
706 addr + size - 1);
707 rc = remove_memory(addr, size);
708 if (!rc) {
709 atomic64_sub(size, &vm->offline_size);
710 /*
711 * We might have freed up memory we can now unplug, retry
712 * immediately instead of waiting.
713 */
714 virtio_mem_retry(vm);
715 } else {
716 dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc);
717 }
718 return rc;
719 }
720
721 /*
722 * See virtio_mem_remove_memory(): Try removing a single Linux memory block.
723 */
virtio_mem_sbm_remove_mb(struct virtio_mem * vm,unsigned long mb_id)724 static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id)
725 {
726 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
727 const uint64_t size = memory_block_size_bytes();
728
729 return virtio_mem_remove_memory(vm, addr, size);
730 }
731
732 /*
733 * Try offlining and removing memory from Linux.
734 *
735 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
736 * onlining code).
737 *
738 * Will not modify the state of memory blocks in virtio-mem.
739 */
virtio_mem_offline_and_remove_memory(struct virtio_mem * vm,uint64_t addr,uint64_t size)740 static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm,
741 uint64_t addr,
742 uint64_t size)
743 {
744 int rc;
745
746 dev_dbg(&vm->vdev->dev,
747 "offlining and removing memory: 0x%llx - 0x%llx\n", addr,
748 addr + size - 1);
749
750 rc = offline_and_remove_memory(addr, size);
751 if (!rc) {
752 atomic64_sub(size, &vm->offline_size);
753 /*
754 * We might have freed up memory we can now unplug, retry
755 * immediately instead of waiting.
756 */
757 virtio_mem_retry(vm);
758 return 0;
759 }
760 dev_dbg(&vm->vdev->dev, "offlining and removing memory failed: %d\n", rc);
761 /*
762 * We don't really expect this to fail, because we fake-offlined all
763 * memory already. But it could fail in corner cases.
764 */
765 WARN_ON_ONCE(rc != -ENOMEM && rc != -EBUSY);
766 return rc == -ENOMEM ? -ENOMEM : -EBUSY;
767 }
768
769 /*
770 * See virtio_mem_offline_and_remove_memory(): Try offlining and removing
771 * a single Linux memory block.
772 */
virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem * vm,unsigned long mb_id)773 static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm,
774 unsigned long mb_id)
775 {
776 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
777 const uint64_t size = memory_block_size_bytes();
778
779 return virtio_mem_offline_and_remove_memory(vm, addr, size);
780 }
781
782 /*
783 * Try (offlining and) removing memory from Linux in case all subblocks are
784 * unplugged. Can be called on online and offline memory blocks.
785 *
786 * May modify the state of memory blocks in virtio-mem.
787 */
virtio_mem_sbm_try_remove_unplugged_mb(struct virtio_mem * vm,unsigned long mb_id)788 static int virtio_mem_sbm_try_remove_unplugged_mb(struct virtio_mem *vm,
789 unsigned long mb_id)
790 {
791 int rc;
792
793 /*
794 * Once all subblocks of a memory block were unplugged, offline and
795 * remove it.
796 */
797 if (!virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
798 return 0;
799
800 /* offline_and_remove_memory() works for online and offline memory. */
801 mutex_unlock(&vm->hotplug_mutex);
802 rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id);
803 mutex_lock(&vm->hotplug_mutex);
804 if (!rc)
805 virtio_mem_sbm_set_mb_state(vm, mb_id,
806 VIRTIO_MEM_SBM_MB_UNUSED);
807 return rc;
808 }
809
810 /*
811 * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a
812 * all Linux memory blocks covered by the big block.
813 */
virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem * vm,unsigned long bb_id)814 static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm,
815 unsigned long bb_id)
816 {
817 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
818 const uint64_t size = vm->bbm.bb_size;
819
820 return virtio_mem_offline_and_remove_memory(vm, addr, size);
821 }
822
823 /*
824 * Trigger the workqueue so the device can perform its magic.
825 */
virtio_mem_retry(struct virtio_mem * vm)826 static void virtio_mem_retry(struct virtio_mem *vm)
827 {
828 unsigned long flags;
829
830 spin_lock_irqsave(&vm->removal_lock, flags);
831 if (!vm->removing)
832 queue_work(system_freezable_wq, &vm->wq);
833 spin_unlock_irqrestore(&vm->removal_lock, flags);
834 }
835
virtio_mem_translate_node_id(struct virtio_mem * vm,uint16_t node_id)836 static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id)
837 {
838 int node = NUMA_NO_NODE;
839
840 #if defined(CONFIG_ACPI_NUMA)
841 if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM))
842 node = pxm_to_node(node_id);
843 #endif
844 return node;
845 }
846
847 /*
848 * Test if a virtio-mem device overlaps with the given range. Can be called
849 * from (notifier) callbacks lockless.
850 */
virtio_mem_overlaps_range(struct virtio_mem * vm,uint64_t start,uint64_t size)851 static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start,
852 uint64_t size)
853 {
854 return start < vm->addr + vm->region_size && vm->addr < start + size;
855 }
856
857 /*
858 * Test if a virtio-mem device contains a given range. Can be called from
859 * (notifier) callbacks lockless.
860 */
virtio_mem_contains_range(struct virtio_mem * vm,uint64_t start,uint64_t size)861 static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start,
862 uint64_t size)
863 {
864 return start >= vm->addr && start + size <= vm->addr + vm->region_size;
865 }
866
virtio_mem_sbm_notify_going_online(struct virtio_mem * vm,unsigned long mb_id)867 static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm,
868 unsigned long mb_id)
869 {
870 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
871 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
872 case VIRTIO_MEM_SBM_MB_OFFLINE:
873 return NOTIFY_OK;
874 default:
875 break;
876 }
877 dev_warn_ratelimited(&vm->vdev->dev,
878 "memory block onlining denied\n");
879 return NOTIFY_BAD;
880 }
881
virtio_mem_sbm_notify_offline(struct virtio_mem * vm,unsigned long mb_id)882 static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm,
883 unsigned long mb_id)
884 {
885 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
886 case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL:
887 case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL:
888 virtio_mem_sbm_set_mb_state(vm, mb_id,
889 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
890 break;
891 case VIRTIO_MEM_SBM_MB_KERNEL:
892 case VIRTIO_MEM_SBM_MB_MOVABLE:
893 virtio_mem_sbm_set_mb_state(vm, mb_id,
894 VIRTIO_MEM_SBM_MB_OFFLINE);
895 break;
896 default:
897 BUG();
898 break;
899 }
900 }
901
virtio_mem_sbm_notify_online(struct virtio_mem * vm,unsigned long mb_id,unsigned long start_pfn)902 static void virtio_mem_sbm_notify_online(struct virtio_mem *vm,
903 unsigned long mb_id,
904 unsigned long start_pfn)
905 {
906 const bool is_movable = is_zone_movable_page(pfn_to_page(start_pfn));
907 int new_state;
908
909 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
910 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
911 new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL;
912 if (is_movable)
913 new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL;
914 break;
915 case VIRTIO_MEM_SBM_MB_OFFLINE:
916 new_state = VIRTIO_MEM_SBM_MB_KERNEL;
917 if (is_movable)
918 new_state = VIRTIO_MEM_SBM_MB_MOVABLE;
919 break;
920 default:
921 BUG();
922 break;
923 }
924 virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
925 }
926
virtio_mem_sbm_notify_going_offline(struct virtio_mem * vm,unsigned long mb_id)927 static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm,
928 unsigned long mb_id)
929 {
930 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
931 unsigned long pfn;
932 int sb_id;
933
934 for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
935 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
936 continue;
937 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
938 sb_id * vm->sbm.sb_size);
939 virtio_mem_fake_offline_going_offline(pfn, nr_pages);
940 }
941 }
942
virtio_mem_sbm_notify_cancel_offline(struct virtio_mem * vm,unsigned long mb_id)943 static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm,
944 unsigned long mb_id)
945 {
946 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
947 unsigned long pfn;
948 int sb_id;
949
950 for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
951 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
952 continue;
953 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
954 sb_id * vm->sbm.sb_size);
955 virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
956 }
957 }
958
virtio_mem_bbm_notify_going_offline(struct virtio_mem * vm,unsigned long bb_id,unsigned long pfn,unsigned long nr_pages)959 static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm,
960 unsigned long bb_id,
961 unsigned long pfn,
962 unsigned long nr_pages)
963 {
964 /*
965 * When marked as "fake-offline", all online memory of this device block
966 * is allocated by us. Otherwise, we don't have any memory allocated.
967 */
968 if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
969 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
970 return;
971 virtio_mem_fake_offline_going_offline(pfn, nr_pages);
972 }
973
virtio_mem_bbm_notify_cancel_offline(struct virtio_mem * vm,unsigned long bb_id,unsigned long pfn,unsigned long nr_pages)974 static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm,
975 unsigned long bb_id,
976 unsigned long pfn,
977 unsigned long nr_pages)
978 {
979 if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
980 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
981 return;
982 virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
983 }
984
985 /*
986 * This callback will either be called synchronously from add_memory() or
987 * asynchronously (e.g., triggered via user space). We have to be careful
988 * with locking when calling add_memory().
989 */
virtio_mem_memory_notifier_cb(struct notifier_block * nb,unsigned long action,void * arg)990 static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
991 unsigned long action, void *arg)
992 {
993 struct virtio_mem *vm = container_of(nb, struct virtio_mem,
994 memory_notifier);
995 struct memory_notify *mhp = arg;
996 const unsigned long start = PFN_PHYS(mhp->start_pfn);
997 const unsigned long size = PFN_PHYS(mhp->nr_pages);
998 int rc = NOTIFY_OK;
999 unsigned long id;
1000
1001 if (!virtio_mem_overlaps_range(vm, start, size))
1002 return NOTIFY_DONE;
1003
1004 if (vm->in_sbm) {
1005 id = virtio_mem_phys_to_mb_id(start);
1006 /*
1007 * In SBM, we add memory in separate memory blocks - we expect
1008 * it to be onlined/offlined in the same granularity. Bail out
1009 * if this ever changes.
1010 */
1011 if (WARN_ON_ONCE(size != memory_block_size_bytes() ||
1012 !IS_ALIGNED(start, memory_block_size_bytes())))
1013 return NOTIFY_BAD;
1014 } else {
1015 id = virtio_mem_phys_to_bb_id(vm, start);
1016 /*
1017 * In BBM, we only care about onlining/offlining happening
1018 * within a single big block, we don't care about the
1019 * actual granularity as we don't track individual Linux
1020 * memory blocks.
1021 */
1022 if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1)))
1023 return NOTIFY_BAD;
1024 }
1025
1026 /*
1027 * Avoid circular locking lockdep warnings. We lock the mutex
1028 * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The
1029 * blocking_notifier_call_chain() has it's own lock, which gets unlocked
1030 * between both notifier calls and will bail out. False positive.
1031 */
1032 lockdep_off();
1033
1034 switch (action) {
1035 case MEM_GOING_OFFLINE:
1036 mutex_lock(&vm->hotplug_mutex);
1037 if (vm->removing) {
1038 rc = notifier_from_errno(-EBUSY);
1039 mutex_unlock(&vm->hotplug_mutex);
1040 break;
1041 }
1042 vm->hotplug_active = true;
1043 if (vm->in_sbm)
1044 virtio_mem_sbm_notify_going_offline(vm, id);
1045 else
1046 virtio_mem_bbm_notify_going_offline(vm, id,
1047 mhp->start_pfn,
1048 mhp->nr_pages);
1049 break;
1050 case MEM_GOING_ONLINE:
1051 mutex_lock(&vm->hotplug_mutex);
1052 if (vm->removing) {
1053 rc = notifier_from_errno(-EBUSY);
1054 mutex_unlock(&vm->hotplug_mutex);
1055 break;
1056 }
1057 vm->hotplug_active = true;
1058 if (vm->in_sbm)
1059 rc = virtio_mem_sbm_notify_going_online(vm, id);
1060 break;
1061 case MEM_OFFLINE:
1062 if (vm->in_sbm)
1063 virtio_mem_sbm_notify_offline(vm, id);
1064
1065 atomic64_add(size, &vm->offline_size);
1066 /*
1067 * Trigger the workqueue. Now that we have some offline memory,
1068 * maybe we can handle pending unplug requests.
1069 */
1070 if (!unplug_online)
1071 virtio_mem_retry(vm);
1072
1073 vm->hotplug_active = false;
1074 mutex_unlock(&vm->hotplug_mutex);
1075 break;
1076 case MEM_ONLINE:
1077 if (vm->in_sbm)
1078 virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn);
1079
1080 atomic64_sub(size, &vm->offline_size);
1081 /*
1082 * Start adding more memory once we onlined half of our
1083 * threshold. Don't trigger if it's possibly due to our actipn
1084 * (e.g., us adding memory which gets onlined immediately from
1085 * the core).
1086 */
1087 if (!atomic_read(&vm->wq_active) &&
1088 virtio_mem_could_add_memory(vm, vm->offline_threshold / 2))
1089 virtio_mem_retry(vm);
1090
1091 vm->hotplug_active = false;
1092 mutex_unlock(&vm->hotplug_mutex);
1093 break;
1094 case MEM_CANCEL_OFFLINE:
1095 if (!vm->hotplug_active)
1096 break;
1097 if (vm->in_sbm)
1098 virtio_mem_sbm_notify_cancel_offline(vm, id);
1099 else
1100 virtio_mem_bbm_notify_cancel_offline(vm, id,
1101 mhp->start_pfn,
1102 mhp->nr_pages);
1103 vm->hotplug_active = false;
1104 mutex_unlock(&vm->hotplug_mutex);
1105 break;
1106 case MEM_CANCEL_ONLINE:
1107 if (!vm->hotplug_active)
1108 break;
1109 vm->hotplug_active = false;
1110 mutex_unlock(&vm->hotplug_mutex);
1111 break;
1112 default:
1113 break;
1114 }
1115
1116 lockdep_on();
1117
1118 return rc;
1119 }
1120
virtio_mem_pm_notifier_cb(struct notifier_block * nb,unsigned long action,void * arg)1121 static int virtio_mem_pm_notifier_cb(struct notifier_block *nb,
1122 unsigned long action, void *arg)
1123 {
1124 struct virtio_mem *vm = container_of(nb, struct virtio_mem,
1125 pm_notifier);
1126 switch (action) {
1127 case PM_HIBERNATION_PREPARE:
1128 case PM_RESTORE_PREPARE:
1129 /*
1130 * When restarting the VM, all memory is unplugged. Don't
1131 * allow to hibernate and restore from an image.
1132 */
1133 dev_err(&vm->vdev->dev, "hibernation is not supported.\n");
1134 return NOTIFY_BAD;
1135 default:
1136 return NOTIFY_OK;
1137 }
1138 }
1139
1140 /*
1141 * Set a range of pages PG_offline. Remember pages that were never onlined
1142 * (via generic_online_page()) using PageDirty().
1143 */
virtio_mem_set_fake_offline(unsigned long pfn,unsigned long nr_pages,bool onlined)1144 static void virtio_mem_set_fake_offline(unsigned long pfn,
1145 unsigned long nr_pages, bool onlined)
1146 {
1147 page_offline_begin();
1148 for (; nr_pages--; pfn++) {
1149 struct page *page = pfn_to_page(pfn);
1150
1151 if (!onlined)
1152 /*
1153 * Pages that have not been onlined yet were initialized
1154 * to PageOffline(). Remember that we have to route them
1155 * through generic_online_page().
1156 */
1157 SetPageDirty(page);
1158 else
1159 __SetPageOffline(page);
1160 VM_WARN_ON_ONCE(!PageOffline(page));
1161 }
1162 page_offline_end();
1163 }
1164
1165 /*
1166 * Clear PG_offline from a range of pages. If the pages were never onlined,
1167 * (via generic_online_page()), clear PageDirty().
1168 */
virtio_mem_clear_fake_offline(unsigned long pfn,unsigned long nr_pages,bool onlined)1169 static void virtio_mem_clear_fake_offline(unsigned long pfn,
1170 unsigned long nr_pages, bool onlined)
1171 {
1172 for (; nr_pages--; pfn++) {
1173 struct page *page = pfn_to_page(pfn);
1174
1175 if (!onlined)
1176 /* generic_online_page() will clear PageOffline(). */
1177 ClearPageDirty(page);
1178 else
1179 __ClearPageOffline(page);
1180 }
1181 }
1182
1183 /*
1184 * Release a range of fake-offline pages to the buddy, effectively
1185 * fake-onlining them.
1186 */
virtio_mem_fake_online(unsigned long pfn,unsigned long nr_pages)1187 static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages)
1188 {
1189 unsigned long order = MAX_PAGE_ORDER;
1190 unsigned long i;
1191
1192 /*
1193 * We might get called for ranges that don't cover properly aligned
1194 * MAX_PAGE_ORDER pages; however, we can only online properly aligned
1195 * pages with an order of MAX_PAGE_ORDER at maximum.
1196 */
1197 while (!IS_ALIGNED(pfn | nr_pages, 1 << order))
1198 order--;
1199
1200 for (i = 0; i < nr_pages; i += 1 << order) {
1201 struct page *page = pfn_to_page(pfn + i);
1202
1203 /*
1204 * If the page is PageDirty(), it was kept fake-offline when
1205 * onlining the memory block. Otherwise, it was allocated
1206 * using alloc_contig_range(). All pages in a subblock are
1207 * alike.
1208 */
1209 if (PageDirty(page)) {
1210 virtio_mem_clear_fake_offline(pfn + i, 1 << order, false);
1211 generic_online_page(page, order);
1212 } else {
1213 virtio_mem_clear_fake_offline(pfn + i, 1 << order, true);
1214 free_contig_range(pfn + i, 1 << order);
1215 adjust_managed_page_count(page, 1 << order);
1216 }
1217 }
1218 }
1219
1220 /*
1221 * Try to allocate a range, marking pages fake-offline, effectively
1222 * fake-offlining them.
1223 */
virtio_mem_fake_offline(struct virtio_mem * vm,unsigned long pfn,unsigned long nr_pages)1224 static int virtio_mem_fake_offline(struct virtio_mem *vm, unsigned long pfn,
1225 unsigned long nr_pages)
1226 {
1227 const bool is_movable = is_zone_movable_page(pfn_to_page(pfn));
1228 int rc, retry_count;
1229
1230 /*
1231 * TODO: We want an alloc_contig_range() mode that tries to allocate
1232 * harder (e.g., dealing with temporarily pinned pages, PCP), especially
1233 * with ZONE_MOVABLE. So for now, retry a couple of times with
1234 * ZONE_MOVABLE before giving up - because that zone is supposed to give
1235 * some guarantees.
1236 */
1237 for (retry_count = 0; retry_count < 5; retry_count++) {
1238 /*
1239 * If the config changed, stop immediately and go back to the
1240 * main loop: avoid trying to keep unplugging if the device
1241 * might have decided to not remove any more memory.
1242 */
1243 if (atomic_read(&vm->config_changed))
1244 return -EAGAIN;
1245
1246 rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE,
1247 GFP_KERNEL);
1248 if (rc == -ENOMEM)
1249 /* whoops, out of memory */
1250 return rc;
1251 else if (rc && !is_movable)
1252 break;
1253 else if (rc)
1254 continue;
1255
1256 virtio_mem_set_fake_offline(pfn, nr_pages, true);
1257 adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
1258 return 0;
1259 }
1260
1261 return -EBUSY;
1262 }
1263
1264 /*
1265 * Handle fake-offline pages when memory is going offline - such that the
1266 * pages can be skipped by mm-core when offlining.
1267 */
virtio_mem_fake_offline_going_offline(unsigned long pfn,unsigned long nr_pages)1268 static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
1269 unsigned long nr_pages)
1270 {
1271 struct page *page;
1272 unsigned long i;
1273
1274 /* Drop our reference to the pages so the memory can get offlined. */
1275 for (i = 0; i < nr_pages; i++) {
1276 page = pfn_to_page(pfn + i);
1277 if (WARN_ON(!page_ref_dec_and_test(page)))
1278 dump_page(page, "fake-offline page referenced");
1279 }
1280 }
1281
1282 /*
1283 * Handle fake-offline pages when memory offlining is canceled - to undo
1284 * what we did in virtio_mem_fake_offline_going_offline().
1285 */
virtio_mem_fake_offline_cancel_offline(unsigned long pfn,unsigned long nr_pages)1286 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
1287 unsigned long nr_pages)
1288 {
1289 unsigned long i;
1290
1291 /*
1292 * Get the reference again that we dropped via page_ref_dec_and_test()
1293 * when going offline.
1294 */
1295 for (i = 0; i < nr_pages; i++)
1296 page_ref_inc(pfn_to_page(pfn + i));
1297 }
1298
virtio_mem_online_page(struct virtio_mem * vm,struct page * page,unsigned int order)1299 static void virtio_mem_online_page(struct virtio_mem *vm,
1300 struct page *page, unsigned int order)
1301 {
1302 const unsigned long start = page_to_phys(page);
1303 const unsigned long end = start + PFN_PHYS(1 << order);
1304 unsigned long addr, next, id, sb_id, count;
1305 bool do_online;
1306
1307 /*
1308 * We can get called with any order up to MAX_PAGE_ORDER. If our subblock
1309 * size is smaller than that and we have a mixture of plugged and
1310 * unplugged subblocks within such a page, we have to process in
1311 * smaller granularity. In that case we'll adjust the order exactly once
1312 * within the loop.
1313 */
1314 for (addr = start; addr < end; ) {
1315 next = addr + PFN_PHYS(1 << order);
1316
1317 if (vm->in_sbm) {
1318 id = virtio_mem_phys_to_mb_id(addr);
1319 sb_id = virtio_mem_phys_to_sb_id(vm, addr);
1320 count = virtio_mem_phys_to_sb_id(vm, next - 1) - sb_id + 1;
1321
1322 if (virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, count)) {
1323 /* Fully plugged. */
1324 do_online = true;
1325 } else if (count == 1 ||
1326 virtio_mem_sbm_test_sb_unplugged(vm, id, sb_id, count)) {
1327 /* Fully unplugged. */
1328 do_online = false;
1329 } else {
1330 /*
1331 * Mixture, process sub-blocks instead. This
1332 * will be at least the size of a pageblock.
1333 * We'll run into this case exactly once.
1334 */
1335 order = ilog2(vm->sbm.sb_size) - PAGE_SHIFT;
1336 do_online = virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, 1);
1337 continue;
1338 }
1339 } else {
1340 /*
1341 * If the whole block is marked fake offline, keep
1342 * everything that way.
1343 */
1344 id = virtio_mem_phys_to_bb_id(vm, addr);
1345 do_online = virtio_mem_bbm_get_bb_state(vm, id) !=
1346 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE;
1347 }
1348
1349 if (do_online)
1350 generic_online_page(pfn_to_page(PFN_DOWN(addr)), order);
1351 else
1352 virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order,
1353 false);
1354 addr = next;
1355 }
1356 }
1357
virtio_mem_online_page_cb(struct page * page,unsigned int order)1358 static void virtio_mem_online_page_cb(struct page *page, unsigned int order)
1359 {
1360 const unsigned long addr = page_to_phys(page);
1361 struct virtio_mem *vm;
1362
1363 rcu_read_lock();
1364 list_for_each_entry_rcu(vm, &virtio_mem_devices, next) {
1365 /*
1366 * Pages we're onlining will never cross memory blocks and,
1367 * therefore, not virtio-mem devices.
1368 */
1369 if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order)))
1370 continue;
1371
1372 /*
1373 * virtio_mem_set_fake_offline() might sleep. We can safely
1374 * drop the RCU lock at this point because the device
1375 * cannot go away. See virtio_mem_remove() how races
1376 * between memory onlining and device removal are handled.
1377 */
1378 rcu_read_unlock();
1379
1380 virtio_mem_online_page(vm, page, order);
1381 return;
1382 }
1383 rcu_read_unlock();
1384
1385 /* not virtio-mem memory, but e.g., a DIMM. online it */
1386 generic_online_page(page, order);
1387 }
1388
virtio_mem_send_request(struct virtio_mem * vm,const struct virtio_mem_req * req)1389 static uint64_t virtio_mem_send_request(struct virtio_mem *vm,
1390 const struct virtio_mem_req *req)
1391 {
1392 struct scatterlist *sgs[2], sg_req, sg_resp;
1393 unsigned int len;
1394 int rc;
1395
1396 /* don't use the request residing on the stack (vaddr) */
1397 vm->req = *req;
1398
1399 /* out: buffer for request */
1400 sg_init_one(&sg_req, &vm->req, sizeof(vm->req));
1401 sgs[0] = &sg_req;
1402
1403 /* in: buffer for response */
1404 sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp));
1405 sgs[1] = &sg_resp;
1406
1407 rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL);
1408 if (rc < 0)
1409 return rc;
1410
1411 virtqueue_kick(vm->vq);
1412
1413 /* wait for a response */
1414 wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len));
1415
1416 return virtio16_to_cpu(vm->vdev, vm->resp.type);
1417 }
1418
virtio_mem_send_plug_request(struct virtio_mem * vm,uint64_t addr,uint64_t size)1419 static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr,
1420 uint64_t size)
1421 {
1422 const uint64_t nb_vm_blocks = size / vm->device_block_size;
1423 const struct virtio_mem_req req = {
1424 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG),
1425 .u.plug.addr = cpu_to_virtio64(vm->vdev, addr),
1426 .u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
1427 };
1428 int rc = -ENOMEM;
1429
1430 if (atomic_read(&vm->config_changed))
1431 return -EAGAIN;
1432
1433 dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr,
1434 addr + size - 1);
1435
1436 switch (virtio_mem_send_request(vm, &req)) {
1437 case VIRTIO_MEM_RESP_ACK:
1438 vm->plugged_size += size;
1439 return 0;
1440 case VIRTIO_MEM_RESP_NACK:
1441 rc = -EAGAIN;
1442 break;
1443 case VIRTIO_MEM_RESP_BUSY:
1444 rc = -ETXTBSY;
1445 break;
1446 case VIRTIO_MEM_RESP_ERROR:
1447 rc = -EINVAL;
1448 break;
1449 default:
1450 break;
1451 }
1452
1453 dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc);
1454 return rc;
1455 }
1456
virtio_mem_send_unplug_request(struct virtio_mem * vm,uint64_t addr,uint64_t size)1457 static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
1458 uint64_t size)
1459 {
1460 const uint64_t nb_vm_blocks = size / vm->device_block_size;
1461 const struct virtio_mem_req req = {
1462 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG),
1463 .u.unplug.addr = cpu_to_virtio64(vm->vdev, addr),
1464 .u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
1465 };
1466 int rc = -ENOMEM;
1467
1468 if (atomic_read(&vm->config_changed))
1469 return -EAGAIN;
1470
1471 dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr,
1472 addr + size - 1);
1473
1474 switch (virtio_mem_send_request(vm, &req)) {
1475 case VIRTIO_MEM_RESP_ACK:
1476 vm->plugged_size -= size;
1477 return 0;
1478 case VIRTIO_MEM_RESP_BUSY:
1479 rc = -ETXTBSY;
1480 break;
1481 case VIRTIO_MEM_RESP_ERROR:
1482 rc = -EINVAL;
1483 break;
1484 default:
1485 break;
1486 }
1487
1488 dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc);
1489 return rc;
1490 }
1491
virtio_mem_send_unplug_all_request(struct virtio_mem * vm)1492 static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
1493 {
1494 const struct virtio_mem_req req = {
1495 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL),
1496 };
1497 int rc = -ENOMEM;
1498
1499 dev_dbg(&vm->vdev->dev, "unplugging all memory");
1500
1501 switch (virtio_mem_send_request(vm, &req)) {
1502 case VIRTIO_MEM_RESP_ACK:
1503 vm->unplug_all_required = false;
1504 vm->plugged_size = 0;
1505 /* usable region might have shrunk */
1506 atomic_set(&vm->config_changed, 1);
1507 return 0;
1508 case VIRTIO_MEM_RESP_BUSY:
1509 rc = -ETXTBSY;
1510 break;
1511 default:
1512 break;
1513 }
1514
1515 dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc);
1516 return rc;
1517 }
1518
1519 /*
1520 * Plug selected subblocks. Updates the plugged state, but not the state
1521 * of the memory block.
1522 */
virtio_mem_sbm_plug_sb(struct virtio_mem * vm,unsigned long mb_id,int sb_id,int count)1523 static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
1524 int sb_id, int count)
1525 {
1526 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
1527 sb_id * vm->sbm.sb_size;
1528 const uint64_t size = count * vm->sbm.sb_size;
1529 int rc;
1530
1531 rc = virtio_mem_send_plug_request(vm, addr, size);
1532 if (!rc)
1533 virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count);
1534 return rc;
1535 }
1536
1537 /*
1538 * Unplug selected subblocks. Updates the plugged state, but not the state
1539 * of the memory block.
1540 */
virtio_mem_sbm_unplug_sb(struct virtio_mem * vm,unsigned long mb_id,int sb_id,int count)1541 static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id,
1542 int sb_id, int count)
1543 {
1544 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
1545 sb_id * vm->sbm.sb_size;
1546 const uint64_t size = count * vm->sbm.sb_size;
1547 int rc;
1548
1549 rc = virtio_mem_send_unplug_request(vm, addr, size);
1550 if (!rc)
1551 virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count);
1552 return rc;
1553 }
1554
1555 /*
1556 * Request to unplug a big block.
1557 *
1558 * Will not modify the state of the big block.
1559 */
virtio_mem_bbm_unplug_bb(struct virtio_mem * vm,unsigned long bb_id)1560 static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id)
1561 {
1562 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
1563 const uint64_t size = vm->bbm.bb_size;
1564
1565 return virtio_mem_send_unplug_request(vm, addr, size);
1566 }
1567
1568 /*
1569 * Request to plug a big block.
1570 *
1571 * Will not modify the state of the big block.
1572 */
virtio_mem_bbm_plug_bb(struct virtio_mem * vm,unsigned long bb_id)1573 static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id)
1574 {
1575 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
1576 const uint64_t size = vm->bbm.bb_size;
1577
1578 return virtio_mem_send_plug_request(vm, addr, size);
1579 }
1580
1581 /*
1582 * Unplug the desired number of plugged subblocks of a offline or not-added
1583 * memory block. Will fail if any subblock cannot get unplugged (instead of
1584 * skipping it).
1585 *
1586 * Will not modify the state of the memory block.
1587 *
1588 * Note: can fail after some subblocks were unplugged.
1589 */
virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem * vm,unsigned long mb_id,uint64_t * nb_sb)1590 static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm,
1591 unsigned long mb_id, uint64_t *nb_sb)
1592 {
1593 int sb_id, count;
1594 int rc;
1595
1596 sb_id = vm->sbm.sbs_per_mb - 1;
1597 while (*nb_sb) {
1598 /* Find the next candidate subblock */
1599 while (sb_id >= 0 &&
1600 virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1))
1601 sb_id--;
1602 if (sb_id < 0)
1603 break;
1604 /* Try to unplug multiple subblocks at a time */
1605 count = 1;
1606 while (count < *nb_sb && sb_id > 0 &&
1607 virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) {
1608 count++;
1609 sb_id--;
1610 }
1611
1612 rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
1613 if (rc)
1614 return rc;
1615 *nb_sb -= count;
1616 sb_id--;
1617 }
1618
1619 return 0;
1620 }
1621
1622 /*
1623 * Unplug all plugged subblocks of an offline or not-added memory block.
1624 *
1625 * Will not modify the state of the memory block.
1626 *
1627 * Note: can fail after some subblocks were unplugged.
1628 */
virtio_mem_sbm_unplug_mb(struct virtio_mem * vm,unsigned long mb_id)1629 static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id)
1630 {
1631 uint64_t nb_sb = vm->sbm.sbs_per_mb;
1632
1633 return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb);
1634 }
1635
1636 /*
1637 * Prepare tracking data for the next memory block.
1638 */
virtio_mem_sbm_prepare_next_mb(struct virtio_mem * vm,unsigned long * mb_id)1639 static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm,
1640 unsigned long *mb_id)
1641 {
1642 int rc;
1643
1644 if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id)
1645 return -ENOSPC;
1646
1647 /* Resize the state array if required. */
1648 rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm);
1649 if (rc)
1650 return rc;
1651
1652 /* Resize the subblock bitmap if required. */
1653 rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm);
1654 if (rc)
1655 return rc;
1656
1657 vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++;
1658 *mb_id = vm->sbm.next_mb_id++;
1659 return 0;
1660 }
1661
1662 /*
1663 * Try to plug the desired number of subblocks and add the memory block
1664 * to Linux.
1665 *
1666 * Will modify the state of the memory block.
1667 */
virtio_mem_sbm_plug_and_add_mb(struct virtio_mem * vm,unsigned long mb_id,uint64_t * nb_sb)1668 static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm,
1669 unsigned long mb_id, uint64_t *nb_sb)
1670 {
1671 const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb);
1672 int rc;
1673
1674 if (WARN_ON_ONCE(!count))
1675 return -EINVAL;
1676
1677 /*
1678 * Plug the requested number of subblocks before adding it to linux,
1679 * so that onlining will directly online all plugged subblocks.
1680 */
1681 rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count);
1682 if (rc)
1683 return rc;
1684
1685 /*
1686 * Mark the block properly offline before adding it to Linux,
1687 * so the memory notifiers will find the block in the right state.
1688 */
1689 if (count == vm->sbm.sbs_per_mb)
1690 virtio_mem_sbm_set_mb_state(vm, mb_id,
1691 VIRTIO_MEM_SBM_MB_OFFLINE);
1692 else
1693 virtio_mem_sbm_set_mb_state(vm, mb_id,
1694 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
1695
1696 /* Add the memory block to linux - if that fails, try to unplug. */
1697 rc = virtio_mem_sbm_add_mb(vm, mb_id);
1698 if (rc) {
1699 int new_state = VIRTIO_MEM_SBM_MB_UNUSED;
1700
1701 if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count))
1702 new_state = VIRTIO_MEM_SBM_MB_PLUGGED;
1703 virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
1704 return rc;
1705 }
1706
1707 *nb_sb -= count;
1708 return 0;
1709 }
1710
1711 /*
1712 * Try to plug the desired number of subblocks of a memory block that
1713 * is already added to Linux.
1714 *
1715 * Will modify the state of the memory block.
1716 *
1717 * Note: Can fail after some subblocks were successfully plugged.
1718 */
virtio_mem_sbm_plug_any_sb(struct virtio_mem * vm,unsigned long mb_id,uint64_t * nb_sb)1719 static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm,
1720 unsigned long mb_id, uint64_t *nb_sb)
1721 {
1722 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
1723 unsigned long pfn, nr_pages;
1724 int sb_id, count;
1725 int rc;
1726
1727 if (WARN_ON_ONCE(!*nb_sb))
1728 return -EINVAL;
1729
1730 while (*nb_sb) {
1731 sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id);
1732 if (sb_id >= vm->sbm.sbs_per_mb)
1733 break;
1734 count = 1;
1735 while (count < *nb_sb &&
1736 sb_id + count < vm->sbm.sbs_per_mb &&
1737 !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1))
1738 count++;
1739
1740 rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count);
1741 if (rc)
1742 return rc;
1743 *nb_sb -= count;
1744 if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL)
1745 continue;
1746
1747 /* fake-online the pages if the memory block is online */
1748 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
1749 sb_id * vm->sbm.sb_size);
1750 nr_pages = PFN_DOWN(count * vm->sbm.sb_size);
1751 virtio_mem_fake_online(pfn, nr_pages);
1752 }
1753
1754 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
1755 virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1);
1756
1757 return 0;
1758 }
1759
virtio_mem_sbm_plug_request(struct virtio_mem * vm,uint64_t diff)1760 static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff)
1761 {
1762 const int mb_states[] = {
1763 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
1764 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
1765 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
1766 };
1767 uint64_t nb_sb = diff / vm->sbm.sb_size;
1768 unsigned long mb_id;
1769 int rc, i;
1770
1771 if (!nb_sb)
1772 return 0;
1773
1774 /* Don't race with onlining/offlining */
1775 mutex_lock(&vm->hotplug_mutex);
1776
1777 for (i = 0; i < ARRAY_SIZE(mb_states); i++) {
1778 virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) {
1779 rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb);
1780 if (rc || !nb_sb)
1781 goto out_unlock;
1782 cond_resched();
1783 }
1784 }
1785
1786 /*
1787 * We won't be working on online/offline memory blocks from this point,
1788 * so we can't race with memory onlining/offlining. Drop the mutex.
1789 */
1790 mutex_unlock(&vm->hotplug_mutex);
1791
1792 /* Try to plug and add unused blocks */
1793 virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) {
1794 if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
1795 return -ENOSPC;
1796
1797 rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
1798 if (rc || !nb_sb)
1799 return rc;
1800 cond_resched();
1801 }
1802
1803 /* Try to prepare, plug and add new blocks */
1804 while (nb_sb) {
1805 if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
1806 return -ENOSPC;
1807
1808 rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id);
1809 if (rc)
1810 return rc;
1811 rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
1812 if (rc)
1813 return rc;
1814 cond_resched();
1815 }
1816
1817 return 0;
1818 out_unlock:
1819 mutex_unlock(&vm->hotplug_mutex);
1820 return rc;
1821 }
1822
1823 /*
1824 * Plug a big block and add it to Linux.
1825 *
1826 * Will modify the state of the big block.
1827 */
virtio_mem_bbm_plug_and_add_bb(struct virtio_mem * vm,unsigned long bb_id)1828 static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm,
1829 unsigned long bb_id)
1830 {
1831 int rc;
1832
1833 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
1834 VIRTIO_MEM_BBM_BB_UNUSED))
1835 return -EINVAL;
1836
1837 rc = virtio_mem_bbm_plug_bb(vm, bb_id);
1838 if (rc)
1839 return rc;
1840 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
1841
1842 rc = virtio_mem_bbm_add_bb(vm, bb_id);
1843 if (rc) {
1844 if (!virtio_mem_bbm_unplug_bb(vm, bb_id))
1845 virtio_mem_bbm_set_bb_state(vm, bb_id,
1846 VIRTIO_MEM_BBM_BB_UNUSED);
1847 else
1848 /* Retry from the main loop. */
1849 virtio_mem_bbm_set_bb_state(vm, bb_id,
1850 VIRTIO_MEM_BBM_BB_PLUGGED);
1851 return rc;
1852 }
1853 return 0;
1854 }
1855
1856 /*
1857 * Prepare tracking data for the next big block.
1858 */
virtio_mem_bbm_prepare_next_bb(struct virtio_mem * vm,unsigned long * bb_id)1859 static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm,
1860 unsigned long *bb_id)
1861 {
1862 int rc;
1863
1864 if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id)
1865 return -ENOSPC;
1866
1867 /* Resize the big block state array if required. */
1868 rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm);
1869 if (rc)
1870 return rc;
1871
1872 vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++;
1873 *bb_id = vm->bbm.next_bb_id;
1874 vm->bbm.next_bb_id++;
1875 return 0;
1876 }
1877
virtio_mem_bbm_plug_request(struct virtio_mem * vm,uint64_t diff)1878 static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff)
1879 {
1880 uint64_t nb_bb = diff / vm->bbm.bb_size;
1881 unsigned long bb_id;
1882 int rc;
1883
1884 if (!nb_bb)
1885 return 0;
1886
1887 /* Try to plug and add unused big blocks */
1888 virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) {
1889 if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
1890 return -ENOSPC;
1891
1892 rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
1893 if (!rc)
1894 nb_bb--;
1895 if (rc || !nb_bb)
1896 return rc;
1897 cond_resched();
1898 }
1899
1900 /* Try to prepare, plug and add new big blocks */
1901 while (nb_bb) {
1902 if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
1903 return -ENOSPC;
1904
1905 rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id);
1906 if (rc)
1907 return rc;
1908 rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
1909 if (!rc)
1910 nb_bb--;
1911 if (rc)
1912 return rc;
1913 cond_resched();
1914 }
1915
1916 return 0;
1917 }
1918
1919 /*
1920 * Try to plug the requested amount of memory.
1921 */
virtio_mem_plug_request(struct virtio_mem * vm,uint64_t diff)1922 static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
1923 {
1924 if (vm->in_sbm)
1925 return virtio_mem_sbm_plug_request(vm, diff);
1926 return virtio_mem_bbm_plug_request(vm, diff);
1927 }
1928
1929 /*
1930 * Unplug the desired number of plugged subblocks of an offline memory block.
1931 * Will fail if any subblock cannot get unplugged (instead of skipping it).
1932 *
1933 * Will modify the state of the memory block. Might temporarily drop the
1934 * hotplug_mutex.
1935 *
1936 * Note: Can fail after some subblocks were successfully unplugged.
1937 */
virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem * vm,unsigned long mb_id,uint64_t * nb_sb)1938 static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm,
1939 unsigned long mb_id,
1940 uint64_t *nb_sb)
1941 {
1942 int rc;
1943
1944 rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb);
1945
1946 /* some subblocks might have been unplugged even on failure */
1947 if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
1948 virtio_mem_sbm_set_mb_state(vm, mb_id,
1949 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
1950 if (rc)
1951 return rc;
1952
1953 if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
1954 /*
1955 * Remove the block from Linux - this should never fail.
1956 * Hinder the block from getting onlined by marking it
1957 * unplugged. Temporarily drop the mutex, so
1958 * any pending GOING_ONLINE requests can be serviced/rejected.
1959 */
1960 virtio_mem_sbm_set_mb_state(vm, mb_id,
1961 VIRTIO_MEM_SBM_MB_UNUSED);
1962
1963 mutex_unlock(&vm->hotplug_mutex);
1964 rc = virtio_mem_sbm_remove_mb(vm, mb_id);
1965 BUG_ON(rc);
1966 mutex_lock(&vm->hotplug_mutex);
1967 }
1968 return 0;
1969 }
1970
1971 /*
1972 * Unplug the given plugged subblocks of an online memory block.
1973 *
1974 * Will modify the state of the memory block.
1975 */
virtio_mem_sbm_unplug_sb_online(struct virtio_mem * vm,unsigned long mb_id,int sb_id,int count)1976 static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm,
1977 unsigned long mb_id, int sb_id,
1978 int count)
1979 {
1980 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count;
1981 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
1982 unsigned long start_pfn;
1983 int rc;
1984
1985 start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
1986 sb_id * vm->sbm.sb_size);
1987
1988 rc = virtio_mem_fake_offline(vm, start_pfn, nr_pages);
1989 if (rc)
1990 return rc;
1991
1992 /* Try to unplug the allocated memory */
1993 rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
1994 if (rc) {
1995 /* Return the memory to the buddy. */
1996 virtio_mem_fake_online(start_pfn, nr_pages);
1997 return rc;
1998 }
1999
2000 switch (old_state) {
2001 case VIRTIO_MEM_SBM_MB_KERNEL:
2002 virtio_mem_sbm_set_mb_state(vm, mb_id,
2003 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL);
2004 break;
2005 case VIRTIO_MEM_SBM_MB_MOVABLE:
2006 virtio_mem_sbm_set_mb_state(vm, mb_id,
2007 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL);
2008 break;
2009 }
2010
2011 return 0;
2012 }
2013
2014 /*
2015 * Unplug the desired number of plugged subblocks of an online memory block.
2016 * Will skip subblock that are busy.
2017 *
2018 * Will modify the state of the memory block. Might temporarily drop the
2019 * hotplug_mutex.
2020 *
2021 * Note: Can fail after some subblocks were successfully unplugged. Can
2022 * return 0 even if subblocks were busy and could not get unplugged.
2023 */
virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem * vm,unsigned long mb_id,uint64_t * nb_sb)2024 static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm,
2025 unsigned long mb_id,
2026 uint64_t *nb_sb)
2027 {
2028 int rc, sb_id;
2029
2030 /* If possible, try to unplug the complete block in one shot. */
2031 if (*nb_sb >= vm->sbm.sbs_per_mb &&
2032 virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
2033 rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0,
2034 vm->sbm.sbs_per_mb);
2035 if (!rc) {
2036 *nb_sb -= vm->sbm.sbs_per_mb;
2037 goto unplugged;
2038 } else if (rc != -EBUSY)
2039 return rc;
2040 }
2041
2042 /* Fallback to single subblocks. */
2043 for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) {
2044 /* Find the next candidate subblock */
2045 while (sb_id >= 0 &&
2046 !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
2047 sb_id--;
2048 if (sb_id < 0)
2049 break;
2050
2051 rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1);
2052 if (rc == -EBUSY)
2053 continue;
2054 else if (rc)
2055 return rc;
2056 *nb_sb -= 1;
2057 }
2058
2059 unplugged:
2060 rc = virtio_mem_sbm_try_remove_unplugged_mb(vm, mb_id);
2061 if (rc)
2062 vm->sbm.have_unplugged_mb = 1;
2063 /* Ignore errors, this is not critical. We'll retry later. */
2064 return 0;
2065 }
2066
2067 /*
2068 * Unplug the desired number of plugged subblocks of a memory block that is
2069 * already added to Linux. Will skip subblock of online memory blocks that are
2070 * busy (by the OS). Will fail if any subblock that's not busy cannot get
2071 * unplugged.
2072 *
2073 * Will modify the state of the memory block. Might temporarily drop the
2074 * hotplug_mutex.
2075 *
2076 * Note: Can fail after some subblocks were successfully unplugged. Can
2077 * return 0 even if subblocks were busy and could not get unplugged.
2078 */
virtio_mem_sbm_unplug_any_sb(struct virtio_mem * vm,unsigned long mb_id,uint64_t * nb_sb)2079 static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm,
2080 unsigned long mb_id,
2081 uint64_t *nb_sb)
2082 {
2083 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
2084
2085 switch (old_state) {
2086 case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL:
2087 case VIRTIO_MEM_SBM_MB_KERNEL:
2088 case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL:
2089 case VIRTIO_MEM_SBM_MB_MOVABLE:
2090 return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb);
2091 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
2092 case VIRTIO_MEM_SBM_MB_OFFLINE:
2093 return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb);
2094 }
2095 return -EINVAL;
2096 }
2097
virtio_mem_sbm_unplug_request(struct virtio_mem * vm,uint64_t diff)2098 static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
2099 {
2100 const int mb_states[] = {
2101 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
2102 VIRTIO_MEM_SBM_MB_OFFLINE,
2103 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
2104 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
2105 VIRTIO_MEM_SBM_MB_MOVABLE,
2106 VIRTIO_MEM_SBM_MB_KERNEL,
2107 };
2108 uint64_t nb_sb = diff / vm->sbm.sb_size;
2109 unsigned long mb_id;
2110 int rc, i;
2111
2112 if (!nb_sb)
2113 return 0;
2114
2115 /*
2116 * We'll drop the mutex a couple of times when it is safe to do so.
2117 * This might result in some blocks switching the state (online/offline)
2118 * and we could miss them in this run - we will retry again later.
2119 */
2120 mutex_lock(&vm->hotplug_mutex);
2121
2122 /*
2123 * We try unplug from partially plugged blocks first, to try removing
2124 * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE
2125 * as it's more reliable to unplug memory and remove whole memory
2126 * blocks, and we don't want to trigger a zone imbalances by
2127 * accidentially removing too much kernel memory.
2128 */
2129 for (i = 0; i < ARRAY_SIZE(mb_states); i++) {
2130 virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) {
2131 rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb);
2132 if (rc || !nb_sb)
2133 goto out_unlock;
2134 mutex_unlock(&vm->hotplug_mutex);
2135 cond_resched();
2136 mutex_lock(&vm->hotplug_mutex);
2137 }
2138 if (!unplug_online && i == 1) {
2139 mutex_unlock(&vm->hotplug_mutex);
2140 return 0;
2141 }
2142 }
2143
2144 mutex_unlock(&vm->hotplug_mutex);
2145 return nb_sb ? -EBUSY : 0;
2146 out_unlock:
2147 mutex_unlock(&vm->hotplug_mutex);
2148 return rc;
2149 }
2150
2151 /*
2152 * Try to offline and remove a big block from Linux and unplug it. Will fail
2153 * with -EBUSY if some memory is busy and cannot get unplugged.
2154 *
2155 * Will modify the state of the memory block. Might temporarily drop the
2156 * hotplug_mutex.
2157 */
virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem * vm,unsigned long bb_id)2158 static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm,
2159 unsigned long bb_id)
2160 {
2161 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2162 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2163 unsigned long end_pfn = start_pfn + nr_pages;
2164 unsigned long pfn;
2165 struct page *page;
2166 int rc;
2167
2168 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
2169 VIRTIO_MEM_BBM_BB_ADDED))
2170 return -EINVAL;
2171
2172 /*
2173 * Start by fake-offlining all memory. Once we marked the device
2174 * block as fake-offline, all newly onlined memory will
2175 * automatically be kept fake-offline. Protect from concurrent
2176 * onlining/offlining until we have a consistent state.
2177 */
2178 mutex_lock(&vm->hotplug_mutex);
2179 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_FAKE_OFFLINE);
2180
2181 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
2182 page = pfn_to_online_page(pfn);
2183 if (!page)
2184 continue;
2185
2186 rc = virtio_mem_fake_offline(vm, pfn, PAGES_PER_SECTION);
2187 if (rc) {
2188 end_pfn = pfn;
2189 goto rollback;
2190 }
2191 }
2192 mutex_unlock(&vm->hotplug_mutex);
2193
2194 rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id);
2195 if (rc) {
2196 mutex_lock(&vm->hotplug_mutex);
2197 goto rollback;
2198 }
2199
2200 rc = virtio_mem_bbm_unplug_bb(vm, bb_id);
2201 if (rc)
2202 virtio_mem_bbm_set_bb_state(vm, bb_id,
2203 VIRTIO_MEM_BBM_BB_PLUGGED);
2204 else
2205 virtio_mem_bbm_set_bb_state(vm, bb_id,
2206 VIRTIO_MEM_BBM_BB_UNUSED);
2207 return rc;
2208
2209 rollback:
2210 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
2211 page = pfn_to_online_page(pfn);
2212 if (!page)
2213 continue;
2214 virtio_mem_fake_online(pfn, PAGES_PER_SECTION);
2215 }
2216 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
2217 mutex_unlock(&vm->hotplug_mutex);
2218 return rc;
2219 }
2220
2221 /*
2222 * Test if a big block is completely offline.
2223 */
virtio_mem_bbm_bb_is_offline(struct virtio_mem * vm,unsigned long bb_id)2224 static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm,
2225 unsigned long bb_id)
2226 {
2227 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2228 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2229 unsigned long pfn;
2230
2231 for (pfn = start_pfn; pfn < start_pfn + nr_pages;
2232 pfn += PAGES_PER_SECTION) {
2233 if (pfn_to_online_page(pfn))
2234 return false;
2235 }
2236
2237 return true;
2238 }
2239
2240 /*
2241 * Test if a big block is completely onlined to ZONE_MOVABLE (or offline).
2242 */
virtio_mem_bbm_bb_is_movable(struct virtio_mem * vm,unsigned long bb_id)2243 static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm,
2244 unsigned long bb_id)
2245 {
2246 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2247 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2248 struct page *page;
2249 unsigned long pfn;
2250
2251 for (pfn = start_pfn; pfn < start_pfn + nr_pages;
2252 pfn += PAGES_PER_SECTION) {
2253 page = pfn_to_online_page(pfn);
2254 if (!page)
2255 continue;
2256 if (page_zonenum(page) != ZONE_MOVABLE)
2257 return false;
2258 }
2259
2260 return true;
2261 }
2262
virtio_mem_bbm_unplug_request(struct virtio_mem * vm,uint64_t diff)2263 static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
2264 {
2265 uint64_t nb_bb = diff / vm->bbm.bb_size;
2266 uint64_t bb_id;
2267 int rc, i;
2268
2269 if (!nb_bb)
2270 return 0;
2271
2272 /*
2273 * Try to unplug big blocks. Similar to SBM, start with offline
2274 * big blocks.
2275 */
2276 for (i = 0; i < 3; i++) {
2277 virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) {
2278 cond_resched();
2279
2280 /*
2281 * As we're holding no locks, these checks are racy,
2282 * but we don't care.
2283 */
2284 if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id))
2285 continue;
2286 if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id))
2287 continue;
2288 rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id);
2289 if (rc == -EBUSY)
2290 continue;
2291 if (!rc)
2292 nb_bb--;
2293 if (rc || !nb_bb)
2294 return rc;
2295 }
2296 if (i == 0 && !unplug_online)
2297 return 0;
2298 }
2299
2300 return nb_bb ? -EBUSY : 0;
2301 }
2302
2303 /*
2304 * Try to unplug the requested amount of memory.
2305 */
virtio_mem_unplug_request(struct virtio_mem * vm,uint64_t diff)2306 static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
2307 {
2308 if (vm->in_sbm)
2309 return virtio_mem_sbm_unplug_request(vm, diff);
2310 return virtio_mem_bbm_unplug_request(vm, diff);
2311 }
2312
2313 /*
2314 * Try to unplug all blocks that couldn't be unplugged before, for example,
2315 * because the hypervisor was busy. Further, offline and remove any memory
2316 * blocks where we previously failed.
2317 */
virtio_mem_cleanup_pending_mb(struct virtio_mem * vm)2318 static int virtio_mem_cleanup_pending_mb(struct virtio_mem *vm)
2319 {
2320 unsigned long id;
2321 int rc = 0;
2322
2323 if (!vm->in_sbm) {
2324 virtio_mem_bbm_for_each_bb(vm, id,
2325 VIRTIO_MEM_BBM_BB_PLUGGED) {
2326 rc = virtio_mem_bbm_unplug_bb(vm, id);
2327 if (rc)
2328 return rc;
2329 virtio_mem_bbm_set_bb_state(vm, id,
2330 VIRTIO_MEM_BBM_BB_UNUSED);
2331 }
2332 return 0;
2333 }
2334
2335 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) {
2336 rc = virtio_mem_sbm_unplug_mb(vm, id);
2337 if (rc)
2338 return rc;
2339 virtio_mem_sbm_set_mb_state(vm, id,
2340 VIRTIO_MEM_SBM_MB_UNUSED);
2341 }
2342
2343 if (!vm->sbm.have_unplugged_mb)
2344 return 0;
2345
2346 /*
2347 * Let's retry (offlining and) removing completely unplugged Linux
2348 * memory blocks.
2349 */
2350 vm->sbm.have_unplugged_mb = false;
2351
2352 mutex_lock(&vm->hotplug_mutex);
2353 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL)
2354 rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id);
2355 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL)
2356 rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id);
2357 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL)
2358 rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id);
2359 mutex_unlock(&vm->hotplug_mutex);
2360
2361 if (rc)
2362 vm->sbm.have_unplugged_mb = true;
2363 /* Ignore errors, this is not critical. We'll retry later. */
2364 return 0;
2365 }
2366
2367 /*
2368 * Update all parts of the config that could have changed.
2369 */
virtio_mem_refresh_config(struct virtio_mem * vm)2370 static void virtio_mem_refresh_config(struct virtio_mem *vm)
2371 {
2372 const struct range pluggable_range = mhp_get_pluggable_range(true);
2373 uint64_t new_plugged_size, end_addr;
2374
2375 /* the plugged_size is just a reflection of what _we_ did previously */
2376 virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
2377 &new_plugged_size);
2378 if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size))
2379 vm->plugged_size = new_plugged_size;
2380
2381 /* calculate the last usable memory block id */
2382 virtio_cread_le(vm->vdev, struct virtio_mem_config,
2383 usable_region_size, &vm->usable_region_size);
2384 end_addr = min(vm->addr + vm->usable_region_size - 1,
2385 pluggable_range.end);
2386
2387 if (vm->in_sbm) {
2388 vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr);
2389 if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes()))
2390 vm->sbm.last_usable_mb_id--;
2391 } else {
2392 vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm,
2393 end_addr);
2394 if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size))
2395 vm->bbm.last_usable_bb_id--;
2396 }
2397 /*
2398 * If we cannot plug any of our device memory (e.g., nothing in the
2399 * usable region is addressable), the last usable memory block id will
2400 * be smaller than the first usable memory block id. We'll stop
2401 * attempting to add memory with -ENOSPC from our main loop.
2402 */
2403
2404 /* see if there is a request to change the size */
2405 virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size,
2406 &vm->requested_size);
2407
2408 dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size);
2409 dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size);
2410 }
2411
2412 /*
2413 * Workqueue function for handling plug/unplug requests and config updates.
2414 */
virtio_mem_run_wq(struct work_struct * work)2415 static void virtio_mem_run_wq(struct work_struct *work)
2416 {
2417 struct virtio_mem *vm = container_of(work, struct virtio_mem, wq);
2418 uint64_t diff;
2419 int rc;
2420
2421 if (unlikely(vm->in_kdump)) {
2422 dev_warn_once(&vm->vdev->dev,
2423 "unexpected workqueue run in kdump kernel\n");
2424 return;
2425 }
2426
2427 hrtimer_cancel(&vm->retry_timer);
2428
2429 if (vm->broken)
2430 return;
2431
2432 atomic_set(&vm->wq_active, 1);
2433 retry:
2434 rc = 0;
2435
2436 /* Make sure we start with a clean state if there are leftovers. */
2437 if (unlikely(vm->unplug_all_required))
2438 rc = virtio_mem_send_unplug_all_request(vm);
2439
2440 if (atomic_read(&vm->config_changed)) {
2441 atomic_set(&vm->config_changed, 0);
2442 virtio_mem_refresh_config(vm);
2443 }
2444
2445 /* Cleanup any leftovers from previous runs */
2446 if (!rc)
2447 rc = virtio_mem_cleanup_pending_mb(vm);
2448
2449 if (!rc && vm->requested_size != vm->plugged_size) {
2450 if (vm->requested_size > vm->plugged_size) {
2451 diff = vm->requested_size - vm->plugged_size;
2452 rc = virtio_mem_plug_request(vm, diff);
2453 } else {
2454 diff = vm->plugged_size - vm->requested_size;
2455 rc = virtio_mem_unplug_request(vm, diff);
2456 }
2457 }
2458
2459 /*
2460 * Keep retrying to offline and remove completely unplugged Linux
2461 * memory blocks.
2462 */
2463 if (!rc && vm->in_sbm && vm->sbm.have_unplugged_mb)
2464 rc = -EBUSY;
2465
2466 switch (rc) {
2467 case 0:
2468 vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
2469 break;
2470 case -ENOSPC:
2471 /*
2472 * We cannot add any more memory (alignment, physical limit)
2473 * or we have too many offline memory blocks.
2474 */
2475 break;
2476 case -ETXTBSY:
2477 /*
2478 * The hypervisor cannot process our request right now
2479 * (e.g., out of memory, migrating);
2480 */
2481 case -EBUSY:
2482 /*
2483 * We cannot free up any memory to unplug it (all plugged memory
2484 * is busy).
2485 */
2486 case -ENOMEM:
2487 /* Out of memory, try again later. */
2488 hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms),
2489 HRTIMER_MODE_REL);
2490 break;
2491 case -EAGAIN:
2492 /* Retry immediately (e.g., the config changed). */
2493 goto retry;
2494 default:
2495 /* Unknown error, mark as broken */
2496 dev_err(&vm->vdev->dev,
2497 "unknown error, marking device broken: %d\n", rc);
2498 vm->broken = true;
2499 }
2500
2501 atomic_set(&vm->wq_active, 0);
2502 }
2503
virtio_mem_timer_expired(struct hrtimer * timer)2504 static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer)
2505 {
2506 struct virtio_mem *vm = container_of(timer, struct virtio_mem,
2507 retry_timer);
2508
2509 virtio_mem_retry(vm);
2510 vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2,
2511 VIRTIO_MEM_RETRY_TIMER_MAX_MS);
2512 return HRTIMER_NORESTART;
2513 }
2514
virtio_mem_handle_response(struct virtqueue * vq)2515 static void virtio_mem_handle_response(struct virtqueue *vq)
2516 {
2517 struct virtio_mem *vm = vq->vdev->priv;
2518
2519 wake_up(&vm->host_resp);
2520 }
2521
virtio_mem_init_vq(struct virtio_mem * vm)2522 static int virtio_mem_init_vq(struct virtio_mem *vm)
2523 {
2524 struct virtqueue *vq;
2525
2526 vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response,
2527 "guest-request");
2528 if (IS_ERR(vq))
2529 return PTR_ERR(vq);
2530 vm->vq = vq;
2531
2532 return 0;
2533 }
2534
virtio_mem_init_hotplug(struct virtio_mem * vm)2535 static int virtio_mem_init_hotplug(struct virtio_mem *vm)
2536 {
2537 const struct range pluggable_range = mhp_get_pluggable_range(true);
2538 uint64_t unit_pages, sb_size, addr;
2539 int rc;
2540
2541 /* bad device setup - warn only */
2542 if (!IS_ALIGNED(vm->addr, memory_block_size_bytes()))
2543 dev_warn(&vm->vdev->dev,
2544 "The alignment of the physical start address can make some memory unusable.\n");
2545 if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes()))
2546 dev_warn(&vm->vdev->dev,
2547 "The alignment of the physical end address can make some memory unusable.\n");
2548 if (vm->addr < pluggable_range.start ||
2549 vm->addr + vm->region_size - 1 > pluggable_range.end)
2550 dev_warn(&vm->vdev->dev,
2551 "Some device memory is not addressable/pluggable. This can make some memory unusable.\n");
2552
2553 /* Prepare the offline threshold - make sure we can add two blocks. */
2554 vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(),
2555 VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD);
2556
2557 /*
2558 * alloc_contig_range() works reliably with pageblock
2559 * granularity on ZONE_NORMAL, use pageblock_nr_pages.
2560 */
2561 sb_size = PAGE_SIZE * pageblock_nr_pages;
2562 sb_size = max_t(uint64_t, vm->device_block_size, sb_size);
2563
2564 if (sb_size < memory_block_size_bytes() && !force_bbm) {
2565 /* SBM: At least two subblocks per Linux memory block. */
2566 vm->in_sbm = true;
2567 vm->sbm.sb_size = sb_size;
2568 vm->sbm.sbs_per_mb = memory_block_size_bytes() /
2569 vm->sbm.sb_size;
2570
2571 /* Round up to the next full memory block */
2572 addr = max_t(uint64_t, vm->addr, pluggable_range.start) +
2573 memory_block_size_bytes() - 1;
2574 vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr);
2575 vm->sbm.next_mb_id = vm->sbm.first_mb_id;
2576 } else {
2577 /* BBM: At least one Linux memory block. */
2578 vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size,
2579 memory_block_size_bytes());
2580
2581 if (bbm_block_size) {
2582 if (!is_power_of_2(bbm_block_size)) {
2583 dev_warn(&vm->vdev->dev,
2584 "bbm_block_size is not a power of 2");
2585 } else if (bbm_block_size < vm->bbm.bb_size) {
2586 dev_warn(&vm->vdev->dev,
2587 "bbm_block_size is too small");
2588 } else {
2589 vm->bbm.bb_size = bbm_block_size;
2590 }
2591 }
2592
2593 /* Round up to the next aligned big block */
2594 addr = max_t(uint64_t, vm->addr, pluggable_range.start) +
2595 vm->bbm.bb_size - 1;
2596 vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr);
2597 vm->bbm.next_bb_id = vm->bbm.first_bb_id;
2598
2599 /* Make sure we can add two big blocks. */
2600 vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size,
2601 vm->offline_threshold);
2602 }
2603
2604 dev_info(&vm->vdev->dev, "memory block size: 0x%lx",
2605 memory_block_size_bytes());
2606 if (vm->in_sbm)
2607 dev_info(&vm->vdev->dev, "subblock size: 0x%llx",
2608 (unsigned long long)vm->sbm.sb_size);
2609 else
2610 dev_info(&vm->vdev->dev, "big block size: 0x%llx",
2611 (unsigned long long)vm->bbm.bb_size);
2612
2613 /* create the parent resource for all memory */
2614 rc = virtio_mem_create_resource(vm);
2615 if (rc)
2616 return rc;
2617
2618 /* use a single dynamic memory group to cover the whole memory device */
2619 if (vm->in_sbm)
2620 unit_pages = PHYS_PFN(memory_block_size_bytes());
2621 else
2622 unit_pages = PHYS_PFN(vm->bbm.bb_size);
2623 rc = memory_group_register_dynamic(vm->nid, unit_pages);
2624 if (rc < 0)
2625 goto out_del_resource;
2626 vm->mgid = rc;
2627
2628 /*
2629 * If we still have memory plugged, we have to unplug all memory first.
2630 * Registering our parent resource makes sure that this memory isn't
2631 * actually in use (e.g., trying to reload the driver).
2632 */
2633 if (vm->plugged_size) {
2634 vm->unplug_all_required = true;
2635 dev_info(&vm->vdev->dev, "unplugging all memory is required\n");
2636 }
2637
2638 /* register callbacks */
2639 vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb;
2640 rc = register_memory_notifier(&vm->memory_notifier);
2641 if (rc)
2642 goto out_unreg_group;
2643 /* Block hibernation as early as possible. */
2644 vm->pm_notifier.priority = INT_MAX;
2645 vm->pm_notifier.notifier_call = virtio_mem_pm_notifier_cb;
2646 rc = register_pm_notifier(&vm->pm_notifier);
2647 if (rc)
2648 goto out_unreg_mem;
2649 rc = register_virtio_mem_device(vm);
2650 if (rc)
2651 goto out_unreg_pm;
2652
2653 virtio_device_ready(vm->vdev);
2654 return 0;
2655 out_unreg_pm:
2656 unregister_pm_notifier(&vm->pm_notifier);
2657 out_unreg_mem:
2658 unregister_memory_notifier(&vm->memory_notifier);
2659 out_unreg_group:
2660 memory_group_unregister(vm->mgid);
2661 out_del_resource:
2662 virtio_mem_delete_resource(vm);
2663 return rc;
2664 }
2665
2666 #ifdef CONFIG_PROC_VMCORE
virtio_mem_send_state_request(struct virtio_mem * vm,uint64_t addr,uint64_t size)2667 static int virtio_mem_send_state_request(struct virtio_mem *vm, uint64_t addr,
2668 uint64_t size)
2669 {
2670 const uint64_t nb_vm_blocks = size / vm->device_block_size;
2671 const struct virtio_mem_req req = {
2672 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_STATE),
2673 .u.state.addr = cpu_to_virtio64(vm->vdev, addr),
2674 .u.state.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
2675 };
2676 int rc = -ENOMEM;
2677
2678 dev_dbg(&vm->vdev->dev, "requesting state: 0x%llx - 0x%llx\n", addr,
2679 addr + size - 1);
2680
2681 switch (virtio_mem_send_request(vm, &req)) {
2682 case VIRTIO_MEM_RESP_ACK:
2683 return virtio16_to_cpu(vm->vdev, vm->resp.u.state.state);
2684 case VIRTIO_MEM_RESP_ERROR:
2685 rc = -EINVAL;
2686 break;
2687 default:
2688 break;
2689 }
2690
2691 dev_dbg(&vm->vdev->dev, "requesting state failed: %d\n", rc);
2692 return rc;
2693 }
2694
virtio_mem_vmcore_pfn_is_ram(struct vmcore_cb * cb,unsigned long pfn)2695 static bool virtio_mem_vmcore_pfn_is_ram(struct vmcore_cb *cb,
2696 unsigned long pfn)
2697 {
2698 struct virtio_mem *vm = container_of(cb, struct virtio_mem,
2699 vmcore_cb);
2700 uint64_t addr = PFN_PHYS(pfn);
2701 bool is_ram;
2702 int rc;
2703
2704 if (!virtio_mem_contains_range(vm, addr, PAGE_SIZE))
2705 return true;
2706 if (!vm->plugged_size)
2707 return false;
2708
2709 /*
2710 * We have to serialize device requests and access to the information
2711 * about the block queried last.
2712 */
2713 mutex_lock(&vm->hotplug_mutex);
2714
2715 addr = ALIGN_DOWN(addr, vm->device_block_size);
2716 if (addr != vm->last_block_addr) {
2717 rc = virtio_mem_send_state_request(vm, addr,
2718 vm->device_block_size);
2719 /* On any kind of error, we're going to signal !ram. */
2720 if (rc == VIRTIO_MEM_STATE_PLUGGED)
2721 vm->last_block_plugged = true;
2722 else
2723 vm->last_block_plugged = false;
2724 vm->last_block_addr = addr;
2725 }
2726
2727 is_ram = vm->last_block_plugged;
2728 mutex_unlock(&vm->hotplug_mutex);
2729 return is_ram;
2730 }
2731
2732 #ifdef CONFIG_PROC_VMCORE_DEVICE_RAM
virtio_mem_vmcore_add_device_ram(struct virtio_mem * vm,struct list_head * list,uint64_t start,uint64_t end)2733 static int virtio_mem_vmcore_add_device_ram(struct virtio_mem *vm,
2734 struct list_head *list, uint64_t start, uint64_t end)
2735 {
2736 int rc;
2737
2738 rc = vmcore_alloc_add_range(list, start, end - start);
2739 if (rc)
2740 dev_err(&vm->vdev->dev,
2741 "Error adding device RAM range: %d\n", rc);
2742 return rc;
2743 }
2744
virtio_mem_vmcore_get_device_ram(struct vmcore_cb * cb,struct list_head * list)2745 static int virtio_mem_vmcore_get_device_ram(struct vmcore_cb *cb,
2746 struct list_head *list)
2747 {
2748 struct virtio_mem *vm = container_of(cb, struct virtio_mem,
2749 vmcore_cb);
2750 const uint64_t device_start = vm->addr;
2751 const uint64_t device_end = vm->addr + vm->usable_region_size;
2752 uint64_t chunk_size, cur_start, cur_end, plugged_range_start = 0;
2753 LIST_HEAD(tmp_list);
2754 int rc;
2755
2756 if (!vm->plugged_size)
2757 return 0;
2758
2759 /* Process memory sections, unless the device block size is bigger. */
2760 chunk_size = max_t(uint64_t, PFN_PHYS(PAGES_PER_SECTION),
2761 vm->device_block_size);
2762
2763 mutex_lock(&vm->hotplug_mutex);
2764
2765 /*
2766 * We process larger chunks and indicate the complete chunk if any
2767 * block in there is plugged. This reduces the number of pfn_is_ram()
2768 * callbacks and mimic what is effectively being done when the old
2769 * kernel would add complete memory sections/blocks to the elfcore hdr.
2770 */
2771 cur_start = device_start;
2772 for (cur_start = device_start; cur_start < device_end; cur_start = cur_end) {
2773 cur_end = ALIGN_DOWN(cur_start + chunk_size, chunk_size);
2774 cur_end = min_t(uint64_t, cur_end, device_end);
2775
2776 rc = virtio_mem_send_state_request(vm, cur_start,
2777 cur_end - cur_start);
2778
2779 if (rc < 0) {
2780 dev_err(&vm->vdev->dev,
2781 "Error querying block states: %d\n", rc);
2782 goto out;
2783 } else if (rc != VIRTIO_MEM_STATE_UNPLUGGED) {
2784 /* Merge ranges with plugged memory. */
2785 if (!plugged_range_start)
2786 plugged_range_start = cur_start;
2787 continue;
2788 }
2789
2790 /* Flush any plugged range. */
2791 if (plugged_range_start) {
2792 rc = virtio_mem_vmcore_add_device_ram(vm, &tmp_list,
2793 plugged_range_start,
2794 cur_start);
2795 if (rc)
2796 goto out;
2797 plugged_range_start = 0;
2798 }
2799 }
2800
2801 /* Flush any plugged range. */
2802 if (plugged_range_start)
2803 rc = virtio_mem_vmcore_add_device_ram(vm, &tmp_list,
2804 plugged_range_start,
2805 cur_start);
2806 out:
2807 mutex_unlock(&vm->hotplug_mutex);
2808 if (rc < 0) {
2809 vmcore_free_ranges(&tmp_list);
2810 return rc;
2811 }
2812 list_splice_tail(&tmp_list, list);
2813 return 0;
2814 }
2815 #endif /* CONFIG_PROC_VMCORE_DEVICE_RAM */
2816 #endif /* CONFIG_PROC_VMCORE */
2817
virtio_mem_init_kdump(struct virtio_mem * vm)2818 static int virtio_mem_init_kdump(struct virtio_mem *vm)
2819 {
2820 /* We must be prepared to receive a callback immediately. */
2821 virtio_device_ready(vm->vdev);
2822 #ifdef CONFIG_PROC_VMCORE
2823 dev_info(&vm->vdev->dev, "memory hot(un)plug disabled in kdump kernel\n");
2824 vm->vmcore_cb.pfn_is_ram = virtio_mem_vmcore_pfn_is_ram;
2825 #ifdef CONFIG_PROC_VMCORE_DEVICE_RAM
2826 vm->vmcore_cb.get_device_ram = virtio_mem_vmcore_get_device_ram;
2827 #endif /* CONFIG_PROC_VMCORE_DEVICE_RAM */
2828 register_vmcore_cb(&vm->vmcore_cb);
2829 return 0;
2830 #else /* CONFIG_PROC_VMCORE */
2831 dev_warn(&vm->vdev->dev, "disabled in kdump kernel without vmcore\n");
2832 return -EBUSY;
2833 #endif /* CONFIG_PROC_VMCORE */
2834 }
2835
virtio_mem_init(struct virtio_mem * vm)2836 static int virtio_mem_init(struct virtio_mem *vm)
2837 {
2838 uint16_t node_id;
2839
2840 if (!vm->vdev->config->get) {
2841 dev_err(&vm->vdev->dev, "config access disabled\n");
2842 return -EINVAL;
2843 }
2844
2845 /* Fetch all properties that can't change. */
2846 virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
2847 &vm->plugged_size);
2848 virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size,
2849 &vm->device_block_size);
2850 virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id,
2851 &node_id);
2852 vm->nid = virtio_mem_translate_node_id(vm, node_id);
2853 virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr);
2854 virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size,
2855 &vm->region_size);
2856 virtio_cread_le(vm->vdev, struct virtio_mem_config, usable_region_size,
2857 &vm->usable_region_size);
2858
2859 /* Determine the nid for the device based on the lowest address. */
2860 if (vm->nid == NUMA_NO_NODE)
2861 vm->nid = memory_add_physaddr_to_nid(vm->addr);
2862
2863 dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr);
2864 dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size);
2865 dev_info(&vm->vdev->dev, "device block size: 0x%llx",
2866 (unsigned long long)vm->device_block_size);
2867 if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA))
2868 dev_info(&vm->vdev->dev, "nid: %d", vm->nid);
2869
2870 /*
2871 * We don't want to (un)plug or reuse any memory when in kdump. The
2872 * memory is still accessible (but not exposed to Linux).
2873 */
2874 if (vm->in_kdump)
2875 return virtio_mem_init_kdump(vm);
2876 return virtio_mem_init_hotplug(vm);
2877 }
2878
virtio_mem_create_resource(struct virtio_mem * vm)2879 static int virtio_mem_create_resource(struct virtio_mem *vm)
2880 {
2881 /*
2882 * When force-unloading the driver and removing the device, we
2883 * could have a garbage pointer. Duplicate the string.
2884 */
2885 const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL);
2886
2887 if (!name)
2888 return -ENOMEM;
2889
2890 /* Disallow mapping device memory via /dev/mem completely. */
2891 vm->parent_resource = __request_mem_region(vm->addr, vm->region_size,
2892 name, IORESOURCE_SYSTEM_RAM |
2893 IORESOURCE_EXCLUSIVE);
2894 if (!vm->parent_resource) {
2895 kfree(name);
2896 dev_warn(&vm->vdev->dev, "could not reserve device region\n");
2897 dev_info(&vm->vdev->dev,
2898 "reloading the driver is not supported\n");
2899 return -EBUSY;
2900 }
2901
2902 /* The memory is not actually busy - make add_memory() work. */
2903 vm->parent_resource->flags &= ~IORESOURCE_BUSY;
2904 return 0;
2905 }
2906
virtio_mem_delete_resource(struct virtio_mem * vm)2907 static void virtio_mem_delete_resource(struct virtio_mem *vm)
2908 {
2909 const char *name;
2910
2911 if (!vm->parent_resource)
2912 return;
2913
2914 name = vm->parent_resource->name;
2915 release_resource(vm->parent_resource);
2916 kfree(vm->parent_resource);
2917 kfree(name);
2918 vm->parent_resource = NULL;
2919 }
2920
virtio_mem_range_has_system_ram(struct resource * res,void * arg)2921 static int virtio_mem_range_has_system_ram(struct resource *res, void *arg)
2922 {
2923 return 1;
2924 }
2925
virtio_mem_has_memory_added(struct virtio_mem * vm)2926 static bool virtio_mem_has_memory_added(struct virtio_mem *vm)
2927 {
2928 const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
2929
2930 return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr,
2931 vm->addr + vm->region_size, NULL,
2932 virtio_mem_range_has_system_ram) == 1;
2933 }
2934
virtio_mem_probe(struct virtio_device * vdev)2935 static int virtio_mem_probe(struct virtio_device *vdev)
2936 {
2937 struct virtio_mem *vm;
2938 int rc;
2939
2940 BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24);
2941 BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10);
2942
2943 vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL);
2944 if (!vm)
2945 return -ENOMEM;
2946
2947 init_waitqueue_head(&vm->host_resp);
2948 vm->vdev = vdev;
2949 INIT_WORK(&vm->wq, virtio_mem_run_wq);
2950 mutex_init(&vm->hotplug_mutex);
2951 INIT_LIST_HEAD(&vm->next);
2952 spin_lock_init(&vm->removal_lock);
2953 hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2954 vm->retry_timer.function = virtio_mem_timer_expired;
2955 vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
2956 vm->in_kdump = is_kdump_kernel();
2957
2958 /* register the virtqueue */
2959 rc = virtio_mem_init_vq(vm);
2960 if (rc)
2961 goto out_free_vm;
2962
2963 /* initialize the device by querying the config */
2964 rc = virtio_mem_init(vm);
2965 if (rc)
2966 goto out_del_vq;
2967
2968 /* trigger a config update to start processing the requested_size */
2969 if (!vm->in_kdump) {
2970 atomic_set(&vm->config_changed, 1);
2971 queue_work(system_freezable_wq, &vm->wq);
2972 }
2973
2974 return 0;
2975 out_del_vq:
2976 vdev->config->del_vqs(vdev);
2977 out_free_vm:
2978 kfree(vm);
2979 vdev->priv = NULL;
2980
2981 return rc;
2982 }
2983
virtio_mem_deinit_hotplug(struct virtio_mem * vm)2984 static void virtio_mem_deinit_hotplug(struct virtio_mem *vm)
2985 {
2986 unsigned long mb_id;
2987 int rc;
2988
2989 /*
2990 * Make sure the workqueue won't be triggered anymore and no memory
2991 * blocks can be onlined/offlined until we're finished here.
2992 */
2993 mutex_lock(&vm->hotplug_mutex);
2994 spin_lock_irq(&vm->removal_lock);
2995 vm->removing = true;
2996 spin_unlock_irq(&vm->removal_lock);
2997 mutex_unlock(&vm->hotplug_mutex);
2998
2999 /* wait until the workqueue stopped */
3000 cancel_work_sync(&vm->wq);
3001 hrtimer_cancel(&vm->retry_timer);
3002
3003 if (vm->in_sbm) {
3004 /*
3005 * After we unregistered our callbacks, user space can online
3006 * partially plugged offline blocks. Make sure to remove them.
3007 */
3008 virtio_mem_sbm_for_each_mb(vm, mb_id,
3009 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) {
3010 rc = virtio_mem_sbm_remove_mb(vm, mb_id);
3011 BUG_ON(rc);
3012 virtio_mem_sbm_set_mb_state(vm, mb_id,
3013 VIRTIO_MEM_SBM_MB_UNUSED);
3014 }
3015 /*
3016 * After we unregistered our callbacks, user space can no longer
3017 * offline partially plugged online memory blocks. No need to
3018 * worry about them.
3019 */
3020 }
3021
3022 /* unregister callbacks */
3023 unregister_virtio_mem_device(vm);
3024 unregister_pm_notifier(&vm->pm_notifier);
3025 unregister_memory_notifier(&vm->memory_notifier);
3026
3027 /*
3028 * There is no way we could reliably remove all memory we have added to
3029 * the system. And there is no way to stop the driver/device from going
3030 * away. Warn at least.
3031 */
3032 if (virtio_mem_has_memory_added(vm)) {
3033 dev_warn(&vm->vdev->dev,
3034 "device still has system memory added\n");
3035 } else {
3036 virtio_mem_delete_resource(vm);
3037 kfree_const(vm->resource_name);
3038 memory_group_unregister(vm->mgid);
3039 }
3040
3041 /* remove all tracking data - no locking needed */
3042 if (vm->in_sbm) {
3043 vfree(vm->sbm.mb_states);
3044 vfree(vm->sbm.sb_states);
3045 } else {
3046 vfree(vm->bbm.bb_states);
3047 }
3048 }
3049
virtio_mem_deinit_kdump(struct virtio_mem * vm)3050 static void virtio_mem_deinit_kdump(struct virtio_mem *vm)
3051 {
3052 #ifdef CONFIG_PROC_VMCORE
3053 unregister_vmcore_cb(&vm->vmcore_cb);
3054 #endif /* CONFIG_PROC_VMCORE */
3055 }
3056
virtio_mem_remove(struct virtio_device * vdev)3057 static void virtio_mem_remove(struct virtio_device *vdev)
3058 {
3059 struct virtio_mem *vm = vdev->priv;
3060
3061 if (vm->in_kdump)
3062 virtio_mem_deinit_kdump(vm);
3063 else
3064 virtio_mem_deinit_hotplug(vm);
3065
3066 /* reset the device and cleanup the queues */
3067 virtio_reset_device(vdev);
3068 vdev->config->del_vqs(vdev);
3069
3070 kfree(vm);
3071 vdev->priv = NULL;
3072 }
3073
virtio_mem_config_changed(struct virtio_device * vdev)3074 static void virtio_mem_config_changed(struct virtio_device *vdev)
3075 {
3076 struct virtio_mem *vm = vdev->priv;
3077
3078 if (unlikely(vm->in_kdump))
3079 return;
3080
3081 atomic_set(&vm->config_changed, 1);
3082 virtio_mem_retry(vm);
3083 }
3084
3085 #ifdef CONFIG_PM_SLEEP
virtio_mem_freeze(struct virtio_device * vdev)3086 static int virtio_mem_freeze(struct virtio_device *vdev)
3087 {
3088 struct virtio_mem *vm = vdev->priv;
3089
3090 /*
3091 * We block hibernation using the PM notifier completely. The workqueue
3092 * is already frozen by the PM core at this point, so we simply
3093 * reset the device and cleanup the queues.
3094 */
3095 if (pm_suspend_target_state != PM_SUSPEND_TO_IDLE &&
3096 vm->plugged_size &&
3097 !virtio_has_feature(vm->vdev, VIRTIO_MEM_F_PERSISTENT_SUSPEND)) {
3098 dev_err(&vm->vdev->dev,
3099 "suspending with plugged memory is not supported\n");
3100 return -EPERM;
3101 }
3102
3103 virtio_reset_device(vdev);
3104 vdev->config->del_vqs(vdev);
3105 vm->vq = NULL;
3106 return 0;
3107 }
3108
virtio_mem_restore(struct virtio_device * vdev)3109 static int virtio_mem_restore(struct virtio_device *vdev)
3110 {
3111 struct virtio_mem *vm = vdev->priv;
3112 int ret;
3113
3114 ret = virtio_mem_init_vq(vm);
3115 if (ret)
3116 return ret;
3117 virtio_device_ready(vdev);
3118
3119 /* Let's check if anything changed. */
3120 virtio_mem_config_changed(vdev);
3121 return 0;
3122 }
3123 #endif
3124
3125 static unsigned int virtio_mem_features[] = {
3126 #if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA)
3127 VIRTIO_MEM_F_ACPI_PXM,
3128 #endif
3129 VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE,
3130 VIRTIO_MEM_F_PERSISTENT_SUSPEND,
3131 };
3132
3133 static const struct virtio_device_id virtio_mem_id_table[] = {
3134 { VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID },
3135 { 0 },
3136 };
3137
3138 static struct virtio_driver virtio_mem_driver = {
3139 .feature_table = virtio_mem_features,
3140 .feature_table_size = ARRAY_SIZE(virtio_mem_features),
3141 .driver.name = KBUILD_MODNAME,
3142 .id_table = virtio_mem_id_table,
3143 .probe = virtio_mem_probe,
3144 .remove = virtio_mem_remove,
3145 .config_changed = virtio_mem_config_changed,
3146 #ifdef CONFIG_PM_SLEEP
3147 .freeze = virtio_mem_freeze,
3148 .restore = virtio_mem_restore,
3149 #endif
3150 };
3151
3152 module_virtio_driver(virtio_mem_driver);
3153 MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table);
3154 MODULE_AUTHOR("David Hildenbrand <[email protected]>");
3155 MODULE_DESCRIPTION("Virtio-mem driver");
3156 MODULE_LICENSE("GPL");
3157