* [PATCH 3/3] virtio_balloon: Introduce memory recover
@ 2022-05-20 7:06 ` zhenwei pi
0 siblings, 0 replies; 37+ messages in thread
From: zhenwei pi @ 2022-05-20 7:06 UTC (permalink / raw)
To: akpm, naoya.horiguchi, mst, david
Cc: qemu-devel, linux-kernel, virtualization, linux-mm, zhenwei pi, pbonzini
Introduce a new queue 'recover VQ', this allows guest to recover
hardware corrupted page:
Guest 5.MF -> 6.RVQ FE 10.Unpoison page
/ \ /
-------------------+-------------+----------+-----------
| | |
4.MCE 7.RVQ BE 9.RVQ Event
QEMU / \ /
3.SIGBUS 8.Remap
/
----------------+------------------------------------
|
+--2.MF
Host /
1.HW error
The workflow:
1, HardWare page error occurs randomly.
2, host side handles corrupted page by Memory Failure mechanism, sends
SIGBUS to the user process if early-kill is enabled.
3, QEMU handles SIGBUS, if the address belongs to guest RAM, then:
4, QEMU tries to inject MCE into guest.
5, guest handles memory failure again.
1-5 is already supported for a long time, the next steps are supported
in this patch(also related driver patch):
6, guest balloon driver gets noticed of the corrupted PFN, and sends
request to host side by Recover VQ FrontEnd.
7, QEMU handles request from Recover VQ BackEnd, then:
8, QEMU remaps the corrupted HVA fo fix the memory failure, then:
9, QEMU acks the guest side the result by Recover VQ.
10, guest unpoisons the page if the corrupted page gets recoverd
successfully.
Then the guest fixes the HW page error dynamiclly without rebooting.
Emulate MCE by QEMU, the guest works fine:
mce: [Hardware Error]: Machine check events logged
Memory failure: 0x61646: recovery action for dirty LRU page: Recovered
virtio_balloon virtio5: recovered pfn 0x61646
Unpoison: Unpoisoned page 0x61646 by virtio-balloon
MCE: Killing stress:24502 due to hardware memory corruption fault at 7f5be2e5a010
The 'HardwareCorrupted' in /proc/meminfo also shows 0 kB.
Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
---
drivers/virtio/virtio_balloon.c | 243 ++++++++++++++++++++++++++++
include/uapi/linux/virtio_balloon.h | 16 ++
2 files changed, 259 insertions(+)
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index f4c34a2a6b8e..f9d95d1d8a4d 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -52,6 +52,7 @@ enum virtio_balloon_vq {
VIRTIO_BALLOON_VQ_STATS,
VIRTIO_BALLOON_VQ_FREE_PAGE,
VIRTIO_BALLOON_VQ_REPORTING,
+ VIRTIO_BALLOON_VQ_RECOVER,
VIRTIO_BALLOON_VQ_MAX
};
@@ -59,6 +60,12 @@ enum virtio_balloon_config_read {
VIRTIO_BALLOON_CONFIG_READ_CMD_ID = 0,
};
+/* the request body to commucate with host side */
+struct __virtio_balloon_recover {
+ struct virtio_balloon_recover vbr;
+ __virtio32 pfns[VIRTIO_BALLOON_PAGES_PER_PAGE];
+};
+
struct virtio_balloon {
struct virtio_device *vdev;
struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
@@ -126,6 +133,16 @@ struct virtio_balloon {
/* Free page reporting device */
struct virtqueue *reporting_vq;
struct page_reporting_dev_info pr_dev_info;
+
+ /* Memory recover VQ - VIRTIO_BALLOON_F_RECOVER */
+ struct virtqueue *recover_vq;
+ spinlock_t recover_vq_lock;
+ struct notifier_block memory_failure_nb;
+ struct list_head corrupted_page_list;
+ struct list_head recovered_page_list;
+ spinlock_t recover_page_list_lock;
+ struct __virtio_balloon_recover in_vbr;
+ struct work_struct unpoison_memory_work;
};
static const struct virtio_device_id id_table[] = {
@@ -494,6 +511,198 @@ static void update_balloon_size_func(struct work_struct *work)
queue_work(system_freezable_wq, work);
}
+/*
+ * virtballoon_memory_failure - notified by memory failure, try to fix the
+ * corrupted page.
+ * The memory failure notifier is designed to call back when the kernel handled
+ * successfully only, WARN_ON_ONCE on the unlikely condition to find out any
+ * error(memory error handling is a best effort, not 100% coverd).
+ */
+static int virtballoon_memory_failure(struct notifier_block *notifier,
+ unsigned long pfn, void *parm)
+{
+ struct virtio_balloon *vb = container_of(notifier, struct virtio_balloon,
+ memory_failure_nb);
+ struct page *page;
+ struct __virtio_balloon_recover *out_vbr;
+ struct scatterlist sg;
+ unsigned long flags;
+ int err;
+
+ page = pfn_to_online_page(pfn);
+ if (WARN_ON_ONCE(!page))
+ return NOTIFY_DONE;
+
+ if (PageHuge(page))
+ return NOTIFY_DONE;
+
+ if (WARN_ON_ONCE(!PageHWPoison(page)))
+ return NOTIFY_DONE;
+
+ if (WARN_ON_ONCE(page_count(page) != 1))
+ return NOTIFY_DONE;
+
+ get_page(page); /* balloon reference */
+
+ out_vbr = kzalloc(sizeof(*out_vbr), GFP_KERNEL);
+ if (WARN_ON_ONCE(!out_vbr))
+ return NOTIFY_BAD;
+
+ spin_lock(&vb->recover_page_list_lock);
+ balloon_page_push(&vb->corrupted_page_list, page);
+ spin_unlock(&vb->recover_page_list_lock);
+
+ out_vbr->vbr.cmd = VIRTIO_BALLOON_R_CMD_RECOVER;
+ set_page_pfns(vb, out_vbr->pfns, page);
+ sg_init_one(&sg, out_vbr, sizeof(*out_vbr));
+
+ spin_lock_irqsave(&vb->recover_vq_lock, flags);
+ err = virtqueue_add_outbuf(vb->recover_vq, &sg, 1, out_vbr, GFP_KERNEL);
+ if (unlikely(err)) {
+ spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
+ return NOTIFY_DONE;
+ }
+ virtqueue_kick(vb->recover_vq);
+ spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
+
+ return NOTIFY_OK;
+}
+
+static int recover_vq_get_response(struct virtio_balloon *vb)
+{
+ struct __virtio_balloon_recover *in_vbr;
+ struct scatterlist sg;
+ unsigned long flags;
+ int err;
+
+ spin_lock_irqsave(&vb->recover_vq_lock, flags);
+ in_vbr = &vb->in_vbr;
+ memset(in_vbr, 0x00, sizeof(*in_vbr));
+ sg_init_one(&sg, in_vbr, sizeof(*in_vbr));
+ err = virtqueue_add_inbuf(vb->recover_vq, &sg, 1, in_vbr, GFP_KERNEL);
+ if (unlikely(err)) {
+ spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
+ return err;
+ }
+
+ virtqueue_kick(vb->recover_vq);
+ spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
+
+ return 0;
+}
+
+static void recover_vq_handle_response(struct virtio_balloon *vb, unsigned int len)
+{
+ struct __virtio_balloon_recover *in_vbr;
+ struct virtio_balloon_recover *vbr;
+ struct page *page;
+ unsigned int pfns;
+ u32 pfn0, pfn1;
+ __u8 status;
+
+ /* the response is not expected */
+ if (unlikely(len != sizeof(struct __virtio_balloon_recover)))
+ return;
+
+ in_vbr = &vb->in_vbr;
+ vbr = &in_vbr->vbr;
+ if (unlikely(vbr->cmd != VIRTIO_BALLOON_R_CMD_RESPONSE))
+ return;
+
+ /* to make sure the contiguous balloon PFNs */
+ for (pfns = 1; pfns < VIRTIO_BALLOON_PAGES_PER_PAGE; pfns++) {
+ pfn0 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[pfns - 1]);
+ pfn1 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[pfns]);
+ if (pfn1 - pfn0 != 1)
+ return;
+ }
+
+ pfn0 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[0]);
+ if (!pfn_valid(pfn0))
+ return;
+
+ pfn1 = -1;
+ spin_lock(&vb->recover_page_list_lock);
+ list_for_each_entry(page, &vb->corrupted_page_list, lru) {
+ pfn1 = page_to_pfn(page);
+ if (pfn1 == pfn0)
+ break;
+ }
+ spin_unlock(&vb->recover_page_list_lock);
+
+ status = vbr->status;
+ switch (status) {
+ case VIRTIO_BALLOON_R_STATUS_RECOVERED:
+ if (pfn1 == pfn0) {
+ spin_lock(&vb->recover_page_list_lock);
+ list_del(&page->lru);
+ balloon_page_push(&vb->recovered_page_list, page);
+ spin_unlock(&vb->recover_page_list_lock);
+ queue_work(system_freezable_wq, &vb->unpoison_memory_work);
+ dev_info_ratelimited(&vb->vdev->dev, "recovered pfn 0x%x", pfn0);
+ }
+ break;
+ case VIRTIO_BALLOON_R_STATUS_FAILED:
+ /* the hypervisor can't fix this corrupted page, balloon puts page */
+ if (pfn1 == pfn0) {
+ spin_lock(&vb->recover_page_list_lock);
+ list_del(&page->lru);
+ spin_unlock(&vb->recover_page_list_lock);
+ put_page(page);
+ dev_info_ratelimited(&vb->vdev->dev, "failed to recover pfn 0x%x", pfn0);
+ }
+ default:
+ break;
+ };
+
+ /* continue to get response from host side if the response gets handled successfully */
+ recover_vq_get_response(vb);
+}
+
+static void unpoison_memory_func(struct work_struct *work)
+{
+ struct virtio_balloon *vb;
+ struct page *page;
+
+ vb = container_of(work, struct virtio_balloon, unpoison_memory_work);
+
+ do {
+ spin_lock(&vb->recover_page_list_lock);
+ page = list_first_entry_or_null(&vb->recovered_page_list,
+ struct page, lru);
+ if (page)
+ list_del(&page->lru);
+ spin_unlock(&vb->recover_page_list_lock);
+
+ if (page) {
+ put_page(page);
+ unpoison_memory(page_to_pfn(page), true, "virtio-balloon");
+ }
+ } while (page);
+}
+
+static void recover_vq_cb(struct virtqueue *vq)
+{
+ struct virtio_balloon *vb = vq->vdev->priv;
+ struct __virtio_balloon_recover *vbr;
+ unsigned long flags;
+ unsigned int len;
+
+ spin_lock_irqsave(&vb->recover_vq_lock, flags);
+ do {
+ virtqueue_disable_cb(vq);
+ while ((vbr = virtqueue_get_buf(vq, &len)) != NULL) {
+ spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
+ if (vbr == &vb->in_vbr)
+ recover_vq_handle_response(vb, len);
+ else
+ kfree(vbr); /* just free the memory for out vbr request */
+ spin_lock_irqsave(&vb->recover_vq_lock, flags);
+ }
+ } while (!virtqueue_enable_cb(vq));
+ spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
+}
+
static int init_vqs(struct virtio_balloon *vb)
{
struct virtqueue *vqs[VIRTIO_BALLOON_VQ_MAX];
@@ -515,6 +724,7 @@ static int init_vqs(struct virtio_balloon *vb)
callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
names[VIRTIO_BALLOON_VQ_REPORTING] = NULL;
+ names[VIRTIO_BALLOON_VQ_RECOVER] = NULL;
if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
names[VIRTIO_BALLOON_VQ_STATS] = "stats";
@@ -531,6 +741,11 @@ static int init_vqs(struct virtio_balloon *vb)
callbacks[VIRTIO_BALLOON_VQ_REPORTING] = balloon_ack;
}
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_RECOVER)) {
+ names[VIRTIO_BALLOON_VQ_RECOVER] = "recover_vq";
+ callbacks[VIRTIO_BALLOON_VQ_RECOVER] = recover_vq_cb;
+ }
+
err = virtio_find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX, vqs,
callbacks, names, NULL);
if (err)
@@ -566,6 +781,9 @@ static int init_vqs(struct virtio_balloon *vb)
if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
vb->reporting_vq = vqs[VIRTIO_BALLOON_VQ_REPORTING];
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_RECOVER))
+ vb->recover_vq = vqs[VIRTIO_BALLOON_VQ_RECOVER];
+
return 0;
}
@@ -1015,12 +1233,31 @@ static int virtballoon_probe(struct virtio_device *vdev)
goto out_unregister_oom;
}
+ if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_RECOVER)) {
+ err = recover_vq_get_response(vb);
+ if (err)
+ goto out_unregister_reporting;
+
+ vb->memory_failure_nb.notifier_call = virtballoon_memory_failure;
+ spin_lock_init(&vb->recover_page_list_lock);
+ spin_lock_init(&vb->recover_vq_lock);
+ INIT_LIST_HEAD(&vb->corrupted_page_list);
+ INIT_LIST_HEAD(&vb->recovered_page_list);
+ INIT_WORK(&vb->unpoison_memory_work, unpoison_memory_func);
+ err = register_memory_failure_notifier(&vb->memory_failure_nb);
+ if (err)
+ goto out_unregister_reporting;
+ }
+
virtio_device_ready(vdev);
if (towards_target(vb))
virtballoon_changed(vdev);
return 0;
+out_unregister_reporting:
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
+ page_reporting_unregister(&vb->pr_dev_info);
out_unregister_oom:
if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
unregister_oom_notifier(&vb->oom_nb);
@@ -1082,6 +1319,11 @@ static void virtballoon_remove(struct virtio_device *vdev)
destroy_workqueue(vb->balloon_wq);
}
+ if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_RECOVER)) {
+ unregister_memory_failure_notifier(&vb->memory_failure_nb);
+ cancel_work_sync(&vb->unpoison_memory_work);
+ }
+
remove_common(vb);
#ifdef CONFIG_BALLOON_COMPACTION
if (vb->vb_dev_info.inode)
@@ -1147,6 +1389,7 @@ static unsigned int features[] = {
VIRTIO_BALLOON_F_FREE_PAGE_HINT,
VIRTIO_BALLOON_F_PAGE_POISON,
VIRTIO_BALLOON_F_REPORTING,
+ VIRTIO_BALLOON_F_RECOVER,
};
static struct virtio_driver virtio_balloon_driver = {
diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index ddaa45e723c4..41d0ffa2fb54 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -37,6 +37,7 @@
#define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */
#define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */
#define VIRTIO_BALLOON_F_REPORTING 5 /* Page reporting virtqueue */
+#define VIRTIO_BALLOON_F_RECOVER 6 /* Memory recover virtqueue */
/* Size of a PFN in the balloon interface. */
#define VIRTIO_BALLOON_PFN_SHIFT 12
@@ -59,6 +60,8 @@ struct virtio_balloon_config {
};
/* Stores PAGE_POISON if page poisoning is in use */
__le32 poison_val;
+ /* Number of hardware corrupted pages, guest read only */
+ __le32 corrupted_pages;
};
#define VIRTIO_BALLOON_S_SWAP_IN 0 /* Amount of memory swapped in */
@@ -116,4 +119,17 @@ struct virtio_balloon_stat {
__virtio64 val;
} __attribute__((packed));
+#define VIRTIO_BALLOON_R_CMD_RECOVER 0
+#define VIRTIO_BALLOON_R_CMD_RESPONSE 0x80
+
+#define VIRTIO_BALLOON_R_STATUS_CORRUPTED 0
+#define VIRTIO_BALLOON_R_STATUS_RECOVERED 1
+#define VIRTIO_BALLOON_R_STATUS_FAILED 2
+
+struct virtio_balloon_recover {
+ __u8 cmd;
+ __u8 status;
+ __u8 padding[6];
+};
+
#endif /* _LINUX_VIRTIO_BALLOON_H */
--
2.20.1
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply related [flat|nested] 37+ messages in thread
* Re: [PATCH 3/3] virtio_balloon: Introduce memory recover
2022-05-20 7:06 ` zhenwei pi
@ 2022-05-20 12:48 ` kernel test robot
-1 siblings, 0 replies; 37+ messages in thread
From: kernel test robot @ 2022-05-20 12:48 UTC (permalink / raw)
To: zhenwei pi, akpm, naoya.horiguchi, mst, david
Cc: llvm, kbuild-all, linux-mm, linux-kernel, jasowang,
virtualization, pbonzini, peterx, qemu-devel, zhenwei pi
Hi zhenwei,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on akpm-mm/mm-everything]
[also build test WARNING on next-20220519]
[cannot apply to linux/master linus/master v5.18-rc7]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/intel-lab-lkp/linux/commits/zhenwei-pi/recover-hardware-corrupted-page-by-virtio-balloon/20220520-151328
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
config: hexagon-randconfig-r041-20220519 (https://download.01.org/0day-ci/archive/20220520/202205202014.mgqgBrKd-lkp@intel.com/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project e00cbbec06c08dc616a0d52a20f678b8fbd4e304)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/a42127073dd4adb6354649c8235c5cde033d01f2
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review zhenwei-pi/recover-hardware-corrupted-page-by-virtio-balloon/20220520-151328
git checkout a42127073dd4adb6354649c8235c5cde033d01f2
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=hexagon SHELL=/bin/bash drivers/virtio/
If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>
All warnings (new ones prefixed by >>):
>> drivers/virtio/virtio_balloon.c:654:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough]
default:
^
drivers/virtio/virtio_balloon.c:654:2: note: insert 'break;' to avoid fall-through
default:
^
break;
1 warning generated.
vim +654 drivers/virtio/virtio_balloon.c
593
594 static void recover_vq_handle_response(struct virtio_balloon *vb, unsigned int len)
595 {
596 struct __virtio_balloon_recover *in_vbr;
597 struct virtio_balloon_recover *vbr;
598 struct page *page;
599 unsigned int pfns;
600 u32 pfn0, pfn1;
601 __u8 status;
602
603 /* the response is not expected */
604 if (unlikely(len != sizeof(struct __virtio_balloon_recover)))
605 return;
606
607 in_vbr = &vb->in_vbr;
608 vbr = &in_vbr->vbr;
609 if (unlikely(vbr->cmd != VIRTIO_BALLOON_R_CMD_RESPONSE))
610 return;
611
612 /* to make sure the contiguous balloon PFNs */
613 for (pfns = 1; pfns < VIRTIO_BALLOON_PAGES_PER_PAGE; pfns++) {
614 pfn0 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[pfns - 1]);
615 pfn1 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[pfns]);
616 if (pfn1 - pfn0 != 1)
617 return;
618 }
619
620 pfn0 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[0]);
621 if (!pfn_valid(pfn0))
622 return;
623
624 pfn1 = -1;
625 spin_lock(&vb->recover_page_list_lock);
626 list_for_each_entry(page, &vb->corrupted_page_list, lru) {
627 pfn1 = page_to_pfn(page);
628 if (pfn1 == pfn0)
629 break;
630 }
631 spin_unlock(&vb->recover_page_list_lock);
632
633 status = vbr->status;
634 switch (status) {
635 case VIRTIO_BALLOON_R_STATUS_RECOVERED:
636 if (pfn1 == pfn0) {
637 spin_lock(&vb->recover_page_list_lock);
638 list_del(&page->lru);
639 balloon_page_push(&vb->recovered_page_list, page);
640 spin_unlock(&vb->recover_page_list_lock);
641 queue_work(system_freezable_wq, &vb->unpoison_memory_work);
642 dev_info_ratelimited(&vb->vdev->dev, "recovered pfn 0x%x", pfn0);
643 }
644 break;
645 case VIRTIO_BALLOON_R_STATUS_FAILED:
646 /* the hypervisor can't fix this corrupted page, balloon puts page */
647 if (pfn1 == pfn0) {
648 spin_lock(&vb->recover_page_list_lock);
649 list_del(&page->lru);
650 spin_unlock(&vb->recover_page_list_lock);
651 put_page(page);
652 dev_info_ratelimited(&vb->vdev->dev, "failed to recover pfn 0x%x", pfn0);
653 }
> 654 default:
655 break;
656 };
657
658 /* continue to get response from host side if the response gets handled successfully */
659 recover_vq_get_response(vb);
660 }
661
--
0-DAY CI Kernel Test Service
https://01.org/lkp
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 3/3] virtio_balloon: Introduce memory recover
@ 2022-05-20 12:48 ` kernel test robot
0 siblings, 0 replies; 37+ messages in thread
From: kernel test robot @ 2022-05-20 12:48 UTC (permalink / raw)
To: zhenwei pi, akpm, naoya.horiguchi, mst, david
Cc: kbuild-all, qemu-devel, llvm, linux-kernel, virtualization,
linux-mm, zhenwei pi, pbonzini
Hi zhenwei,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on akpm-mm/mm-everything]
[also build test WARNING on next-20220519]
[cannot apply to linux/master linus/master v5.18-rc7]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/intel-lab-lkp/linux/commits/zhenwei-pi/recover-hardware-corrupted-page-by-virtio-balloon/20220520-151328
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
config: hexagon-randconfig-r041-20220519 (https://download.01.org/0day-ci/archive/20220520/202205202014.mgqgBrKd-lkp@intel.com/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project e00cbbec06c08dc616a0d52a20f678b8fbd4e304)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/a42127073dd4adb6354649c8235c5cde033d01f2
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review zhenwei-pi/recover-hardware-corrupted-page-by-virtio-balloon/20220520-151328
git checkout a42127073dd4adb6354649c8235c5cde033d01f2
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=hexagon SHELL=/bin/bash drivers/virtio/
If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>
All warnings (new ones prefixed by >>):
>> drivers/virtio/virtio_balloon.c:654:2: warning: unannotated fall-through between switch labels [-Wimplicit-fallthrough]
default:
^
drivers/virtio/virtio_balloon.c:654:2: note: insert 'break;' to avoid fall-through
default:
^
break;
1 warning generated.
vim +654 drivers/virtio/virtio_balloon.c
593
594 static void recover_vq_handle_response(struct virtio_balloon *vb, unsigned int len)
595 {
596 struct __virtio_balloon_recover *in_vbr;
597 struct virtio_balloon_recover *vbr;
598 struct page *page;
599 unsigned int pfns;
600 u32 pfn0, pfn1;
601 __u8 status;
602
603 /* the response is not expected */
604 if (unlikely(len != sizeof(struct __virtio_balloon_recover)))
605 return;
606
607 in_vbr = &vb->in_vbr;
608 vbr = &in_vbr->vbr;
609 if (unlikely(vbr->cmd != VIRTIO_BALLOON_R_CMD_RESPONSE))
610 return;
611
612 /* to make sure the contiguous balloon PFNs */
613 for (pfns = 1; pfns < VIRTIO_BALLOON_PAGES_PER_PAGE; pfns++) {
614 pfn0 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[pfns - 1]);
615 pfn1 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[pfns]);
616 if (pfn1 - pfn0 != 1)
617 return;
618 }
619
620 pfn0 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[0]);
621 if (!pfn_valid(pfn0))
622 return;
623
624 pfn1 = -1;
625 spin_lock(&vb->recover_page_list_lock);
626 list_for_each_entry(page, &vb->corrupted_page_list, lru) {
627 pfn1 = page_to_pfn(page);
628 if (pfn1 == pfn0)
629 break;
630 }
631 spin_unlock(&vb->recover_page_list_lock);
632
633 status = vbr->status;
634 switch (status) {
635 case VIRTIO_BALLOON_R_STATUS_RECOVERED:
636 if (pfn1 == pfn0) {
637 spin_lock(&vb->recover_page_list_lock);
638 list_del(&page->lru);
639 balloon_page_push(&vb->recovered_page_list, page);
640 spin_unlock(&vb->recover_page_list_lock);
641 queue_work(system_freezable_wq, &vb->unpoison_memory_work);
642 dev_info_ratelimited(&vb->vdev->dev, "recovered pfn 0x%x", pfn0);
643 }
644 break;
645 case VIRTIO_BALLOON_R_STATUS_FAILED:
646 /* the hypervisor can't fix this corrupted page, balloon puts page */
647 if (pfn1 == pfn0) {
648 spin_lock(&vb->recover_page_list_lock);
649 list_del(&page->lru);
650 spin_unlock(&vb->recover_page_list_lock);
651 put_page(page);
652 dev_info_ratelimited(&vb->vdev->dev, "failed to recover pfn 0x%x", pfn0);
653 }
> 654 default:
655 break;
656 };
657
658 /* continue to get response from host side if the response gets handled successfully */
659 recover_vq_get_response(vb);
660 }
661
--
0-DAY CI Kernel Test Service
https://01.org/lkp
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 3/3] virtio_balloon: Introduce memory recover
2022-05-20 7:06 ` zhenwei pi
@ 2022-05-20 13:39 ` kernel test robot
-1 siblings, 0 replies; 37+ messages in thread
From: kernel test robot @ 2022-05-20 13:39 UTC (permalink / raw)
To: zhenwei pi, akpm, naoya.horiguchi, mst, david
Cc: kbuild-all, linux-mm, linux-kernel, jasowang, virtualization,
pbonzini, peterx, qemu-devel, zhenwei pi
Hi zhenwei,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on next-20220519]
[cannot apply to linux/master linus/master v5.18-rc7]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/intel-lab-lkp/linux/commits/zhenwei-pi/recover-hardware-corrupted-page-by-virtio-balloon/20220520-151328
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
config: nios2-randconfig-r002-20220519 (https://download.01.org/0day-ci/archive/20220520/202205202151.7T3K7Szf-lkp@intel.com/config)
compiler: nios2-linux-gcc (GCC) 11.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/a42127073dd4adb6354649c8235c5cde033d01f2
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review zhenwei-pi/recover-hardware-corrupted-page-by-virtio-balloon/20220520-151328
git checkout a42127073dd4adb6354649c8235c5cde033d01f2
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0 make.cross W=1 O=build_dir ARCH=nios2 SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>
All errors (new ones prefixed by >>):
nios2-linux-ld: drivers/virtio/virtio_balloon.o: in function `unpoison_memory_func':
>> drivers/virtio/virtio_balloon.c:679: undefined reference to `unpoison_memory'
drivers/virtio/virtio_balloon.c:679:(.text+0xc00): relocation truncated to fit: R_NIOS2_CALL26 against `unpoison_memory'
nios2-linux-ld: drivers/virtio/virtio_balloon.o: in function `virtballoon_probe':
>> drivers/virtio/virtio_balloon.c:1247: undefined reference to `register_memory_failure_notifier'
drivers/virtio/virtio_balloon.c:1247:(.text+0x1940): relocation truncated to fit: R_NIOS2_CALL26 against `register_memory_failure_notifier'
nios2-linux-ld: drivers/virtio/virtio_balloon.o: in function `virtballoon_remove':
>> drivers/virtio/virtio_balloon.c:1323: undefined reference to `unregister_memory_failure_notifier'
drivers/virtio/virtio_balloon.c:1323:(.text+0x1bcc): relocation truncated to fit: R_NIOS2_CALL26 against `unregister_memory_failure_notifier'
vim +679 drivers/virtio/virtio_balloon.c
661
662 static void unpoison_memory_func(struct work_struct *work)
663 {
664 struct virtio_balloon *vb;
665 struct page *page;
666
667 vb = container_of(work, struct virtio_balloon, unpoison_memory_work);
668
669 do {
670 spin_lock(&vb->recover_page_list_lock);
671 page = list_first_entry_or_null(&vb->recovered_page_list,
672 struct page, lru);
673 if (page)
674 list_del(&page->lru);
675 spin_unlock(&vb->recover_page_list_lock);
676
677 if (page) {
678 put_page(page);
> 679 unpoison_memory(page_to_pfn(page), true, "virtio-balloon");
680 }
681 } while (page);
682 }
683
--
0-DAY CI Kernel Test Service
https://01.org/lkp
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 3/3] virtio_balloon: Introduce memory recover
@ 2022-05-20 13:39 ` kernel test robot
0 siblings, 0 replies; 37+ messages in thread
From: kernel test robot @ 2022-05-20 13:39 UTC (permalink / raw)
To: zhenwei pi, akpm, naoya.horiguchi, mst, david
Cc: kbuild-all, qemu-devel, linux-kernel, virtualization, linux-mm,
zhenwei pi, pbonzini
Hi zhenwei,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on next-20220519]
[cannot apply to linux/master linus/master v5.18-rc7]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/intel-lab-lkp/linux/commits/zhenwei-pi/recover-hardware-corrupted-page-by-virtio-balloon/20220520-151328
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
config: nios2-randconfig-r002-20220519 (https://download.01.org/0day-ci/archive/20220520/202205202151.7T3K7Szf-lkp@intel.com/config)
compiler: nios2-linux-gcc (GCC) 11.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/a42127073dd4adb6354649c8235c5cde033d01f2
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review zhenwei-pi/recover-hardware-corrupted-page-by-virtio-balloon/20220520-151328
git checkout a42127073dd4adb6354649c8235c5cde033d01f2
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0 make.cross W=1 O=build_dir ARCH=nios2 SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>
All errors (new ones prefixed by >>):
nios2-linux-ld: drivers/virtio/virtio_balloon.o: in function `unpoison_memory_func':
>> drivers/virtio/virtio_balloon.c:679: undefined reference to `unpoison_memory'
drivers/virtio/virtio_balloon.c:679:(.text+0xc00): relocation truncated to fit: R_NIOS2_CALL26 against `unpoison_memory'
nios2-linux-ld: drivers/virtio/virtio_balloon.o: in function `virtballoon_probe':
>> drivers/virtio/virtio_balloon.c:1247: undefined reference to `register_memory_failure_notifier'
drivers/virtio/virtio_balloon.c:1247:(.text+0x1940): relocation truncated to fit: R_NIOS2_CALL26 against `register_memory_failure_notifier'
nios2-linux-ld: drivers/virtio/virtio_balloon.o: in function `virtballoon_remove':
>> drivers/virtio/virtio_balloon.c:1323: undefined reference to `unregister_memory_failure_notifier'
drivers/virtio/virtio_balloon.c:1323:(.text+0x1bcc): relocation truncated to fit: R_NIOS2_CALL26 against `unregister_memory_failure_notifier'
vim +679 drivers/virtio/virtio_balloon.c
661
662 static void unpoison_memory_func(struct work_struct *work)
663 {
664 struct virtio_balloon *vb;
665 struct page *page;
666
667 vb = container_of(work, struct virtio_balloon, unpoison_memory_work);
668
669 do {
670 spin_lock(&vb->recover_page_list_lock);
671 page = list_first_entry_or_null(&vb->recovered_page_list,
672 struct page, lru);
673 if (page)
674 list_del(&page->lru);
675 spin_unlock(&vb->recover_page_list_lock);
676
677 if (page) {
678 put_page(page);
> 679 unpoison_memory(page_to_pfn(page), true, "virtio-balloon");
680 }
681 } while (page);
682 }
683
--
0-DAY CI Kernel Test Service
https://01.org/lkp
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 3/3] virtio_balloon: Introduce memory recover
2022-05-20 7:06 ` zhenwei pi
@ 2022-05-20 15:28 ` kernel test robot
-1 siblings, 0 replies; 37+ messages in thread
From: kernel test robot @ 2022-05-20 15:28 UTC (permalink / raw)
To: zhenwei pi, akpm, naoya.horiguchi, mst, david
Cc: kbuild-all, linux-mm, linux-kernel, jasowang, virtualization,
pbonzini, peterx, qemu-devel, zhenwei pi
Hi zhenwei,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on next-20220519]
[cannot apply to linux/master linus/master v5.18-rc7]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/intel-lab-lkp/linux/commits/zhenwei-pi/recover-hardware-corrupted-page-by-virtio-balloon/20220520-151328
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
config: arm-randconfig-r026-20220519 (https://download.01.org/0day-ci/archive/20220520/202205202330.u0vQWiWG-lkp@intel.com/config)
compiler: arm-linux-gnueabi-gcc (GCC) 11.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/a42127073dd4adb6354649c8235c5cde033d01f2
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review zhenwei-pi/recover-hardware-corrupted-page-by-virtio-balloon/20220520-151328
git checkout a42127073dd4adb6354649c8235c5cde033d01f2
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0 make.cross W=1 O=build_dir ARCH=arm SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>
All errors (new ones prefixed by >>):
arm-linux-gnueabi-ld: drivers/virtio/virtio_balloon.o: in function `unpoison_memory_func':
>> virtio_balloon.c:(.text+0x89a): undefined reference to `unpoison_memory'
arm-linux-gnueabi-ld: drivers/virtio/virtio_balloon.o: in function `virtballoon_probe':
>> virtio_balloon.c:(.text+0x111a): undefined reference to `register_memory_failure_notifier'
arm-linux-gnueabi-ld: drivers/virtio/virtio_balloon.o: in function `virtballoon_remove':
>> virtio_balloon.c:(.text+0x12a2): undefined reference to `unregister_memory_failure_notifier'
--
0-DAY CI Kernel Test Service
https://01.org/lkp
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 3/3] virtio_balloon: Introduce memory recover
@ 2022-05-20 15:28 ` kernel test robot
0 siblings, 0 replies; 37+ messages in thread
From: kernel test robot @ 2022-05-20 15:28 UTC (permalink / raw)
To: zhenwei pi, akpm, naoya.horiguchi, mst, david
Cc: kbuild-all, qemu-devel, linux-kernel, virtualization, linux-mm,
zhenwei pi, pbonzini
Hi zhenwei,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on next-20220519]
[cannot apply to linux/master linus/master v5.18-rc7]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/intel-lab-lkp/linux/commits/zhenwei-pi/recover-hardware-corrupted-page-by-virtio-balloon/20220520-151328
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
config: arm-randconfig-r026-20220519 (https://download.01.org/0day-ci/archive/20220520/202205202330.u0vQWiWG-lkp@intel.com/config)
compiler: arm-linux-gnueabi-gcc (GCC) 11.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/a42127073dd4adb6354649c8235c5cde033d01f2
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review zhenwei-pi/recover-hardware-corrupted-page-by-virtio-balloon/20220520-151328
git checkout a42127073dd4adb6354649c8235c5cde033d01f2
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0 make.cross W=1 O=build_dir ARCH=arm SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>
All errors (new ones prefixed by >>):
arm-linux-gnueabi-ld: drivers/virtio/virtio_balloon.o: in function `unpoison_memory_func':
>> virtio_balloon.c:(.text+0x89a): undefined reference to `unpoison_memory'
arm-linux-gnueabi-ld: drivers/virtio/virtio_balloon.o: in function `virtballoon_probe':
>> virtio_balloon.c:(.text+0x111a): undefined reference to `register_memory_failure_notifier'
arm-linux-gnueabi-ld: drivers/virtio/virtio_balloon.o: in function `virtballoon_remove':
>> virtio_balloon.c:(.text+0x12a2): undefined reference to `unregister_memory_failure_notifier'
--
0-DAY CI Kernel Test Service
https://01.org/lkp
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 3/3] virtio_balloon: Introduce memory recover
2022-05-20 7:06 ` zhenwei pi
` (3 preceding siblings ...)
(?)
@ 2022-05-24 19:35 ` Sean Christopherson
2022-05-24 23:32 ` zhenwei pi
-1 siblings, 1 reply; 37+ messages in thread
From: Sean Christopherson @ 2022-05-24 19:35 UTC (permalink / raw)
To: zhenwei pi
Cc: akpm, naoya.horiguchi, mst, david, linux-mm, linux-kernel,
jasowang, virtualization, pbonzini, peterx, qemu-devel
On Fri, May 20, 2022, zhenwei pi wrote:
> @@ -59,6 +60,12 @@ enum virtio_balloon_config_read {
> VIRTIO_BALLOON_CONFIG_READ_CMD_ID = 0,
> };
>
> +/* the request body to commucate with host side */
> +struct __virtio_balloon_recover {
> + struct virtio_balloon_recover vbr;
> + __virtio32 pfns[VIRTIO_BALLOON_PAGES_PER_PAGE];
I assume this is copied from virtio_balloon.pfns, which also uses __virtio32, but
isn't that horribly broken? PFNs are 'unsigned long', i.e. 64 bits on 64-bit kernels.
x86-64 at least most definitely generates 64-bit PFNs. Unless there's magic I'm
missing, page_to_balloon_pfn() will truncate PFNs and feed the host bad info.
> @@ -494,6 +511,198 @@ static void update_balloon_size_func(struct work_struct *work)
> queue_work(system_freezable_wq, work);
> }
>
> +/*
> + * virtballoon_memory_failure - notified by memory failure, try to fix the
> + * corrupted page.
> + * The memory failure notifier is designed to call back when the kernel handled
> + * successfully only, WARN_ON_ONCE on the unlikely condition to find out any
> + * error(memory error handling is a best effort, not 100% coverd).
> + */
> +static int virtballoon_memory_failure(struct notifier_block *notifier,
> + unsigned long pfn, void *parm)
> +{
> + struct virtio_balloon *vb = container_of(notifier, struct virtio_balloon,
> + memory_failure_nb);
> + struct page *page;
> + struct __virtio_balloon_recover *out_vbr;
> + struct scatterlist sg;
> + unsigned long flags;
> + int err;
> +
> + page = pfn_to_online_page(pfn);
> + if (WARN_ON_ONCE(!page))
> + return NOTIFY_DONE;
> +
> + if (PageHuge(page))
> + return NOTIFY_DONE;
> +
> + if (WARN_ON_ONCE(!PageHWPoison(page)))
> + return NOTIFY_DONE;
> +
> + if (WARN_ON_ONCE(page_count(page) != 1))
> + return NOTIFY_DONE;
> +
> + get_page(page); /* balloon reference */
> +
> + out_vbr = kzalloc(sizeof(*out_vbr), GFP_KERNEL);
> + if (WARN_ON_ONCE(!out_vbr))
> + return NOTIFY_BAD;
Not that it truly matters, but won't failure at this point leak the poisoned page?
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: Re: [PATCH 3/3] virtio_balloon: Introduce memory recover
2022-05-24 19:35 ` Sean Christopherson
@ 2022-05-24 23:32 ` zhenwei pi
0 siblings, 0 replies; 37+ messages in thread
From: zhenwei pi @ 2022-05-24 23:32 UTC (permalink / raw)
To: Sean Christopherson, david
Cc: akpm, naoya.horiguchi, mst, linux-mm, linux-kernel, jasowang,
virtualization, pbonzini, peterx, qemu-devel
On 5/25/22 03:35, Sean Christopherson wrote:
> On Fri, May 20, 2022, zhenwei pi wrote:
>> @@ -59,6 +60,12 @@ enum virtio_balloon_config_read {
>> VIRTIO_BALLOON_CONFIG_READ_CMD_ID = 0,
>> };
>>
>> +/* the request body to commucate with host side */
>> +struct __virtio_balloon_recover {
>> + struct virtio_balloon_recover vbr;
>> + __virtio32 pfns[VIRTIO_BALLOON_PAGES_PER_PAGE];
>
> I assume this is copied from virtio_balloon.pfns, which also uses __virtio32, but
> isn't that horribly broken? PFNs are 'unsigned long', i.e. 64 bits on 64-bit kernels.
> x86-64 at least most definitely generates 64-bit PFNs. Unless there's magic I'm
> missing, page_to_balloon_pfn() will truncate PFNs and feed the host bad info.
>
Yes, I also noticed this point, I suppose the balloon device can not
work on a virtual machine which has physical address larger than 16T.
I still let the recover VQ keep aligned with the inflate VQ and deflate
VQ. I prefer the recover VQ to be workable/unworkable with
inflate/deflate VQ together. So I leave this to the virtio balloon
maintainer to decide ...
>> @@ -494,6 +511,198 @@ static void update_balloon_size_func(struct work_struct *work)
>> queue_work(system_freezable_wq, work);
>> }
>>
>> +/*
>> + * virtballoon_memory_failure - notified by memory failure, try to fix the
>> + * corrupted page.
>> + * The memory failure notifier is designed to call back when the kernel handled
>> + * successfully only, WARN_ON_ONCE on the unlikely condition to find out any
>> + * error(memory error handling is a best effort, not 100% coverd).
>> + */
>> +static int virtballoon_memory_failure(struct notifier_block *notifier,
>> + unsigned long pfn, void *parm)
>> +{
>> + struct virtio_balloon *vb = container_of(notifier, struct virtio_balloon,
>> + memory_failure_nb);
>> + struct page *page;
>> + struct __virtio_balloon_recover *out_vbr;
>> + struct scatterlist sg;
>> + unsigned long flags;
>> + int err;
>> +
>> + page = pfn_to_online_page(pfn);
>> + if (WARN_ON_ONCE(!page))
>> + return NOTIFY_DONE;
>> +
>> + if (PageHuge(page))
>> + return NOTIFY_DONE;
>> +
>> + if (WARN_ON_ONCE(!PageHWPoison(page)))
>> + return NOTIFY_DONE;
>> +
>> + if (WARN_ON_ONCE(page_count(page) != 1))
>> + return NOTIFY_DONE;
>> +
>> + get_page(page); /* balloon reference */
>> +
>> + out_vbr = kzalloc(sizeof(*out_vbr), GFP_KERNEL);
>> + if (WARN_ON_ONCE(!out_vbr))
>> + return NOTIFY_BAD;
>
> Not that it truly matters, but won't failure at this point leak the poisoned page?
I'll fix this, thanks!
--
zhenwei pi
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: Re: [PATCH 3/3] virtio_balloon: Introduce memory recover
@ 2022-05-24 23:32 ` zhenwei pi
0 siblings, 0 replies; 37+ messages in thread
From: zhenwei pi @ 2022-05-24 23:32 UTC (permalink / raw)
To: Sean Christopherson, david
Cc: mst, qemu-devel, naoya.horiguchi, linux-kernel, virtualization,
linux-mm, pbonzini, akpm
On 5/25/22 03:35, Sean Christopherson wrote:
> On Fri, May 20, 2022, zhenwei pi wrote:
>> @@ -59,6 +60,12 @@ enum virtio_balloon_config_read {
>> VIRTIO_BALLOON_CONFIG_READ_CMD_ID = 0,
>> };
>>
>> +/* the request body to commucate with host side */
>> +struct __virtio_balloon_recover {
>> + struct virtio_balloon_recover vbr;
>> + __virtio32 pfns[VIRTIO_BALLOON_PAGES_PER_PAGE];
>
> I assume this is copied from virtio_balloon.pfns, which also uses __virtio32, but
> isn't that horribly broken? PFNs are 'unsigned long', i.e. 64 bits on 64-bit kernels.
> x86-64 at least most definitely generates 64-bit PFNs. Unless there's magic I'm
> missing, page_to_balloon_pfn() will truncate PFNs and feed the host bad info.
>
Yes, I also noticed this point, I suppose the balloon device can not
work on a virtual machine which has physical address larger than 16T.
I still let the recover VQ keep aligned with the inflate VQ and deflate
VQ. I prefer the recover VQ to be workable/unworkable with
inflate/deflate VQ together. So I leave this to the virtio balloon
maintainer to decide ...
>> @@ -494,6 +511,198 @@ static void update_balloon_size_func(struct work_struct *work)
>> queue_work(system_freezable_wq, work);
>> }
>>
>> +/*
>> + * virtballoon_memory_failure - notified by memory failure, try to fix the
>> + * corrupted page.
>> + * The memory failure notifier is designed to call back when the kernel handled
>> + * successfully only, WARN_ON_ONCE on the unlikely condition to find out any
>> + * error(memory error handling is a best effort, not 100% coverd).
>> + */
>> +static int virtballoon_memory_failure(struct notifier_block *notifier,
>> + unsigned long pfn, void *parm)
>> +{
>> + struct virtio_balloon *vb = container_of(notifier, struct virtio_balloon,
>> + memory_failure_nb);
>> + struct page *page;
>> + struct __virtio_balloon_recover *out_vbr;
>> + struct scatterlist sg;
>> + unsigned long flags;
>> + int err;
>> +
>> + page = pfn_to_online_page(pfn);
>> + if (WARN_ON_ONCE(!page))
>> + return NOTIFY_DONE;
>> +
>> + if (PageHuge(page))
>> + return NOTIFY_DONE;
>> +
>> + if (WARN_ON_ONCE(!PageHWPoison(page)))
>> + return NOTIFY_DONE;
>> +
>> + if (WARN_ON_ONCE(page_count(page) != 1))
>> + return NOTIFY_DONE;
>> +
>> + get_page(page); /* balloon reference */
>> +
>> + out_vbr = kzalloc(sizeof(*out_vbr), GFP_KERNEL);
>> + if (WARN_ON_ONCE(!out_vbr))
>> + return NOTIFY_BAD;
>
> Not that it truly matters, but won't failure at this point leak the poisoned page?
I'll fix this, thanks!
--
zhenwei pi
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 3/3] virtio_balloon: Introduce memory recover
2022-05-24 23:32 ` zhenwei pi
@ 2022-05-30 7:53 ` David Hildenbrand
-1 siblings, 0 replies; 37+ messages in thread
From: David Hildenbrand @ 2022-05-30 7:53 UTC (permalink / raw)
To: zhenwei pi, Sean Christopherson
Cc: akpm, naoya.horiguchi, mst, linux-mm, linux-kernel, jasowang,
virtualization, pbonzini, peterx, qemu-devel
On 25.05.22 01:32, zhenwei pi wrote:
>
>
> On 5/25/22 03:35, Sean Christopherson wrote:
>> On Fri, May 20, 2022, zhenwei pi wrote:
>>> @@ -59,6 +60,12 @@ enum virtio_balloon_config_read {
>>> VIRTIO_BALLOON_CONFIG_READ_CMD_ID = 0,
>>> };
>>>
>>> +/* the request body to commucate with host side */
>>> +struct __virtio_balloon_recover {
>>> + struct virtio_balloon_recover vbr;
>>> + __virtio32 pfns[VIRTIO_BALLOON_PAGES_PER_PAGE];
>>
>> I assume this is copied from virtio_balloon.pfns, which also uses __virtio32, but
>> isn't that horribly broken? PFNs are 'unsigned long', i.e. 64 bits on 64-bit kernels.
>> x86-64 at least most definitely generates 64-bit PFNs. Unless there's magic I'm
>> missing, page_to_balloon_pfn() will truncate PFNs and feed the host bad info.
>>
>
> Yes, I also noticed this point, I suppose the balloon device can not
> work on a virtual machine which has physical address larger than 16T.
Yes, that's a historical artifact and we never ran into it in practice
-- because 16TB VMs are still rare, especially when paired with
virtio-balloon inflation/deflation. Most probably the guest should just
stop inflating when hitting such a big PFN. In the future, we might want
a proper sg interface instead.
--
Thanks,
David / dhildenb
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 3/3] virtio_balloon: Introduce memory recover
@ 2022-05-30 7:53 ` David Hildenbrand
0 siblings, 0 replies; 37+ messages in thread
From: David Hildenbrand @ 2022-05-30 7:53 UTC (permalink / raw)
To: zhenwei pi, Sean Christopherson
Cc: mst, qemu-devel, naoya.horiguchi, linux-kernel, virtualization,
linux-mm, pbonzini, akpm
On 25.05.22 01:32, zhenwei pi wrote:
>
>
> On 5/25/22 03:35, Sean Christopherson wrote:
>> On Fri, May 20, 2022, zhenwei pi wrote:
>>> @@ -59,6 +60,12 @@ enum virtio_balloon_config_read {
>>> VIRTIO_BALLOON_CONFIG_READ_CMD_ID = 0,
>>> };
>>>
>>> +/* the request body to commucate with host side */
>>> +struct __virtio_balloon_recover {
>>> + struct virtio_balloon_recover vbr;
>>> + __virtio32 pfns[VIRTIO_BALLOON_PAGES_PER_PAGE];
>>
>> I assume this is copied from virtio_balloon.pfns, which also uses __virtio32, but
>> isn't that horribly broken? PFNs are 'unsigned long', i.e. 64 bits on 64-bit kernels.
>> x86-64 at least most definitely generates 64-bit PFNs. Unless there's magic I'm
>> missing, page_to_balloon_pfn() will truncate PFNs and feed the host bad info.
>>
>
> Yes, I also noticed this point, I suppose the balloon device can not
> work on a virtual machine which has physical address larger than 16T.
Yes, that's a historical artifact and we never ran into it in practice
-- because 16TB VMs are still rare, especially when paired with
virtio-balloon inflation/deflation. Most probably the guest should just
stop inflating when hitting such a big PFN. In the future, we might want
a proper sg interface instead.
--
Thanks,
David / dhildenb
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 3/3] virtio_balloon: Introduce memory recover
2022-05-20 7:06 ` zhenwei pi
@ 2022-05-26 19:18 ` Michael S. Tsirkin
-1 siblings, 0 replies; 37+ messages in thread
From: Michael S. Tsirkin @ 2022-05-26 19:18 UTC (permalink / raw)
To: zhenwei pi
Cc: akpm, naoya.horiguchi, david, linux-mm, linux-kernel, jasowang,
virtualization, pbonzini, peterx, qemu-devel
On Fri, May 20, 2022 at 03:06:48PM +0800, zhenwei pi wrote:
> Introduce a new queue 'recover VQ', this allows guest to recover
> hardware corrupted page:
>
> Guest 5.MF -> 6.RVQ FE 10.Unpoison page
> / \ /
> -------------------+-------------+----------+-----------
> | | |
> 4.MCE 7.RVQ BE 9.RVQ Event
> QEMU / \ /
> 3.SIGBUS 8.Remap
> /
> ----------------+------------------------------------
> |
> +--2.MF
> Host /
> 1.HW error
>
> The workflow:
> 1, HardWare page error occurs randomly.
> 2, host side handles corrupted page by Memory Failure mechanism, sends
> SIGBUS to the user process if early-kill is enabled.
> 3, QEMU handles SIGBUS, if the address belongs to guest RAM, then:
> 4, QEMU tries to inject MCE into guest.
> 5, guest handles memory failure again.
>
> 1-5 is already supported for a long time, the next steps are supported
> in this patch(also related driver patch):
> 6, guest balloon driver gets noticed of the corrupted PFN, and sends
> request to host side by Recover VQ FrontEnd.
> 7, QEMU handles request from Recover VQ BackEnd, then:
> 8, QEMU remaps the corrupted HVA fo fix the memory failure, then:
> 9, QEMU acks the guest side the result by Recover VQ.
> 10, guest unpoisons the page if the corrupted page gets recoverd
> successfully.
>
> Then the guest fixes the HW page error dynamiclly without rebooting.
>
> Emulate MCE by QEMU, the guest works fine:
> mce: [Hardware Error]: Machine check events logged
> Memory failure: 0x61646: recovery action for dirty LRU page: Recovered
> virtio_balloon virtio5: recovered pfn 0x61646
> Unpoison: Unpoisoned page 0x61646 by virtio-balloon
> MCE: Killing stress:24502 due to hardware memory corruption fault at 7f5be2e5a010
>
> The 'HardwareCorrupted' in /proc/meminfo also shows 0 kB.
>
> Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
> ---
> drivers/virtio/virtio_balloon.c | 243 ++++++++++++++++++++++++++++
> include/uapi/linux/virtio_balloon.h | 16 ++
> 2 files changed, 259 insertions(+)
>
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index f4c34a2a6b8e..f9d95d1d8a4d 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -52,6 +52,7 @@ enum virtio_balloon_vq {
> VIRTIO_BALLOON_VQ_STATS,
> VIRTIO_BALLOON_VQ_FREE_PAGE,
> VIRTIO_BALLOON_VQ_REPORTING,
> + VIRTIO_BALLOON_VQ_RECOVER,
> VIRTIO_BALLOON_VQ_MAX
> };
>
> @@ -59,6 +60,12 @@ enum virtio_balloon_config_read {
> VIRTIO_BALLOON_CONFIG_READ_CMD_ID = 0,
> };
>
> +/* the request body to commucate with host side */
> +struct __virtio_balloon_recover {
> + struct virtio_balloon_recover vbr;
> + __virtio32 pfns[VIRTIO_BALLOON_PAGES_PER_PAGE];
> +};
> +
I don't think this idea of passing 32 bit pfns is going to fly.
What is wrong with just passing the pages normally as a s/g list?
this is what is done for the hints at the moment.
neither should you use __virtio types for new functionality
(should all be __le), nor use __virtio for the struct name.
> struct virtio_balloon {
> struct virtio_device *vdev;
> struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
> @@ -126,6 +133,16 @@ struct virtio_balloon {
> /* Free page reporting device */
> struct virtqueue *reporting_vq;
> struct page_reporting_dev_info pr_dev_info;
> +
> + /* Memory recover VQ - VIRTIO_BALLOON_F_RECOVER */
> + struct virtqueue *recover_vq;
> + spinlock_t recover_vq_lock;
> + struct notifier_block memory_failure_nb;
> + struct list_head corrupted_page_list;
> + struct list_head recovered_page_list;
> + spinlock_t recover_page_list_lock;
> + struct __virtio_balloon_recover in_vbr;
> + struct work_struct unpoison_memory_work;
> };
>
> static const struct virtio_device_id id_table[] = {
> @@ -494,6 +511,198 @@ static void update_balloon_size_func(struct work_struct *work)
> queue_work(system_freezable_wq, work);
> }
>
> +/*
> + * virtballoon_memory_failure - notified by memory failure, try to fix the
> + * corrupted page.
> + * The memory failure notifier is designed to call back when the kernel handled
> + * successfully only, WARN_ON_ONCE on the unlikely condition to find out any
> + * error(memory error handling is a best effort, not 100% coverd).
covered
> + */
> +static int virtballoon_memory_failure(struct notifier_block *notifier,
> + unsigned long pfn, void *parm)
> +{
> + struct virtio_balloon *vb = container_of(notifier, struct virtio_balloon,
> + memory_failure_nb);
> + struct page *page;
> + struct __virtio_balloon_recover *out_vbr;
> + struct scatterlist sg;
> + unsigned long flags;
> + int err;
> +
> + page = pfn_to_online_page(pfn);
> + if (WARN_ON_ONCE(!page))
> + return NOTIFY_DONE;
> +
> + if (PageHuge(page))
> + return NOTIFY_DONE;
> +
> + if (WARN_ON_ONCE(!PageHWPoison(page)))
> + return NOTIFY_DONE;
> +
> + if (WARN_ON_ONCE(page_count(page) != 1))
> + return NOTIFY_DONE;
> +
> + get_page(page); /* balloon reference */
> +
> + out_vbr = kzalloc(sizeof(*out_vbr), GFP_KERNEL);
> + if (WARN_ON_ONCE(!out_vbr))
> + return NOTIFY_BAD;
> +
> + spin_lock(&vb->recover_page_list_lock);
> + balloon_page_push(&vb->corrupted_page_list, page);
> + spin_unlock(&vb->recover_page_list_lock);
> +
> + out_vbr->vbr.cmd = VIRTIO_BALLOON_R_CMD_RECOVER;
> + set_page_pfns(vb, out_vbr->pfns, page);
> + sg_init_one(&sg, out_vbr, sizeof(*out_vbr));
> +
> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
> + err = virtqueue_add_outbuf(vb->recover_vq, &sg, 1, out_vbr, GFP_KERNEL);
> + if (unlikely(err)) {
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> + return NOTIFY_DONE;
> + }
> + virtqueue_kick(vb->recover_vq);
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> +
> + return NOTIFY_OK;
> +}
> +
> +static int recover_vq_get_response(struct virtio_balloon *vb)
> +{
> + struct __virtio_balloon_recover *in_vbr;
> + struct scatterlist sg;
> + unsigned long flags;
> + int err;
> +
> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
> + in_vbr = &vb->in_vbr;
> + memset(in_vbr, 0x00, sizeof(*in_vbr));
> + sg_init_one(&sg, in_vbr, sizeof(*in_vbr));
> + err = virtqueue_add_inbuf(vb->recover_vq, &sg, 1, in_vbr, GFP_KERNEL);
> + if (unlikely(err)) {
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> + return err;
> + }
> +
> + virtqueue_kick(vb->recover_vq);
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> +
> + return 0;
> +}
> +
> +static void recover_vq_handle_response(struct virtio_balloon *vb, unsigned int len)
> +{
> + struct __virtio_balloon_recover *in_vbr;
> + struct virtio_balloon_recover *vbr;
> + struct page *page;
> + unsigned int pfns;
> + u32 pfn0, pfn1;
> + __u8 status;
> +
> + /* the response is not expected */
> + if (unlikely(len != sizeof(struct __virtio_balloon_recover)))
> + return;
> +
> + in_vbr = &vb->in_vbr;
> + vbr = &in_vbr->vbr;
> + if (unlikely(vbr->cmd != VIRTIO_BALLOON_R_CMD_RESPONSE))
> + return;
> +
> + /* to make sure the contiguous balloon PFNs */
> + for (pfns = 1; pfns < VIRTIO_BALLOON_PAGES_PER_PAGE; pfns++) {
> + pfn0 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[pfns - 1]);
> + pfn1 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[pfns]);
> + if (pfn1 - pfn0 != 1)
> + return;
> + }
> +
> + pfn0 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[0]);
> + if (!pfn_valid(pfn0))
> + return;
> +
> + pfn1 = -1;
> + spin_lock(&vb->recover_page_list_lock);
> + list_for_each_entry(page, &vb->corrupted_page_list, lru) {
> + pfn1 = page_to_pfn(page);
> + if (pfn1 == pfn0)
> + break;
> + }
> + spin_unlock(&vb->recover_page_list_lock);
> +
> + status = vbr->status;
> + switch (status) {
> + case VIRTIO_BALLOON_R_STATUS_RECOVERED:
> + if (pfn1 == pfn0) {
> + spin_lock(&vb->recover_page_list_lock);
> + list_del(&page->lru);
> + balloon_page_push(&vb->recovered_page_list, page);
> + spin_unlock(&vb->recover_page_list_lock);
> + queue_work(system_freezable_wq, &vb->unpoison_memory_work);
> + dev_info_ratelimited(&vb->vdev->dev, "recovered pfn 0x%x", pfn0);
> + }
> + break;
> + case VIRTIO_BALLOON_R_STATUS_FAILED:
> + /* the hypervisor can't fix this corrupted page, balloon puts page */
> + if (pfn1 == pfn0) {
> + spin_lock(&vb->recover_page_list_lock);
> + list_del(&page->lru);
> + spin_unlock(&vb->recover_page_list_lock);
> + put_page(page);
> + dev_info_ratelimited(&vb->vdev->dev, "failed to recover pfn 0x%x", pfn0);
> + }
> + default:
> + break;
> + };
> +
> + /* continue to get response from host side if the response gets handled successfully */
> + recover_vq_get_response(vb);
> +}
> +
> +static void unpoison_memory_func(struct work_struct *work)
> +{
> + struct virtio_balloon *vb;
> + struct page *page;
> +
> + vb = container_of(work, struct virtio_balloon, unpoison_memory_work);
> +
> + do {
> + spin_lock(&vb->recover_page_list_lock);
> + page = list_first_entry_or_null(&vb->recovered_page_list,
> + struct page, lru);
> + if (page)
> + list_del(&page->lru);
> + spin_unlock(&vb->recover_page_list_lock);
> +
> + if (page) {
> + put_page(page);
> + unpoison_memory(page_to_pfn(page), true, "virtio-balloon");
> + }
> + } while (page);
> +}
> +
> +static void recover_vq_cb(struct virtqueue *vq)
> +{
> + struct virtio_balloon *vb = vq->vdev->priv;
> + struct __virtio_balloon_recover *vbr;
> + unsigned long flags;
> + unsigned int len;
> +
> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
> + do {
> + virtqueue_disable_cb(vq);
> + while ((vbr = virtqueue_get_buf(vq, &len)) != NULL) {
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> + if (vbr == &vb->in_vbr)
> + recover_vq_handle_response(vb, len);
> + else
> + kfree(vbr); /* just free the memory for out vbr request */
> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
> + }
> + } while (!virtqueue_enable_cb(vq));
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> +}
> +
> static int init_vqs(struct virtio_balloon *vb)
> {
> struct virtqueue *vqs[VIRTIO_BALLOON_VQ_MAX];
> @@ -515,6 +724,7 @@ static int init_vqs(struct virtio_balloon *vb)
> callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> names[VIRTIO_BALLOON_VQ_REPORTING] = NULL;
> + names[VIRTIO_BALLOON_VQ_RECOVER] = NULL;
>
> if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> names[VIRTIO_BALLOON_VQ_STATS] = "stats";
> @@ -531,6 +741,11 @@ static int init_vqs(struct virtio_balloon *vb)
> callbacks[VIRTIO_BALLOON_VQ_REPORTING] = balloon_ack;
> }
>
> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_RECOVER)) {
> + names[VIRTIO_BALLOON_VQ_RECOVER] = "recover_vq";
> + callbacks[VIRTIO_BALLOON_VQ_RECOVER] = recover_vq_cb;
> + }
> +
> err = virtio_find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX, vqs,
> callbacks, names, NULL);
> if (err)
> @@ -566,6 +781,9 @@ static int init_vqs(struct virtio_balloon *vb)
> if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
> vb->reporting_vq = vqs[VIRTIO_BALLOON_VQ_REPORTING];
>
> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_RECOVER))
> + vb->recover_vq = vqs[VIRTIO_BALLOON_VQ_RECOVER];
> +
> return 0;
> }
>
> @@ -1015,12 +1233,31 @@ static int virtballoon_probe(struct virtio_device *vdev)
> goto out_unregister_oom;
> }
>
> + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_RECOVER)) {
> + err = recover_vq_get_response(vb);
> + if (err)
> + goto out_unregister_reporting;
> +
> + vb->memory_failure_nb.notifier_call = virtballoon_memory_failure;
> + spin_lock_init(&vb->recover_page_list_lock);
> + spin_lock_init(&vb->recover_vq_lock);
> + INIT_LIST_HEAD(&vb->corrupted_page_list);
> + INIT_LIST_HEAD(&vb->recovered_page_list);
> + INIT_WORK(&vb->unpoison_memory_work, unpoison_memory_func);
> + err = register_memory_failure_notifier(&vb->memory_failure_nb);
> + if (err)
> + goto out_unregister_reporting;
> + }
> +
> virtio_device_ready(vdev);
>
> if (towards_target(vb))
> virtballoon_changed(vdev);
> return 0;
>
> +out_unregister_reporting:
> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
> + page_reporting_unregister(&vb->pr_dev_info);
> out_unregister_oom:
> if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
> unregister_oom_notifier(&vb->oom_nb);
> @@ -1082,6 +1319,11 @@ static void virtballoon_remove(struct virtio_device *vdev)
> destroy_workqueue(vb->balloon_wq);
> }
>
> + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_RECOVER)) {
> + unregister_memory_failure_notifier(&vb->memory_failure_nb);
> + cancel_work_sync(&vb->unpoison_memory_work);
> + }
> +
> remove_common(vb);
> #ifdef CONFIG_BALLOON_COMPACTION
> if (vb->vb_dev_info.inode)
> @@ -1147,6 +1389,7 @@ static unsigned int features[] = {
> VIRTIO_BALLOON_F_FREE_PAGE_HINT,
> VIRTIO_BALLOON_F_PAGE_POISON,
> VIRTIO_BALLOON_F_REPORTING,
> + VIRTIO_BALLOON_F_RECOVER,
> };
>
> static struct virtio_driver virtio_balloon_driver = {
> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> index ddaa45e723c4..41d0ffa2fb54 100644
> --- a/include/uapi/linux/virtio_balloon.h
> +++ b/include/uapi/linux/virtio_balloon.h
> @@ -37,6 +37,7 @@
> #define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */
> #define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */
> #define VIRTIO_BALLOON_F_REPORTING 5 /* Page reporting virtqueue */
> +#define VIRTIO_BALLOON_F_RECOVER 6 /* Memory recover virtqueue */
>
> /* Size of a PFN in the balloon interface. */
> #define VIRTIO_BALLOON_PFN_SHIFT 12
Please get this feature recorded in the spec with the virtio TC.
They will also ask you to supply minimal documentation.
> @@ -59,6 +60,8 @@ struct virtio_balloon_config {
> };
> /* Stores PAGE_POISON if page poisoning is in use */
> __le32 poison_val;
> + /* Number of hardware corrupted pages, guest read only */
> + __le32 corrupted_pages;
> };
>
> #define VIRTIO_BALLOON_S_SWAP_IN 0 /* Amount of memory swapped in */
> @@ -116,4 +119,17 @@ struct virtio_balloon_stat {
> __virtio64 val;
> } __attribute__((packed));
>
> +#define VIRTIO_BALLOON_R_CMD_RECOVER 0
> +#define VIRTIO_BALLOON_R_CMD_RESPONSE 0x80
> +
> +#define VIRTIO_BALLOON_R_STATUS_CORRUPTED 0
> +#define VIRTIO_BALLOON_R_STATUS_RECOVERED 1
> +#define VIRTIO_BALLOON_R_STATUS_FAILED 2
> +
> +struct virtio_balloon_recover {
> + __u8 cmd;
> + __u8 status;
> + __u8 padding[6];
> +};
> +
> #endif /* _LINUX_VIRTIO_BALLOON_H */
> --
> 2.20.1
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 3/3] virtio_balloon: Introduce memory recover
@ 2022-05-26 19:18 ` Michael S. Tsirkin
0 siblings, 0 replies; 37+ messages in thread
From: Michael S. Tsirkin @ 2022-05-26 19:18 UTC (permalink / raw)
To: zhenwei pi
Cc: qemu-devel, naoya.horiguchi, linux-kernel, virtualization,
linux-mm, pbonzini, akpm
On Fri, May 20, 2022 at 03:06:48PM +0800, zhenwei pi wrote:
> Introduce a new queue 'recover VQ', this allows guest to recover
> hardware corrupted page:
>
> Guest 5.MF -> 6.RVQ FE 10.Unpoison page
> / \ /
> -------------------+-------------+----------+-----------
> | | |
> 4.MCE 7.RVQ BE 9.RVQ Event
> QEMU / \ /
> 3.SIGBUS 8.Remap
> /
> ----------------+------------------------------------
> |
> +--2.MF
> Host /
> 1.HW error
>
> The workflow:
> 1, HardWare page error occurs randomly.
> 2, host side handles corrupted page by Memory Failure mechanism, sends
> SIGBUS to the user process if early-kill is enabled.
> 3, QEMU handles SIGBUS, if the address belongs to guest RAM, then:
> 4, QEMU tries to inject MCE into guest.
> 5, guest handles memory failure again.
>
> 1-5 is already supported for a long time, the next steps are supported
> in this patch(also related driver patch):
> 6, guest balloon driver gets noticed of the corrupted PFN, and sends
> request to host side by Recover VQ FrontEnd.
> 7, QEMU handles request from Recover VQ BackEnd, then:
> 8, QEMU remaps the corrupted HVA fo fix the memory failure, then:
> 9, QEMU acks the guest side the result by Recover VQ.
> 10, guest unpoisons the page if the corrupted page gets recoverd
> successfully.
>
> Then the guest fixes the HW page error dynamiclly without rebooting.
>
> Emulate MCE by QEMU, the guest works fine:
> mce: [Hardware Error]: Machine check events logged
> Memory failure: 0x61646: recovery action for dirty LRU page: Recovered
> virtio_balloon virtio5: recovered pfn 0x61646
> Unpoison: Unpoisoned page 0x61646 by virtio-balloon
> MCE: Killing stress:24502 due to hardware memory corruption fault at 7f5be2e5a010
>
> The 'HardwareCorrupted' in /proc/meminfo also shows 0 kB.
>
> Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
> ---
> drivers/virtio/virtio_balloon.c | 243 ++++++++++++++++++++++++++++
> include/uapi/linux/virtio_balloon.h | 16 ++
> 2 files changed, 259 insertions(+)
>
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index f4c34a2a6b8e..f9d95d1d8a4d 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -52,6 +52,7 @@ enum virtio_balloon_vq {
> VIRTIO_BALLOON_VQ_STATS,
> VIRTIO_BALLOON_VQ_FREE_PAGE,
> VIRTIO_BALLOON_VQ_REPORTING,
> + VIRTIO_BALLOON_VQ_RECOVER,
> VIRTIO_BALLOON_VQ_MAX
> };
>
> @@ -59,6 +60,12 @@ enum virtio_balloon_config_read {
> VIRTIO_BALLOON_CONFIG_READ_CMD_ID = 0,
> };
>
> +/* the request body to commucate with host side */
> +struct __virtio_balloon_recover {
> + struct virtio_balloon_recover vbr;
> + __virtio32 pfns[VIRTIO_BALLOON_PAGES_PER_PAGE];
> +};
> +
I don't think this idea of passing 32 bit pfns is going to fly.
What is wrong with just passing the pages normally as a s/g list?
this is what is done for the hints at the moment.
neither should you use __virtio types for new functionality
(should all be __le), nor use __virtio for the struct name.
> struct virtio_balloon {
> struct virtio_device *vdev;
> struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
> @@ -126,6 +133,16 @@ struct virtio_balloon {
> /* Free page reporting device */
> struct virtqueue *reporting_vq;
> struct page_reporting_dev_info pr_dev_info;
> +
> + /* Memory recover VQ - VIRTIO_BALLOON_F_RECOVER */
> + struct virtqueue *recover_vq;
> + spinlock_t recover_vq_lock;
> + struct notifier_block memory_failure_nb;
> + struct list_head corrupted_page_list;
> + struct list_head recovered_page_list;
> + spinlock_t recover_page_list_lock;
> + struct __virtio_balloon_recover in_vbr;
> + struct work_struct unpoison_memory_work;
> };
>
> static const struct virtio_device_id id_table[] = {
> @@ -494,6 +511,198 @@ static void update_balloon_size_func(struct work_struct *work)
> queue_work(system_freezable_wq, work);
> }
>
> +/*
> + * virtballoon_memory_failure - notified by memory failure, try to fix the
> + * corrupted page.
> + * The memory failure notifier is designed to call back when the kernel handled
> + * successfully only, WARN_ON_ONCE on the unlikely condition to find out any
> + * error(memory error handling is a best effort, not 100% coverd).
covered
> + */
> +static int virtballoon_memory_failure(struct notifier_block *notifier,
> + unsigned long pfn, void *parm)
> +{
> + struct virtio_balloon *vb = container_of(notifier, struct virtio_balloon,
> + memory_failure_nb);
> + struct page *page;
> + struct __virtio_balloon_recover *out_vbr;
> + struct scatterlist sg;
> + unsigned long flags;
> + int err;
> +
> + page = pfn_to_online_page(pfn);
> + if (WARN_ON_ONCE(!page))
> + return NOTIFY_DONE;
> +
> + if (PageHuge(page))
> + return NOTIFY_DONE;
> +
> + if (WARN_ON_ONCE(!PageHWPoison(page)))
> + return NOTIFY_DONE;
> +
> + if (WARN_ON_ONCE(page_count(page) != 1))
> + return NOTIFY_DONE;
> +
> + get_page(page); /* balloon reference */
> +
> + out_vbr = kzalloc(sizeof(*out_vbr), GFP_KERNEL);
> + if (WARN_ON_ONCE(!out_vbr))
> + return NOTIFY_BAD;
> +
> + spin_lock(&vb->recover_page_list_lock);
> + balloon_page_push(&vb->corrupted_page_list, page);
> + spin_unlock(&vb->recover_page_list_lock);
> +
> + out_vbr->vbr.cmd = VIRTIO_BALLOON_R_CMD_RECOVER;
> + set_page_pfns(vb, out_vbr->pfns, page);
> + sg_init_one(&sg, out_vbr, sizeof(*out_vbr));
> +
> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
> + err = virtqueue_add_outbuf(vb->recover_vq, &sg, 1, out_vbr, GFP_KERNEL);
> + if (unlikely(err)) {
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> + return NOTIFY_DONE;
> + }
> + virtqueue_kick(vb->recover_vq);
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> +
> + return NOTIFY_OK;
> +}
> +
> +static int recover_vq_get_response(struct virtio_balloon *vb)
> +{
> + struct __virtio_balloon_recover *in_vbr;
> + struct scatterlist sg;
> + unsigned long flags;
> + int err;
> +
> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
> + in_vbr = &vb->in_vbr;
> + memset(in_vbr, 0x00, sizeof(*in_vbr));
> + sg_init_one(&sg, in_vbr, sizeof(*in_vbr));
> + err = virtqueue_add_inbuf(vb->recover_vq, &sg, 1, in_vbr, GFP_KERNEL);
> + if (unlikely(err)) {
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> + return err;
> + }
> +
> + virtqueue_kick(vb->recover_vq);
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> +
> + return 0;
> +}
> +
> +static void recover_vq_handle_response(struct virtio_balloon *vb, unsigned int len)
> +{
> + struct __virtio_balloon_recover *in_vbr;
> + struct virtio_balloon_recover *vbr;
> + struct page *page;
> + unsigned int pfns;
> + u32 pfn0, pfn1;
> + __u8 status;
> +
> + /* the response is not expected */
> + if (unlikely(len != sizeof(struct __virtio_balloon_recover)))
> + return;
> +
> + in_vbr = &vb->in_vbr;
> + vbr = &in_vbr->vbr;
> + if (unlikely(vbr->cmd != VIRTIO_BALLOON_R_CMD_RESPONSE))
> + return;
> +
> + /* to make sure the contiguous balloon PFNs */
> + for (pfns = 1; pfns < VIRTIO_BALLOON_PAGES_PER_PAGE; pfns++) {
> + pfn0 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[pfns - 1]);
> + pfn1 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[pfns]);
> + if (pfn1 - pfn0 != 1)
> + return;
> + }
> +
> + pfn0 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[0]);
> + if (!pfn_valid(pfn0))
> + return;
> +
> + pfn1 = -1;
> + spin_lock(&vb->recover_page_list_lock);
> + list_for_each_entry(page, &vb->corrupted_page_list, lru) {
> + pfn1 = page_to_pfn(page);
> + if (pfn1 == pfn0)
> + break;
> + }
> + spin_unlock(&vb->recover_page_list_lock);
> +
> + status = vbr->status;
> + switch (status) {
> + case VIRTIO_BALLOON_R_STATUS_RECOVERED:
> + if (pfn1 == pfn0) {
> + spin_lock(&vb->recover_page_list_lock);
> + list_del(&page->lru);
> + balloon_page_push(&vb->recovered_page_list, page);
> + spin_unlock(&vb->recover_page_list_lock);
> + queue_work(system_freezable_wq, &vb->unpoison_memory_work);
> + dev_info_ratelimited(&vb->vdev->dev, "recovered pfn 0x%x", pfn0);
> + }
> + break;
> + case VIRTIO_BALLOON_R_STATUS_FAILED:
> + /* the hypervisor can't fix this corrupted page, balloon puts page */
> + if (pfn1 == pfn0) {
> + spin_lock(&vb->recover_page_list_lock);
> + list_del(&page->lru);
> + spin_unlock(&vb->recover_page_list_lock);
> + put_page(page);
> + dev_info_ratelimited(&vb->vdev->dev, "failed to recover pfn 0x%x", pfn0);
> + }
> + default:
> + break;
> + };
> +
> + /* continue to get response from host side if the response gets handled successfully */
> + recover_vq_get_response(vb);
> +}
> +
> +static void unpoison_memory_func(struct work_struct *work)
> +{
> + struct virtio_balloon *vb;
> + struct page *page;
> +
> + vb = container_of(work, struct virtio_balloon, unpoison_memory_work);
> +
> + do {
> + spin_lock(&vb->recover_page_list_lock);
> + page = list_first_entry_or_null(&vb->recovered_page_list,
> + struct page, lru);
> + if (page)
> + list_del(&page->lru);
> + spin_unlock(&vb->recover_page_list_lock);
> +
> + if (page) {
> + put_page(page);
> + unpoison_memory(page_to_pfn(page), true, "virtio-balloon");
> + }
> + } while (page);
> +}
> +
> +static void recover_vq_cb(struct virtqueue *vq)
> +{
> + struct virtio_balloon *vb = vq->vdev->priv;
> + struct __virtio_balloon_recover *vbr;
> + unsigned long flags;
> + unsigned int len;
> +
> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
> + do {
> + virtqueue_disable_cb(vq);
> + while ((vbr = virtqueue_get_buf(vq, &len)) != NULL) {
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> + if (vbr == &vb->in_vbr)
> + recover_vq_handle_response(vb, len);
> + else
> + kfree(vbr); /* just free the memory for out vbr request */
> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
> + }
> + } while (!virtqueue_enable_cb(vq));
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> +}
> +
> static int init_vqs(struct virtio_balloon *vb)
> {
> struct virtqueue *vqs[VIRTIO_BALLOON_VQ_MAX];
> @@ -515,6 +724,7 @@ static int init_vqs(struct virtio_balloon *vb)
> callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> names[VIRTIO_BALLOON_VQ_REPORTING] = NULL;
> + names[VIRTIO_BALLOON_VQ_RECOVER] = NULL;
>
> if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> names[VIRTIO_BALLOON_VQ_STATS] = "stats";
> @@ -531,6 +741,11 @@ static int init_vqs(struct virtio_balloon *vb)
> callbacks[VIRTIO_BALLOON_VQ_REPORTING] = balloon_ack;
> }
>
> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_RECOVER)) {
> + names[VIRTIO_BALLOON_VQ_RECOVER] = "recover_vq";
> + callbacks[VIRTIO_BALLOON_VQ_RECOVER] = recover_vq_cb;
> + }
> +
> err = virtio_find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX, vqs,
> callbacks, names, NULL);
> if (err)
> @@ -566,6 +781,9 @@ static int init_vqs(struct virtio_balloon *vb)
> if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
> vb->reporting_vq = vqs[VIRTIO_BALLOON_VQ_REPORTING];
>
> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_RECOVER))
> + vb->recover_vq = vqs[VIRTIO_BALLOON_VQ_RECOVER];
> +
> return 0;
> }
>
> @@ -1015,12 +1233,31 @@ static int virtballoon_probe(struct virtio_device *vdev)
> goto out_unregister_oom;
> }
>
> + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_RECOVER)) {
> + err = recover_vq_get_response(vb);
> + if (err)
> + goto out_unregister_reporting;
> +
> + vb->memory_failure_nb.notifier_call = virtballoon_memory_failure;
> + spin_lock_init(&vb->recover_page_list_lock);
> + spin_lock_init(&vb->recover_vq_lock);
> + INIT_LIST_HEAD(&vb->corrupted_page_list);
> + INIT_LIST_HEAD(&vb->recovered_page_list);
> + INIT_WORK(&vb->unpoison_memory_work, unpoison_memory_func);
> + err = register_memory_failure_notifier(&vb->memory_failure_nb);
> + if (err)
> + goto out_unregister_reporting;
> + }
> +
> virtio_device_ready(vdev);
>
> if (towards_target(vb))
> virtballoon_changed(vdev);
> return 0;
>
> +out_unregister_reporting:
> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
> + page_reporting_unregister(&vb->pr_dev_info);
> out_unregister_oom:
> if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
> unregister_oom_notifier(&vb->oom_nb);
> @@ -1082,6 +1319,11 @@ static void virtballoon_remove(struct virtio_device *vdev)
> destroy_workqueue(vb->balloon_wq);
> }
>
> + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_RECOVER)) {
> + unregister_memory_failure_notifier(&vb->memory_failure_nb);
> + cancel_work_sync(&vb->unpoison_memory_work);
> + }
> +
> remove_common(vb);
> #ifdef CONFIG_BALLOON_COMPACTION
> if (vb->vb_dev_info.inode)
> @@ -1147,6 +1389,7 @@ static unsigned int features[] = {
> VIRTIO_BALLOON_F_FREE_PAGE_HINT,
> VIRTIO_BALLOON_F_PAGE_POISON,
> VIRTIO_BALLOON_F_REPORTING,
> + VIRTIO_BALLOON_F_RECOVER,
> };
>
> static struct virtio_driver virtio_balloon_driver = {
> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> index ddaa45e723c4..41d0ffa2fb54 100644
> --- a/include/uapi/linux/virtio_balloon.h
> +++ b/include/uapi/linux/virtio_balloon.h
> @@ -37,6 +37,7 @@
> #define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */
> #define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */
> #define VIRTIO_BALLOON_F_REPORTING 5 /* Page reporting virtqueue */
> +#define VIRTIO_BALLOON_F_RECOVER 6 /* Memory recover virtqueue */
>
> /* Size of a PFN in the balloon interface. */
> #define VIRTIO_BALLOON_PFN_SHIFT 12
Please get this feature recorded in the spec with the virtio TC.
They will also ask you to supply minimal documentation.
> @@ -59,6 +60,8 @@ struct virtio_balloon_config {
> };
> /* Stores PAGE_POISON if page poisoning is in use */
> __le32 poison_val;
> + /* Number of hardware corrupted pages, guest read only */
> + __le32 corrupted_pages;
> };
>
> #define VIRTIO_BALLOON_S_SWAP_IN 0 /* Amount of memory swapped in */
> @@ -116,4 +119,17 @@ struct virtio_balloon_stat {
> __virtio64 val;
> } __attribute__((packed));
>
> +#define VIRTIO_BALLOON_R_CMD_RECOVER 0
> +#define VIRTIO_BALLOON_R_CMD_RESPONSE 0x80
> +
> +#define VIRTIO_BALLOON_R_STATUS_CORRUPTED 0
> +#define VIRTIO_BALLOON_R_STATUS_RECOVERED 1
> +#define VIRTIO_BALLOON_R_STATUS_FAILED 2
> +
> +struct virtio_balloon_recover {
> + __u8 cmd;
> + __u8 status;
> + __u8 padding[6];
> +};
> +
> #endif /* _LINUX_VIRTIO_BALLOON_H */
> --
> 2.20.1
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: Re: [PATCH 3/3] virtio_balloon: Introduce memory recover
2022-05-26 19:18 ` Michael S. Tsirkin
@ 2022-05-27 2:22 ` zhenwei pi
-1 siblings, 0 replies; 37+ messages in thread
From: zhenwei pi @ 2022-05-27 2:22 UTC (permalink / raw)
To: Michael S. Tsirkin, akpm, naoya.horiguchi
Cc: david, linux-mm, linux-kernel, jasowang, virtualization,
pbonzini, peterx, qemu-devel
On 5/27/22 03:18, Michael S. Tsirkin wrote:
> On Fri, May 20, 2022 at 03:06:48PM +0800, zhenwei pi wrote:
>> Introduce a new queue 'recover VQ', this allows guest to recover
>> hardware corrupted page:
>>
>> Guest 5.MF -> 6.RVQ FE 10.Unpoison page
>> / \ /
>> -------------------+-------------+----------+-----------
>> | | |
>> 4.MCE 7.RVQ BE 9.RVQ Event
>> QEMU / \ /
>> 3.SIGBUS 8.Remap
>> /
>> ----------------+------------------------------------
>> |
>> +--2.MF
>> Host /
>> 1.HW error
>>
>> The workflow:
>> 1, HardWare page error occurs randomly.
>> 2, host side handles corrupted page by Memory Failure mechanism, sends
>> SIGBUS to the user process if early-kill is enabled.
>> 3, QEMU handles SIGBUS, if the address belongs to guest RAM, then:
>> 4, QEMU tries to inject MCE into guest.
>> 5, guest handles memory failure again.
>>
>> 1-5 is already supported for a long time, the next steps are supported
>> in this patch(also related driver patch):
>> 6, guest balloon driver gets noticed of the corrupted PFN, and sends
>> request to host side by Recover VQ FrontEnd.
>> 7, QEMU handles request from Recover VQ BackEnd, then:
>> 8, QEMU remaps the corrupted HVA fo fix the memory failure, then:
>> 9, QEMU acks the guest side the result by Recover VQ.
>> 10, guest unpoisons the page if the corrupted page gets recoverd
>> successfully.
>>
>> Then the guest fixes the HW page error dynamiclly without rebooting.
>>
>> Emulate MCE by QEMU, the guest works fine:
>> mce: [Hardware Error]: Machine check events logged
>> Memory failure: 0x61646: recovery action for dirty LRU page: Recovered
>> virtio_balloon virtio5: recovered pfn 0x61646
>> Unpoison: Unpoisoned page 0x61646 by virtio-balloon
>> MCE: Killing stress:24502 due to hardware memory corruption fault at 7f5be2e5a010
>>
>> The 'HardwareCorrupted' in /proc/meminfo also shows 0 kB.
>>
>> Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
>> ---
>> drivers/virtio/virtio_balloon.c | 243 ++++++++++++++++++++++++++++
>> include/uapi/linux/virtio_balloon.h | 16 ++
>> 2 files changed, 259 insertions(+)
>>
>> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
>> index f4c34a2a6b8e..f9d95d1d8a4d 100644
>> --- a/drivers/virtio/virtio_balloon.c
>> +++ b/drivers/virtio/virtio_balloon.c
>> @@ -52,6 +52,7 @@ enum virtio_balloon_vq {
>> VIRTIO_BALLOON_VQ_STATS,
>> VIRTIO_BALLOON_VQ_FREE_PAGE,
>> VIRTIO_BALLOON_VQ_REPORTING,
>> + VIRTIO_BALLOON_VQ_RECOVER,
>> VIRTIO_BALLOON_VQ_MAX
>> };
>>
>> @@ -59,6 +60,12 @@ enum virtio_balloon_config_read {
>> VIRTIO_BALLOON_CONFIG_READ_CMD_ID = 0,
>> };
>>
>> +/* the request body to commucate with host side */
>> +struct __virtio_balloon_recover {
>> + struct virtio_balloon_recover vbr;
>> + __virtio32 pfns[VIRTIO_BALLOON_PAGES_PER_PAGE];
>> +};
>> +
>
>
> I don't think this idea of passing 32 bit pfns is going to fly.
> What is wrong with just passing the pages normally as a s/g list?
> this is what is done for the hints at the moment.
>
> neither should you use __virtio types for new functionality
> (should all be __le), nor use __virtio for the struct name.
>
>
Guest side sends GPA/PFN to host side by passing the pages normally as a
s/g list, this is OK.
But in this scenario, guest also needs to get
status(recovered?corrupted?failed to recover?) of page from the host side.
For a normal page(Ex, 4K), the host could return the status quite
immediately. But for the 2M hugetlb of guest RAM, the host should be
pending until the guest requests 512*4K to recover. Once the 2M hugepage
gets recovered(or failed to recover), the host returns 512 PFNs with
status to guest. There are at most 512 recover requests of a single 2M
huge page.
For example, the guest tries to recover a corrupted page:
struct scatterlist status_sg, page_sg, *sgs[2];
sg_init_one(&status_sg, status, sizeof(*status));
sgs[0] = &status_sg;
p = page_address(page);
sg_init_one(&page_sg, p, PAGE_SIZE);
sgs[1] = &page_sg;
virtqueue_add_sgs(recover_vq, sgs, 1, 1, NULL, GFP_ATOMIC);
The host handles 4K recover request on 2M hugepage, this request is
pending until the full 2M huge page gets recovered(or failed).
To avoid too many pending request in virt queue, I designed as this
patch(should use __le), passing PFN in request body, using a single IN
request only.
...
>> --- a/include/uapi/linux/virtio_balloon.h
>> +++ b/include/uapi/linux/virtio_balloon.h
>> @@ -37,6 +37,7 @@
>> #define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */
>> #define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */
>> #define VIRTIO_BALLOON_F_REPORTING 5 /* Page reporting virtqueue */
>> +#define VIRTIO_BALLOON_F_RECOVER 6 /* Memory recover virtqueue */
>>
>> /* Size of a PFN in the balloon interface. */
>> #define VIRTIO_BALLOON_PFN_SHIFT 12
>
> Please get this feature recorded in the spec with the virtio TC.
> They will also ask you to supply minimal documentation.
>
Sure!
By the way, this feature depends on the memory&&memory-failure
mechanism, what about sending the change of spec to virtio TC after
Andrew and Naoya ack?
--
zhenwei pi
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: Re: [PATCH 3/3] virtio_balloon: Introduce memory recover
@ 2022-05-27 2:22 ` zhenwei pi
0 siblings, 0 replies; 37+ messages in thread
From: zhenwei pi @ 2022-05-27 2:22 UTC (permalink / raw)
To: Michael S. Tsirkin, akpm, naoya.horiguchi
Cc: qemu-devel, linux-kernel, virtualization, linux-mm, pbonzini
On 5/27/22 03:18, Michael S. Tsirkin wrote:
> On Fri, May 20, 2022 at 03:06:48PM +0800, zhenwei pi wrote:
>> Introduce a new queue 'recover VQ', this allows guest to recover
>> hardware corrupted page:
>>
>> Guest 5.MF -> 6.RVQ FE 10.Unpoison page
>> / \ /
>> -------------------+-------------+----------+-----------
>> | | |
>> 4.MCE 7.RVQ BE 9.RVQ Event
>> QEMU / \ /
>> 3.SIGBUS 8.Remap
>> /
>> ----------------+------------------------------------
>> |
>> +--2.MF
>> Host /
>> 1.HW error
>>
>> The workflow:
>> 1, HardWare page error occurs randomly.
>> 2, host side handles corrupted page by Memory Failure mechanism, sends
>> SIGBUS to the user process if early-kill is enabled.
>> 3, QEMU handles SIGBUS, if the address belongs to guest RAM, then:
>> 4, QEMU tries to inject MCE into guest.
>> 5, guest handles memory failure again.
>>
>> 1-5 is already supported for a long time, the next steps are supported
>> in this patch(also related driver patch):
>> 6, guest balloon driver gets noticed of the corrupted PFN, and sends
>> request to host side by Recover VQ FrontEnd.
>> 7, QEMU handles request from Recover VQ BackEnd, then:
>> 8, QEMU remaps the corrupted HVA fo fix the memory failure, then:
>> 9, QEMU acks the guest side the result by Recover VQ.
>> 10, guest unpoisons the page if the corrupted page gets recoverd
>> successfully.
>>
>> Then the guest fixes the HW page error dynamiclly without rebooting.
>>
>> Emulate MCE by QEMU, the guest works fine:
>> mce: [Hardware Error]: Machine check events logged
>> Memory failure: 0x61646: recovery action for dirty LRU page: Recovered
>> virtio_balloon virtio5: recovered pfn 0x61646
>> Unpoison: Unpoisoned page 0x61646 by virtio-balloon
>> MCE: Killing stress:24502 due to hardware memory corruption fault at 7f5be2e5a010
>>
>> The 'HardwareCorrupted' in /proc/meminfo also shows 0 kB.
>>
>> Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
>> ---
>> drivers/virtio/virtio_balloon.c | 243 ++++++++++++++++++++++++++++
>> include/uapi/linux/virtio_balloon.h | 16 ++
>> 2 files changed, 259 insertions(+)
>>
>> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
>> index f4c34a2a6b8e..f9d95d1d8a4d 100644
>> --- a/drivers/virtio/virtio_balloon.c
>> +++ b/drivers/virtio/virtio_balloon.c
>> @@ -52,6 +52,7 @@ enum virtio_balloon_vq {
>> VIRTIO_BALLOON_VQ_STATS,
>> VIRTIO_BALLOON_VQ_FREE_PAGE,
>> VIRTIO_BALLOON_VQ_REPORTING,
>> + VIRTIO_BALLOON_VQ_RECOVER,
>> VIRTIO_BALLOON_VQ_MAX
>> };
>>
>> @@ -59,6 +60,12 @@ enum virtio_balloon_config_read {
>> VIRTIO_BALLOON_CONFIG_READ_CMD_ID = 0,
>> };
>>
>> +/* the request body to commucate with host side */
>> +struct __virtio_balloon_recover {
>> + struct virtio_balloon_recover vbr;
>> + __virtio32 pfns[VIRTIO_BALLOON_PAGES_PER_PAGE];
>> +};
>> +
>
>
> I don't think this idea of passing 32 bit pfns is going to fly.
> What is wrong with just passing the pages normally as a s/g list?
> this is what is done for the hints at the moment.
>
> neither should you use __virtio types for new functionality
> (should all be __le), nor use __virtio for the struct name.
>
>
Guest side sends GPA/PFN to host side by passing the pages normally as a
s/g list, this is OK.
But in this scenario, guest also needs to get
status(recovered?corrupted?failed to recover?) of page from the host side.
For a normal page(Ex, 4K), the host could return the status quite
immediately. But for the 2M hugetlb of guest RAM, the host should be
pending until the guest requests 512*4K to recover. Once the 2M hugepage
gets recovered(or failed to recover), the host returns 512 PFNs with
status to guest. There are at most 512 recover requests of a single 2M
huge page.
For example, the guest tries to recover a corrupted page:
struct scatterlist status_sg, page_sg, *sgs[2];
sg_init_one(&status_sg, status, sizeof(*status));
sgs[0] = &status_sg;
p = page_address(page);
sg_init_one(&page_sg, p, PAGE_SIZE);
sgs[1] = &page_sg;
virtqueue_add_sgs(recover_vq, sgs, 1, 1, NULL, GFP_ATOMIC);
The host handles 4K recover request on 2M hugepage, this request is
pending until the full 2M huge page gets recovered(or failed).
To avoid too many pending request in virt queue, I designed as this
patch(should use __le), passing PFN in request body, using a single IN
request only.
...
>> --- a/include/uapi/linux/virtio_balloon.h
>> +++ b/include/uapi/linux/virtio_balloon.h
>> @@ -37,6 +37,7 @@
>> #define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */
>> #define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */
>> #define VIRTIO_BALLOON_F_REPORTING 5 /* Page reporting virtqueue */
>> +#define VIRTIO_BALLOON_F_RECOVER 6 /* Memory recover virtqueue */
>>
>> /* Size of a PFN in the balloon interface. */
>> #define VIRTIO_BALLOON_PFN_SHIFT 12
>
> Please get this feature recorded in the spec with the virtio TC.
> They will also ask you to supply minimal documentation.
>
Sure!
By the way, this feature depends on the memory&&memory-failure
mechanism, what about sending the change of spec to virtio TC after
Andrew and Naoya ack?
--
zhenwei pi
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 3/3] virtio_balloon: Introduce memory recover
2022-05-20 7:06 ` zhenwei pi
@ 2022-05-30 7:48 ` David Hildenbrand
-1 siblings, 0 replies; 37+ messages in thread
From: David Hildenbrand @ 2022-05-30 7:48 UTC (permalink / raw)
To: zhenwei pi, akpm, naoya.horiguchi, mst
Cc: linux-mm, linux-kernel, jasowang, virtualization, pbonzini,
peterx, qemu-devel
> +
> struct virtio_balloon {
> struct virtio_device *vdev;
> struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
> @@ -126,6 +133,16 @@ struct virtio_balloon {
> /* Free page reporting device */
> struct virtqueue *reporting_vq;
> struct page_reporting_dev_info pr_dev_info;
> +
> + /* Memory recover VQ - VIRTIO_BALLOON_F_RECOVER */
> + struct virtqueue *recover_vq;
> + spinlock_t recover_vq_lock;
> + struct notifier_block memory_failure_nb;
> + struct list_head corrupted_page_list;
> + struct list_head recovered_page_list;
> + spinlock_t recover_page_list_lock;
> + struct __virtio_balloon_recover in_vbr;
> + struct work_struct unpoison_memory_work;
I assume we want all that only with CONFIG_MEMORY_FAILURE.
> };
>
> static const struct virtio_device_id id_table[] = {
> @@ -494,6 +511,198 @@ static void update_balloon_size_func(struct work_struct *work)
> queue_work(system_freezable_wq, work);
> }
>
> +/*
> + * virtballoon_memory_failure - notified by memory failure, try to fix the
> + * corrupted page.
> + * The memory failure notifier is designed to call back when the kernel handled
> + * successfully only, WARN_ON_ONCE on the unlikely condition to find out any
> + * error(memory error handling is a best effort, not 100% coverd).
> + */
> +static int virtballoon_memory_failure(struct notifier_block *notifier,
> + unsigned long pfn, void *parm)
> +{
> + struct virtio_balloon *vb = container_of(notifier, struct virtio_balloon,
> + memory_failure_nb);
> + struct page *page;
> + struct __virtio_balloon_recover *out_vbr;
> + struct scatterlist sg;
> + unsigned long flags;
> + int err;
> +
> + page = pfn_to_online_page(pfn);
> + if (WARN_ON_ONCE(!page))
> + return NOTIFY_DONE;
> +
> + if (PageHuge(page))
> + return NOTIFY_DONE;
> +
> + if (WARN_ON_ONCE(!PageHWPoison(page)))
> + return NOTIFY_DONE;
> +
> + if (WARN_ON_ONCE(page_count(page) != 1))
> + return NOTIFY_DONE;
Relying on the page_count to be 1 for correctness is usually a bit
shaky, for example, when racing against isolate_movable_page() that
might temporarily bump upo the refcount.
> +
> + get_page(page); /* balloon reference */
> +
> + out_vbr = kzalloc(sizeof(*out_vbr), GFP_KERNEL);
Are we always guaranteed to be able to use GFP_KERNEL out of MCE
context? (IOW, are we never atomic?)
> + if (WARN_ON_ONCE(!out_vbr))
> + return NOTIFY_BAD;
> +
> + spin_lock(&vb->recover_page_list_lock);
> + balloon_page_push(&vb->corrupted_page_list, page);
> + spin_unlock(&vb->recover_page_list_lock);
> +
> + out_vbr->vbr.cmd = VIRTIO_BALLOON_R_CMD_RECOVER;
This makes me wonder if we should have a more generic guest->host
request queue, similar to what e.g., virtio-mem uses, instead of adding
a separate VIRTIO_BALLOON_VQ_RECOVER vq.
> + set_page_pfns(vb, out_vbr->pfns, page);
> + sg_init_one(&sg, out_vbr, sizeof(*out_vbr));
> +
> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
> + err = virtqueue_add_outbuf(vb->recover_vq, &sg, 1, out_vbr, GFP_KERNEL);
> + if (unlikely(err)) {
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> + return NOTIFY_DONE;
> + }
> + virtqueue_kick(vb->recover_vq);
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> +
> + return NOTIFY_OK;
> +}
> +
> +static int recover_vq_get_response(struct virtio_balloon *vb)
> +{
> + struct __virtio_balloon_recover *in_vbr;
> + struct scatterlist sg;
> + unsigned long flags;
> + int err;
> +
> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
> + in_vbr = &vb->in_vbr;
> + memset(in_vbr, 0x00, sizeof(*in_vbr));
> + sg_init_one(&sg, in_vbr, sizeof(*in_vbr));
> + err = virtqueue_add_inbuf(vb->recover_vq, &sg, 1, in_vbr, GFP_KERNEL);
> + if (unlikely(err)) {
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> + return err;
> + }
> +
> + virtqueue_kick(vb->recover_vq);
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> +
> + return 0;
> +}
> +
> +static void recover_vq_handle_response(struct virtio_balloon *vb, unsigned int len)
> +{
> + struct __virtio_balloon_recover *in_vbr;
> + struct virtio_balloon_recover *vbr;
> + struct page *page;
> + unsigned int pfns;
> + u32 pfn0, pfn1;
> + __u8 status;
> +
> + /* the response is not expected */
> + if (unlikely(len != sizeof(struct __virtio_balloon_recover)))
> + return;
> +
> + in_vbr = &vb->in_vbr;
> + vbr = &in_vbr->vbr;
> + if (unlikely(vbr->cmd != VIRTIO_BALLOON_R_CMD_RESPONSE))
> + return;
> +
> + /* to make sure the contiguous balloon PFNs */
> + for (pfns = 1; pfns < VIRTIO_BALLOON_PAGES_PER_PAGE; pfns++) {
> + pfn0 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[pfns - 1]);
> + pfn1 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[pfns]);
> + if (pfn1 - pfn0 != 1)
> + return;
Yeah, we really shouldn't be dealing with (legacy) 4k PFNs here, but
instead, proper ranges I guess.
> + }
> +
> + pfn0 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[0]);
> + if (!pfn_valid(pfn0))
> + return;
> +
> + pfn1 = -1;
> + spin_lock(&vb->recover_page_list_lock);
> + list_for_each_entry(page, &vb->corrupted_page_list, lru) {
> + pfn1 = page_to_pfn(page);
> + if (pfn1 == pfn0)
> + break;
> + }
> + spin_unlock(&vb->recover_page_list_lock);
> +
> + status = vbr->status;
> + switch (status) {
> + case VIRTIO_BALLOON_R_STATUS_RECOVERED:
> + if (pfn1 == pfn0) {
> + spin_lock(&vb->recover_page_list_lock);
> + list_del(&page->lru);
> + balloon_page_push(&vb->recovered_page_list, page);
We rather not reuse actual balloon functions in !balloon context. Just
move the page to the proper list directly.
> + spin_unlock(&vb->recover_page_list_lock);
> + queue_work(system_freezable_wq, &vb->unpoison_memory_work);
> + dev_info_ratelimited(&vb->vdev->dev, "recovered pfn 0x%x", pfn0);
Well, not yet. Shouldn't this go into unpoison_memory_func() ?
> + }
> + break;
> + case VIRTIO_BALLOON_R_STATUS_FAILED:
> + /* the hypervisor can't fix this corrupted page, balloon puts page */
> + if (pfn1 == pfn0) {
> + spin_lock(&vb->recover_page_list_lock);
> + list_del(&page->lru);
> + spin_unlock(&vb->recover_page_list_lock);
> + put_page(page);
> + dev_info_ratelimited(&vb->vdev->dev, "failed to recover pfn 0x%x", pfn0);
> + }
> + default:
> + break;
> + };
> +
> + /* continue to get response from host side if the response gets handled successfully */
> + recover_vq_get_response(vb);
> +}
> +
> +static void unpoison_memory_func(struct work_struct *work)
> +{
> + struct virtio_balloon *vb;
> + struct page *page;
> +
> + vb = container_of(work, struct virtio_balloon, unpoison_memory_work);
> +
> + do {
> + spin_lock(&vb->recover_page_list_lock);
> + page = list_first_entry_or_null(&vb->recovered_page_list,
> + struct page, lru);
> + if (page)
> + list_del(&page->lru);
> + spin_unlock(&vb->recover_page_list_lock);
> +
> + if (page) {
> + put_page(page);
> + unpoison_memory(page_to_pfn(page), true, "virtio-balloon");
> + }
> + } while (page);
> +}
> +
> +static void recover_vq_cb(struct virtqueue *vq)
> +{
> + struct virtio_balloon *vb = vq->vdev->priv;
> + struct __virtio_balloon_recover *vbr;
> + unsigned long flags;
> + unsigned int len;
> +
> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
> + do {
> + virtqueue_disable_cb(vq);
> + while ((vbr = virtqueue_get_buf(vq, &len)) != NULL) {
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> + if (vbr == &vb->in_vbr)
> + recover_vq_handle_response(vb, len);
> + else
> + kfree(vbr); /* just free the memory for out vbr request */
> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
> + }
> + } while (!virtqueue_enable_cb(vq));
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> +}
> +
[...]
>
> +out_unregister_reporting:
> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
> + page_reporting_unregister(&vb->pr_dev_info);
> out_unregister_oom:
> if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
> unregister_oom_notifier(&vb->oom_nb);
> @@ -1082,6 +1319,11 @@ static void virtballoon_remove(struct virtio_device *vdev)
> destroy_workqueue(vb->balloon_wq);
> }
>
> + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_RECOVER)) {
Could the notifier already have been triggered and we might be using the
device before already fully initialized from the notifier and might end
up leaking memory here that we allocated?
> + unregister_memory_failure_notifier(&vb->memory_failure_nb);
> + cancel_work_sync(&vb->unpoison_memory_work);
> + }
> +
Could we be leaking memory from the virtballoon_remove() path?
--
Thanks,
David / dhildenb
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: [PATCH 3/3] virtio_balloon: Introduce memory recover
@ 2022-05-30 7:48 ` David Hildenbrand
0 siblings, 0 replies; 37+ messages in thread
From: David Hildenbrand @ 2022-05-30 7:48 UTC (permalink / raw)
To: zhenwei pi, akpm, naoya.horiguchi, mst
Cc: qemu-devel, linux-kernel, virtualization, linux-mm, pbonzini
> +
> struct virtio_balloon {
> struct virtio_device *vdev;
> struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
> @@ -126,6 +133,16 @@ struct virtio_balloon {
> /* Free page reporting device */
> struct virtqueue *reporting_vq;
> struct page_reporting_dev_info pr_dev_info;
> +
> + /* Memory recover VQ - VIRTIO_BALLOON_F_RECOVER */
> + struct virtqueue *recover_vq;
> + spinlock_t recover_vq_lock;
> + struct notifier_block memory_failure_nb;
> + struct list_head corrupted_page_list;
> + struct list_head recovered_page_list;
> + spinlock_t recover_page_list_lock;
> + struct __virtio_balloon_recover in_vbr;
> + struct work_struct unpoison_memory_work;
I assume we want all that only with CONFIG_MEMORY_FAILURE.
> };
>
> static const struct virtio_device_id id_table[] = {
> @@ -494,6 +511,198 @@ static void update_balloon_size_func(struct work_struct *work)
> queue_work(system_freezable_wq, work);
> }
>
> +/*
> + * virtballoon_memory_failure - notified by memory failure, try to fix the
> + * corrupted page.
> + * The memory failure notifier is designed to call back when the kernel handled
> + * successfully only, WARN_ON_ONCE on the unlikely condition to find out any
> + * error(memory error handling is a best effort, not 100% coverd).
> + */
> +static int virtballoon_memory_failure(struct notifier_block *notifier,
> + unsigned long pfn, void *parm)
> +{
> + struct virtio_balloon *vb = container_of(notifier, struct virtio_balloon,
> + memory_failure_nb);
> + struct page *page;
> + struct __virtio_balloon_recover *out_vbr;
> + struct scatterlist sg;
> + unsigned long flags;
> + int err;
> +
> + page = pfn_to_online_page(pfn);
> + if (WARN_ON_ONCE(!page))
> + return NOTIFY_DONE;
> +
> + if (PageHuge(page))
> + return NOTIFY_DONE;
> +
> + if (WARN_ON_ONCE(!PageHWPoison(page)))
> + return NOTIFY_DONE;
> +
> + if (WARN_ON_ONCE(page_count(page) != 1))
> + return NOTIFY_DONE;
Relying on the page_count to be 1 for correctness is usually a bit
shaky, for example, when racing against isolate_movable_page() that
might temporarily bump upo the refcount.
> +
> + get_page(page); /* balloon reference */
> +
> + out_vbr = kzalloc(sizeof(*out_vbr), GFP_KERNEL);
Are we always guaranteed to be able to use GFP_KERNEL out of MCE
context? (IOW, are we never atomic?)
> + if (WARN_ON_ONCE(!out_vbr))
> + return NOTIFY_BAD;
> +
> + spin_lock(&vb->recover_page_list_lock);
> + balloon_page_push(&vb->corrupted_page_list, page);
> + spin_unlock(&vb->recover_page_list_lock);
> +
> + out_vbr->vbr.cmd = VIRTIO_BALLOON_R_CMD_RECOVER;
This makes me wonder if we should have a more generic guest->host
request queue, similar to what e.g., virtio-mem uses, instead of adding
a separate VIRTIO_BALLOON_VQ_RECOVER vq.
> + set_page_pfns(vb, out_vbr->pfns, page);
> + sg_init_one(&sg, out_vbr, sizeof(*out_vbr));
> +
> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
> + err = virtqueue_add_outbuf(vb->recover_vq, &sg, 1, out_vbr, GFP_KERNEL);
> + if (unlikely(err)) {
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> + return NOTIFY_DONE;
> + }
> + virtqueue_kick(vb->recover_vq);
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> +
> + return NOTIFY_OK;
> +}
> +
> +static int recover_vq_get_response(struct virtio_balloon *vb)
> +{
> + struct __virtio_balloon_recover *in_vbr;
> + struct scatterlist sg;
> + unsigned long flags;
> + int err;
> +
> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
> + in_vbr = &vb->in_vbr;
> + memset(in_vbr, 0x00, sizeof(*in_vbr));
> + sg_init_one(&sg, in_vbr, sizeof(*in_vbr));
> + err = virtqueue_add_inbuf(vb->recover_vq, &sg, 1, in_vbr, GFP_KERNEL);
> + if (unlikely(err)) {
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> + return err;
> + }
> +
> + virtqueue_kick(vb->recover_vq);
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> +
> + return 0;
> +}
> +
> +static void recover_vq_handle_response(struct virtio_balloon *vb, unsigned int len)
> +{
> + struct __virtio_balloon_recover *in_vbr;
> + struct virtio_balloon_recover *vbr;
> + struct page *page;
> + unsigned int pfns;
> + u32 pfn0, pfn1;
> + __u8 status;
> +
> + /* the response is not expected */
> + if (unlikely(len != sizeof(struct __virtio_balloon_recover)))
> + return;
> +
> + in_vbr = &vb->in_vbr;
> + vbr = &in_vbr->vbr;
> + if (unlikely(vbr->cmd != VIRTIO_BALLOON_R_CMD_RESPONSE))
> + return;
> +
> + /* to make sure the contiguous balloon PFNs */
> + for (pfns = 1; pfns < VIRTIO_BALLOON_PAGES_PER_PAGE; pfns++) {
> + pfn0 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[pfns - 1]);
> + pfn1 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[pfns]);
> + if (pfn1 - pfn0 != 1)
> + return;
Yeah, we really shouldn't be dealing with (legacy) 4k PFNs here, but
instead, proper ranges I guess.
> + }
> +
> + pfn0 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[0]);
> + if (!pfn_valid(pfn0))
> + return;
> +
> + pfn1 = -1;
> + spin_lock(&vb->recover_page_list_lock);
> + list_for_each_entry(page, &vb->corrupted_page_list, lru) {
> + pfn1 = page_to_pfn(page);
> + if (pfn1 == pfn0)
> + break;
> + }
> + spin_unlock(&vb->recover_page_list_lock);
> +
> + status = vbr->status;
> + switch (status) {
> + case VIRTIO_BALLOON_R_STATUS_RECOVERED:
> + if (pfn1 == pfn0) {
> + spin_lock(&vb->recover_page_list_lock);
> + list_del(&page->lru);
> + balloon_page_push(&vb->recovered_page_list, page);
We rather not reuse actual balloon functions in !balloon context. Just
move the page to the proper list directly.
> + spin_unlock(&vb->recover_page_list_lock);
> + queue_work(system_freezable_wq, &vb->unpoison_memory_work);
> + dev_info_ratelimited(&vb->vdev->dev, "recovered pfn 0x%x", pfn0);
Well, not yet. Shouldn't this go into unpoison_memory_func() ?
> + }
> + break;
> + case VIRTIO_BALLOON_R_STATUS_FAILED:
> + /* the hypervisor can't fix this corrupted page, balloon puts page */
> + if (pfn1 == pfn0) {
> + spin_lock(&vb->recover_page_list_lock);
> + list_del(&page->lru);
> + spin_unlock(&vb->recover_page_list_lock);
> + put_page(page);
> + dev_info_ratelimited(&vb->vdev->dev, "failed to recover pfn 0x%x", pfn0);
> + }
> + default:
> + break;
> + };
> +
> + /* continue to get response from host side if the response gets handled successfully */
> + recover_vq_get_response(vb);
> +}
> +
> +static void unpoison_memory_func(struct work_struct *work)
> +{
> + struct virtio_balloon *vb;
> + struct page *page;
> +
> + vb = container_of(work, struct virtio_balloon, unpoison_memory_work);
> +
> + do {
> + spin_lock(&vb->recover_page_list_lock);
> + page = list_first_entry_or_null(&vb->recovered_page_list,
> + struct page, lru);
> + if (page)
> + list_del(&page->lru);
> + spin_unlock(&vb->recover_page_list_lock);
> +
> + if (page) {
> + put_page(page);
> + unpoison_memory(page_to_pfn(page), true, "virtio-balloon");
> + }
> + } while (page);
> +}
> +
> +static void recover_vq_cb(struct virtqueue *vq)
> +{
> + struct virtio_balloon *vb = vq->vdev->priv;
> + struct __virtio_balloon_recover *vbr;
> + unsigned long flags;
> + unsigned int len;
> +
> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
> + do {
> + virtqueue_disable_cb(vq);
> + while ((vbr = virtqueue_get_buf(vq, &len)) != NULL) {
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> + if (vbr == &vb->in_vbr)
> + recover_vq_handle_response(vb, len);
> + else
> + kfree(vbr); /* just free the memory for out vbr request */
> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
> + }
> + } while (!virtqueue_enable_cb(vq));
> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
> +}
> +
[...]
>
> +out_unregister_reporting:
> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
> + page_reporting_unregister(&vb->pr_dev_info);
> out_unregister_oom:
> if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
> unregister_oom_notifier(&vb->oom_nb);
> @@ -1082,6 +1319,11 @@ static void virtballoon_remove(struct virtio_device *vdev)
> destroy_workqueue(vb->balloon_wq);
> }
>
> + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_RECOVER)) {
Could the notifier already have been triggered and we might be using the
device before already fully initialized from the notifier and might end
up leaking memory here that we allocated?
> + unregister_memory_failure_notifier(&vb->memory_failure_nb);
> + cancel_work_sync(&vb->unpoison_memory_work);
> + }
> +
Could we be leaking memory from the virtballoon_remove() path?
--
Thanks,
David / dhildenb
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: Re: [PATCH 3/3] virtio_balloon: Introduce memory recover
2022-05-30 7:48 ` David Hildenbrand
@ 2022-05-30 12:47 ` zhenwei pi
-1 siblings, 0 replies; 37+ messages in thread
From: zhenwei pi @ 2022-05-30 12:47 UTC (permalink / raw)
To: David Hildenbrand
Cc: linux-mm, linux-kernel, mst, naoya.horiguchi, akpm, jasowang,
virtualization, pbonzini, peterx, qemu-devel
On 5/30/22 15:48, David Hildenbrand wrote:
>
>> +
>> struct virtio_balloon {
>> struct virtio_device *vdev;
>> struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
>> @@ -126,6 +133,16 @@ struct virtio_balloon {
>> /* Free page reporting device */
>> struct virtqueue *reporting_vq;
>> struct page_reporting_dev_info pr_dev_info;
>> +
>> + /* Memory recover VQ - VIRTIO_BALLOON_F_RECOVER */
>> + struct virtqueue *recover_vq;
>> + spinlock_t recover_vq_lock;
>> + struct notifier_block memory_failure_nb;
>> + struct list_head corrupted_page_list;
>> + struct list_head recovered_page_list;
>> + spinlock_t recover_page_list_lock;
>> + struct __virtio_balloon_recover in_vbr;
>> + struct work_struct unpoison_memory_work;
>
> I assume we want all that only with CONFIG_MEMORY_FAILURE.
>
Sorry, I missed this.
>> };
>>
>> static const struct virtio_device_id id_table[] = {
>> @@ -494,6 +511,198 @@ static void update_balloon_size_func(struct work_struct *work)
>> queue_work(system_freezable_wq, work);
>> }
>>
>> +/*
>> + * virtballoon_memory_failure - notified by memory failure, try to fix the
>> + * corrupted page.
>> + * The memory failure notifier is designed to call back when the kernel handled
>> + * successfully only, WARN_ON_ONCE on the unlikely condition to find out any
>> + * error(memory error handling is a best effort, not 100% coverd).
>> + */
>> +static int virtballoon_memory_failure(struct notifier_block *notifier,
>> + unsigned long pfn, void *parm)
>> +{
>> + struct virtio_balloon *vb = container_of(notifier, struct virtio_balloon,
>> + memory_failure_nb);
>> + struct page *page;
>> + struct __virtio_balloon_recover *out_vbr;
>> + struct scatterlist sg;
>> + unsigned long flags;
>> + int err;
>> +
>> + page = pfn_to_online_page(pfn);
>> + if (WARN_ON_ONCE(!page))
>> + return NOTIFY_DONE;
>> +
>> + if (PageHuge(page))
>> + return NOTIFY_DONE;
>> +
>> + if (WARN_ON_ONCE(!PageHWPoison(page)))
>> + return NOTIFY_DONE;
>> +
>> + if (WARN_ON_ONCE(page_count(page) != 1))
>> + return NOTIFY_DONE;
>
> Relying on the page_count to be 1 for correctness is usually a bit
> shaky, for example, when racing against isolate_movable_page() that
> might temporarily bump upo the refcount.
>
The memory notifier is designed to call the chain if a page gets result
MF_RECOVERED only:
if (result == MF_RECOVERED)
blocking_notifier_call_chain(&mf_notifier_list, pfn, NULL);
>> +
>> + get_page(page); /* balloon reference */
>> +
>> + out_vbr = kzalloc(sizeof(*out_vbr), GFP_KERNEL);
>
> Are we always guaranteed to be able to use GFP_KERNEL out of MCE
> context? (IOW, are we never atomic?)
>
>> + if (WARN_ON_ONCE(!out_vbr))
>> + return NOTIFY_BAD;
>> +
>> + spin_lock(&vb->recover_page_list_lock);
>> + balloon_page_push(&vb->corrupted_page_list, page);
>> + spin_unlock(&vb->recover_page_list_lock);
>> +
>> + out_vbr->vbr.cmd = VIRTIO_BALLOON_R_CMD_RECOVER;
>
> This makes me wonder if we should have a more generic guest->host
> request queue, similar to what e.g., virtio-mem uses, instead of adding
> a separate VIRTIO_BALLOON_VQ_RECOVER vq.
>
I'm OK with either one, I'll follow your decision! :D
>> + set_page_pfns(vb, out_vbr->pfns, page);
>> + sg_init_one(&sg, out_vbr, sizeof(*out_vbr));
>> +
>> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
>> + err = virtqueue_add_outbuf(vb->recover_vq, &sg, 1, out_vbr, GFP_KERNEL);
>> + if (unlikely(err)) {
>> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
>> + return NOTIFY_DONE;
>> + }
>> + virtqueue_kick(vb->recover_vq);
>> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
>> +
>> + return NOTIFY_OK;
>> +}
>> +
>> +static int recover_vq_get_response(struct virtio_balloon *vb)
>> +{
>> + struct __virtio_balloon_recover *in_vbr;
>> + struct scatterlist sg;
>> + unsigned long flags;
>> + int err;
>> +
>> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
>> + in_vbr = &vb->in_vbr;
>> + memset(in_vbr, 0x00, sizeof(*in_vbr));
>> + sg_init_one(&sg, in_vbr, sizeof(*in_vbr));
>> + err = virtqueue_add_inbuf(vb->recover_vq, &sg, 1, in_vbr, GFP_KERNEL);
>> + if (unlikely(err)) {
>> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
>> + return err;
>> + }
>> +
>> + virtqueue_kick(vb->recover_vq);
>> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
>> +
>> + return 0;
>> +}
>> +
>> +static void recover_vq_handle_response(struct virtio_balloon *vb, unsigned int len)
>> +{
>> + struct __virtio_balloon_recover *in_vbr;
>> + struct virtio_balloon_recover *vbr;
>> + struct page *page;
>> + unsigned int pfns;
>> + u32 pfn0, pfn1;
>> + __u8 status;
>> +
>> + /* the response is not expected */
>> + if (unlikely(len != sizeof(struct __virtio_balloon_recover)))
>> + return;
>> +
>> + in_vbr = &vb->in_vbr;
>> + vbr = &in_vbr->vbr;
>> + if (unlikely(vbr->cmd != VIRTIO_BALLOON_R_CMD_RESPONSE))
>> + return;
>> +
>> + /* to make sure the contiguous balloon PFNs */
>> + for (pfns = 1; pfns < VIRTIO_BALLOON_PAGES_PER_PAGE; pfns++) {
>> + pfn0 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[pfns - 1]);
>> + pfn1 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[pfns]);
>> + if (pfn1 - pfn0 != 1)
>> + return;
>
> Yeah, we really shouldn't be dealing with (legacy) 4k PFNs here, but
> instead, proper ranges I guess.
>
MST also pointed out this, I explained in this link:
https://lkml.org/lkml/2022/5/26/942
Rather than page reporting style, virtio-mem style should be fine. Ex,
struct virtio_memory_recover {
__virtio64 addr;
__virtio32 length;
__virtio16 padding[2];
};
>> + }
>> +
>> + pfn0 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[0]);
>> + if (!pfn_valid(pfn0))
>> + return;
>> +
>> + pfn1 = -1;
>> + spin_lock(&vb->recover_page_list_lock);
>> + list_for_each_entry(page, &vb->corrupted_page_list, lru) {
>> + pfn1 = page_to_pfn(page);
>> + if (pfn1 == pfn0)
>> + break;
>> + }
>> + spin_unlock(&vb->recover_page_list_lock);
>> +
>> + status = vbr->status;
>> + switch (status) {
>> + case VIRTIO_BALLOON_R_STATUS_RECOVERED:
>> + if (pfn1 == pfn0) {
>> + spin_lock(&vb->recover_page_list_lock);
>> + list_del(&page->lru);
>> + balloon_page_push(&vb->recovered_page_list, page);
>
> We rather not reuse actual balloon functions in !balloon context. Just
> move the page to the proper list directly.
>
OK.
>> + spin_unlock(&vb->recover_page_list_lock);
>> + queue_work(system_freezable_wq, &vb->unpoison_memory_work);
>> + dev_info_ratelimited(&vb->vdev->dev, "recovered pfn 0x%x", pfn0);
>
> Well, not yet. Shouldn't this go into unpoison_memory_func() ?
>
OK.
[...]
>
>>
>> +out_unregister_reporting:
>> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
>> + page_reporting_unregister(&vb->pr_dev_info);
>> out_unregister_oom:
>> if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
>> unregister_oom_notifier(&vb->oom_nb);
>> @@ -1082,6 +1319,11 @@ static void virtballoon_remove(struct virtio_device *vdev)
>> destroy_workqueue(vb->balloon_wq);
>> }
>>
>> + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_RECOVER)) {
>
> Could the notifier already have been triggered and we might be using the
> device before already fully initialized from the notifier and might end
> up leaking memory here that we allocated?
>
>> + unregister_memory_failure_notifier(&vb->memory_failure_nb);
>> + cancel_work_sync(&vb->unpoison_memory_work);
>> + }
>> +
>
> Could we be leaking memory from the virtballoon_remove() path?
>
Yes, I'll fix the possible memory leak here.
Thanks a lot.
--
zhenwei pi
^ permalink raw reply [flat|nested] 37+ messages in thread
* Re: Re: [PATCH 3/3] virtio_balloon: Introduce memory recover
@ 2022-05-30 12:47 ` zhenwei pi
0 siblings, 0 replies; 37+ messages in thread
From: zhenwei pi @ 2022-05-30 12:47 UTC (permalink / raw)
To: David Hildenbrand
Cc: mst, qemu-devel, naoya.horiguchi, linux-kernel, virtualization,
linux-mm, pbonzini, akpm
On 5/30/22 15:48, David Hildenbrand wrote:
>
>> +
>> struct virtio_balloon {
>> struct virtio_device *vdev;
>> struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
>> @@ -126,6 +133,16 @@ struct virtio_balloon {
>> /* Free page reporting device */
>> struct virtqueue *reporting_vq;
>> struct page_reporting_dev_info pr_dev_info;
>> +
>> + /* Memory recover VQ - VIRTIO_BALLOON_F_RECOVER */
>> + struct virtqueue *recover_vq;
>> + spinlock_t recover_vq_lock;
>> + struct notifier_block memory_failure_nb;
>> + struct list_head corrupted_page_list;
>> + struct list_head recovered_page_list;
>> + spinlock_t recover_page_list_lock;
>> + struct __virtio_balloon_recover in_vbr;
>> + struct work_struct unpoison_memory_work;
>
> I assume we want all that only with CONFIG_MEMORY_FAILURE.
>
Sorry, I missed this.
>> };
>>
>> static const struct virtio_device_id id_table[] = {
>> @@ -494,6 +511,198 @@ static void update_balloon_size_func(struct work_struct *work)
>> queue_work(system_freezable_wq, work);
>> }
>>
>> +/*
>> + * virtballoon_memory_failure - notified by memory failure, try to fix the
>> + * corrupted page.
>> + * The memory failure notifier is designed to call back when the kernel handled
>> + * successfully only, WARN_ON_ONCE on the unlikely condition to find out any
>> + * error(memory error handling is a best effort, not 100% coverd).
>> + */
>> +static int virtballoon_memory_failure(struct notifier_block *notifier,
>> + unsigned long pfn, void *parm)
>> +{
>> + struct virtio_balloon *vb = container_of(notifier, struct virtio_balloon,
>> + memory_failure_nb);
>> + struct page *page;
>> + struct __virtio_balloon_recover *out_vbr;
>> + struct scatterlist sg;
>> + unsigned long flags;
>> + int err;
>> +
>> + page = pfn_to_online_page(pfn);
>> + if (WARN_ON_ONCE(!page))
>> + return NOTIFY_DONE;
>> +
>> + if (PageHuge(page))
>> + return NOTIFY_DONE;
>> +
>> + if (WARN_ON_ONCE(!PageHWPoison(page)))
>> + return NOTIFY_DONE;
>> +
>> + if (WARN_ON_ONCE(page_count(page) != 1))
>> + return NOTIFY_DONE;
>
> Relying on the page_count to be 1 for correctness is usually a bit
> shaky, for example, when racing against isolate_movable_page() that
> might temporarily bump upo the refcount.
>
The memory notifier is designed to call the chain if a page gets result
MF_RECOVERED only:
if (result == MF_RECOVERED)
blocking_notifier_call_chain(&mf_notifier_list, pfn, NULL);
>> +
>> + get_page(page); /* balloon reference */
>> +
>> + out_vbr = kzalloc(sizeof(*out_vbr), GFP_KERNEL);
>
> Are we always guaranteed to be able to use GFP_KERNEL out of MCE
> context? (IOW, are we never atomic?)
>
>> + if (WARN_ON_ONCE(!out_vbr))
>> + return NOTIFY_BAD;
>> +
>> + spin_lock(&vb->recover_page_list_lock);
>> + balloon_page_push(&vb->corrupted_page_list, page);
>> + spin_unlock(&vb->recover_page_list_lock);
>> +
>> + out_vbr->vbr.cmd = VIRTIO_BALLOON_R_CMD_RECOVER;
>
> This makes me wonder if we should have a more generic guest->host
> request queue, similar to what e.g., virtio-mem uses, instead of adding
> a separate VIRTIO_BALLOON_VQ_RECOVER vq.
>
I'm OK with either one, I'll follow your decision! :D
>> + set_page_pfns(vb, out_vbr->pfns, page);
>> + sg_init_one(&sg, out_vbr, sizeof(*out_vbr));
>> +
>> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
>> + err = virtqueue_add_outbuf(vb->recover_vq, &sg, 1, out_vbr, GFP_KERNEL);
>> + if (unlikely(err)) {
>> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
>> + return NOTIFY_DONE;
>> + }
>> + virtqueue_kick(vb->recover_vq);
>> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
>> +
>> + return NOTIFY_OK;
>> +}
>> +
>> +static int recover_vq_get_response(struct virtio_balloon *vb)
>> +{
>> + struct __virtio_balloon_recover *in_vbr;
>> + struct scatterlist sg;
>> + unsigned long flags;
>> + int err;
>> +
>> + spin_lock_irqsave(&vb->recover_vq_lock, flags);
>> + in_vbr = &vb->in_vbr;
>> + memset(in_vbr, 0x00, sizeof(*in_vbr));
>> + sg_init_one(&sg, in_vbr, sizeof(*in_vbr));
>> + err = virtqueue_add_inbuf(vb->recover_vq, &sg, 1, in_vbr, GFP_KERNEL);
>> + if (unlikely(err)) {
>> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
>> + return err;
>> + }
>> +
>> + virtqueue_kick(vb->recover_vq);
>> + spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
>> +
>> + return 0;
>> +}
>> +
>> +static void recover_vq_handle_response(struct virtio_balloon *vb, unsigned int len)
>> +{
>> + struct __virtio_balloon_recover *in_vbr;
>> + struct virtio_balloon_recover *vbr;
>> + struct page *page;
>> + unsigned int pfns;
>> + u32 pfn0, pfn1;
>> + __u8 status;
>> +
>> + /* the response is not expected */
>> + if (unlikely(len != sizeof(struct __virtio_balloon_recover)))
>> + return;
>> +
>> + in_vbr = &vb->in_vbr;
>> + vbr = &in_vbr->vbr;
>> + if (unlikely(vbr->cmd != VIRTIO_BALLOON_R_CMD_RESPONSE))
>> + return;
>> +
>> + /* to make sure the contiguous balloon PFNs */
>> + for (pfns = 1; pfns < VIRTIO_BALLOON_PAGES_PER_PAGE; pfns++) {
>> + pfn0 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[pfns - 1]);
>> + pfn1 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[pfns]);
>> + if (pfn1 - pfn0 != 1)
>> + return;
>
> Yeah, we really shouldn't be dealing with (legacy) 4k PFNs here, but
> instead, proper ranges I guess.
>
MST also pointed out this, I explained in this link:
https://lkml.org/lkml/2022/5/26/942
Rather than page reporting style, virtio-mem style should be fine. Ex,
struct virtio_memory_recover {
__virtio64 addr;
__virtio32 length;
__virtio16 padding[2];
};
>> + }
>> +
>> + pfn0 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[0]);
>> + if (!pfn_valid(pfn0))
>> + return;
>> +
>> + pfn1 = -1;
>> + spin_lock(&vb->recover_page_list_lock);
>> + list_for_each_entry(page, &vb->corrupted_page_list, lru) {
>> + pfn1 = page_to_pfn(page);
>> + if (pfn1 == pfn0)
>> + break;
>> + }
>> + spin_unlock(&vb->recover_page_list_lock);
>> +
>> + status = vbr->status;
>> + switch (status) {
>> + case VIRTIO_BALLOON_R_STATUS_RECOVERED:
>> + if (pfn1 == pfn0) {
>> + spin_lock(&vb->recover_page_list_lock);
>> + list_del(&page->lru);
>> + balloon_page_push(&vb->recovered_page_list, page);
>
> We rather not reuse actual balloon functions in !balloon context. Just
> move the page to the proper list directly.
>
OK.
>> + spin_unlock(&vb->recover_page_list_lock);
>> + queue_work(system_freezable_wq, &vb->unpoison_memory_work);
>> + dev_info_ratelimited(&vb->vdev->dev, "recovered pfn 0x%x", pfn0);
>
> Well, not yet. Shouldn't this go into unpoison_memory_func() ?
>
OK.
[...]
>
>>
>> +out_unregister_reporting:
>> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
>> + page_reporting_unregister(&vb->pr_dev_info);
>> out_unregister_oom:
>> if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
>> unregister_oom_notifier(&vb->oom_nb);
>> @@ -1082,6 +1319,11 @@ static void virtballoon_remove(struct virtio_device *vdev)
>> destroy_workqueue(vb->balloon_wq);
>> }
>>
>> + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_RECOVER)) {
>
> Could the notifier already have been triggered and we might be using the
> device before already fully initialized from the notifier and might end
> up leaking memory here that we allocated?
>
>> + unregister_memory_failure_notifier(&vb->memory_failure_nb);
>> + cancel_work_sync(&vb->unpoison_memory_work);
>> + }
>> +
>
> Could we be leaking memory from the virtballoon_remove() path?
>
Yes, I'll fix the possible memory leak here.
Thanks a lot.
--
zhenwei pi
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply [flat|nested] 37+ messages in thread