* Re: [PATCH] vfio-pci: report recovery event after device recovery successful
@ 2022-04-25 1:34 Wangjing(Hogan)
0 siblings, 0 replies; 3+ messages in thread
From: Wangjing(Hogan) @ 2022-04-25 1:34 UTC (permalink / raw)
To: jgg, yishaih, Shameerali Kolothum Thodi, kevin.tian, kvm
Cc: Huangweidong (C), Yechuan
Ping
>
> As you all know, device faults are classified into the following
> types: unrecoverable error and recoverable error. vfio-pci drvier will report
> error event to user-space process while device occur hardware errors, and
> still report the other error event after deivce recovery successful. So the
> user-space process just like qemu can not identify the event is an hardware
> error event or a device recovery successful event. So in order to solve this
> problem, add an eventfd named recov_trigger to report device recovery
> successful event, the user-space process can make a decision whether to
> process the recovery event or not.
>
> Signed-off-by: Hogan Wang <hogan.wang@huawei.com>
> ---
> drivers/vfio/pci/vfio_pci_core.c | 13 +++++++++++--
> drivers/vfio/pci/vfio_pci_intrs.c | 19 +++++++++++++++++++
> include/linux/vfio_pci_core.h | 1 +
> include/uapi/linux/vfio.h | 1 +
> 4 files changed, 32 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index b7bb16f92ac6..2360cb44aa36 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -483,6 +483,10 @@ void vfio_pci_core_close_device(struct vfio_device
> *core_vdev)
> eventfd_ctx_put(vdev->err_trigger);
> vdev->err_trigger = NULL;
> }
> + if (vdev->recov_trigger) {
> + eventfd_ctx_put(vdev->recov_trigger);
> + vdev->recov_trigger = NULL;
> + }
> if (vdev->req_trigger) {
> eventfd_ctx_put(vdev->req_trigger);
> vdev->req_trigger = NULL;
> @@ -1922,8 +1926,13 @@ pci_ers_result_t
> vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
>
> mutex_lock(&vdev->igate);
>
> - if (vdev->err_trigger)
> - eventfd_signal(vdev->err_trigger, 1);
> + if (state == pci_channel_io_normal) {
> + if (vdev->recov_trigger)
> + eventfd_signal(vdev->recov_trigger, 1);
> + } else {
> + if (vdev->err_trigger)
> + eventfd_signal(vdev->err_trigger, 1);
> + }
>
> mutex_unlock(&vdev->igate);
>
> diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
> index 6069a11fb51a..be76ff76c361 100644
> --- a/drivers/vfio/pci/vfio_pci_intrs.c
> +++ b/drivers/vfio/pci/vfio_pci_intrs.c
> @@ -624,6 +624,17 @@ static int vfio_pci_set_err_trigger(struct
> vfio_pci_core_device *vdev,
> count, flags, data);
> }
>
> +static int vfio_pci_set_recov_trigger(struct vfio_pci_core_device *vdev,
> + unsigned index, unsigned start,
> + unsigned count, uint32_t flags, void *data) {
> + if (index != VFIO_PCI_ERR_IRQ_INDEX || start != 0 || count > 1)
> + return -EINVAL;
> +
> + return vfio_pci_set_ctx_trigger_single(&vdev->recov_trigger,
> + count, flags, data);
> +}
> +
> static int vfio_pci_set_req_trigger(struct vfio_pci_core_device *vdev,
> unsigned index, unsigned start,
> unsigned count, uint32_t flags, void *data) @@
> -684,6 +695,14 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device
> *vdev, uint32_t flags,
> break;
> }
> break;
> + case VFIO_PCI_RECOV_IRQ_INDEX:
> + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
> + case VFIO_IRQ_SET_ACTION_TRIGGER:
> + if (pci_is_pcie(vdev->pdev))
> + func = vfio_pci_set_recov_trigger;
> + break;
> + }
> + break;
> }
>
> if (!func)
> diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index
> 74a4a0f17b28..d94addb18118 100644
> --- a/include/linux/vfio_pci_core.h
> +++ b/include/linux/vfio_pci_core.h
> @@ -128,6 +128,7 @@ struct vfio_pci_core_device {
> struct pci_saved_state *pm_save;
> int ioeventfds_nr;
> struct eventfd_ctx *err_trigger;
> + struct eventfd_ctx *recov_trigger;
> struct eventfd_ctx *req_trigger;
> struct list_head dummy_resources_list;
> struct mutex ioeventfds_lock;
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index
> fea86061b44e..f88a6ca62c49 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -621,6 +621,7 @@ enum {
> VFIO_PCI_MSIX_IRQ_INDEX,
> VFIO_PCI_ERR_IRQ_INDEX,
> VFIO_PCI_REQ_IRQ_INDEX,
> + VFIO_PCI_RECOV_IRQ_INDEX,
> VFIO_PCI_NUM_IRQS
> };
>
> --
> 2.33.0
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] vfio-pci: report recovery event after device recovery successful
2022-04-20 7:16 Hogan Wang
@ 2022-05-11 18:14 ` Alex Williamson
0 siblings, 0 replies; 3+ messages in thread
From: Alex Williamson @ 2022-05-11 18:14 UTC (permalink / raw)
To: Hogan Wang
Cc: jgg, yishaih, shameerali.kolothum.thodi, kevin.tian, kvm,
weidong.huang, yechuan
On Wed, 20 Apr 2022 15:16:01 +0800
Hogan Wang <hogan.wang@huawei.com> wrote:
> As you all know, device faults are classified into the following
> types: unrecoverable error and recoverable error. vfio-pci drvier
> will report error event to user-space process while device occur
> hardware errors, and still report the other error event after deivce
> recovery successful. So the user-space process just like qemu can not
> identify the event is an hardware error event or a device recovery
> successful event. So in order to solve this problem, add an eventfd
> named recov_trigger to report device recovery successful event, the
> user-space process can make a decision whether to process the recovery
> event or not.
>
> Signed-off-by: Hogan Wang <hogan.wang@huawei.com>
> ---
> drivers/vfio/pci/vfio_pci_core.c | 13 +++++++++++--
> drivers/vfio/pci/vfio_pci_intrs.c | 19 +++++++++++++++++++
> include/linux/vfio_pci_core.h | 1 +
> include/uapi/linux/vfio.h | 1 +
> 4 files changed, 32 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index b7bb16f92ac6..2360cb44aa36 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -483,6 +483,10 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
> eventfd_ctx_put(vdev->err_trigger);
> vdev->err_trigger = NULL;
> }
> + if (vdev->recov_trigger) {
> + eventfd_ctx_put(vdev->recov_trigger);
> + vdev->recov_trigger = NULL;
> + }
> if (vdev->req_trigger) {
> eventfd_ctx_put(vdev->req_trigger);
> vdev->req_trigger = NULL;
> @@ -1922,8 +1926,13 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
>
> mutex_lock(&vdev->igate);
>
> - if (vdev->err_trigger)
> - eventfd_signal(vdev->err_trigger, 1);
> + if (state == pci_channel_io_normal) {
> + if (vdev->recov_trigger)
> + eventfd_signal(vdev->recov_trigger, 1);
> + } else {
> + if (vdev->err_trigger)
> + eventfd_signal(vdev->err_trigger, 1);
> + }
The goal of the existing notification is to signal on any uncorrected
error which requires intervention at the device. Here we're masking
non-fatal, ie. recoverable, errors from that existing mechanism. There
is no userspace that currently handles this new recovery notification,
therefore this is not a backwards compatible proposal.
I also don't see how an asynchronous notification to userspace allows
the device to continue operating, the problem is not as simple as
raising a different interrupt. Thanks,
Alex
>
> mutex_unlock(&vdev->igate);
>
> diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
> index 6069a11fb51a..be76ff76c361 100644
> --- a/drivers/vfio/pci/vfio_pci_intrs.c
> +++ b/drivers/vfio/pci/vfio_pci_intrs.c
> @@ -624,6 +624,17 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_core_device *vdev,
> count, flags, data);
> }
>
> +static int vfio_pci_set_recov_trigger(struct vfio_pci_core_device *vdev,
> + unsigned index, unsigned start,
> + unsigned count, uint32_t flags, void *data)
> +{
> + if (index != VFIO_PCI_ERR_IRQ_INDEX || start != 0 || count > 1)
> + return -EINVAL;
> +
> + return vfio_pci_set_ctx_trigger_single(&vdev->recov_trigger,
> + count, flags, data);
> +}
> +
> static int vfio_pci_set_req_trigger(struct vfio_pci_core_device *vdev,
> unsigned index, unsigned start,
> unsigned count, uint32_t flags, void *data)
> @@ -684,6 +695,14 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags,
> break;
> }
> break;
> + case VFIO_PCI_RECOV_IRQ_INDEX:
> + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
> + case VFIO_IRQ_SET_ACTION_TRIGGER:
> + if (pci_is_pcie(vdev->pdev))
> + func = vfio_pci_set_recov_trigger;
> + break;
> + }
> + break;
> }
>
> if (!func)
> diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
> index 74a4a0f17b28..d94addb18118 100644
> --- a/include/linux/vfio_pci_core.h
> +++ b/include/linux/vfio_pci_core.h
> @@ -128,6 +128,7 @@ struct vfio_pci_core_device {
> struct pci_saved_state *pm_save;
> int ioeventfds_nr;
> struct eventfd_ctx *err_trigger;
> + struct eventfd_ctx *recov_trigger;
> struct eventfd_ctx *req_trigger;
> struct list_head dummy_resources_list;
> struct mutex ioeventfds_lock;
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index fea86061b44e..f88a6ca62c49 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -621,6 +621,7 @@ enum {
> VFIO_PCI_MSIX_IRQ_INDEX,
> VFIO_PCI_ERR_IRQ_INDEX,
> VFIO_PCI_REQ_IRQ_INDEX,
> + VFIO_PCI_RECOV_IRQ_INDEX,
> VFIO_PCI_NUM_IRQS
> };
>
^ permalink raw reply [flat|nested] 3+ messages in thread
* [PATCH] vfio-pci: report recovery event after device recovery successful
@ 2022-04-20 7:16 Hogan Wang
2022-05-11 18:14 ` Alex Williamson
0 siblings, 1 reply; 3+ messages in thread
From: Hogan Wang @ 2022-04-20 7:16 UTC (permalink / raw)
To: jgg, yishaih, shameerali.kolothum.thodi, kevin.tian, kvm
Cc: weidong.huang, yechuan, hogan.wang
As you all know, device faults are classified into the following
types: unrecoverable error and recoverable error. vfio-pci drvier
will report error event to user-space process while device occur
hardware errors, and still report the other error event after deivce
recovery successful. So the user-space process just like qemu can not
identify the event is an hardware error event or a device recovery
successful event. So in order to solve this problem, add an eventfd
named recov_trigger to report device recovery successful event, the
user-space process can make a decision whether to process the recovery
event or not.
Signed-off-by: Hogan Wang <hogan.wang@huawei.com>
---
drivers/vfio/pci/vfio_pci_core.c | 13 +++++++++++--
drivers/vfio/pci/vfio_pci_intrs.c | 19 +++++++++++++++++++
include/linux/vfio_pci_core.h | 1 +
include/uapi/linux/vfio.h | 1 +
4 files changed, 32 insertions(+), 2 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index b7bb16f92ac6..2360cb44aa36 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -483,6 +483,10 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
eventfd_ctx_put(vdev->err_trigger);
vdev->err_trigger = NULL;
}
+ if (vdev->recov_trigger) {
+ eventfd_ctx_put(vdev->recov_trigger);
+ vdev->recov_trigger = NULL;
+ }
if (vdev->req_trigger) {
eventfd_ctx_put(vdev->req_trigger);
vdev->req_trigger = NULL;
@@ -1922,8 +1926,13 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
mutex_lock(&vdev->igate);
- if (vdev->err_trigger)
- eventfd_signal(vdev->err_trigger, 1);
+ if (state == pci_channel_io_normal) {
+ if (vdev->recov_trigger)
+ eventfd_signal(vdev->recov_trigger, 1);
+ } else {
+ if (vdev->err_trigger)
+ eventfd_signal(vdev->err_trigger, 1);
+ }
mutex_unlock(&vdev->igate);
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 6069a11fb51a..be76ff76c361 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -624,6 +624,17 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_core_device *vdev,
count, flags, data);
}
+static int vfio_pci_set_recov_trigger(struct vfio_pci_core_device *vdev,
+ unsigned index, unsigned start,
+ unsigned count, uint32_t flags, void *data)
+{
+ if (index != VFIO_PCI_ERR_IRQ_INDEX || start != 0 || count > 1)
+ return -EINVAL;
+
+ return vfio_pci_set_ctx_trigger_single(&vdev->recov_trigger,
+ count, flags, data);
+}
+
static int vfio_pci_set_req_trigger(struct vfio_pci_core_device *vdev,
unsigned index, unsigned start,
unsigned count, uint32_t flags, void *data)
@@ -684,6 +695,14 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags,
break;
}
break;
+ case VFIO_PCI_RECOV_IRQ_INDEX:
+ switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+ case VFIO_IRQ_SET_ACTION_TRIGGER:
+ if (pci_is_pcie(vdev->pdev))
+ func = vfio_pci_set_recov_trigger;
+ break;
+ }
+ break;
}
if (!func)
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 74a4a0f17b28..d94addb18118 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -128,6 +128,7 @@ struct vfio_pci_core_device {
struct pci_saved_state *pm_save;
int ioeventfds_nr;
struct eventfd_ctx *err_trigger;
+ struct eventfd_ctx *recov_trigger;
struct eventfd_ctx *req_trigger;
struct list_head dummy_resources_list;
struct mutex ioeventfds_lock;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index fea86061b44e..f88a6ca62c49 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -621,6 +621,7 @@ enum {
VFIO_PCI_MSIX_IRQ_INDEX,
VFIO_PCI_ERR_IRQ_INDEX,
VFIO_PCI_REQ_IRQ_INDEX,
+ VFIO_PCI_RECOV_IRQ_INDEX,
VFIO_PCI_NUM_IRQS
};
--
2.33.0
^ permalink raw reply related [flat|nested] 3+ messages in thread
end of thread, other threads:[~2022-05-11 18:14 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-25 1:34 [PATCH] vfio-pci: report recovery event after device recovery successful Wangjing(Hogan)
-- strict thread matches above, loose matches on Subject: below --
2022-04-20 7:16 Hogan Wang
2022-05-11 18:14 ` Alex Williamson
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.