All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH for-2.6] spapr_drc: enable immediate detach for unsignalled devices
@ 2016-03-31 22:27 Michael Roth
  2016-04-01  2:33 ` David Gibson
  0 siblings, 1 reply; 4+ messages in thread
From: Michael Roth @ 2016-03-31 22:27 UTC (permalink / raw)
  To: qemu-devel; +Cc: qemu-ppc, david, sbhat, Michael Roth, bharata

Currently spapr doesn't support "aborting" hotplug of PCI
devices by allowing device_del to immediately remove the
device if we haven't signalled the presence of the device
to the guest.

In the past this wasn't an issue, since we always immediately
signalled device attach and simply relied on full guest-aware
add->remove path for device removal. However, as of 788d259,
we now defer signalling for PCI functions until function 0
is attached, so now we need to deal with these "abort" operations
for cases where a user hotplugs a non-0 function, then opts to
remove it prior hotplugging function 0. Currently they'd have to
reboot before the unplug completed. PCIe multifunction hotplug
does not have this requirement however, so from a management
implementation perspective it would be good to address this within
the same release as 788d259.

We accomplish this by simply adding a 'signalled' flag to track
whether a device hotplug event has been sent to the guest. If it
hasn't, we allow immediate removal under the assumption that the
guest will not be using the device. Devices present at boot/reset
time are also assumed to be 'signalled'.

For CPU/memory/etc, signalling will still happen immediately
as part of device_add, so only PCI functions should be affected.

Cc: bharata@linux.vnet.ibm.com
Cc: david@gibson.dropbear.id.au
Cc: sbhat@linux.vnet.ibm.com
Cc: qemu-ppc@nongnu.org
Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
---
 hw/ppc/spapr_drc.c         | 34 ++++++++++++++++++++++++++++++++++
 hw/ppc/spapr_events.c      | 11 +++++++++++
 include/hw/ppc/spapr_drc.h |  2 ++
 3 files changed, 47 insertions(+)

diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
index ef063c0..5568e44 100644
--- a/hw/ppc/spapr_drc.c
+++ b/hw/ppc/spapr_drc.c
@@ -173,6 +173,12 @@ static void set_configured(sPAPRDRConnector *drc)
     drc->configured = true;
 }
 
+/* has the guest been notified of device attachment? */
+static void set_signalled(sPAPRDRConnector *drc)
+{
+    drc->signalled = true;
+}
+
 /*
  * dr-entity-sense sensor value
  * returned via get-sensor-state RTAS calls
@@ -355,6 +361,7 @@ static void attach(sPAPRDRConnector *drc, DeviceState *d, void *fdt,
     drc->fdt = fdt;
     drc->fdt_start_offset = fdt_start_offset;
     drc->configured = coldplug;
+    drc->signalled = coldplug;
 
     object_property_add_link(OBJECT(drc), "device",
                              object_get_typename(OBJECT(drc->dev)),
@@ -371,6 +378,26 @@ static void detach(sPAPRDRConnector *drc, DeviceState *d,
     drc->detach_cb = detach_cb;
     drc->detach_cb_opaque = detach_cb_opaque;
 
+    /* if we've signalled device presence to the guest, or if the guest
+     * has gone ahead and configured the device (via manually-executed
+     * device add via drmgr in guest, namely), we need to wait
+     * for the guest to quiesce the device before completing detach.
+     * Otherwise, we can assume the guest hasn't seen it and complete the
+     * detach immediately. Note that there is a small race window
+     * just before, or during, configuration, which is this context
+     * refers mainly to fetching the device tree via RTAS.
+     * During this window the device access will be arbitrated by
+     * associated DRC, which will simply fail the RTAS calls as invalid.
+     * This is recoverable within guest and current implementations of
+     * drmgr should be able to cope.
+     */
+    if (!drc->signalled && !drc->configured) {
+        /* if the guest hasn't seen the device we can't rely on it to
+         * set it back to an isolated state via RTAS, so do it here manually
+         */
+        drc->isolation_state = SPAPR_DR_ISOLATION_STATE_ISOLATED;
+    }
+
     if (drc->isolation_state != SPAPR_DR_ISOLATION_STATE_ISOLATED) {
         DPRINTFN("awaiting transition to isolated state before removal");
         drc->awaiting_release = true;
@@ -409,6 +436,7 @@ static void reset(DeviceState *d)
 {
     sPAPRDRConnector *drc = SPAPR_DR_CONNECTOR(d);
     sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
+    sPAPRDREntitySense state;
 
     DPRINTFN("drc reset: %x", drck->get_index(drc));
     /* immediately upon reset we can safely assume DRCs whose devices
@@ -436,6 +464,11 @@ static void reset(DeviceState *d)
             drck->set_allocation_state(drc, SPAPR_DR_ALLOCATION_STATE_UNUSABLE);
         }
     }
+
+    drck->entity_sense(drc, &state);
+    if (state == SPAPR_DR_ENTITY_SENSE_PRESENT) {
+        drck->set_signalled(drc);
+    }
 }
 
 static void realize(DeviceState *d, Error **errp)
@@ -594,6 +627,7 @@ static void spapr_dr_connector_class_init(ObjectClass *k, void *data)
     drck->attach = attach;
     drck->detach = detach;
     drck->release_pending = release_pending;
+    drck->set_signalled = set_signalled;
     /*
      * Reason: it crashes FIXME find and document the real reason
      */
diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
index 39f4682..be8de63 100644
--- a/hw/ppc/spapr_events.c
+++ b/hw/ppc/spapr_events.c
@@ -387,6 +387,13 @@ static void spapr_powerdown_req(Notifier *n, void *opaque)
     qemu_irq_pulse(xics_get_qirq(spapr->icp, spapr->check_exception_irq));
 }
 
+static void spapr_hotplug_set_signalled(uint32_t drc_index)
+{
+    sPAPRDRConnector *drc = spapr_dr_connector_by_index(drc_index);
+    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
+    drck->set_signalled(drc);
+}
+
 static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
                                     sPAPRDRConnectorType drc_type,
                                     uint32_t drc)
@@ -453,6 +460,10 @@ static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
 
     rtas_event_log_queue(RTAS_LOG_TYPE_HOTPLUG, new_hp, true);
 
+    if (hp->hotplug_action == RTAS_LOG_V6_HP_ACTION_ADD) {
+        spapr_hotplug_set_signalled(drc);
+    }
+
     qemu_irq_pulse(xics_get_qirq(spapr->icp, spapr->check_exception_irq));
 }
 
diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h
index 7e56347..fa21ba0 100644
--- a/include/hw/ppc/spapr_drc.h
+++ b/include/hw/ppc/spapr_drc.h
@@ -151,6 +151,7 @@ typedef struct sPAPRDRConnector {
     bool configured;
 
     bool awaiting_release;
+    bool signalled;
 
     /* device pointer, via link property */
     DeviceState *dev;
@@ -188,6 +189,7 @@ typedef struct sPAPRDRConnectorClass {
                    spapr_drc_detach_cb *detach_cb,
                    void *detach_cb_opaque, Error **errp);
     bool (*release_pending)(sPAPRDRConnector *drc);
+    void (*set_signalled)(sPAPRDRConnector *drc);
 } sPAPRDRConnectorClass;
 
 sPAPRDRConnector *spapr_dr_connector_new(Object *owner,
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [Qemu-devel] [PATCH for-2.6] spapr_drc: enable immediate detach for unsignalled devices
  2016-03-31 22:27 [Qemu-devel] [PATCH for-2.6] spapr_drc: enable immediate detach for unsignalled devices Michael Roth
@ 2016-04-01  2:33 ` David Gibson
  2016-04-01  3:47   ` Michael Roth
  0 siblings, 1 reply; 4+ messages in thread
From: David Gibson @ 2016-04-01  2:33 UTC (permalink / raw)
  To: Michael Roth; +Cc: qemu-ppc, sbhat, qemu-devel, bharata

[-- Attachment #1: Type: text/plain, Size: 7743 bytes --]

On Thu, Mar 31, 2016 at 05:27:37PM -0500, Michael Roth wrote:
> Currently spapr doesn't support "aborting" hotplug of PCI
> devices by allowing device_del to immediately remove the
> device if we haven't signalled the presence of the device
> to the guest.
> 
> In the past this wasn't an issue, since we always immediately
> signalled device attach and simply relied on full guest-aware
> add->remove path for device removal. However, as of 788d259,
> we now defer signalling for PCI functions until function 0
> is attached, so now we need to deal with these "abort" operations
> for cases where a user hotplugs a non-0 function, then opts to
> remove it prior hotplugging function 0. Currently they'd have to
> reboot before the unplug completed. PCIe multifunction hotplug
> does not have this requirement however, so from a management
> implementation perspective it would be good to address this within
> the same release as 788d259.
> 
> We accomplish this by simply adding a 'signalled' flag to track
> whether a device hotplug event has been sent to the guest. If it
> hasn't, we allow immediate removal under the assumption that the
> guest will not be using the device. Devices present at boot/reset
> time are also assumed to be 'signalled'.
> 
> For CPU/memory/etc, signalling will still happen immediately
> as part of device_add, so only PCI functions should be affected.
> 
> Cc: bharata@linux.vnet.ibm.com
> Cc: david@gibson.dropbear.id.au
> Cc: sbhat@linux.vnet.ibm.com
> Cc: qemu-ppc@nongnu.org
> Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>

So, I'm disinclined to take this during the hard freeze, without some
evidence that it fixes a problem case that's really being hit in
practice.

> ---
>  hw/ppc/spapr_drc.c         | 34 ++++++++++++++++++++++++++++++++++
>  hw/ppc/spapr_events.c      | 11 +++++++++++
>  include/hw/ppc/spapr_drc.h |  2 ++
>  3 files changed, 47 insertions(+)
> 
> diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
> index ef063c0..5568e44 100644
> --- a/hw/ppc/spapr_drc.c
> +++ b/hw/ppc/spapr_drc.c
> @@ -173,6 +173,12 @@ static void set_configured(sPAPRDRConnector *drc)
>      drc->configured = true;
>  }
>  
> +/* has the guest been notified of device attachment? */
> +static void set_signalled(sPAPRDRConnector *drc)
> +{
> +    drc->signalled = true;
> +}
> +
>  /*
>   * dr-entity-sense sensor value
>   * returned via get-sensor-state RTAS calls
> @@ -355,6 +361,7 @@ static void attach(sPAPRDRConnector *drc, DeviceState *d, void *fdt,
>      drc->fdt = fdt;
>      drc->fdt_start_offset = fdt_start_offset;
>      drc->configured = coldplug;
> +    drc->signalled = coldplug;
>  
>      object_property_add_link(OBJECT(drc), "device",
>                               object_get_typename(OBJECT(drc->dev)),
> @@ -371,6 +378,26 @@ static void detach(sPAPRDRConnector *drc, DeviceState *d,
>      drc->detach_cb = detach_cb;
>      drc->detach_cb_opaque = detach_cb_opaque;
>  
> +    /* if we've signalled device presence to the guest, or if the guest
> +     * has gone ahead and configured the device (via manually-executed
> +     * device add via drmgr in guest, namely), we need to wait
> +     * for the guest to quiesce the device before completing detach.
> +     * Otherwise, we can assume the guest hasn't seen it and complete the
> +     * detach immediately. Note that there is a small race window
> +     * just before, or during, configuration, which is this context
> +     * refers mainly to fetching the device tree via RTAS.
> +     * During this window the device access will be arbitrated by
> +     * associated DRC, which will simply fail the RTAS calls as invalid.
> +     * This is recoverable within guest and current implementations of
> +     * drmgr should be able to cope.

Sorry, I'm really not following this description of the race, or
seeing how it relates to allowing removal of "half plugged" devices.

> +     */
> +    if (!drc->signalled && !drc->configured) {
> +        /* if the guest hasn't seen the device we can't rely on it to
> +         * set it back to an isolated state via RTAS, so do it here manually
> +         */
> +        drc->isolation_state = SPAPR_DR_ISOLATION_STATE_ISOLATED;
> +    }
> +
>      if (drc->isolation_state != SPAPR_DR_ISOLATION_STATE_ISOLATED) {
>          DPRINTFN("awaiting transition to isolated state before removal");
>          drc->awaiting_release = true;
> @@ -409,6 +436,7 @@ static void reset(DeviceState *d)
>  {
>      sPAPRDRConnector *drc = SPAPR_DR_CONNECTOR(d);
>      sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> +    sPAPRDREntitySense state;
>  
>      DPRINTFN("drc reset: %x", drck->get_index(drc));
>      /* immediately upon reset we can safely assume DRCs whose devices
> @@ -436,6 +464,11 @@ static void reset(DeviceState *d)
>              drck->set_allocation_state(drc, SPAPR_DR_ALLOCATION_STATE_UNUSABLE);
>          }
>      }
> +
> +    drck->entity_sense(drc, &state);
> +    if (state == SPAPR_DR_ENTITY_SENSE_PRESENT) {
> +        drck->set_signalled(drc);
> +    }
>  }
>  
>  static void realize(DeviceState *d, Error **errp)
> @@ -594,6 +627,7 @@ static void spapr_dr_connector_class_init(ObjectClass *k, void *data)
>      drck->attach = attach;
>      drck->detach = detach;
>      drck->release_pending = release_pending;
> +    drck->set_signalled = set_signalled;
>      /*
>       * Reason: it crashes FIXME find and document the real reason
>       */
> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> index 39f4682..be8de63 100644
> --- a/hw/ppc/spapr_events.c
> +++ b/hw/ppc/spapr_events.c
> @@ -387,6 +387,13 @@ static void spapr_powerdown_req(Notifier *n, void *opaque)
>      qemu_irq_pulse(xics_get_qirq(spapr->icp, spapr->check_exception_irq));
>  }
>  
> +static void spapr_hotplug_set_signalled(uint32_t drc_index)
> +{
> +    sPAPRDRConnector *drc = spapr_dr_connector_by_index(drc_index);
> +    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> +    drck->set_signalled(drc);
> +}
> +
>  static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
>                                      sPAPRDRConnectorType drc_type,
>                                      uint32_t drc)
> @@ -453,6 +460,10 @@ static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
>  
>      rtas_event_log_queue(RTAS_LOG_TYPE_HOTPLUG, new_hp, true);
>  
> +    if (hp->hotplug_action == RTAS_LOG_V6_HP_ACTION_ADD) {
> +        spapr_hotplug_set_signalled(drc);
> +    }
> +
>      qemu_irq_pulse(xics_get_qirq(spapr->icp, spapr->check_exception_irq));
>  }
>  
> diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h
> index 7e56347..fa21ba0 100644
> --- a/include/hw/ppc/spapr_drc.h
> +++ b/include/hw/ppc/spapr_drc.h
> @@ -151,6 +151,7 @@ typedef struct sPAPRDRConnector {
>      bool configured;
>  
>      bool awaiting_release;
> +    bool signalled;
>  
>      /* device pointer, via link property */
>      DeviceState *dev;
> @@ -188,6 +189,7 @@ typedef struct sPAPRDRConnectorClass {
>                     spapr_drc_detach_cb *detach_cb,
>                     void *detach_cb_opaque, Error **errp);
>      bool (*release_pending)(sPAPRDRConnector *drc);
> +    void (*set_signalled)(sPAPRDRConnector *drc);
>  } sPAPRDRConnectorClass;
>  
>  sPAPRDRConnector *spapr_dr_connector_new(Object *owner,

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [Qemu-devel] [PATCH for-2.6] spapr_drc: enable immediate detach for unsignalled devices
  2016-04-01  2:33 ` David Gibson
@ 2016-04-01  3:47   ` Michael Roth
  2016-04-04  4:33     ` David Gibson
  0 siblings, 1 reply; 4+ messages in thread
From: Michael Roth @ 2016-04-01  3:47 UTC (permalink / raw)
  To: David Gibson; +Cc: qemu-ppc, sbhat, qemu-devel, bharata

Quoting David Gibson (2016-03-31 21:33:25)
> On Thu, Mar 31, 2016 at 05:27:37PM -0500, Michael Roth wrote:
> > Currently spapr doesn't support "aborting" hotplug of PCI
> > devices by allowing device_del to immediately remove the
> > device if we haven't signalled the presence of the device
> > to the guest.
> > 
> > In the past this wasn't an issue, since we always immediately
> > signalled device attach and simply relied on full guest-aware
> > add->remove path for device removal. However, as of 788d259,
> > we now defer signalling for PCI functions until function 0
> > is attached, so now we need to deal with these "abort" operations
> > for cases where a user hotplugs a non-0 function, then opts to
> > remove it prior hotplugging function 0. Currently they'd have to
> > reboot before the unplug completed. PCIe multifunction hotplug
> > does not have this requirement however, so from a management
> > implementation perspective it would be good to address this within
> > the same release as 788d259.
> > 
> > We accomplish this by simply adding a 'signalled' flag to track
> > whether a device hotplug event has been sent to the guest. If it
> > hasn't, we allow immediate removal under the assumption that the
> > guest will not be using the device. Devices present at boot/reset
> > time are also assumed to be 'signalled'.
> > 
> > For CPU/memory/etc, signalling will still happen immediately
> > as part of device_add, so only PCI functions should be affected.
> > 
> > Cc: bharata@linux.vnet.ibm.com
> > Cc: david@gibson.dropbear.id.au
> > Cc: sbhat@linux.vnet.ibm.com
> > Cc: qemu-ppc@nongnu.org
> > Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> 
> So, I'm disinclined to take this during the hard freeze, without some
> evidence that it fixes a problem case that's really being hit in
> practice.

The basic situation is that previously we had:

device_add virtio-net-pci,id=hp5.2,addr=0x5.2
  a1) plug device into slot
  a2) signal guest of attach
  a3) guest adds device
device_del hp5.2 
  d1) signal guest of detach
  d2) wait for guest to clean up device and signal completion
  d3) unplug device from slot

After 788d259 we have:

device_add virtio-net-pci,id=hp5.2,addr=0x5.2
  a1) plug device into slot
  a2) defer signalling until we see 0x5.0 added
device_del hp5.2 
  d1) defer signalling removal until we see 0x5.0 deleted
  d2) wait for guest to clean up device and signal completion

But d2) never happens because the guest was never signalled that
the device was added in the first place.

A real world situation where this might occur is admittedly a bit
contrived, but in practice I can certainly see it happening:

a) user hotplugs multifunction virtio-net-pci to slot 5, starting
   with function 3 at 0x5.3
b) user notices they made a mistake, for instance, they forget enable
   vhost in the accompanying netdev
c) user attempts to unplug function and "abort" the operation, but
   device does not go away

Shiva (on cc:) is working on the libvirt implementation for this
and hit this in testing. Since it differs in behavior from the PCIe
workflow it was based on (where aborts are explicitly allowed), and
causes a minor regression from 2.5, I thought it might be worth
considering for 2.6, but I can certainly understand if you opt to
hold off till 2.7.

> 
> > ---
> >  hw/ppc/spapr_drc.c         | 34 ++++++++++++++++++++++++++++++++++
> >  hw/ppc/spapr_events.c      | 11 +++++++++++
> >  include/hw/ppc/spapr_drc.h |  2 ++
> >  3 files changed, 47 insertions(+)
> > 
> > diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
> > index ef063c0..5568e44 100644
> > --- a/hw/ppc/spapr_drc.c
> > +++ b/hw/ppc/spapr_drc.c
> > @@ -173,6 +173,12 @@ static void set_configured(sPAPRDRConnector *drc)
> >      drc->configured = true;
> >  }
> >  
> > +/* has the guest been notified of device attachment? */
> > +static void set_signalled(sPAPRDRConnector *drc)
> > +{
> > +    drc->signalled = true;
> > +}
> > +
> >  /*
> >   * dr-entity-sense sensor value
> >   * returned via get-sensor-state RTAS calls
> > @@ -355,6 +361,7 @@ static void attach(sPAPRDRConnector *drc, DeviceState *d, void *fdt,
> >      drc->fdt = fdt;
> >      drc->fdt_start_offset = fdt_start_offset;
> >      drc->configured = coldplug;
> > +    drc->signalled = coldplug;
> >  
> >      object_property_add_link(OBJECT(drc), "device",
> >                               object_get_typename(OBJECT(drc->dev)),
> > @@ -371,6 +378,26 @@ static void detach(sPAPRDRConnector *drc, DeviceState *d,
> >      drc->detach_cb = detach_cb;
> >      drc->detach_cb_opaque = detach_cb_opaque;
> >  
> > +    /* if we've signalled device presence to the guest, or if the guest
> > +     * has gone ahead and configured the device (via manually-executed
> > +     * device add via drmgr in guest, namely), we need to wait
> > +     * for the guest to quiesce the device before completing detach.
> > +     * Otherwise, we can assume the guest hasn't seen it and complete the
> > +     * detach immediately. Note that there is a small race window
> > +     * just before, or during, configuration, which is this context
> > +     * refers mainly to fetching the device tree via RTAS.
> > +     * During this window the device access will be arbitrated by
> > +     * associated DRC, which will simply fail the RTAS calls as invalid.
> > +     * This is recoverable within guest and current implementations of
> > +     * drmgr should be able to cope.
> 
> Sorry, I'm really not following this description of the race, or
> seeing how it relates to allowing removal of "half plugged" devices.
> 

There's 2 stages we enter before the device is fully accessible within
the guest:

1) The 'signalled' state introduced by this patch, which we enter as soon
as we send a hotplug event to the guest.

2) The 'configuration' phase, which consists of RTAS calls against the
DRC and power domain for the slot to modify things like hotplug LED
indicator, allocation/isolation state, and fetching the device's device
tree. Once the device is done fetching the device tree it's considered
to be in 'configured' state according to state diagram in PAPR v2.6 13.4.
(note there's no accesses to the device over PCI bus during this time,
just RTAS calls).

We only allow immediate unplug/abort if we haven't signalled the device
to the guest, *and* if the device hasn't reached the 'configured' state.

Normally we wouldn't reach the configuration phase until after we enter
the signalled state, and since being in the signalled state prevents
immediate unplug, there's no race there.

The only race is if we haven't signalled the guest yet (for instance,
because the above example where we've only hotplugged function 0x5.3
and are waiting for function 0x5.0 before sending the hotplug event),
but someone manually ran 'drmgr -c pci -a -s <drc index> ...' in the
guest (perhaps because they were confused why the device didn't show
up). Then, while that command was running through the process of
bringing the device into the 'configured' state, they or someone else
executed device_del on the device. Since it hasn't been signalled,
and since it hasn't reached the configured state, it would be
immediately unplugged by QEMU. But since drmgr might not have finished
adding it yet, we have a case where the guest is attempting to access
the device while it's being removed. This is the race window. It's
pretty small and unlikely, but I figured it was worth mentioning.

I reason in the comment that since the 'configuration' phase consists
only of RTAS calls, and no actual accesses over the PCI bus, that this
would still result in the guest failing gracefully via an RTAS error
for executing an RTAS call against a DRC that no longer has a device
associated with in. Current implementations of drmgr would simply
abort the add operation and resume normal function without ever
adding the device to PCI subsystem. So during the race window an
'abort' should be fairly safe.

> > +     */
> > +    if (!drc->signalled && !drc->configured) {
> > +        /* if the guest hasn't seen the device we can't rely on it to
> > +         * set it back to an isolated state via RTAS, so do it here manually
> > +         */
> > +        drc->isolation_state = SPAPR_DR_ISOLATION_STATE_ISOLATED;
> > +    }
> > +
> >      if (drc->isolation_state != SPAPR_DR_ISOLATION_STATE_ISOLATED) {
> >          DPRINTFN("awaiting transition to isolated state before removal");
> >          drc->awaiting_release = true;
> > @@ -409,6 +436,7 @@ static void reset(DeviceState *d)
> >  {
> >      sPAPRDRConnector *drc = SPAPR_DR_CONNECTOR(d);
> >      sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> > +    sPAPRDREntitySense state;
> >  
> >      DPRINTFN("drc reset: %x", drck->get_index(drc));
> >      /* immediately upon reset we can safely assume DRCs whose devices
> > @@ -436,6 +464,11 @@ static void reset(DeviceState *d)
> >              drck->set_allocation_state(drc, SPAPR_DR_ALLOCATION_STATE_UNUSABLE);
> >          }
> >      }
> > +
> > +    drck->entity_sense(drc, &state);
> > +    if (state == SPAPR_DR_ENTITY_SENSE_PRESENT) {
> > +        drck->set_signalled(drc);
> > +    }
> >  }
> >  
> >  static void realize(DeviceState *d, Error **errp)
> > @@ -594,6 +627,7 @@ static void spapr_dr_connector_class_init(ObjectClass *k, void *data)
> >      drck->attach = attach;
> >      drck->detach = detach;
> >      drck->release_pending = release_pending;
> > +    drck->set_signalled = set_signalled;
> >      /*
> >       * Reason: it crashes FIXME find and document the real reason
> >       */
> > diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> > index 39f4682..be8de63 100644
> > --- a/hw/ppc/spapr_events.c
> > +++ b/hw/ppc/spapr_events.c
> > @@ -387,6 +387,13 @@ static void spapr_powerdown_req(Notifier *n, void *opaque)
> >      qemu_irq_pulse(xics_get_qirq(spapr->icp, spapr->check_exception_irq));
> >  }
> >  
> > +static void spapr_hotplug_set_signalled(uint32_t drc_index)
> > +{
> > +    sPAPRDRConnector *drc = spapr_dr_connector_by_index(drc_index);
> > +    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> > +    drck->set_signalled(drc);
> > +}
> > +
> >  static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
> >                                      sPAPRDRConnectorType drc_type,
> >                                      uint32_t drc)
> > @@ -453,6 +460,10 @@ static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
> >  
> >      rtas_event_log_queue(RTAS_LOG_TYPE_HOTPLUG, new_hp, true);
> >  
> > +    if (hp->hotplug_action == RTAS_LOG_V6_HP_ACTION_ADD) {
> > +        spapr_hotplug_set_signalled(drc);
> > +    }
> > +
> >      qemu_irq_pulse(xics_get_qirq(spapr->icp, spapr->check_exception_irq));
> >  }
> >  
> > diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h
> > index 7e56347..fa21ba0 100644
> > --- a/include/hw/ppc/spapr_drc.h
> > +++ b/include/hw/ppc/spapr_drc.h
> > @@ -151,6 +151,7 @@ typedef struct sPAPRDRConnector {
> >      bool configured;
> >  
> >      bool awaiting_release;
> > +    bool signalled;
> >  
> >      /* device pointer, via link property */
> >      DeviceState *dev;
> > @@ -188,6 +189,7 @@ typedef struct sPAPRDRConnectorClass {
> >                     spapr_drc_detach_cb *detach_cb,
> >                     void *detach_cb_opaque, Error **errp);
> >      bool (*release_pending)(sPAPRDRConnector *drc);
> > +    void (*set_signalled)(sPAPRDRConnector *drc);
> >  } sPAPRDRConnectorClass;
> >  
> >  sPAPRDRConnector *spapr_dr_connector_new(Object *owner,
> 
> -- 
> David Gibson                    | I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
>                                 | _way_ _around_!
> http://www.ozlabs.org/~dgibson

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [Qemu-devel] [PATCH for-2.6] spapr_drc: enable immediate detach for unsignalled devices
  2016-04-01  3:47   ` Michael Roth
@ 2016-04-04  4:33     ` David Gibson
  0 siblings, 0 replies; 4+ messages in thread
From: David Gibson @ 2016-04-04  4:33 UTC (permalink / raw)
  To: Michael Roth; +Cc: qemu-ppc, sbhat, qemu-devel, bharata

[-- Attachment #1: Type: text/plain, Size: 12802 bytes --]

On Thu, Mar 31, 2016 at 10:47:24PM -0500, Michael Roth wrote:
> Quoting David Gibson (2016-03-31 21:33:25)
> > On Thu, Mar 31, 2016 at 05:27:37PM -0500, Michael Roth wrote:
> > > Currently spapr doesn't support "aborting" hotplug of PCI
> > > devices by allowing device_del to immediately remove the
> > > device if we haven't signalled the presence of the device
> > > to the guest.
> > > 
> > > In the past this wasn't an issue, since we always immediately
> > > signalled device attach and simply relied on full guest-aware
> > > add->remove path for device removal. However, as of 788d259,
> > > we now defer signalling for PCI functions until function 0
> > > is attached, so now we need to deal with these "abort" operations
> > > for cases where a user hotplugs a non-0 function, then opts to
> > > remove it prior hotplugging function 0. Currently they'd have to
> > > reboot before the unplug completed. PCIe multifunction hotplug
> > > does not have this requirement however, so from a management
> > > implementation perspective it would be good to address this within
> > > the same release as 788d259.
> > > 
> > > We accomplish this by simply adding a 'signalled' flag to track
> > > whether a device hotplug event has been sent to the guest. If it
> > > hasn't, we allow immediate removal under the assumption that the
> > > guest will not be using the device. Devices present at boot/reset
> > > time are also assumed to be 'signalled'.
> > > 
> > > For CPU/memory/etc, signalling will still happen immediately
> > > as part of device_add, so only PCI functions should be affected.
> > > 
> > > Cc: bharata@linux.vnet.ibm.com
> > > Cc: david@gibson.dropbear.id.au
> > > Cc: sbhat@linux.vnet.ibm.com
> > > Cc: qemu-ppc@nongnu.org
> > > Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> > 
> > So, I'm disinclined to take this during the hard freeze, without some
> > evidence that it fixes a problem case that's really being hit in
> > practice.
> 
> The basic situation is that previously we had:
> 
> device_add virtio-net-pci,id=hp5.2,addr=0x5.2
>   a1) plug device into slot
>   a2) signal guest of attach
>   a3) guest adds device
> device_del hp5.2 
>   d1) signal guest of detach
>   d2) wait for guest to clean up device and signal completion
>   d3) unplug device from slot
> 
> After 788d259 we have:
> 
> device_add virtio-net-pci,id=hp5.2,addr=0x5.2
>   a1) plug device into slot
>   a2) defer signalling until we see 0x5.0 added
> device_del hp5.2 
>   d1) defer signalling removal until we see 0x5.0 deleted
>   d2) wait for guest to clean up device and signal completion
> 
> But d2) never happens because the guest was never signalled that
> the device was added in the first place.
> 
> A real world situation where this might occur is admittedly a bit
> contrived, but in practice I can certainly see it happening:
> 
> a) user hotplugs multifunction virtio-net-pci to slot 5, starting
>    with function 3 at 0x5.3
> b) user notices they made a mistake, for instance, they forget enable
>    vhost in the accompanying netdev
> c) user attempts to unplug function and "abort" the operation, but
>    device does not go away
> 
> Shiva (on cc:) is working on the libvirt implementation for this
> and hit this in testing. Since it differs in behavior from the PCIe
> workflow it was based on (where aborts are explicitly allowed), and
> causes a minor regression from 2.5, I thought it might be worth
> considering for 2.6, but I can certainly understand if you opt to
> hold off till 2.7.

Ah.. yes, I hadn't registered the fact that this was a regression.

Given that, I think it's probably reasonable for 2.6.

> 
> > 
> > > ---
> > >  hw/ppc/spapr_drc.c         | 34 ++++++++++++++++++++++++++++++++++
> > >  hw/ppc/spapr_events.c      | 11 +++++++++++
> > >  include/hw/ppc/spapr_drc.h |  2 ++
> > >  3 files changed, 47 insertions(+)
> > > 
> > > diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
> > > index ef063c0..5568e44 100644
> > > --- a/hw/ppc/spapr_drc.c
> > > +++ b/hw/ppc/spapr_drc.c
> > > @@ -173,6 +173,12 @@ static void set_configured(sPAPRDRConnector *drc)
> > >      drc->configured = true;
> > >  }
> > >  
> > > +/* has the guest been notified of device attachment? */
> > > +static void set_signalled(sPAPRDRConnector *drc)
> > > +{
> > > +    drc->signalled = true;
> > > +}
> > > +
> > >  /*
> > >   * dr-entity-sense sensor value
> > >   * returned via get-sensor-state RTAS calls
> > > @@ -355,6 +361,7 @@ static void attach(sPAPRDRConnector *drc, DeviceState *d, void *fdt,
> > >      drc->fdt = fdt;
> > >      drc->fdt_start_offset = fdt_start_offset;
> > >      drc->configured = coldplug;
> > > +    drc->signalled = coldplug;
> > >  
> > >      object_property_add_link(OBJECT(drc), "device",
> > >                               object_get_typename(OBJECT(drc->dev)),
> > > @@ -371,6 +378,26 @@ static void detach(sPAPRDRConnector *drc, DeviceState *d,
> > >      drc->detach_cb = detach_cb;
> > >      drc->detach_cb_opaque = detach_cb_opaque;
> > >  
> > > +    /* if we've signalled device presence to the guest, or if the guest
> > > +     * has gone ahead and configured the device (via manually-executed
> > > +     * device add via drmgr in guest, namely), we need to wait
> > > +     * for the guest to quiesce the device before completing detach.
> > > +     * Otherwise, we can assume the guest hasn't seen it and complete the
> > > +     * detach immediately. Note that there is a small race window
> > > +     * just before, or during, configuration, which is this context
> > > +     * refers mainly to fetching the device tree via RTAS.
> > > +     * During this window the device access will be arbitrated by
> > > +     * associated DRC, which will simply fail the RTAS calls as invalid.
> > > +     * This is recoverable within guest and current implementations of
> > > +     * drmgr should be able to cope.
> > 
> > Sorry, I'm really not following this description of the race, or
> > seeing how it relates to allowing removal of "half plugged" devices.
> > 
> 
> There's 2 stages we enter before the device is fully accessible within
> the guest:
> 
> 1) The 'signalled' state introduced by this patch, which we enter as soon
> as we send a hotplug event to the guest.
> 
> 2) The 'configuration' phase, which consists of RTAS calls against the
> DRC and power domain for the slot to modify things like hotplug LED
> indicator, allocation/isolation state, and fetching the device's device
> tree. Once the device is done fetching the device tree it's considered
> to be in 'configured' state according to state diagram in PAPR v2.6 13.4.
> (note there's no accesses to the device over PCI bus during this time,
> just RTAS calls).
> 
> We only allow immediate unplug/abort if we haven't signalled the device
> to the guest, *and* if the device hasn't reached the 'configured' state.
> 
> Normally we wouldn't reach the configuration phase until after we enter
> the signalled state, and since being in the signalled state prevents
> immediate unplug, there's no race there.
> 
> The only race is if we haven't signalled the guest yet (for instance,
> because the above example where we've only hotplugged function 0x5.3
> and are waiting for function 0x5.0 before sending the hotplug event),
> but someone manually ran 'drmgr -c pci -a -s <drc index> ...' in the
> guest (perhaps because they were confused why the device didn't show
> up). Then, while that command was running through the process of
> bringing the device into the 'configured' state, they or someone else
> executed device_del on the device. Since it hasn't been signalled,
> and since it hasn't reached the configured state, it would be
> immediately unplugged by QEMU. But since drmgr might not have finished
> adding it yet, we have a case where the guest is attempting to access
> the device while it's being removed. This is the race window. It's
> pretty small and unlikely, but I figured it was worth mentioning.
> 
> I reason in the comment that since the 'configuration' phase consists
> only of RTAS calls, and no actual accesses over the PCI bus, that this
> would still result in the guest failing gracefully via an RTAS error
> for executing an RTAS call against a DRC that no longer has a device
> associated with in. Current implementations of drmgr would simply
> abort the add operation and resume normal function without ever
> adding the device to PCI subsystem. So during the race window an
> 'abort' should be fairly safe.
> 
> > > +     */
> > > +    if (!drc->signalled && !drc->configured) {
> > > +        /* if the guest hasn't seen the device we can't rely on it to
> > > +         * set it back to an isolated state via RTAS, so do it here manually
> > > +         */
> > > +        drc->isolation_state = SPAPR_DR_ISOLATION_STATE_ISOLATED;
> > > +    }
> > > +
> > >      if (drc->isolation_state != SPAPR_DR_ISOLATION_STATE_ISOLATED) {
> > >          DPRINTFN("awaiting transition to isolated state before removal");
> > >          drc->awaiting_release = true;
> > > @@ -409,6 +436,7 @@ static void reset(DeviceState *d)
> > >  {
> > >      sPAPRDRConnector *drc = SPAPR_DR_CONNECTOR(d);
> > >      sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> > > +    sPAPRDREntitySense state;
> > >  
> > >      DPRINTFN("drc reset: %x", drck->get_index(drc));
> > >      /* immediately upon reset we can safely assume DRCs whose devices
> > > @@ -436,6 +464,11 @@ static void reset(DeviceState *d)
> > >              drck->set_allocation_state(drc, SPAPR_DR_ALLOCATION_STATE_UNUSABLE);
> > >          }
> > >      }
> > > +
> > > +    drck->entity_sense(drc, &state);
> > > +    if (state == SPAPR_DR_ENTITY_SENSE_PRESENT) {
> > > +        drck->set_signalled(drc);
> > > +    }
> > >  }
> > >  
> > >  static void realize(DeviceState *d, Error **errp)
> > > @@ -594,6 +627,7 @@ static void spapr_dr_connector_class_init(ObjectClass *k, void *data)
> > >      drck->attach = attach;
> > >      drck->detach = detach;
> > >      drck->release_pending = release_pending;
> > > +    drck->set_signalled = set_signalled;
> > >      /*
> > >       * Reason: it crashes FIXME find and document the real reason
> > >       */
> > > diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> > > index 39f4682..be8de63 100644
> > > --- a/hw/ppc/spapr_events.c
> > > +++ b/hw/ppc/spapr_events.c
> > > @@ -387,6 +387,13 @@ static void spapr_powerdown_req(Notifier *n, void *opaque)
> > >      qemu_irq_pulse(xics_get_qirq(spapr->icp, spapr->check_exception_irq));
> > >  }
> > >  
> > > +static void spapr_hotplug_set_signalled(uint32_t drc_index)
> > > +{
> > > +    sPAPRDRConnector *drc = spapr_dr_connector_by_index(drc_index);
> > > +    sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> > > +    drck->set_signalled(drc);
> > > +}
> > > +
> > >  static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
> > >                                      sPAPRDRConnectorType drc_type,
> > >                                      uint32_t drc)
> > > @@ -453,6 +460,10 @@ static void spapr_hotplug_req_event(uint8_t hp_id, uint8_t hp_action,
> > >  
> > >      rtas_event_log_queue(RTAS_LOG_TYPE_HOTPLUG, new_hp, true);
> > >  
> > > +    if (hp->hotplug_action == RTAS_LOG_V6_HP_ACTION_ADD) {
> > > +        spapr_hotplug_set_signalled(drc);
> > > +    }
> > > +
> > >      qemu_irq_pulse(xics_get_qirq(spapr->icp, spapr->check_exception_irq));
> > >  }
> > >  
> > > diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h
> > > index 7e56347..fa21ba0 100644
> > > --- a/include/hw/ppc/spapr_drc.h
> > > +++ b/include/hw/ppc/spapr_drc.h
> > > @@ -151,6 +151,7 @@ typedef struct sPAPRDRConnector {
> > >      bool configured;
> > >  
> > >      bool awaiting_release;
> > > +    bool signalled;
> > >  
> > >      /* device pointer, via link property */
> > >      DeviceState *dev;
> > > @@ -188,6 +189,7 @@ typedef struct sPAPRDRConnectorClass {
> > >                     spapr_drc_detach_cb *detach_cb,
> > >                     void *detach_cb_opaque, Error **errp);
> > >      bool (*release_pending)(sPAPRDRConnector *drc);
> > > +    void (*set_signalled)(sPAPRDRConnector *drc);
> > >  } sPAPRDRConnectorClass;
> > >  
> > >  sPAPRDRConnector *spapr_dr_connector_new(Object *owner,
> > 
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2016-04-04  6:07 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-03-31 22:27 [Qemu-devel] [PATCH for-2.6] spapr_drc: enable immediate detach for unsignalled devices Michael Roth
2016-04-01  2:33 ` David Gibson
2016-04-01  3:47   ` Michael Roth
2016-04-04  4:33     ` David Gibson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.