All of lore.kernel.org
 help / color / mirror / Atom feed
From: Klaus Jensen <its@irrelevant.dk>
To: Stefan Hajnoczi <stefanha@redhat.com>
Cc: qemu-block@nongnu.org, "Klaus Jensen" <k.jensen@samsung.com>,
	"Markus Armbruster" <armbru@redhat.com>,
	qemu-devel@nongnu.org, "Hannes Reinecke" <hare@suse.de>,
	"Keith Busch" <kbusch@kernel.org>,
	"Philippe Mathieu-Daudé" <philmd@redhat.com>
Subject: Re: [PATCH 4/4] hw/nvme: fix controller hot unplugging
Date: Wed, 7 Jul 2021 18:47:50 +0200	[thread overview]
Message-ID: <YOXatgr6weL1zy9s@apples.localdomain> (raw)
In-Reply-To: <YOXS4QFXOPXLW5//@stefanha-x1.localdomain>

[-- Attachment #1: Type: text/plain, Size: 11858 bytes --]

On Jul  7 17:14, Stefan Hajnoczi wrote:
>On Wed, Jul 07, 2021 at 12:43:56PM +0200, Hannes Reinecke wrote:
>> On 7/7/21 11:53 AM, Klaus Jensen wrote:
>> > On Jul  7 09:49, Hannes Reinecke wrote:
>> > > On 7/6/21 11:33 AM, Klaus Jensen wrote:
>> > > > From: Klaus Jensen <k.jensen@samsung.com>
>> > > >
>> > > > Prior to this patch the nvme-ns devices are always children of the
>> > > > NvmeBus owned by the NvmeCtrl. This causes the namespaces to be
>> > > > unrealized when the parent device is removed. However, when subsystems
>> > > > are involved, this is not what we want since the namespaces may be
>> > > > attached to other controllers as well.
>> > > >
>> > > > This patch adds an additional NvmeBus on the subsystem device. When
>> > > > nvme-ns devices are realized, if the parent controller device is linked
>> > > > to a subsystem, the parent bus is set to the subsystem one instead. This
>> > > > makes sure that namespaces are kept alive and not unrealized.
>> > > >
>> > > > Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
>> > > > ---
>> > > >  hw/nvme/nvme.h   | 18 ++++++++++--------
>> > > >  hw/nvme/ctrl.c   |  8 +++++---
>> > > >  hw/nvme/ns.c     | 32 +++++++++++++++++++++++++-------
>> > > >  hw/nvme/subsys.c |  4 ++++
>> > > >  4 files changed, 44 insertions(+), 18 deletions(-)
>> > > >
>> > > > diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
>> > > > index c4065467d877..9401e212f9f7 100644
>> > > > --- a/hw/nvme/nvme.h
>> > > > +++ b/hw/nvme/nvme.h
>> > > > @@ -33,12 +33,21 @@ QEMU_BUILD_BUG_ON(NVME_MAX_NAMESPACES >
>> > > > NVME_NSID_BROADCAST - 1);
>> > > >  typedef struct NvmeCtrl NvmeCtrl;
>> > > >  typedef struct NvmeNamespace NvmeNamespace;
>> > > > +#define TYPE_NVME_BUS "nvme-bus"
>> > > > +OBJECT_DECLARE_SIMPLE_TYPE(NvmeBus, NVME_BUS)
>> > > > +
>> > > > +typedef struct NvmeBus {
>> > > > +    BusState parent_bus;
>> > > > +    bool     is_subsys;
>> > > > +} NvmeBus;
>> > > > +
>> > > >  #define TYPE_NVME_SUBSYS "nvme-subsys"
>> > > >  #define NVME_SUBSYS(obj) \
>> > > >      OBJECT_CHECK(NvmeSubsystem, (obj), TYPE_NVME_SUBSYS)
>> > > >  typedef struct NvmeSubsystem {
>> > > >      DeviceState parent_obj;
>> > > > +    NvmeBus     bus;
>> > > >      uint8_t     subnqn[256];
>> > > >      NvmeCtrl      *ctrls[NVME_MAX_CONTROLLERS];
>> > > > @@ -365,13 +374,6 @@ typedef struct NvmeCQueue {
>> > > >      QTAILQ_HEAD(, NvmeRequest) req_list;
>> > > >  } NvmeCQueue;
>> > > > -#define TYPE_NVME_BUS "nvme-bus"
>> > > > -#define NVME_BUS(obj) OBJECT_CHECK(NvmeBus, (obj), TYPE_NVME_BUS)
>> > > > -
>> > > > -typedef struct NvmeBus {
>> > > > -    BusState parent_bus;
>> > > > -} NvmeBus;
>> > > > -
>> > > >  #define TYPE_NVME "nvme"
>> > > >  #define NVME(obj) \
>> > > >          OBJECT_CHECK(NvmeCtrl, (obj), TYPE_NVME)
>> > > > @@ -463,7 +465,7 @@ typedef struct NvmeCtrl {
>> > > >  static inline NvmeNamespace *nvme_ns(NvmeCtrl *n, uint32_t nsid)
>> > > >  {
>> > > > -    if (!nsid || nsid > NVME_MAX_NAMESPACES) {
>> > > > +    if (!n || !nsid || nsid > NVME_MAX_NAMESPACES) {
>> > > >          return NULL;
>> > > >      }
>> > > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
>> > > > index 90e3ee2b70ee..7c8fca36d9a5 100644
>> > > > --- a/hw/nvme/ctrl.c
>> > > > +++ b/hw/nvme/ctrl.c
>> > > > @@ -6516,11 +6516,13 @@ static void nvme_exit(PCIDevice *pci_dev)
>> > > >      for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
>> > > >          ns = nvme_ns(n, i);
>> > > > -        if (!ns) {
>> > > > -            continue;
>> > > > +        if (ns) {
>> > > > +            ns->attached--;
>> > > >          }
>> > > > +    }
>> > > > -        nvme_ns_cleanup(ns);
>> > > > +    if (n->subsys) {
>> > > > +        nvme_subsys_unregister_ctrl(n->subsys, n);
>> > > >      }
>> > > >      if (n->subsys) {
>> > > > diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
>> > > > index 3c4f5b8c714a..612a2786d75d 100644
>> > > > --- a/hw/nvme/ns.c
>> > > > +++ b/hw/nvme/ns.c
>> > > > @@ -444,13 +444,29 @@ void nvme_ns_cleanup(NvmeNamespace *ns)
>> > > >  static void nvme_ns_realize(DeviceState *dev, Error **errp)
>> > > >  {
>> > > >      NvmeNamespace *ns = NVME_NS(dev);
>> > > > -    BusState *s = qdev_get_parent_bus(dev);
>> > > > -    NvmeCtrl *n = NVME(s->parent);
>> > > > -    NvmeSubsystem *subsys = n->subsys;
>> > > > +    BusState *qbus = qdev_get_parent_bus(dev);
>> > > > +    NvmeBus *bus = NVME_BUS(qbus);
>> > > > +    NvmeCtrl *n = NULL;
>> > > > +    NvmeSubsystem *subsys = NULL;
>> > > >      uint32_t nsid = ns->params.nsid;
>> > > >      int i;
>> > > > -    if (!n->subsys) {
>> > > > +    if (bus->is_subsys) {
>> > > > +        subsys = NVME_SUBSYS(qbus->parent);
>> > > > +    } else {
>> > > > +        n = NVME(qbus->parent);
>> > > > +        subsys = n->subsys;
>> > > > +    }
>> > > > +
>> > > > +    if (subsys) {
>> > > > +        /*
>> > > > +         * If this namespace belongs to a subsystem (through a
>> > > > link on the
>> > > > +         * controller device), reparent the device.
>> > > > +         */
>> > > > +        if (!qdev_set_parent_bus(dev, &subsys->bus.parent_bus, errp)) {
>> > > > +            return;
>> > > > +        }
>> > > > +    } else {
>> > > >          if (ns->params.detached) {
>> > > >              error_setg(errp, "detached requires that the nvme
>> > > > device is "
>> > > >                         "linked to an nvme-subsys device");
>> > > > @@ -470,7 +486,7 @@ static void nvme_ns_realize(DeviceState
>> > > > *dev, Error **errp)
>> > > >      if (!nsid) {
>> > > >          for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
>> > > > -            if (nvme_ns(n, i) || nvme_subsys_ns(subsys, i)) {
>> > > > +            if (nvme_subsys_ns(subsys, i) || nvme_ns(n, i)) {
>> > > >                  continue;
>> > > >              }
>> > > > @@ -483,7 +499,7 @@ static void nvme_ns_realize(DeviceState
>> > > > *dev, Error **errp)
>> > > >              return;
>> > > >          }
>> > > >      } else {
>> > > > -        if (nvme_ns(n, nsid) || nvme_subsys_ns(subsys, nsid)) {
>> > > > +        if (nvme_subsys_ns(subsys, nsid) || nvme_ns(n, nsid)) {
>> > > >              error_setg(errp, "namespace id '%d' already
>> > > > allocated", nsid);
>> > > >              return;
>> > > >          }
>> > > > @@ -509,7 +525,9 @@ static void nvme_ns_realize(DeviceState
>> > > > *dev, Error **errp)
>> > > >          }
>> > > >      }
>> > > > -    nvme_attach_ns(n, ns);
>> > > > +    if (n) {
>> > > > +        nvme_attach_ns(n, ns);
>> > > > +    }
>> > > >  }
>> > > >  static Property nvme_ns_props[] = {
>> > > > diff --git a/hw/nvme/subsys.c b/hw/nvme/subsys.c
>> > > > index 92caa604a280..fb7c3a7c55fc 100644
>> > > > --- a/hw/nvme/subsys.c
>> > > > +++ b/hw/nvme/subsys.c
>> > > > @@ -50,6 +50,10 @@ static void nvme_subsys_realize(DeviceState
>> > > > *dev, Error **errp)
>> > > >  {
>> > > >      NvmeSubsystem *subsys = NVME_SUBSYS(dev);
>> > > > +    qbus_create_inplace(&subsys->bus, sizeof(NvmeBus),
>> > > > TYPE_NVME_BUS, dev,
>> > > > +                        dev->id);
>> > > > +    subsys->bus.is_subsys = true;
>> > > > +
>> > > >      nvme_subsys_setup(subsys);
>> > > >  }
>> > > >
>> > > Why not always create a subsystem, and distinguish between 'shared'
>> > > and 'non-shared' subsystems instead of the ->subsys pointer?
>> > > That way all namespaces can be children of the subsystem, we won't
>> > > need any reparenting, and the whole thing will be more in-line with
>> > > qdev, no?
>> > >
>> >
>> > Hi Hannes,
>> >
>> > Thanks for your reviews and comments!
>> >
>> > So, I have actually considered what you suggest.
>> >
>> > The problem is that the nvme-ns device currently expects to plug into a
>> > bus (an 'nvme-bus') and we cannot really get rid of that 'bus' parameter
>> > without breaking compatibility. I experimented with removing the bus
>> > from the nvme device and instead creating it in the nvme-subsys device.
>> > My idea here was to implicitly always create an nvme-subsys if not
>> > explicitly created by the user, but since nvme-subsys does not plug into
>> > any bus itself, it is not exposed in the qtree and thus not reachable
>> > (i.e., when realizing the nvme-ns device, it will complain that there
>> > are no 'nvme-bus' available to plug into). To make this work, I believe
>> > we'd have to have the nvme-subsys plug into the main system bus (or some
>> > other bus rooted at main-system-bus), and I'd prefer not to do that
>> > since this is already a flawed design and I think it would be more
>> > intrusive than what my patch does.
>> >
>> Sigh.
>> _Again_.
>>
>> I seem to trip over issues for which no patch can possibly be accepted as
>> first the infrastructure has to be reworked.
>>
>> ... there is a reason why I'm avoiding posting patches to qemu-devel :-(
>>
>> > I raised these issues on the mailinglist some time ago[1]. We didn't
>> > really find a good solution, but I think the conclusion is that the
>> > bus/device design of subsystems and namespaces is flawed. That's why I
>> > am working on an "objectification" of the two devices as suggested by
>> > Stefan[2] as a proper fix for this mess.
>> >
>> Actually, I _do_ think that it would be good to have an nvme-subsystem bus,
>> providing a list of all namespaces for that subsystem.
>> Creating a controller would then mean that one has to create a controller
>> and a namespace _separately_, as then the namespace would _not_ be a child
>> of the controller.
>> But that's arguably a good thing, as the namespaces _do_ have a separate
>> lifetime from controllers.
>> And the lifetime of the subsystem could be completely self-contained; the
>> controller would be looking up subsystems via the subsystem NQN if present;
>> I guess we'll need to create ad-hoc subsystems for PCIe.
>> But nothing insurmountable.
>> And certainly nothing which require a large-scale rework of qemu internals,
>> I would think.
>
>I don't remember the reason to use --object instead of --device. Maybe
>the --device approach is viable and it makes some sense because this is
>a "device" (like a SCSI LUN) that exposed to the guest. It would be
>necessary to have a nvme-subsys device on the system bus to act as the
>bus for nvme-ns devices as Klaus described.
>

A namespace (-device nvme-ns) does not expose anything on its own to the 
guest. Whatever the namespace provides (basically the association to a 
-drive) is accessed through the pcie controller (-device nvme). 
Similarly, the subsystem is just a concept for allowing multiple 
controllers to access the same namespace. And this is where the model 
clash because the parent/child relationship doesnt fit. Reversing the 
relationship (controller is a child of a namespace) also doesnt work 
since a contrller may have multiple namespace attached. A many-to-many 
relationship is required which cannot be expresseed with QDEV as far as 
I understand.

Talk is cheap, but I have a WIP that shims the -device's and moves the 
core functionality of namespaces and subsystems to -objects. This makes 
the setup much more flexible and fits QDEV/QOM much better.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

  reply	other threads:[~2021-07-07 16:51 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-07-06  9:33 [PATCH 0/4] hw/nvme: fix controller hotplugging Klaus Jensen
2021-07-06  9:33 ` [PATCH 1/4] hw/nvme: remove NvmeCtrl parameter from ns setup/check functions Klaus Jensen
2021-07-07  7:45   ` Hannes Reinecke
2021-07-06  9:33 ` [PATCH 2/4] hw/nvme: mark nvme-subsys non-hotpluggable Klaus Jensen
2021-07-07  7:45   ` Hannes Reinecke
2021-07-06  9:33 ` [PATCH 3/4] hw/nvme: unregister controller with subsystem at exit Klaus Jensen
2021-07-07  7:46   ` Hannes Reinecke
2021-07-06  9:33 ` [PATCH 4/4] hw/nvme: fix controller hot unplugging Klaus Jensen
2021-07-07  7:49   ` Hannes Reinecke
2021-07-07  9:53     ` Klaus Jensen
2021-07-07 10:43       ` Hannes Reinecke
2021-07-07 15:41         ` Klaus Jensen
2021-07-07 16:14         ` Stefan Hajnoczi
2021-07-07 16:47           ` Klaus Jensen [this message]
2021-07-08  6:15           ` Hannes Reinecke
2021-07-07 10:02   ` Klaus Jensen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=YOXatgr6weL1zy9s@apples.localdomain \
    --to=its@irrelevant.dk \
    --cc=armbru@redhat.com \
    --cc=hare@suse.de \
    --cc=k.jensen@samsung.com \
    --cc=kbusch@kernel.org \
    --cc=philmd@redhat.com \
    --cc=qemu-block@nongnu.org \
    --cc=qemu-devel@nongnu.org \
    --cc=stefanha@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.