linux-pci.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] PCI: vmd: Add indirection layer to vmd irq lists
@ 2019-10-31 13:08 Jon Derrick
  2019-10-31 23:11 ` Paul E. McKenney
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Jon Derrick @ 2019-10-31 13:08 UTC (permalink / raw)
  To: Lorenzo Pieralisi; +Cc: linux-pci, Paul McKenney, Bjorn Helgaas, Jon Derrick

With CONFIG_MAXSMP and CONFIG_PROVE_LOCKING, the size of an srcu_struct can
grow quite large. In one compilation instance it produced a 74KiB data
structure. These are embedded in the vmd_irq_list struct, and a N=64 allocation
can exceed MAX_ORDER, violating reclaim rules.

  struct srcu_struct {
          struct srcu_node   node[521];                    /*     0 75024 */
          /* --- cacheline 1172 boundary (75008 bytes) was 16 bytes ago --- */
          struct srcu_node *         level[4];             /* 75024    32 */
          struct mutex       srcu_cb_mutex;                /* 75056   128 */
          /* --- cacheline 1174 boundary (75136 bytes) was 48 bytes ago --- */
          spinlock_t                 lock;                 /* 75184    56 */
          /* --- cacheline 1175 boundary (75200 bytes) was 40 bytes ago --- */
          struct mutex       srcu_gp_mutex;                /* 75240   128 */
          /* --- cacheline 1177 boundary (75328 bytes) was 40 bytes ago --- */
          unsigned int               srcu_idx;             /* 75368     4 */

          /* XXX 4 bytes hole, try to pack */

          long unsigned int          srcu_gp_seq;          /* 75376     8 */
          long unsigned int          srcu_gp_seq_needed;   /* 75384     8 */
          /* --- cacheline 1178 boundary (75392 bytes) --- */
          long unsigned int          srcu_gp_seq_needed_exp; /* 75392     8 */
          long unsigned int          srcu_last_gp_end;     /* 75400     8 */
          struct srcu_data *         sda;                  /* 75408     8 */
          long unsigned int          srcu_barrier_seq;     /* 75416     8 */
          struct mutex       srcu_barrier_mutex;           /* 75424   128 */
          /* --- cacheline 1180 boundary (75520 bytes) was 32 bytes ago --- */
          struct completion  srcu_barrier_completion;      /* 75552    80 */
          /* --- cacheline 1181 boundary (75584 bytes) was 48 bytes ago --- */
          atomic_t                   srcu_barrier_cpu_cnt; /* 75632     4 */

          /* XXX 4 bytes hole, try to pack */

          struct delayed_work work;                        /* 75640   152 */

          /* XXX last struct has 4 bytes of padding */

          /* --- cacheline 1184 boundary (75776 bytes) was 16 bytes ago --- */
          struct lockdep_map dep_map;                      /* 75792    32 */

          /* size: 75824, cachelines: 1185, members: 17 */
          /* sum members: 75816, holes: 2, sum holes: 8 */
          /* paddings: 1, sum paddings: 4 */
          /* last cacheline: 48 bytes */
  };

With N=64 VMD IRQ lists, this would allocate 4.6MiB in a single call. This
violates MAX_ORDER reclaim rules when PAGE_SIZE=4096 and
MAX_ORDER_NR_PAGES=1024, and invokes the following warning in mm/page_alloc.c:

  /*
   * There are several places where we assume that the order value is sane
   * so bail out early if the request is out of bound.
   */
  if (unlikely(order >= MAX_ORDER)) {
  	WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
  	return NULL;
  }

This patch changes the irq list array into an array of pointers to irq
lists to avoid allocation failures with greater msix counts.

This patch also reverts commit b31822277abcd7c83d1c1c0af876da9ccdf3b7d6.
The index_from_irqs() helper was added to calculate the irq list index
from the array of irqs, in order to shrink vmd_irq_list for performance.

Due to the embedded srcu_struct within the vmd_irq_list struct having a
varying size depending on a number of factors, the vmd_irq_list struct
no longer guarantees optimal data structure size and granularity.

Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
---
Added Paul to make him aware of srcu_struct size with these options

v1->v2:
Squashed the revert into this commit
changed n=1 kcalloc to kzalloc

 drivers/pci/controller/vmd.c | 47 ++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index a35d3f3..8bce647 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -82,6 +82,7 @@ struct vmd_irq_list {
 	struct list_head	irq_list;
 	struct srcu_struct	srcu;
 	unsigned int		count;
+	unsigned int		index;
 };
 
 struct vmd_dev {
@@ -91,7 +92,7 @@ struct vmd_dev {
 	char __iomem		*cfgbar;
 
 	int msix_count;
-	struct vmd_irq_list	*irqs;
+	struct vmd_irq_list	**irqs;
 
 	struct pci_sysdata	sysdata;
 	struct resource		resources[3];
@@ -108,12 +109,6 @@ static inline struct vmd_dev *vmd_from_bus(struct pci_bus *bus)
 	return container_of(bus->sysdata, struct vmd_dev, sysdata);
 }
 
-static inline unsigned int index_from_irqs(struct vmd_dev *vmd,
-					   struct vmd_irq_list *irqs)
-{
-	return irqs - vmd->irqs;
-}
-
 /*
  * Drivers managing a device in a VMD domain allocate their own IRQs as before,
  * but the MSI entry for the hardware it's driving will be programmed with a
@@ -126,11 +121,10 @@ static void vmd_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 {
 	struct vmd_irq *vmdirq = data->chip_data;
 	struct vmd_irq_list *irq = vmdirq->irq;
-	struct vmd_dev *vmd = irq_data_get_irq_handler_data(data);
 
 	msg->address_hi = MSI_ADDR_BASE_HI;
 	msg->address_lo = MSI_ADDR_BASE_LO |
-			  MSI_ADDR_DEST_ID(index_from_irqs(vmd, irq));
+			  MSI_ADDR_DEST_ID(irq->index);
 	msg->data = 0;
 }
 
@@ -200,7 +194,7 @@ static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd, struct msi_desc *d
 	unsigned long flags;
 
 	if (vmd->msix_count == 1)
-		return &vmd->irqs[0];
+		return vmd->irqs[0];
 
 	/*
 	 * White list for fast-interrupt handlers. All others will share the
@@ -210,17 +204,17 @@ static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd, struct msi_desc *d
 	case PCI_CLASS_STORAGE_EXPRESS:
 		break;
 	default:
-		return &vmd->irqs[0];
+		return vmd->irqs[0];
 	}
 
 	raw_spin_lock_irqsave(&list_lock, flags);
 	for (i = 1; i < vmd->msix_count; i++)
-		if (vmd->irqs[i].count < vmd->irqs[best].count)
+		if (vmd->irqs[i]->count < vmd->irqs[best]->count)
 			best = i;
-	vmd->irqs[best].count++;
+	vmd->irqs[best]->count++;
 	raw_spin_unlock_irqrestore(&list_lock, flags);
 
-	return &vmd->irqs[best];
+	return vmd->irqs[best];
 }
 
 static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info,
@@ -230,7 +224,7 @@ static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info,
 	struct msi_desc *desc = arg->desc;
 	struct vmd_dev *vmd = vmd_from_bus(msi_desc_to_pci_dev(desc)->bus);
 	struct vmd_irq *vmdirq = kzalloc(sizeof(*vmdirq), GFP_KERNEL);
-	unsigned int index, vector;
+	unsigned int vector;
 
 	if (!vmdirq)
 		return -ENOMEM;
@@ -238,8 +232,7 @@ static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info,
 	INIT_LIST_HEAD(&vmdirq->node);
 	vmdirq->irq = vmd_next_irq(vmd, desc);
 	vmdirq->virq = virq;
-	index = index_from_irqs(vmd, vmdirq->irq);
-	vector = pci_irq_vector(vmd->dev, index);
+	vector = pci_irq_vector(vmd->dev, vmdirq->irq->index);
 
 	irq_domain_set_info(domain, virq, vector, info->chip, vmdirq,
 			    handle_untracked_irq, vmd, NULL);
@@ -771,14 +764,22 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
 		return -ENOMEM;
 
 	for (i = 0; i < vmd->msix_count; i++) {
-		err = init_srcu_struct(&vmd->irqs[i].srcu);
+		vmd->irqs[i] = devm_kzalloc(&dev->dev, sizeof(**vmd->irqs),
+					    GFP_KERNEL);
+		if (!vmd->irqs[i])
+			return -ENOMEM;
+	}
+
+	for (i = 0; i < vmd->msix_count; i++) {
+		err = init_srcu_struct(&vmd->irqs[i]->srcu);
 		if (err)
 			return err;
 
-		INIT_LIST_HEAD(&vmd->irqs[i].irq_list);
+		INIT_LIST_HEAD(&vmd->irqs[i]->irq_list);
+		vmd->irqs[i]->index = i;
 		err = devm_request_irq(&dev->dev, pci_irq_vector(dev, i),
 				       vmd_irq, IRQF_NO_THREAD,
-				       "vmd", &vmd->irqs[i]);
+				       "vmd", vmd->irqs[i]);
 		if (err)
 			return err;
 	}
@@ -799,7 +800,7 @@ static void vmd_cleanup_srcu(struct vmd_dev *vmd)
 	int i;
 
 	for (i = 0; i < vmd->msix_count; i++)
-		cleanup_srcu_struct(&vmd->irqs[i].srcu);
+		cleanup_srcu_struct(&vmd->irqs[i]->srcu);
 }
 
 static void vmd_remove(struct pci_dev *dev)
@@ -823,7 +824,7 @@ static int vmd_suspend(struct device *dev)
 	int i;
 
 	for (i = 0; i < vmd->msix_count; i++)
-                devm_free_irq(dev, pci_irq_vector(pdev, i), &vmd->irqs[i]);
+                devm_free_irq(dev, pci_irq_vector(pdev, i), vmd->irqs[i]);
 
 	pci_save_state(pdev);
 	return 0;
@@ -838,7 +839,7 @@ static int vmd_resume(struct device *dev)
 	for (i = 0; i < vmd->msix_count; i++) {
 		err = devm_request_irq(dev, pci_irq_vector(pdev, i),
 				       vmd_irq, IRQF_NO_THREAD,
-				       "vmd", &vmd->irqs[i]);
+				       "vmd", vmd->irqs[i]);
 		if (err)
 			return err;
 	}
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] PCI: vmd: Add indirection layer to vmd irq lists
  2019-10-31 13:08 [PATCH v2] PCI: vmd: Add indirection layer to vmd irq lists Jon Derrick
@ 2019-10-31 23:11 ` Paul E. McKenney
  2019-11-06 16:25   ` Derrick, Jonathan
  2022-01-22  0:03 ` Scott Wood
  2022-01-22  0:19 ` Scott Wood
  2 siblings, 1 reply; 9+ messages in thread
From: Paul E. McKenney @ 2019-10-31 23:11 UTC (permalink / raw)
  To: Jon Derrick; +Cc: Lorenzo Pieralisi, linux-pci, Bjorn Helgaas

On Thu, Oct 31, 2019 at 07:08:53AM -0600, Jon Derrick wrote:
> With CONFIG_MAXSMP and CONFIG_PROVE_LOCKING, the size of an srcu_struct can
> grow quite large. In one compilation instance it produced a 74KiB data
> structure. These are embedded in the vmd_irq_list struct, and a N=64 allocation
> can exceed MAX_ORDER, violating reclaim rules.
> 
>   struct srcu_struct {
>           struct srcu_node   node[521];                    /*     0 75024 */
>           /* --- cacheline 1172 boundary (75008 bytes) was 16 bytes ago --- */
>           struct srcu_node *         level[4];             /* 75024    32 */
>           struct mutex       srcu_cb_mutex;                /* 75056   128 */
>           /* --- cacheline 1174 boundary (75136 bytes) was 48 bytes ago --- */
>           spinlock_t                 lock;                 /* 75184    56 */
>           /* --- cacheline 1175 boundary (75200 bytes) was 40 bytes ago --- */
>           struct mutex       srcu_gp_mutex;                /* 75240   128 */
>           /* --- cacheline 1177 boundary (75328 bytes) was 40 bytes ago --- */
>           unsigned int               srcu_idx;             /* 75368     4 */
> 
>           /* XXX 4 bytes hole, try to pack */
> 
>           long unsigned int          srcu_gp_seq;          /* 75376     8 */
>           long unsigned int          srcu_gp_seq_needed;   /* 75384     8 */
>           /* --- cacheline 1178 boundary (75392 bytes) --- */
>           long unsigned int          srcu_gp_seq_needed_exp; /* 75392     8 */
>           long unsigned int          srcu_last_gp_end;     /* 75400     8 */
>           struct srcu_data *         sda;                  /* 75408     8 */
>           long unsigned int          srcu_barrier_seq;     /* 75416     8 */
>           struct mutex       srcu_barrier_mutex;           /* 75424   128 */
>           /* --- cacheline 1180 boundary (75520 bytes) was 32 bytes ago --- */
>           struct completion  srcu_barrier_completion;      /* 75552    80 */
>           /* --- cacheline 1181 boundary (75584 bytes) was 48 bytes ago --- */
>           atomic_t                   srcu_barrier_cpu_cnt; /* 75632     4 */
> 
>           /* XXX 4 bytes hole, try to pack */
> 
>           struct delayed_work work;                        /* 75640   152 */
> 
>           /* XXX last struct has 4 bytes of padding */
> 
>           /* --- cacheline 1184 boundary (75776 bytes) was 16 bytes ago --- */
>           struct lockdep_map dep_map;                      /* 75792    32 */
> 
>           /* size: 75824, cachelines: 1185, members: 17 */
>           /* sum members: 75816, holes: 2, sum holes: 8 */
>           /* paddings: 1, sum paddings: 4 */
>           /* last cacheline: 48 bytes */
>   };
> 
> With N=64 VMD IRQ lists, this would allocate 4.6MiB in a single call. This
> violates MAX_ORDER reclaim rules when PAGE_SIZE=4096 and
> MAX_ORDER_NR_PAGES=1024, and invokes the following warning in mm/page_alloc.c:
> 
>   /*
>    * There are several places where we assume that the order value is sane
>    * so bail out early if the request is out of bound.
>    */
>   if (unlikely(order >= MAX_ORDER)) {
>   	WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
>   	return NULL;
>   }
> 
> This patch changes the irq list array into an array of pointers to irq
> lists to avoid allocation failures with greater msix counts.
> 
> This patch also reverts commit b31822277abcd7c83d1c1c0af876da9ccdf3b7d6.
> The index_from_irqs() helper was added to calculate the irq list index
> from the array of irqs, in order to shrink vmd_irq_list for performance.
> 
> Due to the embedded srcu_struct within the vmd_irq_list struct having a
> varying size depending on a number of factors, the vmd_irq_list struct
> no longer guarantees optimal data structure size and granularity.
> 
> Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
> ---
> Added Paul to make him aware of srcu_struct size with these options

There was some discussion of making the srcu_struct structure's ->node[]
array be separately allocated, which would allow this array to be
rightsize for the system in question.  However, I believe they ended up
instead separately allocating the srcu_struct structure itself.

Without doing something like that, I am kind of stuck.  After all,
at compile time, the kernel build system tells SRCU that it needs to
be prepared to run on systems with thousands of CPUs.  Which requires
substantial memory to keep track of all those CPUs.  Which are not
present on most systems.

Thoughts?

							Thanx, Paul

> v1->v2:
> Squashed the revert into this commit
> changed n=1 kcalloc to kzalloc
> 
>  drivers/pci/controller/vmd.c | 47 ++++++++++++++++++++++----------------------
>  1 file changed, 24 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
> index a35d3f3..8bce647 100644
> --- a/drivers/pci/controller/vmd.c
> +++ b/drivers/pci/controller/vmd.c
> @@ -82,6 +82,7 @@ struct vmd_irq_list {
>  	struct list_head	irq_list;
>  	struct srcu_struct	srcu;
>  	unsigned int		count;
> +	unsigned int		index;
>  };
>  
>  struct vmd_dev {
> @@ -91,7 +92,7 @@ struct vmd_dev {
>  	char __iomem		*cfgbar;
>  
>  	int msix_count;
> -	struct vmd_irq_list	*irqs;
> +	struct vmd_irq_list	**irqs;
>  
>  	struct pci_sysdata	sysdata;
>  	struct resource		resources[3];
> @@ -108,12 +109,6 @@ static inline struct vmd_dev *vmd_from_bus(struct pci_bus *bus)
>  	return container_of(bus->sysdata, struct vmd_dev, sysdata);
>  }
>  
> -static inline unsigned int index_from_irqs(struct vmd_dev *vmd,
> -					   struct vmd_irq_list *irqs)
> -{
> -	return irqs - vmd->irqs;
> -}
> -
>  /*
>   * Drivers managing a device in a VMD domain allocate their own IRQs as before,
>   * but the MSI entry for the hardware it's driving will be programmed with a
> @@ -126,11 +121,10 @@ static void vmd_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
>  {
>  	struct vmd_irq *vmdirq = data->chip_data;
>  	struct vmd_irq_list *irq = vmdirq->irq;
> -	struct vmd_dev *vmd = irq_data_get_irq_handler_data(data);
>  
>  	msg->address_hi = MSI_ADDR_BASE_HI;
>  	msg->address_lo = MSI_ADDR_BASE_LO |
> -			  MSI_ADDR_DEST_ID(index_from_irqs(vmd, irq));
> +			  MSI_ADDR_DEST_ID(irq->index);
>  	msg->data = 0;
>  }
>  
> @@ -200,7 +194,7 @@ static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd, struct msi_desc *d
>  	unsigned long flags;
>  
>  	if (vmd->msix_count == 1)
> -		return &vmd->irqs[0];
> +		return vmd->irqs[0];
>  
>  	/*
>  	 * White list for fast-interrupt handlers. All others will share the
> @@ -210,17 +204,17 @@ static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd, struct msi_desc *d
>  	case PCI_CLASS_STORAGE_EXPRESS:
>  		break;
>  	default:
> -		return &vmd->irqs[0];
> +		return vmd->irqs[0];
>  	}
>  
>  	raw_spin_lock_irqsave(&list_lock, flags);
>  	for (i = 1; i < vmd->msix_count; i++)
> -		if (vmd->irqs[i].count < vmd->irqs[best].count)
> +		if (vmd->irqs[i]->count < vmd->irqs[best]->count)
>  			best = i;
> -	vmd->irqs[best].count++;
> +	vmd->irqs[best]->count++;
>  	raw_spin_unlock_irqrestore(&list_lock, flags);
>  
> -	return &vmd->irqs[best];
> +	return vmd->irqs[best];
>  }
>  
>  static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info,
> @@ -230,7 +224,7 @@ static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info,
>  	struct msi_desc *desc = arg->desc;
>  	struct vmd_dev *vmd = vmd_from_bus(msi_desc_to_pci_dev(desc)->bus);
>  	struct vmd_irq *vmdirq = kzalloc(sizeof(*vmdirq), GFP_KERNEL);
> -	unsigned int index, vector;
> +	unsigned int vector;
>  
>  	if (!vmdirq)
>  		return -ENOMEM;
> @@ -238,8 +232,7 @@ static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info,
>  	INIT_LIST_HEAD(&vmdirq->node);
>  	vmdirq->irq = vmd_next_irq(vmd, desc);
>  	vmdirq->virq = virq;
> -	index = index_from_irqs(vmd, vmdirq->irq);
> -	vector = pci_irq_vector(vmd->dev, index);
> +	vector = pci_irq_vector(vmd->dev, vmdirq->irq->index);
>  
>  	irq_domain_set_info(domain, virq, vector, info->chip, vmdirq,
>  			    handle_untracked_irq, vmd, NULL);
> @@ -771,14 +764,22 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
>  		return -ENOMEM;
>  
>  	for (i = 0; i < vmd->msix_count; i++) {
> -		err = init_srcu_struct(&vmd->irqs[i].srcu);
> +		vmd->irqs[i] = devm_kzalloc(&dev->dev, sizeof(**vmd->irqs),
> +					    GFP_KERNEL);
> +		if (!vmd->irqs[i])
> +			return -ENOMEM;
> +	}
> +
> +	for (i = 0; i < vmd->msix_count; i++) {
> +		err = init_srcu_struct(&vmd->irqs[i]->srcu);
>  		if (err)
>  			return err;
>  
> -		INIT_LIST_HEAD(&vmd->irqs[i].irq_list);
> +		INIT_LIST_HEAD(&vmd->irqs[i]->irq_list);
> +		vmd->irqs[i]->index = i;
>  		err = devm_request_irq(&dev->dev, pci_irq_vector(dev, i),
>  				       vmd_irq, IRQF_NO_THREAD,
> -				       "vmd", &vmd->irqs[i]);
> +				       "vmd", vmd->irqs[i]);
>  		if (err)
>  			return err;
>  	}
> @@ -799,7 +800,7 @@ static void vmd_cleanup_srcu(struct vmd_dev *vmd)
>  	int i;
>  
>  	for (i = 0; i < vmd->msix_count; i++)
> -		cleanup_srcu_struct(&vmd->irqs[i].srcu);
> +		cleanup_srcu_struct(&vmd->irqs[i]->srcu);
>  }
>  
>  static void vmd_remove(struct pci_dev *dev)
> @@ -823,7 +824,7 @@ static int vmd_suspend(struct device *dev)
>  	int i;
>  
>  	for (i = 0; i < vmd->msix_count; i++)
> -                devm_free_irq(dev, pci_irq_vector(pdev, i), &vmd->irqs[i]);
> +                devm_free_irq(dev, pci_irq_vector(pdev, i), vmd->irqs[i]);
>  
>  	pci_save_state(pdev);
>  	return 0;
> @@ -838,7 +839,7 @@ static int vmd_resume(struct device *dev)
>  	for (i = 0; i < vmd->msix_count; i++) {
>  		err = devm_request_irq(dev, pci_irq_vector(pdev, i),
>  				       vmd_irq, IRQF_NO_THREAD,
> -				       "vmd", &vmd->irqs[i]);
> +				       "vmd", vmd->irqs[i]);
>  		if (err)
>  			return err;
>  	}
> -- 
> 1.8.3.1
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] PCI: vmd: Add indirection layer to vmd irq lists
  2019-10-31 23:11 ` Paul E. McKenney
@ 2019-11-06 16:25   ` Derrick, Jonathan
  2020-02-28 11:10     ` Lorenzo Pieralisi
  0 siblings, 1 reply; 9+ messages in thread
From: Derrick, Jonathan @ 2019-11-06 16:25 UTC (permalink / raw)
  To: paulmck; +Cc: lorenzo.pieralisi, linux-pci, helgaas

On Thu, 2019-10-31 at 16:11 -0700, Paul E. McKenney wrote:
> On Thu, Oct 31, 2019 at 07:08:53AM -0600, Jon Derrick wrote:
> > With CONFIG_MAXSMP and CONFIG_PROVE_LOCKING, the size of an srcu_struct can
> > grow quite large. In one compilation instance it produced a 74KiB data
> > structure. These are embedded in the vmd_irq_list struct, and a N=64 allocation
> > can exceed MAX_ORDER, violating reclaim rules.
> > 
> >   struct srcu_struct {
> >           struct srcu_node   node[521];                    /*     0 75024 */
> >           /* --- cacheline 1172 boundary (75008 bytes) was 16 bytes ago --- */
> >           struct srcu_node *         level[4];             /* 75024    32 */
> >           struct mutex       srcu_cb_mutex;                /* 75056   128 */
> >           /* --- cacheline 1174 boundary (75136 bytes) was 48 bytes ago --- */
> >           spinlock_t                 lock;                 /* 75184    56 */
> >           /* --- cacheline 1175 boundary (75200 bytes) was 40 bytes ago --- */
> >           struct mutex       srcu_gp_mutex;                /* 75240   128 */
> >           /* --- cacheline 1177 boundary (75328 bytes) was 40 bytes ago --- */
> >           unsigned int               srcu_idx;             /* 75368     4 */
> > 
> >           /* XXX 4 bytes hole, try to pack */
> > 
> >           long unsigned int          srcu_gp_seq;          /* 75376     8 */
> >           long unsigned int          srcu_gp_seq_needed;   /* 75384     8 */
> >           /* --- cacheline 1178 boundary (75392 bytes) --- */
> >           long unsigned int          srcu_gp_seq_needed_exp; /* 75392     8 */
> >           long unsigned int          srcu_last_gp_end;     /* 75400     8 */
> >           struct srcu_data *         sda;                  /* 75408     8 */
> >           long unsigned int          srcu_barrier_seq;     /* 75416     8 */
> >           struct mutex       srcu_barrier_mutex;           /* 75424   128 */
> >           /* --- cacheline 1180 boundary (75520 bytes) was 32 bytes ago --- */
> >           struct completion  srcu_barrier_completion;      /* 75552    80 */
> >           /* --- cacheline 1181 boundary (75584 bytes) was 48 bytes ago --- */
> >           atomic_t                   srcu_barrier_cpu_cnt; /* 75632     4 */
> > 
> >           /* XXX 4 bytes hole, try to pack */
> > 
> >           struct delayed_work work;                        /* 75640   152 */
> > 
> >           /* XXX last struct has 4 bytes of padding */
> > 
> >           /* --- cacheline 1184 boundary (75776 bytes) was 16 bytes ago --- */
> >           struct lockdep_map dep_map;                      /* 75792    32 */
> > 
> >           /* size: 75824, cachelines: 1185, members: 17 */
> >           /* sum members: 75816, holes: 2, sum holes: 8 */
> >           /* paddings: 1, sum paddings: 4 */
> >           /* last cacheline: 48 bytes */
> >   };
> > 
> > With N=64 VMD IRQ lists, this would allocate 4.6MiB in a single call. This
> > violates MAX_ORDER reclaim rules when PAGE_SIZE=4096 and
> > MAX_ORDER_NR_PAGES=1024, and invokes the following warning in mm/page_alloc.c:
> > 
> >   /*
> >    * There are several places where we assume that the order value is sane
> >    * so bail out early if the request is out of bound.
> >    */
> >   if (unlikely(order >= MAX_ORDER)) {
> >   	WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
> >   	return NULL;
> >   }
> > 
> > This patch changes the irq list array into an array of pointers to irq
> > lists to avoid allocation failures with greater msix counts.
> > 
> > This patch also reverts commit b31822277abcd7c83d1c1c0af876da9ccdf3b7d6.
> > The index_from_irqs() helper was added to calculate the irq list index
> > from the array of irqs, in order to shrink vmd_irq_list for performance.
> > 
> > Due to the embedded srcu_struct within the vmd_irq_list struct having a
> > varying size depending on a number of factors, the vmd_irq_list struct
> > no longer guarantees optimal data structure size and granularity.
> > 
> > Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
> > ---
> > Added Paul to make him aware of srcu_struct size with these options
> 
> There was some discussion of making the srcu_struct structure's ->node[]
> array be separately allocated, which would allow this array to be
> rightsize for the system in question.  However, I believe they ended up
> instead separately allocating the srcu_struct structure itself.
> 
> Without doing something like that, I am kind of stuck.  After all,
> at compile time, the kernel build system tells SRCU that it needs to
> be prepared to run on systems with thousands of CPUs.  Which requires
> substantial memory to keep track of all those CPUs.  Which are not
> present on most systems.
> 
> Thoughts?
> 
> 							Thanx, Paul
> 

Yes I haven't seen an elegant solution other than making users aware of
the situation.

Thanks for your input

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] PCI: vmd: Add indirection layer to vmd irq lists
  2019-11-06 16:25   ` Derrick, Jonathan
@ 2020-02-28 11:10     ` Lorenzo Pieralisi
  2020-02-28 14:34       ` Paul E. McKenney
  0 siblings, 1 reply; 9+ messages in thread
From: Lorenzo Pieralisi @ 2020-02-28 11:10 UTC (permalink / raw)
  To: Derrick, Jonathan; +Cc: paulmck, linux-pci, helgaas

On Wed, Nov 06, 2019 at 04:25:25PM +0000, Derrick, Jonathan wrote:
> On Thu, 2019-10-31 at 16:11 -0700, Paul E. McKenney wrote:
> > On Thu, Oct 31, 2019 at 07:08:53AM -0600, Jon Derrick wrote:
> > > With CONFIG_MAXSMP and CONFIG_PROVE_LOCKING, the size of an srcu_struct can
> > > grow quite large. In one compilation instance it produced a 74KiB data
> > > structure. These are embedded in the vmd_irq_list struct, and a N=64 allocation
> > > can exceed MAX_ORDER, violating reclaim rules.
> > > 
> > >   struct srcu_struct {
> > >           struct srcu_node   node[521];                    /*     0 75024 */
> > >           /* --- cacheline 1172 boundary (75008 bytes) was 16 bytes ago --- */
> > >           struct srcu_node *         level[4];             /* 75024    32 */
> > >           struct mutex       srcu_cb_mutex;                /* 75056   128 */
> > >           /* --- cacheline 1174 boundary (75136 bytes) was 48 bytes ago --- */
> > >           spinlock_t                 lock;                 /* 75184    56 */
> > >           /* --- cacheline 1175 boundary (75200 bytes) was 40 bytes ago --- */
> > >           struct mutex       srcu_gp_mutex;                /* 75240   128 */
> > >           /* --- cacheline 1177 boundary (75328 bytes) was 40 bytes ago --- */
> > >           unsigned int               srcu_idx;             /* 75368     4 */
> > > 
> > >           /* XXX 4 bytes hole, try to pack */
> > > 
> > >           long unsigned int          srcu_gp_seq;          /* 75376     8 */
> > >           long unsigned int          srcu_gp_seq_needed;   /* 75384     8 */
> > >           /* --- cacheline 1178 boundary (75392 bytes) --- */
> > >           long unsigned int          srcu_gp_seq_needed_exp; /* 75392     8 */
> > >           long unsigned int          srcu_last_gp_end;     /* 75400     8 */
> > >           struct srcu_data *         sda;                  /* 75408     8 */
> > >           long unsigned int          srcu_barrier_seq;     /* 75416     8 */
> > >           struct mutex       srcu_barrier_mutex;           /* 75424   128 */
> > >           /* --- cacheline 1180 boundary (75520 bytes) was 32 bytes ago --- */
> > >           struct completion  srcu_barrier_completion;      /* 75552    80 */
> > >           /* --- cacheline 1181 boundary (75584 bytes) was 48 bytes ago --- */
> > >           atomic_t                   srcu_barrier_cpu_cnt; /* 75632     4 */
> > > 
> > >           /* XXX 4 bytes hole, try to pack */
> > > 
> > >           struct delayed_work work;                        /* 75640   152 */
> > > 
> > >           /* XXX last struct has 4 bytes of padding */
> > > 
> > >           /* --- cacheline 1184 boundary (75776 bytes) was 16 bytes ago --- */
> > >           struct lockdep_map dep_map;                      /* 75792    32 */
> > > 
> > >           /* size: 75824, cachelines: 1185, members: 17 */
> > >           /* sum members: 75816, holes: 2, sum holes: 8 */
> > >           /* paddings: 1, sum paddings: 4 */
> > >           /* last cacheline: 48 bytes */
> > >   };
> > > 
> > > With N=64 VMD IRQ lists, this would allocate 4.6MiB in a single call. This
> > > violates MAX_ORDER reclaim rules when PAGE_SIZE=4096 and
> > > MAX_ORDER_NR_PAGES=1024, and invokes the following warning in mm/page_alloc.c:
> > > 
> > >   /*
> > >    * There are several places where we assume that the order value is sane
> > >    * so bail out early if the request is out of bound.
> > >    */
> > >   if (unlikely(order >= MAX_ORDER)) {
> > >   	WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
> > >   	return NULL;
> > >   }
> > > 
> > > This patch changes the irq list array into an array of pointers to irq
> > > lists to avoid allocation failures with greater msix counts.
> > > 
> > > This patch also reverts commit b31822277abcd7c83d1c1c0af876da9ccdf3b7d6.
> > > The index_from_irqs() helper was added to calculate the irq list index
> > > from the array of irqs, in order to shrink vmd_irq_list for performance.
> > > 
> > > Due to the embedded srcu_struct within the vmd_irq_list struct having a
> > > varying size depending on a number of factors, the vmd_irq_list struct
> > > no longer guarantees optimal data structure size and granularity.
> > > 
> > > Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
> > > ---
> > > Added Paul to make him aware of srcu_struct size with these options
> > 
> > There was some discussion of making the srcu_struct structure's ->node[]
> > array be separately allocated, which would allow this array to be
> > rightsize for the system in question.  However, I believe they ended up
> > instead separately allocating the srcu_struct structure itself.
> > 
> > Without doing something like that, I am kind of stuck.  After all,
> > at compile time, the kernel build system tells SRCU that it needs to
> > be prepared to run on systems with thousands of CPUs.  Which requires
> > substantial memory to keep track of all those CPUs.  Which are not
> > present on most systems.
> > 
> > Thoughts?
> > 
> > 							Thanx, Paul
> > 
> 
> Yes I haven't seen an elegant solution other than making users aware
> of the situation.
> 
> Thanks for your input

Jon, Paul,

I don't know if there was any further development in this area in the
meantime, should we proceed with this patch ?

Thanks,
Lorenzo

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] PCI: vmd: Add indirection layer to vmd irq lists
  2020-02-28 11:10     ` Lorenzo Pieralisi
@ 2020-02-28 14:34       ` Paul E. McKenney
  2020-02-28 16:36         ` Derrick, Jonathan
  0 siblings, 1 reply; 9+ messages in thread
From: Paul E. McKenney @ 2020-02-28 14:34 UTC (permalink / raw)
  To: Lorenzo Pieralisi; +Cc: Derrick, Jonathan, linux-pci, helgaas

On Fri, Feb 28, 2020 at 11:10:10AM +0000, Lorenzo Pieralisi wrote:
> On Wed, Nov 06, 2019 at 04:25:25PM +0000, Derrick, Jonathan wrote:
> > On Thu, 2019-10-31 at 16:11 -0700, Paul E. McKenney wrote:
> > > On Thu, Oct 31, 2019 at 07:08:53AM -0600, Jon Derrick wrote:
> > > > With CONFIG_MAXSMP and CONFIG_PROVE_LOCKING, the size of an srcu_struct can
> > > > grow quite large. In one compilation instance it produced a 74KiB data
> > > > structure. These are embedded in the vmd_irq_list struct, and a N=64 allocation
> > > > can exceed MAX_ORDER, violating reclaim rules.
> > > > 
> > > >   struct srcu_struct {
> > > >           struct srcu_node   node[521];                    /*     0 75024 */
> > > >           /* --- cacheline 1172 boundary (75008 bytes) was 16 bytes ago --- */
> > > >           struct srcu_node *         level[4];             /* 75024    32 */
> > > >           struct mutex       srcu_cb_mutex;                /* 75056   128 */
> > > >           /* --- cacheline 1174 boundary (75136 bytes) was 48 bytes ago --- */
> > > >           spinlock_t                 lock;                 /* 75184    56 */
> > > >           /* --- cacheline 1175 boundary (75200 bytes) was 40 bytes ago --- */
> > > >           struct mutex       srcu_gp_mutex;                /* 75240   128 */
> > > >           /* --- cacheline 1177 boundary (75328 bytes) was 40 bytes ago --- */
> > > >           unsigned int               srcu_idx;             /* 75368     4 */
> > > > 
> > > >           /* XXX 4 bytes hole, try to pack */
> > > > 
> > > >           long unsigned int          srcu_gp_seq;          /* 75376     8 */
> > > >           long unsigned int          srcu_gp_seq_needed;   /* 75384     8 */
> > > >           /* --- cacheline 1178 boundary (75392 bytes) --- */
> > > >           long unsigned int          srcu_gp_seq_needed_exp; /* 75392     8 */
> > > >           long unsigned int          srcu_last_gp_end;     /* 75400     8 */
> > > >           struct srcu_data *         sda;                  /* 75408     8 */
> > > >           long unsigned int          srcu_barrier_seq;     /* 75416     8 */
> > > >           struct mutex       srcu_barrier_mutex;           /* 75424   128 */
> > > >           /* --- cacheline 1180 boundary (75520 bytes) was 32 bytes ago --- */
> > > >           struct completion  srcu_barrier_completion;      /* 75552    80 */
> > > >           /* --- cacheline 1181 boundary (75584 bytes) was 48 bytes ago --- */
> > > >           atomic_t                   srcu_barrier_cpu_cnt; /* 75632     4 */
> > > > 
> > > >           /* XXX 4 bytes hole, try to pack */
> > > > 
> > > >           struct delayed_work work;                        /* 75640   152 */
> > > > 
> > > >           /* XXX last struct has 4 bytes of padding */
> > > > 
> > > >           /* --- cacheline 1184 boundary (75776 bytes) was 16 bytes ago --- */
> > > >           struct lockdep_map dep_map;                      /* 75792    32 */
> > > > 
> > > >           /* size: 75824, cachelines: 1185, members: 17 */
> > > >           /* sum members: 75816, holes: 2, sum holes: 8 */
> > > >           /* paddings: 1, sum paddings: 4 */
> > > >           /* last cacheline: 48 bytes */
> > > >   };
> > > > 
> > > > With N=64 VMD IRQ lists, this would allocate 4.6MiB in a single call. This
> > > > violates MAX_ORDER reclaim rules when PAGE_SIZE=4096 and
> > > > MAX_ORDER_NR_PAGES=1024, and invokes the following warning in mm/page_alloc.c:
> > > > 
> > > >   /*
> > > >    * There are several places where we assume that the order value is sane
> > > >    * so bail out early if the request is out of bound.
> > > >    */
> > > >   if (unlikely(order >= MAX_ORDER)) {
> > > >   	WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
> > > >   	return NULL;
> > > >   }
> > > > 
> > > > This patch changes the irq list array into an array of pointers to irq
> > > > lists to avoid allocation failures with greater msix counts.
> > > > 
> > > > This patch also reverts commit b31822277abcd7c83d1c1c0af876da9ccdf3b7d6.
> > > > The index_from_irqs() helper was added to calculate the irq list index
> > > > from the array of irqs, in order to shrink vmd_irq_list for performance.
> > > > 
> > > > Due to the embedded srcu_struct within the vmd_irq_list struct having a
> > > > varying size depending on a number of factors, the vmd_irq_list struct
> > > > no longer guarantees optimal data structure size and granularity.
> > > > 
> > > > Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
> > > > ---
> > > > Added Paul to make him aware of srcu_struct size with these options
> > > 
> > > There was some discussion of making the srcu_struct structure's ->node[]
> > > array be separately allocated, which would allow this array to be
> > > rightsize for the system in question.  However, I believe they ended up
> > > instead separately allocating the srcu_struct structure itself.
> > > 
> > > Without doing something like that, I am kind of stuck.  After all,
> > > at compile time, the kernel build system tells SRCU that it needs to
> > > be prepared to run on systems with thousands of CPUs.  Which requires
> > > substantial memory to keep track of all those CPUs.  Which are not
> > > present on most systems.
> > > 
> > > Thoughts?
> > 
> > Yes I haven't seen an elegant solution other than making users aware
> > of the situation.
> > 
> > Thanks for your input
> 
> Jon, Paul,
> 
> I don't know if there was any further development in this area in the
> meantime, should we proceed with this patch ?

Let me be more explicit.  Would it be helpful to you guys if there was
a variable-sized ->node[] array that is separately allocated?  If so,
please do tell me.  After all, I cannot read your minds  ;-)

An instance of such a variant would not be available via DEFINE_SRCU(),
which at compile time would absolutely need to allocate as many elements
as Kconfig said to allocate.  In addition, instances of srcu_struct
taking this approach would not be usable until after init_srcu_struct()
was invoked, which would allocate a right-sized ->node array.

Again, would this be helpful?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] PCI: vmd: Add indirection layer to vmd irq lists
  2020-02-28 14:34       ` Paul E. McKenney
@ 2020-02-28 16:36         ` Derrick, Jonathan
  0 siblings, 0 replies; 9+ messages in thread
From: Derrick, Jonathan @ 2020-02-28 16:36 UTC (permalink / raw)
  To: lorenzo.pieralisi, paulmck; +Cc: linux-pci, helgaas

On Fri, 2020-02-28 at 06:34 -0800, Paul E. McKenney wrote:
> On Fri, Feb 28, 2020 at 11:10:10AM +0000, Lorenzo Pieralisi wrote:
> > On Wed, Nov 06, 2019 at 04:25:25PM +0000, Derrick, Jonathan wrote:
> > > On Thu, 2019-10-31 at 16:11 -0700, Paul E. McKenney wrote:
> > > > On Thu, Oct 31, 2019 at 07:08:53AM -0600, Jon Derrick wrote:
> > > > > With CONFIG_MAXSMP and CONFIG_PROVE_LOCKING, the size of an srcu_struct can
> > > > > grow quite large. In one compilation instance it produced a 74KiB data
> > > > > structure. These are embedded in the vmd_irq_list struct, and a N=64 allocation
> > > > > can exceed MAX_ORDER, violating reclaim rules.
> > > > > 
> > > > >   struct srcu_struct {
> > > > >           struct srcu_node   node[521];                    /*     0 75024 */
> > > > >           /* --- cacheline 1172 boundary (75008 bytes) was 16 bytes ago --- */
> > > > >           struct srcu_node *         level[4];             /* 75024    32 */
> > > > >           struct mutex       srcu_cb_mutex;                /* 75056   128 */
> > > > >           /* --- cacheline 1174 boundary (75136 bytes) was 48 bytes ago --- */
> > > > >           spinlock_t                 lock;                 /* 75184    56 */
> > > > >           /* --- cacheline 1175 boundary (75200 bytes) was 40 bytes ago --- */
> > > > >           struct mutex       srcu_gp_mutex;                /* 75240   128 */
> > > > >           /* --- cacheline 1177 boundary (75328 bytes) was 40 bytes ago --- */
> > > > >           unsigned int               srcu_idx;             /* 75368     4 */
> > > > > 
> > > > >           /* XXX 4 bytes hole, try to pack */
> > > > > 
> > > > >           long unsigned int          srcu_gp_seq;          /* 75376     8 */
> > > > >           long unsigned int          srcu_gp_seq_needed;   /* 75384     8 */
> > > > >           /* --- cacheline 1178 boundary (75392 bytes) --- */
> > > > >           long unsigned int          srcu_gp_seq_needed_exp; /* 75392     8 */
> > > > >           long unsigned int          srcu_last_gp_end;     /* 75400     8 */
> > > > >           struct srcu_data *         sda;                  /* 75408     8 */
> > > > >           long unsigned int          srcu_barrier_seq;     /* 75416     8 */
> > > > >           struct mutex       srcu_barrier_mutex;           /* 75424   128 */
> > > > >           /* --- cacheline 1180 boundary (75520 bytes) was 32 bytes ago --- */
> > > > >           struct completion  srcu_barrier_completion;      /* 75552    80 */
> > > > >           /* --- cacheline 1181 boundary (75584 bytes) was 48 bytes ago --- */
> > > > >           atomic_t                   srcu_barrier_cpu_cnt; /* 75632     4 */
> > > > > 
> > > > >           /* XXX 4 bytes hole, try to pack */
> > > > > 
> > > > >           struct delayed_work work;                        /* 75640   152 */
> > > > > 
> > > > >           /* XXX last struct has 4 bytes of padding */
> > > > > 
> > > > >           /* --- cacheline 1184 boundary (75776 bytes) was 16 bytes ago --- */
> > > > >           struct lockdep_map dep_map;                      /* 75792    32 */
> > > > > 
> > > > >           /* size: 75824, cachelines: 1185, members: 17 */
> > > > >           /* sum members: 75816, holes: 2, sum holes: 8 */
> > > > >           /* paddings: 1, sum paddings: 4 */
> > > > >           /* last cacheline: 48 bytes */
> > > > >   };
> > > > > 
> > > > > With N=64 VMD IRQ lists, this would allocate 4.6MiB in a single call. This
> > > > > violates MAX_ORDER reclaim rules when PAGE_SIZE=4096 and
> > > > > MAX_ORDER_NR_PAGES=1024, and invokes the following warning in mm/page_alloc.c:
> > > > > 
> > > > >   /*
> > > > >    * There are several places where we assume that the order value is sane
> > > > >    * so bail out early if the request is out of bound.
> > > > >    */
> > > > >   if (unlikely(order >= MAX_ORDER)) {
> > > > >   	WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
> > > > >   	return NULL;
> > > > >   }
> > > > > 
> > > > > This patch changes the irq list array into an array of pointers to irq
> > > > > lists to avoid allocation failures with greater msix counts.
> > > > > 
> > > > > This patch also reverts commit b31822277abcd7c83d1c1c0af876da9ccdf3b7d6.
> > > > > The index_from_irqs() helper was added to calculate the irq list index
> > > > > from the array of irqs, in order to shrink vmd_irq_list for performance.
> > > > > 
> > > > > Due to the embedded srcu_struct within the vmd_irq_list struct having a
> > > > > varying size depending on a number of factors, the vmd_irq_list struct
> > > > > no longer guarantees optimal data structure size and granularity.
> > > > > 
> > > > > Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
> > > > > ---
> > > > > Added Paul to make him aware of srcu_struct size with these options
> > > > 
> > > > There was some discussion of making the srcu_struct structure's ->node[]
> > > > array be separately allocated, which would allow this array to be
> > > > rightsize for the system in question.  However, I believe they ended up
> > > > instead separately allocating the srcu_struct structure itself.
> > > > 
> > > > Without doing something like that, I am kind of stuck.  After all,
> > > > at compile time, the kernel build system tells SRCU that it needs to
> > > > be prepared to run on systems with thousands of CPUs.  Which requires
> > > > substantial memory to keep track of all those CPUs.  Which are not
> > > > present on most systems.
> > > > 
> > > > Thoughts?
> > > 
> > > Yes I haven't seen an elegant solution other than making users aware
> > > of the situation.
> > > 
> > > Thanks for your input
> > 
> > Jon, Paul,
> > 
> > I don't know if there was any further development in this area in the
> > meantime, should we proceed with this patch ?
> 
> Let me be more explicit.  Would it be helpful to you guys if there was
> a variable-sized ->node[] array that is separately allocated?  If so,
> please do tell me.  After all, I cannot read your minds  ;-)
> 
Frankly I'm not versed enough in RCU to know the implications of this
change. How often have you come across the same issue I am facing? Is
it worth the effort versus my abstraction? Will it affect performance
or will the Sleepable component absorb it?

> An instance of such a variant would not be available via DEFINE_SRCU(),
> which at compile time would absolutely need to allocate as many elements
> as Kconfig said to allocate. 
So the implication is that it needs allocation later via
init_srcu_struct?

> In addition, instances of srcu_struct
> taking this approach would not be usable until after init_srcu_struct()
> was invoked, which would allocate a right-sized ->node array.
So similar to other lock init functions. Are there existing users doing
RCU before init_srcu_struct is called?

> 
> Again, would this be helpful?
> 
> 							Thanx, Paul

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] PCI: vmd: Add indirection layer to vmd irq lists
  2019-10-31 13:08 [PATCH v2] PCI: vmd: Add indirection layer to vmd irq lists Jon Derrick
  2019-10-31 23:11 ` Paul E. McKenney
@ 2022-01-22  0:03 ` Scott Wood
  2022-01-22  0:22   ` Paul E. McKenney
  2022-01-22  0:19 ` Scott Wood
  2 siblings, 1 reply; 9+ messages in thread
From: Scott Wood @ 2022-01-22  0:03 UTC (permalink / raw)
  To: Jon Derrick; +Cc: Lorenzo Pieralisi, linux-pci, Paul McKenney, Bjorn Helgaas

On Thu, Oct 31, 2019 at 07:08:53AM -0600, Jon Derrick wrote:
> With CONFIG_MAXSMP and CONFIG_PROVE_LOCKING, the size of an srcu_struct can
> grow quite large. In one compilation instance it produced a 74KiB data
> structure. These are embedded in the vmd_irq_list struct, and a N=64 allocation
> can exceed MAX_ORDER, violating reclaim rules.
[snip]
> ---
> Added Paul to make him aware of srcu_struct size with these options
> 
> v1->v2:
> Squashed the revert into this commit
> changed n=1 kcalloc to kzalloc
> 
>  drivers/pci/controller/vmd.c | 47 ++++++++++++++++++++++----------------------
>  1 file changed, 24 insertions(+), 23 deletions(-)

Has there been any progress on this?  We're seeing a similar problem
in a PREEMPT_RT kernel with MAXSMP.

-Scott


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] PCI: vmd: Add indirection layer to vmd irq lists
  2019-10-31 13:08 [PATCH v2] PCI: vmd: Add indirection layer to vmd irq lists Jon Derrick
  2019-10-31 23:11 ` Paul E. McKenney
  2022-01-22  0:03 ` Scott Wood
@ 2022-01-22  0:19 ` Scott Wood
  2 siblings, 0 replies; 9+ messages in thread
From: Scott Wood @ 2022-01-22  0:19 UTC (permalink / raw)
  To: Jonathan Derrick
  Cc: Lorenzo Pieralisi, linux-pci, Paul McKenney, Bjorn Helgaas

[resent with Jonathan's current address]

On Thu, Oct 31, 2019 at 07:08:53AM -0600, Jon Derrick wrote:
> With CONFIG_MAXSMP and CONFIG_PROVE_LOCKING, the size of an srcu_struct can
> grow quite large. In one compilation instance it produced a 74KiB data
> structure. These are embedded in the vmd_irq_list struct, and a N=64 allocation
> can exceed MAX_ORDER, violating reclaim rules.
[snip]
> ---
> Added Paul to make him aware of srcu_struct size with these options
> 
> v1->v2:
> Squashed the revert into this commit
> changed n=1 kcalloc to kzalloc
> 
>  drivers/pci/controller/vmd.c | 47 ++++++++++++++++++++++----------------------
>  1 file changed, 24 insertions(+), 23 deletions(-)

Has there been any progress on this?  We're seeing a similar problem
in a PREEMPT_RT kernel with MAXSMP.

-Scott


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] PCI: vmd: Add indirection layer to vmd irq lists
  2022-01-22  0:03 ` Scott Wood
@ 2022-01-22  0:22   ` Paul E. McKenney
  0 siblings, 0 replies; 9+ messages in thread
From: Paul E. McKenney @ 2022-01-22  0:22 UTC (permalink / raw)
  To: Scott Wood; +Cc: Jon Derrick, Lorenzo Pieralisi, linux-pci, Bjorn Helgaas

On Fri, Jan 21, 2022 at 06:03:33PM -0600, Scott Wood wrote:
> On Thu, Oct 31, 2019 at 07:08:53AM -0600, Jon Derrick wrote:
> > With CONFIG_MAXSMP and CONFIG_PROVE_LOCKING, the size of an srcu_struct can
> > grow quite large. In one compilation instance it produced a 74KiB data
> > structure. These are embedded in the vmd_irq_list struct, and a N=64 allocation
> > can exceed MAX_ORDER, violating reclaim rules.
> [snip]
> > ---
> > Added Paul to make him aware of srcu_struct size with these options
> > 
> > v1->v2:
> > Squashed the revert into this commit
> > changed n=1 kcalloc to kzalloc
> > 
> >  drivers/pci/controller/vmd.c | 47 ++++++++++++++++++++++----------------------
> >  1 file changed, 24 insertions(+), 23 deletions(-)
> 
> Has there been any progress on this?  We're seeing a similar problem
> in a PREEMPT_RT kernel with MAXSMP.

In case this helps...

I have started working on shrinking the srcu_struct structure.  Those
possessing intestinal fortitude of mythic proportions might wish to
try this very lightly tested -rcu commits out:

1385139340b7 ("srcu: Dynamically allocate srcu_node array")

This will be followed by additional commits that will dispense
entirely with the srcu_node array unless or until that particular
srcu_struct structure encounters significant update-side contention.

Thoughts?

							Thanx, Paul

------------------------------------------------------------------------

commit 1385139340b7a1c8f35cb7a52af221096cdef86e
Author: Paul E. McKenney <paulmck@kernel.org>
Date:   Fri Jan 21 16:13:52 2022 -0800

    srcu: Dynamically allocate srcu_node array
    
    This commit shrinks the srcu_struct structure by converting its ->node
    field from a fixed-size compile-time array to a pointer to a dynamically
    allocated array.  In kernels built with large values of NR_CPUS that boot
    on systems with smaller numbers of CPUs, this can save significant memory.
    
    Reported-by: A cast of thousands
    Signed-off-by: Paul E. McKenney <paulmck@kernel.org>

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 4025840ba9a38..f7190058fb8ab 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -60,7 +60,7 @@ struct srcu_node {
  * Per-SRCU-domain structure, similar in function to rcu_state.
  */
 struct srcu_struct {
-	struct srcu_node node[NUM_RCU_NODES];	/* Combining tree. */
+	struct srcu_node *node;			/* Combining tree. */
 	struct srcu_node *level[RCU_NUM_LVLS + 1];
 						/* First node at each level. */
 	struct mutex srcu_cb_mutex;		/* Serialize CB preparation. */
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 7d13e35e5d277..7cb5f34c62418 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -24,6 +24,7 @@
 #include <linux/smp.h>
 #include <linux/delay.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/srcu.h>
 
 #include "rcu.h"
@@ -75,12 +76,44 @@ do {									\
 	spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags)	\
 
 /*
- * Initialize SRCU combining tree.  Note that statically allocated
+ * Initialize SRCU per-CPU data.  Note that statically allocated
  * srcu_struct structures might already have srcu_read_lock() and
  * srcu_read_unlock() running against them.  So if the is_static parameter
  * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
+ *
+ * Returns @true if allocation succeeded and @false otherwise.
+ */
+static void init_srcu_struct_data(struct srcu_struct *ssp)
+{
+	int cpu;
+	struct srcu_data *sdp;
+
+	/*
+	 * Initialize the per-CPU srcu_data array, which feeds into the
+	 * leaves of the srcu_node tree.
+	 */
+	WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
+		     ARRAY_SIZE(sdp->srcu_unlock_count));
+	for_each_possible_cpu(cpu) {
+		sdp = per_cpu_ptr(ssp->sda, cpu);
+		spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
+		rcu_segcblist_init(&sdp->srcu_cblist);
+		sdp->srcu_cblist_invoking = false;
+		sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq;
+		sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq;
+		sdp->mynode = NULL;
+		sdp->cpu = cpu;
+		INIT_WORK(&sdp->work, srcu_invoke_callbacks);
+		timer_setup(&sdp->delay_work, srcu_delay_timer, 0);
+		sdp->ssp = ssp;
+	}
+}
+
+/*
+ * Allocated and initialize SRCU combining tree.  Returns @true if
+ * allocation succeeded and @false otherwise.
  */
-static void init_srcu_struct_nodes(struct srcu_struct *ssp)
+static bool init_srcu_struct_nodes(struct srcu_struct *ssp)
 {
 	int cpu;
 	int i;
@@ -92,6 +125,9 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp)
 
 	/* Initialize geometry if it has not already been initialized. */
 	rcu_init_geometry();
+	ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), GFP_ATOMIC);
+	if (!ssp->node)
+		return false;
 
 	/* Work out the overall tree geometry. */
 	ssp->level[0] = &ssp->node[0];
@@ -129,29 +165,19 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp)
 	 * Initialize the per-CPU srcu_data array, which feeds into the
 	 * leaves of the srcu_node tree.
 	 */
-	WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
-		     ARRAY_SIZE(sdp->srcu_unlock_count));
 	level = rcu_num_lvls - 1;
 	snp_first = ssp->level[level];
 	for_each_possible_cpu(cpu) {
 		sdp = per_cpu_ptr(ssp->sda, cpu);
-		spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
-		rcu_segcblist_init(&sdp->srcu_cblist);
-		sdp->srcu_cblist_invoking = false;
-		sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq;
-		sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq;
 		sdp->mynode = &snp_first[cpu / levelspread[level]];
 		for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
 			if (snp->grplo < 0)
 				snp->grplo = cpu;
 			snp->grphi = cpu;
 		}
-		sdp->cpu = cpu;
-		INIT_WORK(&sdp->work, srcu_invoke_callbacks);
-		timer_setup(&sdp->delay_work, srcu_delay_timer, 0);
-		sdp->ssp = ssp;
 		sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
 	}
+	return true;
 }
 
 /*
@@ -162,6 +188,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp)
  */
 static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 {
+	ssp->node = NULL;
 	mutex_init(&ssp->srcu_cb_mutex);
 	mutex_init(&ssp->srcu_gp_mutex);
 	ssp->srcu_idx = 0;
@@ -174,7 +201,8 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
 		ssp->sda = alloc_percpu(struct srcu_data);
 	if (!ssp->sda)
 		return -ENOMEM;
-	init_srcu_struct_nodes(ssp);
+	init_srcu_struct_data(ssp);
+	WARN_ON_ONCE(!init_srcu_struct_nodes(ssp));
 	ssp->srcu_gp_seq_needed_exp = 0;
 	ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
 	smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */

^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2022-01-22  0:22 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-31 13:08 [PATCH v2] PCI: vmd: Add indirection layer to vmd irq lists Jon Derrick
2019-10-31 23:11 ` Paul E. McKenney
2019-11-06 16:25   ` Derrick, Jonathan
2020-02-28 11:10     ` Lorenzo Pieralisi
2020-02-28 14:34       ` Paul E. McKenney
2020-02-28 16:36         ` Derrick, Jonathan
2022-01-22  0:03 ` Scott Wood
2022-01-22  0:22   ` Paul E. McKenney
2022-01-22  0:19 ` Scott Wood

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).