All of lore.kernel.org
 help / color / mirror / Atom feed
* [net-next PATCH v2] net: dsa: qca8k: pack driver struct and improve cache use
@ 2022-02-28 11:04 Ansuel Smith
  2022-03-03  1:53 ` Vladimir Oltean
  0 siblings, 1 reply; 10+ messages in thread
From: Ansuel Smith @ 2022-02-28 11:04 UTC (permalink / raw)
  To: Andrew Lunn, Vivien Didelot, Florian Fainelli, Vladimir Oltean,
	David S. Miller, Jakub Kicinski, Russell King, netdev,
	linux-kernel
  Cc: Ansuel Smith

Pack qca8k priv and other struct using pahole and set the first priv
struct entry to mgmt_master and mgmt_eth_data to speedup access.
While at it also rework pcs struct and move it qca8k_ports_config
following other configuration set for the cpu ports.

Signed-off-by: Ansuel Smith <ansuelsmth@gmail.com>
---
 drivers/net/dsa/qca8k.c |  8 ++++----
 drivers/net/dsa/qca8k.h | 33 ++++++++++++++++-----------------
 2 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index ee0dbf324268..8d059da5f0ca 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -1685,11 +1685,11 @@ qca8k_phylink_mac_select_pcs(struct dsa_switch *ds, int port,
 	case PHY_INTERFACE_MODE_1000BASEX:
 		switch (port) {
 		case 0:
-			pcs = &priv->pcs_port_0.pcs;
+			pcs = &priv->ports_config.qpcs[QCA8K_CPU_PORT0].pcs;
 			break;
 
 		case 6:
-			pcs = &priv->pcs_port_6.pcs;
+			pcs = &priv->ports_config.qpcs[QCA8K_CPU_PORT6].pcs;
 			break;
 		}
 		break;
@@ -2889,8 +2889,8 @@ qca8k_setup(struct dsa_switch *ds)
 	if (ret)
 		return ret;
 
-	qca8k_setup_pcs(priv, &priv->pcs_port_0, 0);
-	qca8k_setup_pcs(priv, &priv->pcs_port_6, 6);
+	qca8k_setup_pcs(priv, &priv->ports_config.qpcs[QCA8K_CPU_PORT0], 0);
+	qca8k_setup_pcs(priv, &priv->ports_config.qpcs[QCA8K_CPU_PORT6], 6);
 
 	/* Make sure MAC06 is disabled */
 	ret = regmap_clear_bits(priv->regmap, QCA8K_REG_PORT0_PAD_CTRL,
diff --git a/drivers/net/dsa/qca8k.h b/drivers/net/dsa/qca8k.h
index f375627174c8..611dc2335dbe 100644
--- a/drivers/net/dsa/qca8k.h
+++ b/drivers/net/dsa/qca8k.h
@@ -341,18 +341,24 @@ enum {
 
 struct qca8k_mgmt_eth_data {
 	struct completion rw_done;
-	struct mutex mutex; /* Enforce one mdio read/write at time */
+	u32 data[4];
 	bool ack;
 	u32 seq;
-	u32 data[4];
+	struct mutex mutex; /* Enforce one mdio read/write at time */
 };
 
 struct qca8k_mib_eth_data {
 	struct completion rw_done;
+	u64 *data; /* pointer to ethtool data */
+	u8 req_port;
 	struct mutex mutex; /* Process one command at time */
 	refcount_t port_parsed; /* Counter to track parsed port */
-	u8 req_port;
-	u64 *data; /* pointer to ethtool data */
+};
+
+struct qca8k_pcs {
+	struct phylink_pcs pcs;
+	struct qca8k_priv *priv;
+	int port;
 };
 
 struct qca8k_ports_config {
@@ -361,6 +367,7 @@ struct qca8k_ports_config {
 	bool sgmii_enable_pll;
 	u8 rgmii_rx_delay[QCA8K_NUM_CPU_PORTS]; /* 0: CPU port0, 1: CPU port6 */
 	u8 rgmii_tx_delay[QCA8K_NUM_CPU_PORTS]; /* 0: CPU port0, 1: CPU port6 */
+	struct qca8k_pcs qpcs[QCA8K_NUM_CPU_PORTS];
 };
 
 struct qca8k_mdio_cache {
@@ -376,13 +383,10 @@ struct qca8k_mdio_cache {
 	u16 hi;
 };
 
-struct qca8k_pcs {
-	struct phylink_pcs pcs;
-	struct qca8k_priv *priv;
-	int port;
-};
-
 struct qca8k_priv {
+	struct net_device *mgmt_master; /* Track if mdio/mib Ethernet is available */
+	struct qca8k_mgmt_eth_data mgmt_eth_data;
+	struct qca8k_mdio_cache mdio_cache;
 	u8 switch_id;
 	u8 switch_revision;
 	u8 mirror_rx;
@@ -396,15 +400,10 @@ struct qca8k_priv {
 	struct dsa_switch *ds;
 	struct mutex reg_mutex;
 	struct device *dev;
-	struct dsa_switch_ops ops;
 	struct gpio_desc *reset_gpio;
-	unsigned int port_mtu[QCA8K_NUM_PORTS];
-	struct net_device *mgmt_master; /* Track if mdio/mib Ethernet is available */
-	struct qca8k_mgmt_eth_data mgmt_eth_data;
+	struct dsa_switch_ops ops;
 	struct qca8k_mib_eth_data mib_eth_data;
-	struct qca8k_mdio_cache mdio_cache;
-	struct qca8k_pcs pcs_port_0;
-	struct qca8k_pcs pcs_port_6;
+	unsigned int port_mtu[QCA8K_NUM_PORTS];
 };
 
 struct qca8k_mib_desc {
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [net-next PATCH v2] net: dsa: qca8k: pack driver struct and improve cache use
  2022-02-28 11:04 [net-next PATCH v2] net: dsa: qca8k: pack driver struct and improve cache use Ansuel Smith
@ 2022-03-03  1:53 ` Vladimir Oltean
  2022-03-03  2:25   ` Andrew Lunn
  2022-03-21 16:07   ` Ansuel Smith
  0 siblings, 2 replies; 10+ messages in thread
From: Vladimir Oltean @ 2022-03-03  1:53 UTC (permalink / raw)
  To: Ansuel Smith
  Cc: Andrew Lunn, Vivien Didelot, Florian Fainelli, David S. Miller,
	Jakub Kicinski, Russell King, netdev, linux-kernel

On Mon, Feb 28, 2022 at 12:04:08PM +0100, Ansuel Smith wrote:
> Pack qca8k priv and other struct using pahole and set the first priv
> struct entry to mgmt_master and mgmt_eth_data to speedup access.
> While at it also rework pcs struct and move it qca8k_ports_config
> following other configuration set for the cpu ports.
> 
> Signed-off-by: Ansuel Smith <ansuelsmth@gmail.com>
> ---

How did you "pack" struct qca8k_priv exactly?

Before:

struct qca8k_priv {
        u8                         switch_id;            /*     0     1 */
        u8                         switch_revision;      /*     1     1 */
        u8                         mirror_rx;            /*     2     1 */
        u8                         mirror_tx;            /*     3     1 */
        u8                         lag_hash_mode;        /*     4     1 */
        bool                       legacy_phy_port_mapping; /*     5     1 */
        struct qca8k_ports_config  ports_config;         /*     6     7 */

        /* XXX 3 bytes hole, try to pack */

        struct regmap *            regmap;               /*    16     8 */
        struct mii_bus *           bus;                  /*    24     8 */
        struct ar8xxx_port_status  port_sts[7];          /*    32    28 */

        /* XXX 4 bytes hole, try to pack */

        /* --- cacheline 1 boundary (64 bytes) --- */
        struct dsa_switch *        ds;                   /*    64     8 */
        struct mutex               reg_mutex;            /*    72   160 */
        /* --- cacheline 3 boundary (192 bytes) was 40 bytes ago --- */
        struct device *            dev;                  /*   232     8 */
        struct dsa_switch_ops      ops;                  /*   240   864 */
        /* --- cacheline 17 boundary (1088 bytes) was 16 bytes ago --- */
        struct gpio_desc *         reset_gpio;           /*  1104     8 */
        unsigned int               port_mtu[7];          /*  1112    28 */

        /* XXX 4 bytes hole, try to pack */

        struct net_device *        mgmt_master;          /*  1144     8 */
        /* --- cacheline 18 boundary (1152 bytes) --- */
        struct qca8k_mgmt_eth_data mgmt_eth_data;        /*  1152   280 */
        /* --- cacheline 22 boundary (1408 bytes) was 24 bytes ago --- */
        struct qca8k_mib_eth_data  mib_eth_data;         /*  1432   272 */
        /* --- cacheline 26 boundary (1664 bytes) was 40 bytes ago --- */
        struct qca8k_mdio_cache    mdio_cache;           /*  1704     6 */

        /* XXX 2 bytes hole, try to pack */

        struct qca8k_pcs           pcs_port_0;           /*  1712    32 */

        /* XXX last struct has 4 bytes of padding */

        /* --- cacheline 27 boundary (1728 bytes) was 16 bytes ago --- */
        struct qca8k_pcs           pcs_port_6;           /*  1744    32 */

        /* XXX last struct has 4 bytes of padding */

        /* size: 1776, cachelines: 28, members: 22 */
        /* sum members: 1763, holes: 4, sum holes: 13 */
        /* paddings: 2, sum paddings: 8 */
        /* last cacheline: 48 bytes */
};

After:

struct qca8k_priv {
        struct net_device *        mgmt_master;          /*     0     8 */
        struct qca8k_mgmt_eth_data mgmt_eth_data;        /*     8   280 */
        /* --- cacheline 4 boundary (256 bytes) was 32 bytes ago --- */
        struct qca8k_mdio_cache    mdio_cache;           /*   288     6 */
        u8                         switch_id;            /*   294     1 */
        u8                         switch_revision;      /*   295     1 */
        u8                         mirror_rx;            /*   296     1 */
        u8                         mirror_tx;            /*   297     1 */
        u8                         lag_hash_mode;        /*   298     1 */
        bool                       legacy_phy_port_mapping; /*   299     1 */

        /* XXX 4 bytes hole, try to pack */

        struct qca8k_ports_config  ports_config;         /*   304    72 */
        /* --- cacheline 5 boundary (320 bytes) was 56 bytes ago --- */
        struct regmap *            regmap;               /*   376     8 */
        /* --- cacheline 6 boundary (384 bytes) --- */
        struct mii_bus *           bus;                  /*   384     8 */
        struct ar8xxx_port_status  port_sts[7];          /*   392    28 */

        /* XXX 4 bytes hole, try to pack */

        struct dsa_switch *        ds;                   /*   424     8 */
        struct mutex               reg_mutex;            /*   432   160 */
        /* --- cacheline 9 boundary (576 bytes) was 16 bytes ago --- */
        struct device *            dev;                  /*   592     8 */
        struct gpio_desc *         reset_gpio;           /*   600     8 */
        struct dsa_switch_ops      ops;                  /*   608   864 */
        /* --- cacheline 23 boundary (1472 bytes) --- */
        struct qca8k_mib_eth_data  mib_eth_data;         /*  1472   280 */

        /* XXX last struct has 4 bytes of padding */

        /* --- cacheline 27 boundary (1728 bytes) was 24 bytes ago --- */
        unsigned int               port_mtu[7];          /*  1752    28 */

        /* size: 1784, cachelines: 28, members: 20 */
        /* sum members: 1772, holes: 2, sum holes: 8 */
        /* padding: 4 */
        /* paddings: 1, sum paddings: 4 */
        /* last cacheline: 56 bytes */
};

1776 vs 1784. That's... larger?!

Also, struct qca8k_priv is so large because the "ops" member is a full
copy of qca8k_switch_ops. I understand why commit db460c54b67f ("net:
dsa: qca8k: extend slave-bus implementations") did this, but I wonder,
is there no better way?

>  drivers/net/dsa/qca8k.c |  8 ++++----
>  drivers/net/dsa/qca8k.h | 33 ++++++++++++++++-----------------
>  2 files changed, 20 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
> index ee0dbf324268..8d059da5f0ca 100644
> --- a/drivers/net/dsa/qca8k.c
> +++ b/drivers/net/dsa/qca8k.c
> @@ -1685,11 +1685,11 @@ qca8k_phylink_mac_select_pcs(struct dsa_switch *ds, int port,
>  	case PHY_INTERFACE_MODE_1000BASEX:
>  		switch (port) {
>  		case 0:
> -			pcs = &priv->pcs_port_0.pcs;
> +			pcs = &priv->ports_config.qpcs[QCA8K_CPU_PORT0].pcs;
>  			break;
>  
>  		case 6:
> -			pcs = &priv->pcs_port_6.pcs;
> +			pcs = &priv->ports_config.qpcs[QCA8K_CPU_PORT6].pcs;
>  			break;
>  		}
>  		break;
> @@ -2889,8 +2889,8 @@ qca8k_setup(struct dsa_switch *ds)
>  	if (ret)
>  		return ret;
>  
> -	qca8k_setup_pcs(priv, &priv->pcs_port_0, 0);
> -	qca8k_setup_pcs(priv, &priv->pcs_port_6, 6);
> +	qca8k_setup_pcs(priv, &priv->ports_config.qpcs[QCA8K_CPU_PORT0], 0);
> +	qca8k_setup_pcs(priv, &priv->ports_config.qpcs[QCA8K_CPU_PORT6], 6);
>  
>  	/* Make sure MAC06 is disabled */
>  	ret = regmap_clear_bits(priv->regmap, QCA8K_REG_PORT0_PAD_CTRL,
> diff --git a/drivers/net/dsa/qca8k.h b/drivers/net/dsa/qca8k.h
> index f375627174c8..611dc2335dbe 100644
> --- a/drivers/net/dsa/qca8k.h
> +++ b/drivers/net/dsa/qca8k.h
> @@ -341,18 +341,24 @@ enum {
>  
>  struct qca8k_mgmt_eth_data {
>  	struct completion rw_done;
> -	struct mutex mutex; /* Enforce one mdio read/write at time */
> +	u32 data[4];
>  	bool ack;
>  	u32 seq;
> -	u32 data[4];
> +	struct mutex mutex; /* Enforce one mdio read/write at time */
>  };
>  
>  struct qca8k_mib_eth_data {
>  	struct completion rw_done;
> +	u64 *data; /* pointer to ethtool data */
> +	u8 req_port;
>  	struct mutex mutex; /* Process one command at time */
>  	refcount_t port_parsed; /* Counter to track parsed port */
> -	u8 req_port;
> -	u64 *data; /* pointer to ethtool data */
> +};
> +
> +struct qca8k_pcs {
> +	struct phylink_pcs pcs;
> +	struct qca8k_priv *priv;
> +	int port;
>  };
>  
>  struct qca8k_ports_config {
> @@ -361,6 +367,7 @@ struct qca8k_ports_config {
>  	bool sgmii_enable_pll;
>  	u8 rgmii_rx_delay[QCA8K_NUM_CPU_PORTS]; /* 0: CPU port0, 1: CPU port6 */
>  	u8 rgmii_tx_delay[QCA8K_NUM_CPU_PORTS]; /* 0: CPU port0, 1: CPU port6 */
> +	struct qca8k_pcs qpcs[QCA8K_NUM_CPU_PORTS];
>  };
>  
>  struct qca8k_mdio_cache {
> @@ -376,13 +383,10 @@ struct qca8k_mdio_cache {
>  	u16 hi;
>  };
>  
> -struct qca8k_pcs {
> -	struct phylink_pcs pcs;
> -	struct qca8k_priv *priv;
> -	int port;
> -};
> -
>  struct qca8k_priv {
> +	struct net_device *mgmt_master; /* Track if mdio/mib Ethernet is available */
> +	struct qca8k_mgmt_eth_data mgmt_eth_data;
> +	struct qca8k_mdio_cache mdio_cache;
>  	u8 switch_id;
>  	u8 switch_revision;
>  	u8 mirror_rx;
> @@ -396,15 +400,10 @@ struct qca8k_priv {
>  	struct dsa_switch *ds;
>  	struct mutex reg_mutex;
>  	struct device *dev;
> -	struct dsa_switch_ops ops;
>  	struct gpio_desc *reset_gpio;
> -	unsigned int port_mtu[QCA8K_NUM_PORTS];
> -	struct net_device *mgmt_master; /* Track if mdio/mib Ethernet is available */
> -	struct qca8k_mgmt_eth_data mgmt_eth_data;
> +	struct dsa_switch_ops ops;
>  	struct qca8k_mib_eth_data mib_eth_data;
> -	struct qca8k_mdio_cache mdio_cache;
> -	struct qca8k_pcs pcs_port_0;
> -	struct qca8k_pcs pcs_port_6;
> +	unsigned int port_mtu[QCA8K_NUM_PORTS];
>  };
>  
>  struct qca8k_mib_desc {
> -- 
> 2.34.1
> 


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [net-next PATCH v2] net: dsa: qca8k: pack driver struct and improve cache use
  2022-03-03  1:53 ` Vladimir Oltean
@ 2022-03-03  2:25   ` Andrew Lunn
  2022-03-21 16:07   ` Ansuel Smith
  1 sibling, 0 replies; 10+ messages in thread
From: Andrew Lunn @ 2022-03-03  2:25 UTC (permalink / raw)
  To: Ansuel Smith
  Cc: Vladimir Oltean, Vivien Didelot, Florian Fainelli,
	David S. Miller, Jakub Kicinski, Russell King, netdev,
	linux-kernel

On Thu, Mar 03, 2022 at 03:53:27AM +0200, Vladimir Oltean wrote:
> On Mon, Feb 28, 2022 at 12:04:08PM +0100, Ansuel Smith wrote:
> > Pack qca8k priv and other struct using pahole and set the first priv
> > struct entry to mgmt_master and mgmt_eth_data to speedup access.
> > While at it also rework pcs struct and move it qca8k_ports_config
> > following other configuration set for the cpu ports.
> > 
> > Signed-off-by: Ansuel Smith <ansuelsmth@gmail.com>
> > ---
> 
> How did you "pack" struct qca8k_priv exactly?

As far as i can see, nothing in qca8k_priv is on the hot path. So i
doubt anything changed here will make a difference. The MDIO bus is
the bottleneck, not the memory interface.

    Andrew

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [net-next PATCH v2] net: dsa: qca8k: pack driver struct and improve cache use
  2022-03-03  1:53 ` Vladimir Oltean
  2022-03-03  2:25   ` Andrew Lunn
@ 2022-03-21 16:07   ` Ansuel Smith
  2022-03-21 17:22     ` Vladimir Oltean
  1 sibling, 1 reply; 10+ messages in thread
From: Ansuel Smith @ 2022-03-21 16:07 UTC (permalink / raw)
  To: Vladimir Oltean
  Cc: Andrew Lunn, Vivien Didelot, Florian Fainelli, David S. Miller,
	Jakub Kicinski, Russell King, netdev, linux-kernel

On Thu, Mar 03, 2022 at 03:53:27AM +0200, Vladimir Oltean wrote:
> On Mon, Feb 28, 2022 at 12:04:08PM +0100, Ansuel Smith wrote:
> > Pack qca8k priv and other struct using pahole and set the first priv
> > struct entry to mgmt_master and mgmt_eth_data to speedup access.
> > While at it also rework pcs struct and move it qca8k_ports_config
> > following other configuration set for the cpu ports.
> > 
> > Signed-off-by: Ansuel Smith <ansuelsmth@gmail.com>
> > ---
> 
> How did you "pack" struct qca8k_priv exactly?
>

I'm trying to understand it too... also this was done basend on what
target? 

> Before:
> 
> struct qca8k_priv {
>         u8                         switch_id;            /*     0     1 */
>         u8                         switch_revision;      /*     1     1 */
>         u8                         mirror_rx;            /*     2     1 */
>         u8                         mirror_tx;            /*     3     1 */
>         u8                         lag_hash_mode;        /*     4     1 */
>         bool                       legacy_phy_port_mapping; /*     5     1 */
>         struct qca8k_ports_config  ports_config;         /*     6     7 */
> 
>         /* XXX 3 bytes hole, try to pack */
> 
>         struct regmap *            regmap;               /*    16     8 */
>         struct mii_bus *           bus;                  /*    24     8 */
>         struct ar8xxx_port_status  port_sts[7];          /*    32    28 */
> 
>         /* XXX 4 bytes hole, try to pack */
> 
>         /* --- cacheline 1 boundary (64 bytes) --- */
>         struct dsa_switch *        ds;                   /*    64     8 */
>         struct mutex               reg_mutex;            /*    72   160 */
>         /* --- cacheline 3 boundary (192 bytes) was 40 bytes ago --- */
>         struct device *            dev;                  /*   232     8 */
>         struct dsa_switch_ops      ops;                  /*   240   864 */
>         /* --- cacheline 17 boundary (1088 bytes) was 16 bytes ago --- */
>         struct gpio_desc *         reset_gpio;           /*  1104     8 */
>         unsigned int               port_mtu[7];          /*  1112    28 */
> 
>         /* XXX 4 bytes hole, try to pack */
> 
>         struct net_device *        mgmt_master;          /*  1144     8 */
>         /* --- cacheline 18 boundary (1152 bytes) --- */
>         struct qca8k_mgmt_eth_data mgmt_eth_data;        /*  1152   280 */
>         /* --- cacheline 22 boundary (1408 bytes) was 24 bytes ago --- */
>         struct qca8k_mib_eth_data  mib_eth_data;         /*  1432   272 */
>         /* --- cacheline 26 boundary (1664 bytes) was 40 bytes ago --- */
>         struct qca8k_mdio_cache    mdio_cache;           /*  1704     6 */
> 
>         /* XXX 2 bytes hole, try to pack */
> 
>         struct qca8k_pcs           pcs_port_0;           /*  1712    32 */
> 
>         /* XXX last struct has 4 bytes of padding */
> 
>         /* --- cacheline 27 boundary (1728 bytes) was 16 bytes ago --- */
>         struct qca8k_pcs           pcs_port_6;           /*  1744    32 */
> 
>         /* XXX last struct has 4 bytes of padding */
> 
>         /* size: 1776, cachelines: 28, members: 22 */
>         /* sum members: 1763, holes: 4, sum holes: 13 */
>         /* paddings: 2, sum paddings: 8 */
>         /* last cacheline: 48 bytes */
> };
> 
> After:
> 
> struct qca8k_priv {
>         struct net_device *        mgmt_master;          /*     0     8 */
>         struct qca8k_mgmt_eth_data mgmt_eth_data;        /*     8   280 */
>         /* --- cacheline 4 boundary (256 bytes) was 32 bytes ago --- */
>         struct qca8k_mdio_cache    mdio_cache;           /*   288     6 */
>         u8                         switch_id;            /*   294     1 */
>         u8                         switch_revision;      /*   295     1 */
>         u8                         mirror_rx;            /*   296     1 */
>         u8                         mirror_tx;            /*   297     1 */
>         u8                         lag_hash_mode;        /*   298     1 */
>         bool                       legacy_phy_port_mapping; /*   299     1 */
> 
>         /* XXX 4 bytes hole, try to pack */
> 
>         struct qca8k_ports_config  ports_config;         /*   304    72 */
>         /* --- cacheline 5 boundary (320 bytes) was 56 bytes ago --- */
>         struct regmap *            regmap;               /*   376     8 */
>         /* --- cacheline 6 boundary (384 bytes) --- */
>         struct mii_bus *           bus;                  /*   384     8 */
>         struct ar8xxx_port_status  port_sts[7];          /*   392    28 */
> 
>         /* XXX 4 bytes hole, try to pack */
> 
>         struct dsa_switch *        ds;                   /*   424     8 */
>         struct mutex               reg_mutex;            /*   432   160 */
>         /* --- cacheline 9 boundary (576 bytes) was 16 bytes ago --- */
>         struct device *            dev;                  /*   592     8 */
>         struct gpio_desc *         reset_gpio;           /*   600     8 */
>         struct dsa_switch_ops      ops;                  /*   608   864 */
>         /* --- cacheline 23 boundary (1472 bytes) --- */
>         struct qca8k_mib_eth_data  mib_eth_data;         /*  1472   280 */
> 
>         /* XXX last struct has 4 bytes of padding */
> 
>         /* --- cacheline 27 boundary (1728 bytes) was 24 bytes ago --- */
>         unsigned int               port_mtu[7];          /*  1752    28 */
> 
>         /* size: 1784, cachelines: 28, members: 20 */
>         /* sum members: 1772, holes: 2, sum holes: 8 */
>         /* padding: 4 */
>         /* paddings: 1, sum paddings: 4 */
>         /* last cacheline: 56 bytes */
> };
> 
> 1776 vs 1784. That's... larger?!
> 
> Also, struct qca8k_priv is so large because the "ops" member is a full
> copy of qca8k_switch_ops. I understand why commit db460c54b67f ("net:
> dsa: qca8k: extend slave-bus implementations") did this, but I wonder,
> is there no better way?
> 

Actually from what I can see the struct can be improved in 2 way...
The ancient struct
ar8xxx_port_status port_sts[QCA8K_NUM_PORTS];
can be totally dropped as I can't see why we still need it.

The duplicated ops is funny. I could be very confused but why it's done
like that? Can't we modify directly the already defined one and drop
struct dsa_switch_ops ops; from qca8k_priv?

I mean is all of that to use priv->ops.phy_read instead of
priv->ds->ops.phy_read or even qca8k_switch_ops directly? o.o

Am I missing something?

> >  drivers/net/dsa/qca8k.c |  8 ++++----
> >  drivers/net/dsa/qca8k.h | 33 ++++++++++++++++-----------------
> >  2 files changed, 20 insertions(+), 21 deletions(-)
> > 
> > diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
> > index ee0dbf324268..8d059da5f0ca 100644
> > --- a/drivers/net/dsa/qca8k.c
> > +++ b/drivers/net/dsa/qca8k.c
> > @@ -1685,11 +1685,11 @@ qca8k_phylink_mac_select_pcs(struct dsa_switch *ds, int port,
> >  	case PHY_INTERFACE_MODE_1000BASEX:
> >  		switch (port) {
> >  		case 0:
> > -			pcs = &priv->pcs_port_0.pcs;
> > +			pcs = &priv->ports_config.qpcs[QCA8K_CPU_PORT0].pcs;
> >  			break;
> >  
> >  		case 6:
> > -			pcs = &priv->pcs_port_6.pcs;
> > +			pcs = &priv->ports_config.qpcs[QCA8K_CPU_PORT6].pcs;
> >  			break;
> >  		}
> >  		break;
> > @@ -2889,8 +2889,8 @@ qca8k_setup(struct dsa_switch *ds)
> >  	if (ret)
> >  		return ret;
> >  
> > -	qca8k_setup_pcs(priv, &priv->pcs_port_0, 0);
> > -	qca8k_setup_pcs(priv, &priv->pcs_port_6, 6);
> > +	qca8k_setup_pcs(priv, &priv->ports_config.qpcs[QCA8K_CPU_PORT0], 0);
> > +	qca8k_setup_pcs(priv, &priv->ports_config.qpcs[QCA8K_CPU_PORT6], 6);
> >  
> >  	/* Make sure MAC06 is disabled */
> >  	ret = regmap_clear_bits(priv->regmap, QCA8K_REG_PORT0_PAD_CTRL,
> > diff --git a/drivers/net/dsa/qca8k.h b/drivers/net/dsa/qca8k.h
> > index f375627174c8..611dc2335dbe 100644
> > --- a/drivers/net/dsa/qca8k.h
> > +++ b/drivers/net/dsa/qca8k.h
> > @@ -341,18 +341,24 @@ enum {
> >  
> >  struct qca8k_mgmt_eth_data {
> >  	struct completion rw_done;
> > -	struct mutex mutex; /* Enforce one mdio read/write at time */
> > +	u32 data[4];
> >  	bool ack;
> >  	u32 seq;
> > -	u32 data[4];
> > +	struct mutex mutex; /* Enforce one mdio read/write at time */
> >  };
> >  
> >  struct qca8k_mib_eth_data {
> >  	struct completion rw_done;
> > +	u64 *data; /* pointer to ethtool data */
> > +	u8 req_port;
> >  	struct mutex mutex; /* Process one command at time */
> >  	refcount_t port_parsed; /* Counter to track parsed port */
> > -	u8 req_port;
> > -	u64 *data; /* pointer to ethtool data */
> > +};
> > +
> > +struct qca8k_pcs {
> > +	struct phylink_pcs pcs;
> > +	struct qca8k_priv *priv;
> > +	int port;
> >  };
> >  
> >  struct qca8k_ports_config {
> > @@ -361,6 +367,7 @@ struct qca8k_ports_config {
> >  	bool sgmii_enable_pll;
> >  	u8 rgmii_rx_delay[QCA8K_NUM_CPU_PORTS]; /* 0: CPU port0, 1: CPU port6 */
> >  	u8 rgmii_tx_delay[QCA8K_NUM_CPU_PORTS]; /* 0: CPU port0, 1: CPU port6 */
> > +	struct qca8k_pcs qpcs[QCA8K_NUM_CPU_PORTS];
> >  };
> >  
> >  struct qca8k_mdio_cache {
> > @@ -376,13 +383,10 @@ struct qca8k_mdio_cache {
> >  	u16 hi;
> >  };
> >  
> > -struct qca8k_pcs {
> > -	struct phylink_pcs pcs;
> > -	struct qca8k_priv *priv;
> > -	int port;
> > -};
> > -
> >  struct qca8k_priv {
> > +	struct net_device *mgmt_master; /* Track if mdio/mib Ethernet is available */
> > +	struct qca8k_mgmt_eth_data mgmt_eth_data;
> > +	struct qca8k_mdio_cache mdio_cache;
> >  	u8 switch_id;
> >  	u8 switch_revision;
> >  	u8 mirror_rx;
> > @@ -396,15 +400,10 @@ struct qca8k_priv {
> >  	struct dsa_switch *ds;
> >  	struct mutex reg_mutex;
> >  	struct device *dev;
> > -	struct dsa_switch_ops ops;
> >  	struct gpio_desc *reset_gpio;
> > -	unsigned int port_mtu[QCA8K_NUM_PORTS];
> > -	struct net_device *mgmt_master; /* Track if mdio/mib Ethernet is available */
> > -	struct qca8k_mgmt_eth_data mgmt_eth_data;
> > +	struct dsa_switch_ops ops;
> >  	struct qca8k_mib_eth_data mib_eth_data;
> > -	struct qca8k_mdio_cache mdio_cache;
> > -	struct qca8k_pcs pcs_port_0;
> > -	struct qca8k_pcs pcs_port_6;
> > +	unsigned int port_mtu[QCA8K_NUM_PORTS];
> >  };
> >  
> >  struct qca8k_mib_desc {
> > -- 
> > 2.34.1
> > 
> 

-- 
	Ansuel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [net-next PATCH v2] net: dsa: qca8k: pack driver struct and improve cache use
  2022-03-21 16:07   ` Ansuel Smith
@ 2022-03-21 17:22     ` Vladimir Oltean
  2022-03-21 17:26       ` Ansuel Smith
  0 siblings, 1 reply; 10+ messages in thread
From: Vladimir Oltean @ 2022-03-21 17:22 UTC (permalink / raw)
  To: Ansuel Smith
  Cc: Andrew Lunn, Vivien Didelot, Florian Fainelli, David S. Miller,
	Jakub Kicinski, Russell King, netdev, linux-kernel

On Mon, Mar 21, 2022 at 05:07:55PM +0100, Ansuel Smith wrote:
> On Thu, Mar 03, 2022 at 03:53:27AM +0200, Vladimir Oltean wrote:
> > On Mon, Feb 28, 2022 at 12:04:08PM +0100, Ansuel Smith wrote:
> > > Pack qca8k priv and other struct using pahole and set the first priv
> > > struct entry to mgmt_master and mgmt_eth_data to speedup access.
> > > While at it also rework pcs struct and move it qca8k_ports_config
> > > following other configuration set for the cpu ports.
> > > 
> > > Signed-off-by: Ansuel Smith <ansuelsmth@gmail.com>
> > > ---
> > 
> > How did you "pack" struct qca8k_priv exactly?
> >
> 
> I'm trying to understand it too... also this was done basend on what
> target? 

I used an arm64 toolchain, if that's what you're asking.

> > Before:
> > 
> > struct qca8k_priv {
> >         u8                         switch_id;            /*     0     1 */
> >         u8                         switch_revision;      /*     1     1 */
> >         u8                         mirror_rx;            /*     2     1 */
> >         u8                         mirror_tx;            /*     3     1 */
> >         u8                         lag_hash_mode;        /*     4     1 */
> >         bool                       legacy_phy_port_mapping; /*     5     1 */
> >         struct qca8k_ports_config  ports_config;         /*     6     7 */
> > 
> >         /* XXX 3 bytes hole, try to pack */
> > 
> >         struct regmap *            regmap;               /*    16     8 */
> >         struct mii_bus *           bus;                  /*    24     8 */
> >         struct ar8xxx_port_status  port_sts[7];          /*    32    28 */
> > 
> >         /* XXX 4 bytes hole, try to pack */
> > 
> >         /* --- cacheline 1 boundary (64 bytes) --- */
> >         struct dsa_switch *        ds;                   /*    64     8 */
> >         struct mutex               reg_mutex;            /*    72   160 */
> >         /* --- cacheline 3 boundary (192 bytes) was 40 bytes ago --- */
> >         struct device *            dev;                  /*   232     8 */
> >         struct dsa_switch_ops      ops;                  /*   240   864 */
> >         /* --- cacheline 17 boundary (1088 bytes) was 16 bytes ago --- */
> >         struct gpio_desc *         reset_gpio;           /*  1104     8 */
> >         unsigned int               port_mtu[7];          /*  1112    28 */
> > 
> >         /* XXX 4 bytes hole, try to pack */
> > 
> >         struct net_device *        mgmt_master;          /*  1144     8 */
> >         /* --- cacheline 18 boundary (1152 bytes) --- */
> >         struct qca8k_mgmt_eth_data mgmt_eth_data;        /*  1152   280 */
> >         /* --- cacheline 22 boundary (1408 bytes) was 24 bytes ago --- */
> >         struct qca8k_mib_eth_data  mib_eth_data;         /*  1432   272 */
> >         /* --- cacheline 26 boundary (1664 bytes) was 40 bytes ago --- */
> >         struct qca8k_mdio_cache    mdio_cache;           /*  1704     6 */
> > 
> >         /* XXX 2 bytes hole, try to pack */
> > 
> >         struct qca8k_pcs           pcs_port_0;           /*  1712    32 */
> > 
> >         /* XXX last struct has 4 bytes of padding */
> > 
> >         /* --- cacheline 27 boundary (1728 bytes) was 16 bytes ago --- */
> >         struct qca8k_pcs           pcs_port_6;           /*  1744    32 */
> > 
> >         /* XXX last struct has 4 bytes of padding */
> > 
> >         /* size: 1776, cachelines: 28, members: 22 */
> >         /* sum members: 1763, holes: 4, sum holes: 13 */
> >         /* paddings: 2, sum paddings: 8 */
> >         /* last cacheline: 48 bytes */
> > };
> > 
> > After:
> > 
> > struct qca8k_priv {
> >         struct net_device *        mgmt_master;          /*     0     8 */
> >         struct qca8k_mgmt_eth_data mgmt_eth_data;        /*     8   280 */
> >         /* --- cacheline 4 boundary (256 bytes) was 32 bytes ago --- */
> >         struct qca8k_mdio_cache    mdio_cache;           /*   288     6 */
> >         u8                         switch_id;            /*   294     1 */
> >         u8                         switch_revision;      /*   295     1 */
> >         u8                         mirror_rx;            /*   296     1 */
> >         u8                         mirror_tx;            /*   297     1 */
> >         u8                         lag_hash_mode;        /*   298     1 */
> >         bool                       legacy_phy_port_mapping; /*   299     1 */
> > 
> >         /* XXX 4 bytes hole, try to pack */
> > 
> >         struct qca8k_ports_config  ports_config;         /*   304    72 */
> >         /* --- cacheline 5 boundary (320 bytes) was 56 bytes ago --- */
> >         struct regmap *            regmap;               /*   376     8 */
> >         /* --- cacheline 6 boundary (384 bytes) --- */
> >         struct mii_bus *           bus;                  /*   384     8 */
> >         struct ar8xxx_port_status  port_sts[7];          /*   392    28 */
> > 
> >         /* XXX 4 bytes hole, try to pack */
> > 
> >         struct dsa_switch *        ds;                   /*   424     8 */
> >         struct mutex               reg_mutex;            /*   432   160 */
> >         /* --- cacheline 9 boundary (576 bytes) was 16 bytes ago --- */
> >         struct device *            dev;                  /*   592     8 */
> >         struct gpio_desc *         reset_gpio;           /*   600     8 */
> >         struct dsa_switch_ops      ops;                  /*   608   864 */
> >         /* --- cacheline 23 boundary (1472 bytes) --- */
> >         struct qca8k_mib_eth_data  mib_eth_data;         /*  1472   280 */
> > 
> >         /* XXX last struct has 4 bytes of padding */
> > 
> >         /* --- cacheline 27 boundary (1728 bytes) was 24 bytes ago --- */
> >         unsigned int               port_mtu[7];          /*  1752    28 */
> > 
> >         /* size: 1784, cachelines: 28, members: 20 */
> >         /* sum members: 1772, holes: 2, sum holes: 8 */
> >         /* padding: 4 */
> >         /* paddings: 1, sum paddings: 4 */
> >         /* last cacheline: 56 bytes */
> > };
> > 
> > 1776 vs 1784. That's... larger?!
> > 
> > Also, struct qca8k_priv is so large because the "ops" member is a full
> > copy of qca8k_switch_ops. I understand why commit db460c54b67f ("net:
> > dsa: qca8k: extend slave-bus implementations") did this, but I wonder,
> > is there no better way?
> > 
> 
> Actually from what I can see the struct can be improved in 2 way...
> The ancient struct
> ar8xxx_port_status port_sts[QCA8K_NUM_PORTS];
> can be totally dropped as I can't see why we still need it.
> 
> The duplicated ops is funny. I could be very confused but why it's done
> like that? Can't we modify directly the already defined one and drop
> struct dsa_switch_ops ops; from qca8k_priv?

Simply put, qca8k_switch_ops is per kernel, priv->ops is per driver
instance.

> I mean is all of that to use priv->ops.phy_read instead of
> priv->ds->ops.phy_read or even qca8k_switch_ops directly? o.o
> 
> Am I missing something?

The thing is that qca8k_setup_mdio_bus() wants DSA to use the
legacy_phy_port_mapping only as a fallback. DSA uses this simplistic
condition:

dsa_switch_setup():
	if (!ds->slave_mii_bus && ds->ops->phy_read) {
		ds->slave_mii_bus = mdiobus_alloc();

You might be tempted to say: hey, qca8k_mdio_register() populates
ds->slave_mii_bus to an MDIO bus allocated by itself. So you'd be able
to prune the mdiobus_alloc() because the first part of the condition is
false.

But dsa_switch_teardown() has:

	if (ds->slave_mii_bus && ds->ops->phy_read) {
		mdiobus_unregister(ds->slave_mii_bus);

In other words, dsa_switch_teardown() has lost track of who allocated
the MDIO bus - it assumes that the DSA core has. That will conflict with
qca8k which uses devres, so it would result in double free.

So that's basically what the problem is. The qca8k driver needs to have
a NULL ds->ops->phy_read if it doesn't use the legacy_phy_port_mapping,
so it won't confuse the DSA core.

Please note that I'm not really too familiar with this part of the DSA
core since I don't have any hardware that makes any use of ds->slave_mii_bus.

> > >  drivers/net/dsa/qca8k.c |  8 ++++----
> > >  drivers/net/dsa/qca8k.h | 33 ++++++++++++++++-----------------
> > >  2 files changed, 20 insertions(+), 21 deletions(-)
> > > 
> > > diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
> > > index ee0dbf324268..8d059da5f0ca 100644
> > > --- a/drivers/net/dsa/qca8k.c
> > > +++ b/drivers/net/dsa/qca8k.c
> > > @@ -1685,11 +1685,11 @@ qca8k_phylink_mac_select_pcs(struct dsa_switch *ds, int port,
> > >  	case PHY_INTERFACE_MODE_1000BASEX:
> > >  		switch (port) {
> > >  		case 0:
> > > -			pcs = &priv->pcs_port_0.pcs;
> > > +			pcs = &priv->ports_config.qpcs[QCA8K_CPU_PORT0].pcs;
> > >  			break;
> > >  
> > >  		case 6:
> > > -			pcs = &priv->pcs_port_6.pcs;
> > > +			pcs = &priv->ports_config.qpcs[QCA8K_CPU_PORT6].pcs;
> > >  			break;
> > >  		}
> > >  		break;
> > > @@ -2889,8 +2889,8 @@ qca8k_setup(struct dsa_switch *ds)
> > >  	if (ret)
> > >  		return ret;
> > >  
> > > -	qca8k_setup_pcs(priv, &priv->pcs_port_0, 0);
> > > -	qca8k_setup_pcs(priv, &priv->pcs_port_6, 6);
> > > +	qca8k_setup_pcs(priv, &priv->ports_config.qpcs[QCA8K_CPU_PORT0], 0);
> > > +	qca8k_setup_pcs(priv, &priv->ports_config.qpcs[QCA8K_CPU_PORT6], 6);
> > >  
> > >  	/* Make sure MAC06 is disabled */
> > >  	ret = regmap_clear_bits(priv->regmap, QCA8K_REG_PORT0_PAD_CTRL,
> > > diff --git a/drivers/net/dsa/qca8k.h b/drivers/net/dsa/qca8k.h
> > > index f375627174c8..611dc2335dbe 100644
> > > --- a/drivers/net/dsa/qca8k.h
> > > +++ b/drivers/net/dsa/qca8k.h
> > > @@ -341,18 +341,24 @@ enum {
> > >  
> > >  struct qca8k_mgmt_eth_data {
> > >  	struct completion rw_done;
> > > -	struct mutex mutex; /* Enforce one mdio read/write at time */
> > > +	u32 data[4];
> > >  	bool ack;
> > >  	u32 seq;
> > > -	u32 data[4];
> > > +	struct mutex mutex; /* Enforce one mdio read/write at time */
> > >  };
> > >  
> > >  struct qca8k_mib_eth_data {
> > >  	struct completion rw_done;
> > > +	u64 *data; /* pointer to ethtool data */
> > > +	u8 req_port;
> > >  	struct mutex mutex; /* Process one command at time */
> > >  	refcount_t port_parsed; /* Counter to track parsed port */
> > > -	u8 req_port;
> > > -	u64 *data; /* pointer to ethtool data */
> > > +};
> > > +
> > > +struct qca8k_pcs {
> > > +	struct phylink_pcs pcs;
> > > +	struct qca8k_priv *priv;
> > > +	int port;
> > >  };
> > >  
> > >  struct qca8k_ports_config {
> > > @@ -361,6 +367,7 @@ struct qca8k_ports_config {
> > >  	bool sgmii_enable_pll;
> > >  	u8 rgmii_rx_delay[QCA8K_NUM_CPU_PORTS]; /* 0: CPU port0, 1: CPU port6 */
> > >  	u8 rgmii_tx_delay[QCA8K_NUM_CPU_PORTS]; /* 0: CPU port0, 1: CPU port6 */
> > > +	struct qca8k_pcs qpcs[QCA8K_NUM_CPU_PORTS];
> > >  };
> > >  
> > >  struct qca8k_mdio_cache {
> > > @@ -376,13 +383,10 @@ struct qca8k_mdio_cache {
> > >  	u16 hi;
> > >  };
> > >  
> > > -struct qca8k_pcs {
> > > -	struct phylink_pcs pcs;
> > > -	struct qca8k_priv *priv;
> > > -	int port;
> > > -};
> > > -
> > >  struct qca8k_priv {
> > > +	struct net_device *mgmt_master; /* Track if mdio/mib Ethernet is available */
> > > +	struct qca8k_mgmt_eth_data mgmt_eth_data;
> > > +	struct qca8k_mdio_cache mdio_cache;
> > >  	u8 switch_id;
> > >  	u8 switch_revision;
> > >  	u8 mirror_rx;
> > > @@ -396,15 +400,10 @@ struct qca8k_priv {
> > >  	struct dsa_switch *ds;
> > >  	struct mutex reg_mutex;
> > >  	struct device *dev;
> > > -	struct dsa_switch_ops ops;
> > >  	struct gpio_desc *reset_gpio;
> > > -	unsigned int port_mtu[QCA8K_NUM_PORTS];
> > > -	struct net_device *mgmt_master; /* Track if mdio/mib Ethernet is available */
> > > -	struct qca8k_mgmt_eth_data mgmt_eth_data;
> > > +	struct dsa_switch_ops ops;
> > >  	struct qca8k_mib_eth_data mib_eth_data;
> > > -	struct qca8k_mdio_cache mdio_cache;
> > > -	struct qca8k_pcs pcs_port_0;
> > > -	struct qca8k_pcs pcs_port_6;
> > > +	unsigned int port_mtu[QCA8K_NUM_PORTS];
> > >  };
> > >  
> > >  struct qca8k_mib_desc {
> > > -- 
> > > 2.34.1
> > > 
> > 
> 
> -- 
> 	Ansuel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [net-next PATCH v2] net: dsa: qca8k: pack driver struct and improve cache use
  2022-03-21 17:22     ` Vladimir Oltean
@ 2022-03-21 17:26       ` Ansuel Smith
  2022-03-21 17:31         ` Ansuel Smith
  0 siblings, 1 reply; 10+ messages in thread
From: Ansuel Smith @ 2022-03-21 17:26 UTC (permalink / raw)
  To: Vladimir Oltean
  Cc: Andrew Lunn, Vivien Didelot, Florian Fainelli, David S. Miller,
	Jakub Kicinski, Russell King, netdev, linux-kernel

On Mon, Mar 21, 2022 at 07:22:00PM +0200, Vladimir Oltean wrote:
> On Mon, Mar 21, 2022 at 05:07:55PM +0100, Ansuel Smith wrote:
> > On Thu, Mar 03, 2022 at 03:53:27AM +0200, Vladimir Oltean wrote:
> > > On Mon, Feb 28, 2022 at 12:04:08PM +0100, Ansuel Smith wrote:
> > > > Pack qca8k priv and other struct using pahole and set the first priv
> > > > struct entry to mgmt_master and mgmt_eth_data to speedup access.
> > > > While at it also rework pcs struct and move it qca8k_ports_config
> > > > following other configuration set for the cpu ports.
> > > > 
> > > > Signed-off-by: Ansuel Smith <ansuelsmth@gmail.com>
> > > > ---
> > > 
> > > How did you "pack" struct qca8k_priv exactly?
> > >
> > 
> > I'm trying to understand it too... also this was done basend on what
> > target? 
> 
> I used an arm64 toolchain, if that's what you're asking.
>

That could be the reason of different results. Qca8k is mostly used on
32bit systems with 4byte cache wonder. That could be the reason i have
different results and on a system like that it did suggest this order?

> > > Before:
> > > 
> > > struct qca8k_priv {
> > >         u8                         switch_id;            /*     0     1 */
> > >         u8                         switch_revision;      /*     1     1 */
> > >         u8                         mirror_rx;            /*     2     1 */
> > >         u8                         mirror_tx;            /*     3     1 */
> > >         u8                         lag_hash_mode;        /*     4     1 */
> > >         bool                       legacy_phy_port_mapping; /*     5     1 */
> > >         struct qca8k_ports_config  ports_config;         /*     6     7 */
> > > 
> > >         /* XXX 3 bytes hole, try to pack */
> > > 
> > >         struct regmap *            regmap;               /*    16     8 */
> > >         struct mii_bus *           bus;                  /*    24     8 */
> > >         struct ar8xxx_port_status  port_sts[7];          /*    32    28 */
> > > 
> > >         /* XXX 4 bytes hole, try to pack */
> > > 
> > >         /* --- cacheline 1 boundary (64 bytes) --- */
> > >         struct dsa_switch *        ds;                   /*    64     8 */
> > >         struct mutex               reg_mutex;            /*    72   160 */
> > >         /* --- cacheline 3 boundary (192 bytes) was 40 bytes ago --- */
> > >         struct device *            dev;                  /*   232     8 */
> > >         struct dsa_switch_ops      ops;                  /*   240   864 */
> > >         /* --- cacheline 17 boundary (1088 bytes) was 16 bytes ago --- */
> > >         struct gpio_desc *         reset_gpio;           /*  1104     8 */
> > >         unsigned int               port_mtu[7];          /*  1112    28 */
> > > 
> > >         /* XXX 4 bytes hole, try to pack */
> > > 
> > >         struct net_device *        mgmt_master;          /*  1144     8 */
> > >         /* --- cacheline 18 boundary (1152 bytes) --- */
> > >         struct qca8k_mgmt_eth_data mgmt_eth_data;        /*  1152   280 */
> > >         /* --- cacheline 22 boundary (1408 bytes) was 24 bytes ago --- */
> > >         struct qca8k_mib_eth_data  mib_eth_data;         /*  1432   272 */
> > >         /* --- cacheline 26 boundary (1664 bytes) was 40 bytes ago --- */
> > >         struct qca8k_mdio_cache    mdio_cache;           /*  1704     6 */
> > > 
> > >         /* XXX 2 bytes hole, try to pack */
> > > 
> > >         struct qca8k_pcs           pcs_port_0;           /*  1712    32 */
> > > 
> > >         /* XXX last struct has 4 bytes of padding */
> > > 
> > >         /* --- cacheline 27 boundary (1728 bytes) was 16 bytes ago --- */
> > >         struct qca8k_pcs           pcs_port_6;           /*  1744    32 */
> > > 
> > >         /* XXX last struct has 4 bytes of padding */
> > > 
> > >         /* size: 1776, cachelines: 28, members: 22 */
> > >         /* sum members: 1763, holes: 4, sum holes: 13 */
> > >         /* paddings: 2, sum paddings: 8 */
> > >         /* last cacheline: 48 bytes */
> > > };
> > > 
> > > After:
> > > 
> > > struct qca8k_priv {
> > >         struct net_device *        mgmt_master;          /*     0     8 */
> > >         struct qca8k_mgmt_eth_data mgmt_eth_data;        /*     8   280 */
> > >         /* --- cacheline 4 boundary (256 bytes) was 32 bytes ago --- */
> > >         struct qca8k_mdio_cache    mdio_cache;           /*   288     6 */
> > >         u8                         switch_id;            /*   294     1 */
> > >         u8                         switch_revision;      /*   295     1 */
> > >         u8                         mirror_rx;            /*   296     1 */
> > >         u8                         mirror_tx;            /*   297     1 */
> > >         u8                         lag_hash_mode;        /*   298     1 */
> > >         bool                       legacy_phy_port_mapping; /*   299     1 */
> > > 
> > >         /* XXX 4 bytes hole, try to pack */
> > > 
> > >         struct qca8k_ports_config  ports_config;         /*   304    72 */
> > >         /* --- cacheline 5 boundary (320 bytes) was 56 bytes ago --- */
> > >         struct regmap *            regmap;               /*   376     8 */
> > >         /* --- cacheline 6 boundary (384 bytes) --- */
> > >         struct mii_bus *           bus;                  /*   384     8 */
> > >         struct ar8xxx_port_status  port_sts[7];          /*   392    28 */
> > > 
> > >         /* XXX 4 bytes hole, try to pack */
> > > 
> > >         struct dsa_switch *        ds;                   /*   424     8 */
> > >         struct mutex               reg_mutex;            /*   432   160 */
> > >         /* --- cacheline 9 boundary (576 bytes) was 16 bytes ago --- */
> > >         struct device *            dev;                  /*   592     8 */
> > >         struct gpio_desc *         reset_gpio;           /*   600     8 */
> > >         struct dsa_switch_ops      ops;                  /*   608   864 */
> > >         /* --- cacheline 23 boundary (1472 bytes) --- */
> > >         struct qca8k_mib_eth_data  mib_eth_data;         /*  1472   280 */
> > > 
> > >         /* XXX last struct has 4 bytes of padding */
> > > 
> > >         /* --- cacheline 27 boundary (1728 bytes) was 24 bytes ago --- */
> > >         unsigned int               port_mtu[7];          /*  1752    28 */
> > > 
> > >         /* size: 1784, cachelines: 28, members: 20 */
> > >         /* sum members: 1772, holes: 2, sum holes: 8 */
> > >         /* padding: 4 */
> > >         /* paddings: 1, sum paddings: 4 */
> > >         /* last cacheline: 56 bytes */
> > > };
> > > 
> > > 1776 vs 1784. That's... larger?!
> > > 
> > > Also, struct qca8k_priv is so large because the "ops" member is a full
> > > copy of qca8k_switch_ops. I understand why commit db460c54b67f ("net:
> > > dsa: qca8k: extend slave-bus implementations") did this, but I wonder,
> > > is there no better way?
> > > 
> > 
> > Actually from what I can see the struct can be improved in 2 way...
> > The ancient struct
> > ar8xxx_port_status port_sts[QCA8K_NUM_PORTS];
> > can be totally dropped as I can't see why we still need it.
> > 
> > The duplicated ops is funny. I could be very confused but why it's done
> > like that? Can't we modify directly the already defined one and drop
> > struct dsa_switch_ops ops; from qca8k_priv?
> 
> Simply put, qca8k_switch_ops is per kernel, priv->ops is per driver
> instance.
> 

Right I totally missed that.

> > I mean is all of that to use priv->ops.phy_read instead of
> > priv->ds->ops.phy_read or even qca8k_switch_ops directly? o.o
> > 
> > Am I missing something?
> 
> The thing is that qca8k_setup_mdio_bus() wants DSA to use the
> legacy_phy_port_mapping only as a fallback. DSA uses this simplistic
> condition:
> 
> dsa_switch_setup():
> 	if (!ds->slave_mii_bus && ds->ops->phy_read) {
> 		ds->slave_mii_bus = mdiobus_alloc();
> 
> You might be tempted to say: hey, qca8k_mdio_register() populates
> ds->slave_mii_bus to an MDIO bus allocated by itself. So you'd be able
> to prune the mdiobus_alloc() because the first part of the condition is
> false.
> 
> But dsa_switch_teardown() has:
> 
> 	if (ds->slave_mii_bus && ds->ops->phy_read) {
> 		mdiobus_unregister(ds->slave_mii_bus);
> 
> In other words, dsa_switch_teardown() has lost track of who allocated
> the MDIO bus - it assumes that the DSA core has. That will conflict with
> qca8k which uses devres, so it would result in double free.
> 
> So that's basically what the problem is. The qca8k driver needs to have
> a NULL ds->ops->phy_read if it doesn't use the legacy_phy_port_mapping,
> so it won't confuse the DSA core.
> 
> Please note that I'm not really too familiar with this part of the DSA
> core since I don't have any hardware that makes any use of ds->slave_mii_bus.
> 

Ok now I see the problem. Thx for the good explaination. I wonder...
Should I use the easy path and just drop phy_read/write? Qca8k already
allocate a dedicated mdio for the same task... Should I just add some
check and make the dedicated mdio register without the OF API when it's
in legacy mode?

I think that at times the idea of that patch was to try to use as much
as possible generic dsa ops but if that would result in having the big
ops duplicated for every switch that doesn't seems that optmizied.

An alternative would be adding an additional flag to dsa to control how
DSA should handle slave_mii_bus. But I assume a solution like that would
be directly NACK as it's just an hack for a driver to save some space.

Or now that I think about it generilize all of this and add some API to
DSA to register an mdiobus with an mdio node if provided. Or even
provide some way to make the driver decide what to do... Some type of
configuration? Is it overkill or other driver would benefit from this?
Do we have other driver that declare a custom mdiobus instead of using
phy_read/write?

> > > >  drivers/net/dsa/qca8k.c |  8 ++++----
> > > >  drivers/net/dsa/qca8k.h | 33 ++++++++++++++++-----------------
> > > >  2 files changed, 20 insertions(+), 21 deletions(-)
> > > > 
> > > > diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
> > > > index ee0dbf324268..8d059da5f0ca 100644
> > > > --- a/drivers/net/dsa/qca8k.c
> > > > +++ b/drivers/net/dsa/qca8k.c
> > > > @@ -1685,11 +1685,11 @@ qca8k_phylink_mac_select_pcs(struct dsa_switch *ds, int port,
> > > >  	case PHY_INTERFACE_MODE_1000BASEX:
> > > >  		switch (port) {
> > > >  		case 0:
> > > > -			pcs = &priv->pcs_port_0.pcs;
> > > > +			pcs = &priv->ports_config.qpcs[QCA8K_CPU_PORT0].pcs;
> > > >  			break;
> > > >  
> > > >  		case 6:
> > > > -			pcs = &priv->pcs_port_6.pcs;
> > > > +			pcs = &priv->ports_config.qpcs[QCA8K_CPU_PORT6].pcs;
> > > >  			break;
> > > >  		}
> > > >  		break;
> > > > @@ -2889,8 +2889,8 @@ qca8k_setup(struct dsa_switch *ds)
> > > >  	if (ret)
> > > >  		return ret;
> > > >  
> > > > -	qca8k_setup_pcs(priv, &priv->pcs_port_0, 0);
> > > > -	qca8k_setup_pcs(priv, &priv->pcs_port_6, 6);
> > > > +	qca8k_setup_pcs(priv, &priv->ports_config.qpcs[QCA8K_CPU_PORT0], 0);
> > > > +	qca8k_setup_pcs(priv, &priv->ports_config.qpcs[QCA8K_CPU_PORT6], 6);
> > > >  
> > > >  	/* Make sure MAC06 is disabled */
> > > >  	ret = regmap_clear_bits(priv->regmap, QCA8K_REG_PORT0_PAD_CTRL,
> > > > diff --git a/drivers/net/dsa/qca8k.h b/drivers/net/dsa/qca8k.h
> > > > index f375627174c8..611dc2335dbe 100644
> > > > --- a/drivers/net/dsa/qca8k.h
> > > > +++ b/drivers/net/dsa/qca8k.h
> > > > @@ -341,18 +341,24 @@ enum {
> > > >  
> > > >  struct qca8k_mgmt_eth_data {
> > > >  	struct completion rw_done;
> > > > -	struct mutex mutex; /* Enforce one mdio read/write at time */
> > > > +	u32 data[4];
> > > >  	bool ack;
> > > >  	u32 seq;
> > > > -	u32 data[4];
> > > > +	struct mutex mutex; /* Enforce one mdio read/write at time */
> > > >  };
> > > >  
> > > >  struct qca8k_mib_eth_data {
> > > >  	struct completion rw_done;
> > > > +	u64 *data; /* pointer to ethtool data */
> > > > +	u8 req_port;
> > > >  	struct mutex mutex; /* Process one command at time */
> > > >  	refcount_t port_parsed; /* Counter to track parsed port */
> > > > -	u8 req_port;
> > > > -	u64 *data; /* pointer to ethtool data */
> > > > +};
> > > > +
> > > > +struct qca8k_pcs {
> > > > +	struct phylink_pcs pcs;
> > > > +	struct qca8k_priv *priv;
> > > > +	int port;
> > > >  };
> > > >  
> > > >  struct qca8k_ports_config {
> > > > @@ -361,6 +367,7 @@ struct qca8k_ports_config {
> > > >  	bool sgmii_enable_pll;
> > > >  	u8 rgmii_rx_delay[QCA8K_NUM_CPU_PORTS]; /* 0: CPU port0, 1: CPU port6 */
> > > >  	u8 rgmii_tx_delay[QCA8K_NUM_CPU_PORTS]; /* 0: CPU port0, 1: CPU port6 */
> > > > +	struct qca8k_pcs qpcs[QCA8K_NUM_CPU_PORTS];
> > > >  };
> > > >  
> > > >  struct qca8k_mdio_cache {
> > > > @@ -376,13 +383,10 @@ struct qca8k_mdio_cache {
> > > >  	u16 hi;
> > > >  };
> > > >  
> > > > -struct qca8k_pcs {
> > > > -	struct phylink_pcs pcs;
> > > > -	struct qca8k_priv *priv;
> > > > -	int port;
> > > > -};
> > > > -
> > > >  struct qca8k_priv {
> > > > +	struct net_device *mgmt_master; /* Track if mdio/mib Ethernet is available */
> > > > +	struct qca8k_mgmt_eth_data mgmt_eth_data;
> > > > +	struct qca8k_mdio_cache mdio_cache;
> > > >  	u8 switch_id;
> > > >  	u8 switch_revision;
> > > >  	u8 mirror_rx;
> > > > @@ -396,15 +400,10 @@ struct qca8k_priv {
> > > >  	struct dsa_switch *ds;
> > > >  	struct mutex reg_mutex;
> > > >  	struct device *dev;
> > > > -	struct dsa_switch_ops ops;
> > > >  	struct gpio_desc *reset_gpio;
> > > > -	unsigned int port_mtu[QCA8K_NUM_PORTS];
> > > > -	struct net_device *mgmt_master; /* Track if mdio/mib Ethernet is available */
> > > > -	struct qca8k_mgmt_eth_data mgmt_eth_data;
> > > > +	struct dsa_switch_ops ops;
> > > >  	struct qca8k_mib_eth_data mib_eth_data;
> > > > -	struct qca8k_mdio_cache mdio_cache;
> > > > -	struct qca8k_pcs pcs_port_0;
> > > > -	struct qca8k_pcs pcs_port_6;
> > > > +	unsigned int port_mtu[QCA8K_NUM_PORTS];
> > > >  };
> > > >  
> > > >  struct qca8k_mib_desc {
> > > > -- 
> > > > 2.34.1
> > > > 
> > > 
> > 
> > -- 
> > 	Ansuel

-- 
	Ansuel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [net-next PATCH v2] net: dsa: qca8k: pack driver struct and improve cache use
  2022-03-21 17:26       ` Ansuel Smith
@ 2022-03-21 17:31         ` Ansuel Smith
  2022-03-21 18:22           ` Vladimir Oltean
  0 siblings, 1 reply; 10+ messages in thread
From: Ansuel Smith @ 2022-03-21 17:31 UTC (permalink / raw)
  To: Vladimir Oltean
  Cc: Andrew Lunn, Vivien Didelot, Florian Fainelli, David S. Miller,
	Jakub Kicinski, Russell King, netdev, linux-kernel

On Mon, Mar 21, 2022 at 06:26:24PM +0100, Ansuel Smith wrote:
> On Mon, Mar 21, 2022 at 07:22:00PM +0200, Vladimir Oltean wrote:
> > On Mon, Mar 21, 2022 at 05:07:55PM +0100, Ansuel Smith wrote:
> > > On Thu, Mar 03, 2022 at 03:53:27AM +0200, Vladimir Oltean wrote:
> > > > On Mon, Feb 28, 2022 at 12:04:08PM +0100, Ansuel Smith wrote:
> > > > > Pack qca8k priv and other struct using pahole and set the first priv
> > > > > struct entry to mgmt_master and mgmt_eth_data to speedup access.
> > > > > While at it also rework pcs struct and move it qca8k_ports_config
> > > > > following other configuration set for the cpu ports.
> > > > > 
> > > > > Signed-off-by: Ansuel Smith <ansuelsmth@gmail.com>
> > > > > ---
> > > > 
> > > > How did you "pack" struct qca8k_priv exactly?
> > > >
> > > 
> > > I'm trying to understand it too... also this was done basend on what
> > > target? 
> > 
> > I used an arm64 toolchain, if that's what you're asking.
> >
> 
> That could be the reason of different results. Qca8k is mostly used on
> 32bit systems with 4byte cache wonder. That could be the reason i have
> different results and on a system like that it did suggest this order?
> 
> > > > Before:
> > > > 
> > > > struct qca8k_priv {
> > > >         u8                         switch_id;            /*     0     1 */
> > > >         u8                         switch_revision;      /*     1     1 */
> > > >         u8                         mirror_rx;            /*     2     1 */
> > > >         u8                         mirror_tx;            /*     3     1 */
> > > >         u8                         lag_hash_mode;        /*     4     1 */
> > > >         bool                       legacy_phy_port_mapping; /*     5     1 */
> > > >         struct qca8k_ports_config  ports_config;         /*     6     7 */
> > > > 
> > > >         /* XXX 3 bytes hole, try to pack */
> > > > 
> > > >         struct regmap *            regmap;               /*    16     8 */
> > > >         struct mii_bus *           bus;                  /*    24     8 */
> > > >         struct ar8xxx_port_status  port_sts[7];          /*    32    28 */
> > > > 
> > > >         /* XXX 4 bytes hole, try to pack */
> > > > 
> > > >         /* --- cacheline 1 boundary (64 bytes) --- */
> > > >         struct dsa_switch *        ds;                   /*    64     8 */
> > > >         struct mutex               reg_mutex;            /*    72   160 */
> > > >         /* --- cacheline 3 boundary (192 bytes) was 40 bytes ago --- */
> > > >         struct device *            dev;                  /*   232     8 */
> > > >         struct dsa_switch_ops      ops;                  /*   240   864 */
> > > >         /* --- cacheline 17 boundary (1088 bytes) was 16 bytes ago --- */
> > > >         struct gpio_desc *         reset_gpio;           /*  1104     8 */
> > > >         unsigned int               port_mtu[7];          /*  1112    28 */
> > > > 
> > > >         /* XXX 4 bytes hole, try to pack */
> > > > 
> > > >         struct net_device *        mgmt_master;          /*  1144     8 */
> > > >         /* --- cacheline 18 boundary (1152 bytes) --- */
> > > >         struct qca8k_mgmt_eth_data mgmt_eth_data;        /*  1152   280 */
> > > >         /* --- cacheline 22 boundary (1408 bytes) was 24 bytes ago --- */
> > > >         struct qca8k_mib_eth_data  mib_eth_data;         /*  1432   272 */
> > > >         /* --- cacheline 26 boundary (1664 bytes) was 40 bytes ago --- */
> > > >         struct qca8k_mdio_cache    mdio_cache;           /*  1704     6 */
> > > > 
> > > >         /* XXX 2 bytes hole, try to pack */
> > > > 
> > > >         struct qca8k_pcs           pcs_port_0;           /*  1712    32 */
> > > > 
> > > >         /* XXX last struct has 4 bytes of padding */
> > > > 
> > > >         /* --- cacheline 27 boundary (1728 bytes) was 16 bytes ago --- */
> > > >         struct qca8k_pcs           pcs_port_6;           /*  1744    32 */
> > > > 
> > > >         /* XXX last struct has 4 bytes of padding */
> > > > 
> > > >         /* size: 1776, cachelines: 28, members: 22 */
> > > >         /* sum members: 1763, holes: 4, sum holes: 13 */
> > > >         /* paddings: 2, sum paddings: 8 */
> > > >         /* last cacheline: 48 bytes */
> > > > };
> > > > 
> > > > After:
> > > > 
> > > > struct qca8k_priv {
> > > >         struct net_device *        mgmt_master;          /*     0     8 */
> > > >         struct qca8k_mgmt_eth_data mgmt_eth_data;        /*     8   280 */
> > > >         /* --- cacheline 4 boundary (256 bytes) was 32 bytes ago --- */
> > > >         struct qca8k_mdio_cache    mdio_cache;           /*   288     6 */
> > > >         u8                         switch_id;            /*   294     1 */
> > > >         u8                         switch_revision;      /*   295     1 */
> > > >         u8                         mirror_rx;            /*   296     1 */
> > > >         u8                         mirror_tx;            /*   297     1 */
> > > >         u8                         lag_hash_mode;        /*   298     1 */
> > > >         bool                       legacy_phy_port_mapping; /*   299     1 */
> > > > 
> > > >         /* XXX 4 bytes hole, try to pack */
> > > > 
> > > >         struct qca8k_ports_config  ports_config;         /*   304    72 */
> > > >         /* --- cacheline 5 boundary (320 bytes) was 56 bytes ago --- */
> > > >         struct regmap *            regmap;               /*   376     8 */
> > > >         /* --- cacheline 6 boundary (384 bytes) --- */
> > > >         struct mii_bus *           bus;                  /*   384     8 */
> > > >         struct ar8xxx_port_status  port_sts[7];          /*   392    28 */
> > > > 
> > > >         /* XXX 4 bytes hole, try to pack */
> > > > 
> > > >         struct dsa_switch *        ds;                   /*   424     8 */
> > > >         struct mutex               reg_mutex;            /*   432   160 */
> > > >         /* --- cacheline 9 boundary (576 bytes) was 16 bytes ago --- */
> > > >         struct device *            dev;                  /*   592     8 */
> > > >         struct gpio_desc *         reset_gpio;           /*   600     8 */
> > > >         struct dsa_switch_ops      ops;                  /*   608   864 */
> > > >         /* --- cacheline 23 boundary (1472 bytes) --- */
> > > >         struct qca8k_mib_eth_data  mib_eth_data;         /*  1472   280 */
> > > > 
> > > >         /* XXX last struct has 4 bytes of padding */
> > > > 
> > > >         /* --- cacheline 27 boundary (1728 bytes) was 24 bytes ago --- */
> > > >         unsigned int               port_mtu[7];          /*  1752    28 */
> > > > 
> > > >         /* size: 1784, cachelines: 28, members: 20 */
> > > >         /* sum members: 1772, holes: 2, sum holes: 8 */
> > > >         /* padding: 4 */
> > > >         /* paddings: 1, sum paddings: 4 */
> > > >         /* last cacheline: 56 bytes */
> > > > };
> > > > 
> > > > 1776 vs 1784. That's... larger?!
> > > > 
> > > > Also, struct qca8k_priv is so large because the "ops" member is a full
> > > > copy of qca8k_switch_ops. I understand why commit db460c54b67f ("net:
> > > > dsa: qca8k: extend slave-bus implementations") did this, but I wonder,
> > > > is there no better way?
> > > > 
> > > 
> > > Actually from what I can see the struct can be improved in 2 way...
> > > The ancient struct
> > > ar8xxx_port_status port_sts[QCA8K_NUM_PORTS];
> > > can be totally dropped as I can't see why we still need it.
> > > 
> > > The duplicated ops is funny. I could be very confused but why it's done
> > > like that? Can't we modify directly the already defined one and drop
> > > struct dsa_switch_ops ops; from qca8k_priv?
> > 
> > Simply put, qca8k_switch_ops is per kernel, priv->ops is per driver
> > instance.
> > 
> 
> Right I totally missed that.
> 
> > > I mean is all of that to use priv->ops.phy_read instead of
> > > priv->ds->ops.phy_read or even qca8k_switch_ops directly? o.o
> > > 
> > > Am I missing something?
> > 
> > The thing is that qca8k_setup_mdio_bus() wants DSA to use the
> > legacy_phy_port_mapping only as a fallback. DSA uses this simplistic
> > condition:
> > 
> > dsa_switch_setup():
> > 	if (!ds->slave_mii_bus && ds->ops->phy_read) {
> > 		ds->slave_mii_bus = mdiobus_alloc();
> > 
> > You might be tempted to say: hey, qca8k_mdio_register() populates
> > ds->slave_mii_bus to an MDIO bus allocated by itself. So you'd be able
> > to prune the mdiobus_alloc() because the first part of the condition is
> > false.
> > 
> > But dsa_switch_teardown() has:
> > 
> > 	if (ds->slave_mii_bus && ds->ops->phy_read) {
> > 		mdiobus_unregister(ds->slave_mii_bus);
> > 
> > In other words, dsa_switch_teardown() has lost track of who allocated
> > the MDIO bus - it assumes that the DSA core has. That will conflict with
> > qca8k which uses devres, so it would result in double free.
> > 
> > So that's basically what the problem is. The qca8k driver needs to have
> > a NULL ds->ops->phy_read if it doesn't use the legacy_phy_port_mapping,
> > so it won't confuse the DSA core.
> > 
> > Please note that I'm not really too familiar with this part of the DSA
> > core since I don't have any hardware that makes any use of ds->slave_mii_bus.
> > 
> 
> Ok now I see the problem. Thx for the good explaination. I wonder...
> Should I use the easy path and just drop phy_read/write? Qca8k already
> allocate a dedicated mdio for the same task... Should I just add some
> check and make the dedicated mdio register without the OF API when it's
> in legacy mode?
> 
> I think that at times the idea of that patch was to try to use as much
> as possible generic dsa ops but if that would result in having the big
> ops duplicated for every switch that doesn't seems that optmizied.
> 
> An alternative would be adding an additional flag to dsa to control how
> DSA should handle slave_mii_bus. But I assume a solution like that would
> be directly NACK as it's just an hack for a driver to save some space.
> 
> Or now that I think about it generilize all of this and add some API to
> DSA to register an mdiobus with an mdio node if provided. Or even
> provide some way to make the driver decide what to do... Some type of
> configuration? Is it overkill or other driver would benefit from this?
> Do we have other driver that declare a custom mdiobus instead of using
> phy_read/write?
>

I just checked we have 4 driver that implement custom mdiobus using an
of node and would benefit from this... Still don't know if it would be a
good solution.

> > > > >  drivers/net/dsa/qca8k.c |  8 ++++----
> > > > >  drivers/net/dsa/qca8k.h | 33 ++++++++++++++++-----------------
> > > > >  2 files changed, 20 insertions(+), 21 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
> > > > > index ee0dbf324268..8d059da5f0ca 100644
> > > > > --- a/drivers/net/dsa/qca8k.c
> > > > > +++ b/drivers/net/dsa/qca8k.c
> > > > > @@ -1685,11 +1685,11 @@ qca8k_phylink_mac_select_pcs(struct dsa_switch *ds, int port,
> > > > >  	case PHY_INTERFACE_MODE_1000BASEX:
> > > > >  		switch (port) {
> > > > >  		case 0:
> > > > > -			pcs = &priv->pcs_port_0.pcs;
> > > > > +			pcs = &priv->ports_config.qpcs[QCA8K_CPU_PORT0].pcs;
> > > > >  			break;
> > > > >  
> > > > >  		case 6:
> > > > > -			pcs = &priv->pcs_port_6.pcs;
> > > > > +			pcs = &priv->ports_config.qpcs[QCA8K_CPU_PORT6].pcs;
> > > > >  			break;
> > > > >  		}
> > > > >  		break;
> > > > > @@ -2889,8 +2889,8 @@ qca8k_setup(struct dsa_switch *ds)
> > > > >  	if (ret)
> > > > >  		return ret;
> > > > >  
> > > > > -	qca8k_setup_pcs(priv, &priv->pcs_port_0, 0);
> > > > > -	qca8k_setup_pcs(priv, &priv->pcs_port_6, 6);
> > > > > +	qca8k_setup_pcs(priv, &priv->ports_config.qpcs[QCA8K_CPU_PORT0], 0);
> > > > > +	qca8k_setup_pcs(priv, &priv->ports_config.qpcs[QCA8K_CPU_PORT6], 6);
> > > > >  
> > > > >  	/* Make sure MAC06 is disabled */
> > > > >  	ret = regmap_clear_bits(priv->regmap, QCA8K_REG_PORT0_PAD_CTRL,
> > > > > diff --git a/drivers/net/dsa/qca8k.h b/drivers/net/dsa/qca8k.h
> > > > > index f375627174c8..611dc2335dbe 100644
> > > > > --- a/drivers/net/dsa/qca8k.h
> > > > > +++ b/drivers/net/dsa/qca8k.h
> > > > > @@ -341,18 +341,24 @@ enum {
> > > > >  
> > > > >  struct qca8k_mgmt_eth_data {
> > > > >  	struct completion rw_done;
> > > > > -	struct mutex mutex; /* Enforce one mdio read/write at time */
> > > > > +	u32 data[4];
> > > > >  	bool ack;
> > > > >  	u32 seq;
> > > > > -	u32 data[4];
> > > > > +	struct mutex mutex; /* Enforce one mdio read/write at time */
> > > > >  };
> > > > >  
> > > > >  struct qca8k_mib_eth_data {
> > > > >  	struct completion rw_done;
> > > > > +	u64 *data; /* pointer to ethtool data */
> > > > > +	u8 req_port;
> > > > >  	struct mutex mutex; /* Process one command at time */
> > > > >  	refcount_t port_parsed; /* Counter to track parsed port */
> > > > > -	u8 req_port;
> > > > > -	u64 *data; /* pointer to ethtool data */
> > > > > +};
> > > > > +
> > > > > +struct qca8k_pcs {
> > > > > +	struct phylink_pcs pcs;
> > > > > +	struct qca8k_priv *priv;
> > > > > +	int port;
> > > > >  };
> > > > >  
> > > > >  struct qca8k_ports_config {
> > > > > @@ -361,6 +367,7 @@ struct qca8k_ports_config {
> > > > >  	bool sgmii_enable_pll;
> > > > >  	u8 rgmii_rx_delay[QCA8K_NUM_CPU_PORTS]; /* 0: CPU port0, 1: CPU port6 */
> > > > >  	u8 rgmii_tx_delay[QCA8K_NUM_CPU_PORTS]; /* 0: CPU port0, 1: CPU port6 */
> > > > > +	struct qca8k_pcs qpcs[QCA8K_NUM_CPU_PORTS];
> > > > >  };
> > > > >  
> > > > >  struct qca8k_mdio_cache {
> > > > > @@ -376,13 +383,10 @@ struct qca8k_mdio_cache {
> > > > >  	u16 hi;
> > > > >  };
> > > > >  
> > > > > -struct qca8k_pcs {
> > > > > -	struct phylink_pcs pcs;
> > > > > -	struct qca8k_priv *priv;
> > > > > -	int port;
> > > > > -};
> > > > > -
> > > > >  struct qca8k_priv {
> > > > > +	struct net_device *mgmt_master; /* Track if mdio/mib Ethernet is available */
> > > > > +	struct qca8k_mgmt_eth_data mgmt_eth_data;
> > > > > +	struct qca8k_mdio_cache mdio_cache;
> > > > >  	u8 switch_id;
> > > > >  	u8 switch_revision;
> > > > >  	u8 mirror_rx;
> > > > > @@ -396,15 +400,10 @@ struct qca8k_priv {
> > > > >  	struct dsa_switch *ds;
> > > > >  	struct mutex reg_mutex;
> > > > >  	struct device *dev;
> > > > > -	struct dsa_switch_ops ops;
> > > > >  	struct gpio_desc *reset_gpio;
> > > > > -	unsigned int port_mtu[QCA8K_NUM_PORTS];
> > > > > -	struct net_device *mgmt_master; /* Track if mdio/mib Ethernet is available */
> > > > > -	struct qca8k_mgmt_eth_data mgmt_eth_data;
> > > > > +	struct dsa_switch_ops ops;
> > > > >  	struct qca8k_mib_eth_data mib_eth_data;
> > > > > -	struct qca8k_mdio_cache mdio_cache;
> > > > > -	struct qca8k_pcs pcs_port_0;
> > > > > -	struct qca8k_pcs pcs_port_6;
> > > > > +	unsigned int port_mtu[QCA8K_NUM_PORTS];
> > > > >  };
> > > > >  
> > > > >  struct qca8k_mib_desc {
> > > > > -- 
> > > > > 2.34.1
> > > > > 
> > > > 
> > > 
> > > -- 
> > > 	Ansuel
> 
> -- 
> 	Ansuel

-- 
	Ansuel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [net-next PATCH v2] net: dsa: qca8k: pack driver struct and improve cache use
  2022-03-21 18:22           ` Vladimir Oltean
@ 2022-03-21 18:06             ` Ansuel Smith
  2022-03-21 18:55               ` Vladimir Oltean
  0 siblings, 1 reply; 10+ messages in thread
From: Ansuel Smith @ 2022-03-21 18:06 UTC (permalink / raw)
  To: Vladimir Oltean
  Cc: Andrew Lunn, Vivien Didelot, Florian Fainelli, David S. Miller,
	Jakub Kicinski, Russell King, netdev, linux-kernel

On Mon, Mar 21, 2022 at 08:22:26PM +0200, Vladimir Oltean wrote:
> On Mon, Mar 21, 2022 at 06:31:27PM +0100, Ansuel Smith wrote:
> > On Mon, Mar 21, 2022 at 06:26:24PM +0100, Ansuel Smith wrote:
> > > On Mon, Mar 21, 2022 at 07:22:00PM +0200, Vladimir Oltean wrote:
> > > > On Mon, Mar 21, 2022 at 05:07:55PM +0100, Ansuel Smith wrote:
> > > > > On Thu, Mar 03, 2022 at 03:53:27AM +0200, Vladimir Oltean wrote:
> > > > > > On Mon, Feb 28, 2022 at 12:04:08PM +0100, Ansuel Smith wrote:
> > > > > > > Pack qca8k priv and other struct using pahole and set the first priv
> > > > > > > struct entry to mgmt_master and mgmt_eth_data to speedup access.
> > > > > > > While at it also rework pcs struct and move it qca8k_ports_config
> > > > > > > following other configuration set for the cpu ports.
> > > > > > > 
> > > > > > > Signed-off-by: Ansuel Smith <ansuelsmth@gmail.com>
> > > > > > > ---
> > > > > > 
> > > > > > How did you "pack" struct qca8k_priv exactly?
> > > > > >
> > > > > 
> > > > > I'm trying to understand it too... also this was done basend on what
> > > > > target? 
> > > > 
> > > > I used an arm64 toolchain, if that's what you're asking.
> > > >
> > > 
> > > That could be the reason of different results. Qca8k is mostly used on
> > > 32bit systems with 4byte cache wonder. That could be the reason i have
> > > different results and on a system like that it did suggest this order?
> 
> Is it impossible to use the qca8k driver on 64-bit systems?
>

It's possible but reality is that 99% it will be used on 32bit system.
Wonder if we should optimize for that.

> > > > > > Before:
> > > > > > 
> > > > > > struct qca8k_priv {
> > > > > >         u8                         switch_id;            /*     0     1 */
> > > > > >         u8                         switch_revision;      /*     1     1 */
> > > > > >         u8                         mirror_rx;            /*     2     1 */
> > > > > >         u8                         mirror_tx;            /*     3     1 */
> > > > > >         u8                         lag_hash_mode;        /*     4     1 */
> > > > > >         bool                       legacy_phy_port_mapping; /*     5     1 */
> > > > > >         struct qca8k_ports_config  ports_config;         /*     6     7 */
> > > > > > 
> > > > > >         /* XXX 3 bytes hole, try to pack */
> > > > > > 
> > > > > >         struct regmap *            regmap;               /*    16     8 */
> > > > > >         struct mii_bus *           bus;                  /*    24     8 */
> > > > > >         struct ar8xxx_port_status  port_sts[7];          /*    32    28 */
> > > > > > 
> > > > > >         /* XXX 4 bytes hole, try to pack */
> > > > > > 
> > > > > >         /* --- cacheline 1 boundary (64 bytes) --- */
> > > > > >         struct dsa_switch *        ds;                   /*    64     8 */
> > > > > >         struct mutex               reg_mutex;            /*    72   160 */
> > > > > >         /* --- cacheline 3 boundary (192 bytes) was 40 bytes ago --- */
> > > > > >         struct device *            dev;                  /*   232     8 */
> > > > > >         struct dsa_switch_ops      ops;                  /*   240   864 */
> > > > > >         /* --- cacheline 17 boundary (1088 bytes) was 16 bytes ago --- */
> > > > > >         struct gpio_desc *         reset_gpio;           /*  1104     8 */
> > > > > >         unsigned int               port_mtu[7];          /*  1112    28 */
> > > > > > 
> > > > > >         /* XXX 4 bytes hole, try to pack */
> > > > > > 
> > > > > >         struct net_device *        mgmt_master;          /*  1144     8 */
> > > > > >         /* --- cacheline 18 boundary (1152 bytes) --- */
> > > > > >         struct qca8k_mgmt_eth_data mgmt_eth_data;        /*  1152   280 */
> > > > > >         /* --- cacheline 22 boundary (1408 bytes) was 24 bytes ago --- */
> > > > > >         struct qca8k_mib_eth_data  mib_eth_data;         /*  1432   272 */
> > > > > >         /* --- cacheline 26 boundary (1664 bytes) was 40 bytes ago --- */
> > > > > >         struct qca8k_mdio_cache    mdio_cache;           /*  1704     6 */
> > > > > > 
> > > > > >         /* XXX 2 bytes hole, try to pack */
> > > > > > 
> > > > > >         struct qca8k_pcs           pcs_port_0;           /*  1712    32 */
> > > > > > 
> > > > > >         /* XXX last struct has 4 bytes of padding */
> > > > > > 
> > > > > >         /* --- cacheline 27 boundary (1728 bytes) was 16 bytes ago --- */
> > > > > >         struct qca8k_pcs           pcs_port_6;           /*  1744    32 */
> > > > > > 
> > > > > >         /* XXX last struct has 4 bytes of padding */
> > > > > > 
> > > > > >         /* size: 1776, cachelines: 28, members: 22 */
> > > > > >         /* sum members: 1763, holes: 4, sum holes: 13 */
> > > > > >         /* paddings: 2, sum paddings: 8 */
> > > > > >         /* last cacheline: 48 bytes */
> > > > > > };
> > > > > > 
> > > > > > After:
> > > > > > 
> > > > > > struct qca8k_priv {
> > > > > >         struct net_device *        mgmt_master;          /*     0     8 */
> > > > > >         struct qca8k_mgmt_eth_data mgmt_eth_data;        /*     8   280 */
> > > > > >         /* --- cacheline 4 boundary (256 bytes) was 32 bytes ago --- */
> > > > > >         struct qca8k_mdio_cache    mdio_cache;           /*   288     6 */
> > > > > >         u8                         switch_id;            /*   294     1 */
> > > > > >         u8                         switch_revision;      /*   295     1 */
> > > > > >         u8                         mirror_rx;            /*   296     1 */
> > > > > >         u8                         mirror_tx;            /*   297     1 */
> > > > > >         u8                         lag_hash_mode;        /*   298     1 */
> > > > > >         bool                       legacy_phy_port_mapping; /*   299     1 */
> > > > > > 
> > > > > >         /* XXX 4 bytes hole, try to pack */
> > > > > > 
> > > > > >         struct qca8k_ports_config  ports_config;         /*   304    72 */
> > > > > >         /* --- cacheline 5 boundary (320 bytes) was 56 bytes ago --- */
> > > > > >         struct regmap *            regmap;               /*   376     8 */
> > > > > >         /* --- cacheline 6 boundary (384 bytes) --- */
> > > > > >         struct mii_bus *           bus;                  /*   384     8 */
> > > > > >         struct ar8xxx_port_status  port_sts[7];          /*   392    28 */
> > > > > > 
> > > > > >         /* XXX 4 bytes hole, try to pack */
> > > > > > 
> > > > > >         struct dsa_switch *        ds;                   /*   424     8 */
> > > > > >         struct mutex               reg_mutex;            /*   432   160 */
> > > > > >         /* --- cacheline 9 boundary (576 bytes) was 16 bytes ago --- */
> > > > > >         struct device *            dev;                  /*   592     8 */
> > > > > >         struct gpio_desc *         reset_gpio;           /*   600     8 */
> > > > > >         struct dsa_switch_ops      ops;                  /*   608   864 */
> > > > > >         /* --- cacheline 23 boundary (1472 bytes) --- */
> > > > > >         struct qca8k_mib_eth_data  mib_eth_data;         /*  1472   280 */
> > > > > > 
> > > > > >         /* XXX last struct has 4 bytes of padding */
> > > > > > 
> > > > > >         /* --- cacheline 27 boundary (1728 bytes) was 24 bytes ago --- */
> > > > > >         unsigned int               port_mtu[7];          /*  1752    28 */
> > > > > > 
> > > > > >         /* size: 1784, cachelines: 28, members: 20 */
> > > > > >         /* sum members: 1772, holes: 2, sum holes: 8 */
> > > > > >         /* padding: 4 */
> > > > > >         /* paddings: 1, sum paddings: 4 */
> > > > > >         /* last cacheline: 56 bytes */
> > > > > > };
> > > > > > 
> > > > > > 1776 vs 1784. That's... larger?!
> > > > > > 
> > > > > > Also, struct qca8k_priv is so large because the "ops" member is a full
> > > > > > copy of qca8k_switch_ops. I understand why commit db460c54b67f ("net:
> > > > > > dsa: qca8k: extend slave-bus implementations") did this, but I wonder,
> > > > > > is there no better way?
> > > > > > 
> > > > > 
> > > > > Actually from what I can see the struct can be improved in 2 way...
> > > > > The ancient struct
> > > > > ar8xxx_port_status port_sts[QCA8K_NUM_PORTS];
> > > > > can be totally dropped as I can't see why we still need it.
> > > > > 
> > > > > The duplicated ops is funny. I could be very confused but why it's done
> > > > > like that? Can't we modify directly the already defined one and drop
> > > > > struct dsa_switch_ops ops; from qca8k_priv?
> > > > 
> > > > Simply put, qca8k_switch_ops is per kernel, priv->ops is per driver
> > > > instance.
> > > > 
> > > 
> > > Right I totally missed that.
> > > 
> > > > > I mean is all of that to use priv->ops.phy_read instead of
> > > > > priv->ds->ops.phy_read or even qca8k_switch_ops directly? o.o
> > > > > 
> > > > > Am I missing something?
> > > > 
> > > > The thing is that qca8k_setup_mdio_bus() wants DSA to use the
> > > > legacy_phy_port_mapping only as a fallback. DSA uses this simplistic
> > > > condition:
> > > > 
> > > > dsa_switch_setup():
> > > > 	if (!ds->slave_mii_bus && ds->ops->phy_read) {
> > > > 		ds->slave_mii_bus = mdiobus_alloc();
> > > > 
> > > > You might be tempted to say: hey, qca8k_mdio_register() populates
> > > > ds->slave_mii_bus to an MDIO bus allocated by itself. So you'd be able
> > > > to prune the mdiobus_alloc() because the first part of the condition is
> > > > false.
> > > > 
> > > > But dsa_switch_teardown() has:
> > > > 
> > > > 	if (ds->slave_mii_bus && ds->ops->phy_read) {
> > > > 		mdiobus_unregister(ds->slave_mii_bus);
> > > > 
> > > > In other words, dsa_switch_teardown() has lost track of who allocated
> > > > the MDIO bus - it assumes that the DSA core has. That will conflict with
> > > > qca8k which uses devres, so it would result in double free.
> > > > 
> > > > So that's basically what the problem is. The qca8k driver needs to have
> > > > a NULL ds->ops->phy_read if it doesn't use the legacy_phy_port_mapping,
> > > > so it won't confuse the DSA core.
> > > > 
> > > > Please note that I'm not really too familiar with this part of the DSA
> > > > core since I don't have any hardware that makes any use of ds->slave_mii_bus.
> > > > 
> > > 
> > > Ok now I see the problem. Thx for the good explaination. I wonder...
> > > Should I use the easy path and just drop phy_read/write? Qca8k already
> > > allocate a dedicated mdio for the same task... Should I just add some
> > > check and make the dedicated mdio register without the OF API when it's
> > > in legacy mode?
> > > 
> > > I think that at times the idea of that patch was to try to use as much
> > > as possible generic dsa ops but if that would result in having the big
> > > ops duplicated for every switch that doesn't seems that optmizied.
> > > 
> > > An alternative would be adding an additional flag to dsa to control how
> > > DSA should handle slave_mii_bus. But I assume a solution like that would
> > > be directly NACK as it's just an hack for a driver to save some space.
> > > 
> > > Or now that I think about it generilize all of this and add some API to
> > > DSA to register an mdiobus with an mdio node if provided. Or even
> > > provide some way to make the driver decide what to do... Some type of
> > > configuration? Is it overkill or other driver would benefit from this?
> > > Do we have other driver that declare a custom mdiobus instead of using
> > > phy_read/write?
> > >
> > 
> > I just checked we have 4 driver that implement custom mdiobus using an
> > of node and would benefit from this... Still don't know if it would be a
> > good solution.
> 
> If you provide your own ds->slave_mii_bus there should be no reason to
> need a ds->ops->phy_read, since your slave_mii_bus already has one.
> 
> Bottom line, phy_read is just there in case they it is helpful.
> Whereas ds->slave_mii_bus is there if you want to have a non-OF based
> phy_connect, with the implicit assumption that the phy_addr is equal to
> the port, and it can't be in any other way.
> 
> So if ds->ops->phy_read / ds->ops->phy_write aren't useful, don't use them.
> I suppose one of the drivers you saw with "custom mdiobus" was sja1105.
> I have no interest whatsoever in converting that driver to use
> ds->slave_mii_bus or ds->phy_read.

My idea was to provide some type of API where the switch insert some
type of data and DSA decide to declare a dsa mdiobus or a
dedicated one based on the config provided. Think I will have to provide
an RFC to better describe this idea. The idea would be to drop phy_read
and phy_write and replace them with a single op that provide a
configuration.

-- 
	Ansuel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [net-next PATCH v2] net: dsa: qca8k: pack driver struct and improve cache use
  2022-03-21 17:31         ` Ansuel Smith
@ 2022-03-21 18:22           ` Vladimir Oltean
  2022-03-21 18:06             ` Ansuel Smith
  0 siblings, 1 reply; 10+ messages in thread
From: Vladimir Oltean @ 2022-03-21 18:22 UTC (permalink / raw)
  To: Ansuel Smith
  Cc: Andrew Lunn, Vivien Didelot, Florian Fainelli, David S. Miller,
	Jakub Kicinski, Russell King, netdev, linux-kernel

On Mon, Mar 21, 2022 at 06:31:27PM +0100, Ansuel Smith wrote:
> On Mon, Mar 21, 2022 at 06:26:24PM +0100, Ansuel Smith wrote:
> > On Mon, Mar 21, 2022 at 07:22:00PM +0200, Vladimir Oltean wrote:
> > > On Mon, Mar 21, 2022 at 05:07:55PM +0100, Ansuel Smith wrote:
> > > > On Thu, Mar 03, 2022 at 03:53:27AM +0200, Vladimir Oltean wrote:
> > > > > On Mon, Feb 28, 2022 at 12:04:08PM +0100, Ansuel Smith wrote:
> > > > > > Pack qca8k priv and other struct using pahole and set the first priv
> > > > > > struct entry to mgmt_master and mgmt_eth_data to speedup access.
> > > > > > While at it also rework pcs struct and move it qca8k_ports_config
> > > > > > following other configuration set for the cpu ports.
> > > > > > 
> > > > > > Signed-off-by: Ansuel Smith <ansuelsmth@gmail.com>
> > > > > > ---
> > > > > 
> > > > > How did you "pack" struct qca8k_priv exactly?
> > > > >
> > > > 
> > > > I'm trying to understand it too... also this was done basend on what
> > > > target? 
> > > 
> > > I used an arm64 toolchain, if that's what you're asking.
> > >
> > 
> > That could be the reason of different results. Qca8k is mostly used on
> > 32bit systems with 4byte cache wonder. That could be the reason i have
> > different results and on a system like that it did suggest this order?

Is it impossible to use the qca8k driver on 64-bit systems?

> > > > > Before:
> > > > > 
> > > > > struct qca8k_priv {
> > > > >         u8                         switch_id;            /*     0     1 */
> > > > >         u8                         switch_revision;      /*     1     1 */
> > > > >         u8                         mirror_rx;            /*     2     1 */
> > > > >         u8                         mirror_tx;            /*     3     1 */
> > > > >         u8                         lag_hash_mode;        /*     4     1 */
> > > > >         bool                       legacy_phy_port_mapping; /*     5     1 */
> > > > >         struct qca8k_ports_config  ports_config;         /*     6     7 */
> > > > > 
> > > > >         /* XXX 3 bytes hole, try to pack */
> > > > > 
> > > > >         struct regmap *            regmap;               /*    16     8 */
> > > > >         struct mii_bus *           bus;                  /*    24     8 */
> > > > >         struct ar8xxx_port_status  port_sts[7];          /*    32    28 */
> > > > > 
> > > > >         /* XXX 4 bytes hole, try to pack */
> > > > > 
> > > > >         /* --- cacheline 1 boundary (64 bytes) --- */
> > > > >         struct dsa_switch *        ds;                   /*    64     8 */
> > > > >         struct mutex               reg_mutex;            /*    72   160 */
> > > > >         /* --- cacheline 3 boundary (192 bytes) was 40 bytes ago --- */
> > > > >         struct device *            dev;                  /*   232     8 */
> > > > >         struct dsa_switch_ops      ops;                  /*   240   864 */
> > > > >         /* --- cacheline 17 boundary (1088 bytes) was 16 bytes ago --- */
> > > > >         struct gpio_desc *         reset_gpio;           /*  1104     8 */
> > > > >         unsigned int               port_mtu[7];          /*  1112    28 */
> > > > > 
> > > > >         /* XXX 4 bytes hole, try to pack */
> > > > > 
> > > > >         struct net_device *        mgmt_master;          /*  1144     8 */
> > > > >         /* --- cacheline 18 boundary (1152 bytes) --- */
> > > > >         struct qca8k_mgmt_eth_data mgmt_eth_data;        /*  1152   280 */
> > > > >         /* --- cacheline 22 boundary (1408 bytes) was 24 bytes ago --- */
> > > > >         struct qca8k_mib_eth_data  mib_eth_data;         /*  1432   272 */
> > > > >         /* --- cacheline 26 boundary (1664 bytes) was 40 bytes ago --- */
> > > > >         struct qca8k_mdio_cache    mdio_cache;           /*  1704     6 */
> > > > > 
> > > > >         /* XXX 2 bytes hole, try to pack */
> > > > > 
> > > > >         struct qca8k_pcs           pcs_port_0;           /*  1712    32 */
> > > > > 
> > > > >         /* XXX last struct has 4 bytes of padding */
> > > > > 
> > > > >         /* --- cacheline 27 boundary (1728 bytes) was 16 bytes ago --- */
> > > > >         struct qca8k_pcs           pcs_port_6;           /*  1744    32 */
> > > > > 
> > > > >         /* XXX last struct has 4 bytes of padding */
> > > > > 
> > > > >         /* size: 1776, cachelines: 28, members: 22 */
> > > > >         /* sum members: 1763, holes: 4, sum holes: 13 */
> > > > >         /* paddings: 2, sum paddings: 8 */
> > > > >         /* last cacheline: 48 bytes */
> > > > > };
> > > > > 
> > > > > After:
> > > > > 
> > > > > struct qca8k_priv {
> > > > >         struct net_device *        mgmt_master;          /*     0     8 */
> > > > >         struct qca8k_mgmt_eth_data mgmt_eth_data;        /*     8   280 */
> > > > >         /* --- cacheline 4 boundary (256 bytes) was 32 bytes ago --- */
> > > > >         struct qca8k_mdio_cache    mdio_cache;           /*   288     6 */
> > > > >         u8                         switch_id;            /*   294     1 */
> > > > >         u8                         switch_revision;      /*   295     1 */
> > > > >         u8                         mirror_rx;            /*   296     1 */
> > > > >         u8                         mirror_tx;            /*   297     1 */
> > > > >         u8                         lag_hash_mode;        /*   298     1 */
> > > > >         bool                       legacy_phy_port_mapping; /*   299     1 */
> > > > > 
> > > > >         /* XXX 4 bytes hole, try to pack */
> > > > > 
> > > > >         struct qca8k_ports_config  ports_config;         /*   304    72 */
> > > > >         /* --- cacheline 5 boundary (320 bytes) was 56 bytes ago --- */
> > > > >         struct regmap *            regmap;               /*   376     8 */
> > > > >         /* --- cacheline 6 boundary (384 bytes) --- */
> > > > >         struct mii_bus *           bus;                  /*   384     8 */
> > > > >         struct ar8xxx_port_status  port_sts[7];          /*   392    28 */
> > > > > 
> > > > >         /* XXX 4 bytes hole, try to pack */
> > > > > 
> > > > >         struct dsa_switch *        ds;                   /*   424     8 */
> > > > >         struct mutex               reg_mutex;            /*   432   160 */
> > > > >         /* --- cacheline 9 boundary (576 bytes) was 16 bytes ago --- */
> > > > >         struct device *            dev;                  /*   592     8 */
> > > > >         struct gpio_desc *         reset_gpio;           /*   600     8 */
> > > > >         struct dsa_switch_ops      ops;                  /*   608   864 */
> > > > >         /* --- cacheline 23 boundary (1472 bytes) --- */
> > > > >         struct qca8k_mib_eth_data  mib_eth_data;         /*  1472   280 */
> > > > > 
> > > > >         /* XXX last struct has 4 bytes of padding */
> > > > > 
> > > > >         /* --- cacheline 27 boundary (1728 bytes) was 24 bytes ago --- */
> > > > >         unsigned int               port_mtu[7];          /*  1752    28 */
> > > > > 
> > > > >         /* size: 1784, cachelines: 28, members: 20 */
> > > > >         /* sum members: 1772, holes: 2, sum holes: 8 */
> > > > >         /* padding: 4 */
> > > > >         /* paddings: 1, sum paddings: 4 */
> > > > >         /* last cacheline: 56 bytes */
> > > > > };
> > > > > 
> > > > > 1776 vs 1784. That's... larger?!
> > > > > 
> > > > > Also, struct qca8k_priv is so large because the "ops" member is a full
> > > > > copy of qca8k_switch_ops. I understand why commit db460c54b67f ("net:
> > > > > dsa: qca8k: extend slave-bus implementations") did this, but I wonder,
> > > > > is there no better way?
> > > > > 
> > > > 
> > > > Actually from what I can see the struct can be improved in 2 way...
> > > > The ancient struct
> > > > ar8xxx_port_status port_sts[QCA8K_NUM_PORTS];
> > > > can be totally dropped as I can't see why we still need it.
> > > > 
> > > > The duplicated ops is funny. I could be very confused but why it's done
> > > > like that? Can't we modify directly the already defined one and drop
> > > > struct dsa_switch_ops ops; from qca8k_priv?
> > > 
> > > Simply put, qca8k_switch_ops is per kernel, priv->ops is per driver
> > > instance.
> > > 
> > 
> > Right I totally missed that.
> > 
> > > > I mean is all of that to use priv->ops.phy_read instead of
> > > > priv->ds->ops.phy_read or even qca8k_switch_ops directly? o.o
> > > > 
> > > > Am I missing something?
> > > 
> > > The thing is that qca8k_setup_mdio_bus() wants DSA to use the
> > > legacy_phy_port_mapping only as a fallback. DSA uses this simplistic
> > > condition:
> > > 
> > > dsa_switch_setup():
> > > 	if (!ds->slave_mii_bus && ds->ops->phy_read) {
> > > 		ds->slave_mii_bus = mdiobus_alloc();
> > > 
> > > You might be tempted to say: hey, qca8k_mdio_register() populates
> > > ds->slave_mii_bus to an MDIO bus allocated by itself. So you'd be able
> > > to prune the mdiobus_alloc() because the first part of the condition is
> > > false.
> > > 
> > > But dsa_switch_teardown() has:
> > > 
> > > 	if (ds->slave_mii_bus && ds->ops->phy_read) {
> > > 		mdiobus_unregister(ds->slave_mii_bus);
> > > 
> > > In other words, dsa_switch_teardown() has lost track of who allocated
> > > the MDIO bus - it assumes that the DSA core has. That will conflict with
> > > qca8k which uses devres, so it would result in double free.
> > > 
> > > So that's basically what the problem is. The qca8k driver needs to have
> > > a NULL ds->ops->phy_read if it doesn't use the legacy_phy_port_mapping,
> > > so it won't confuse the DSA core.
> > > 
> > > Please note that I'm not really too familiar with this part of the DSA
> > > core since I don't have any hardware that makes any use of ds->slave_mii_bus.
> > > 
> > 
> > Ok now I see the problem. Thx for the good explaination. I wonder...
> > Should I use the easy path and just drop phy_read/write? Qca8k already
> > allocate a dedicated mdio for the same task... Should I just add some
> > check and make the dedicated mdio register without the OF API when it's
> > in legacy mode?
> > 
> > I think that at times the idea of that patch was to try to use as much
> > as possible generic dsa ops but if that would result in having the big
> > ops duplicated for every switch that doesn't seems that optmizied.
> > 
> > An alternative would be adding an additional flag to dsa to control how
> > DSA should handle slave_mii_bus. But I assume a solution like that would
> > be directly NACK as it's just an hack for a driver to save some space.
> > 
> > Or now that I think about it generilize all of this and add some API to
> > DSA to register an mdiobus with an mdio node if provided. Or even
> > provide some way to make the driver decide what to do... Some type of
> > configuration? Is it overkill or other driver would benefit from this?
> > Do we have other driver that declare a custom mdiobus instead of using
> > phy_read/write?
> >
> 
> I just checked we have 4 driver that implement custom mdiobus using an
> of node and would benefit from this... Still don't know if it would be a
> good solution.

If you provide your own ds->slave_mii_bus there should be no reason to
need a ds->ops->phy_read, since your slave_mii_bus already has one.

Bottom line, phy_read is just there in case they it is helpful.
Whereas ds->slave_mii_bus is there if you want to have a non-OF based
phy_connect, with the implicit assumption that the phy_addr is equal to
the port, and it can't be in any other way.

So if ds->ops->phy_read / ds->ops->phy_write aren't useful, don't use them.
I suppose one of the drivers you saw with "custom mdiobus" was sja1105.
I have no interest whatsoever in converting that driver to use
ds->slave_mii_bus or ds->phy_read.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [net-next PATCH v2] net: dsa: qca8k: pack driver struct and improve cache use
  2022-03-21 18:06             ` Ansuel Smith
@ 2022-03-21 18:55               ` Vladimir Oltean
  0 siblings, 0 replies; 10+ messages in thread
From: Vladimir Oltean @ 2022-03-21 18:55 UTC (permalink / raw)
  To: Ansuel Smith
  Cc: Andrew Lunn, Vivien Didelot, Florian Fainelli, David S. Miller,
	Jakub Kicinski, Russell King, netdev, linux-kernel

On Mon, Mar 21, 2022 at 07:06:07PM +0100, Ansuel Smith wrote:
> > If you provide your own ds->slave_mii_bus there should be no reason to
> > need a ds->ops->phy_read, since your slave_mii_bus already has one.
> > 
> > Bottom line, phy_read is just there in case they it is helpful.
> > Whereas ds->slave_mii_bus is there if you want to have a non-OF based
> > phy_connect, with the implicit assumption that the phy_addr is equal to
> > the port, and it can't be in any other way.
> > 
> > So if ds->ops->phy_read / ds->ops->phy_write aren't useful, don't use them.
> > I suppose one of the drivers you saw with "custom mdiobus" was sja1105.
> > I have no interest whatsoever in converting that driver to use
> > ds->slave_mii_bus or ds->phy_read.
> 
> My idea was to provide some type of API where the switch insert some
> type of data and DSA decide to declare a dsa mdiobus or a
> dedicated one based on the config provided. Think I will have to provide
> an RFC to better describe this idea. The idea would be to drop phy_read
> and phy_write and replace them with a single op that provide a
> configuration.

Just register a non-OF based MDIO bus for the legacy_phy_port_mapping = true
case using plain mdiobus_register() and provide it to ds->slave_mii_bus,
and delete the ds->ops->phy_read and ds->ops->phy_write, instead
adapting qca8k_phy_read and qca8k_phy_write to be your bus->read and
bus->write. Problem solved. No need to touch the existing logic and
implicitly the other drivers that use it.

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2022-03-21 18:55 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-28 11:04 [net-next PATCH v2] net: dsa: qca8k: pack driver struct and improve cache use Ansuel Smith
2022-03-03  1:53 ` Vladimir Oltean
2022-03-03  2:25   ` Andrew Lunn
2022-03-21 16:07   ` Ansuel Smith
2022-03-21 17:22     ` Vladimir Oltean
2022-03-21 17:26       ` Ansuel Smith
2022-03-21 17:31         ` Ansuel Smith
2022-03-21 18:22           ` Vladimir Oltean
2022-03-21 18:06             ` Ansuel Smith
2022-03-21 18:55               ` Vladimir Oltean

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.