All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net v2]bonding: check port and aggregator when select
@ 2021-01-28  8:20 Aichun Li
  2021-02-01 21:52 ` Jakub Kicinski
  2021-02-02 20:02 ` Jay Vosburgh
  0 siblings, 2 replies; 6+ messages in thread
From: Aichun Li @ 2021-01-28  8:20 UTC (permalink / raw)
  To: davem, kuba, j.vosburgh, vfalico, andy; +Cc: netdev, rose.chen, liaichun

When the network service is repeatedly restarted in 802.3ad, there is a low
 probability that oops occurs.
Test commands:systemctl restart network

1.crash: __enable_port():port->slave is NULL
crash> bt
PID: 2508692  TASK: ffff803e72a7ec80  CPU: 29  COMMAND: "kworker/u192:0"
 #0 [ffff0000b13cb5c0] machine_kexec at ffff0000800a3964
 #1 [ffff0000b13cb620] __crash_kexec at ffff0000801bf054
 #2 [ffff0000b13cb7b0] panic at ffff0000800f350c
 #3 [ffff0000b13cb890] die at ffff00008008f940
 #4 [ffff0000b13cb8e0] die_kernel_fault at ffff0000800abbc0
 #5 [ffff0000b13cb910] __do_kernel_fault at ffff0000800ab8c4
 #6 [ffff0000b13cb940] do_page_fault at ffff000080a3eb44
 #7 [ffff0000b13cba40] do_translation_fault at ffff000080a3f064
 #8 [ffff0000b13cba70] do_mem_abort at ffff0000800812cc
 #9 [ffff0000b13cbc70] el1_ia at ffff00008008320c
     PC: ffff000000e2fcd0  [ad_agg_selection_logic+328]
     LR: ffff000000e2fcb0  [ad_agg_selection_logic+296]
     SP: ffff0000b13cbc80  PSTATE: 40c00009
    X29: ffff0000b13cbc90  X28: ffff803e71c31438  X27: ffff000000e41eb8
    X26: ffff0000b13cbd97  X25: ffff000000e4c0b8  X24: ffff803e71c31400
    X23: ffff000081229000  X22: 0000000000000000  X21: ffff803e71c31400
    X20: ffff0000b13cbcf0  X19: ffff803f4c772ac0  X18: ffffffffffffffff
    X17: 0000000000000000  X16: ffff0000808acc70  X15: ffff000081229708
    X14: 7361772074756220  X13: 353335353620726f  X12: 7461676572676761
    X11: 206f742064657461  X10: 0000000000000000   X9: ffff00008122baf0
     X8: 00000000000e97a8   X7: ffff000081408080   X6: ffff805f7fa27448
     X5: ffff805f7fa27448   X4: 0000000000000000   X3: 0000000000000006
     X2: 0000000000000004   X1: 0000000000000000   X0: ffff803e739bea38
crash> struct port ffff803e739bea38
struct port {
  actor_port_number = 2,
  actor_port_priority = 255,
  actor_system = {
    mac_addr_value = "\254\215\064\037\016y"
  },
  actor_system_priority = 65535,
  actor_port_aggregator_identifier = 2094,
  ntt = false,
  actor_admin_port_key = 0,
  actor_oper_port_key = 0,
  actor_admin_port_state = 5 '\005',
  actor_oper_port_state = 3 '\003',
  partner_admin = {
    system = {
      mac_addr_value = "\000\000\000\000\000"
    },
    system_priority = 65535,
    key = 1,
    port_number = 1,
    port_priority = 255,
    port_state = 1
  },
  partner_oper = {
    system = {
      mac_addr_value = "\254\263\265\367b!"
    },
    system_priority = 32768,
    key = 1089,
    port_number = 8,
    port_priority = 32768,
    port_state = 61
  },
  is_enabled = false,
  sm_vars = 304,
  sm_rx_state = AD_RX_PORT_DISABLED,
  sm_rx_timer_counter = 26,
  sm_periodic_state = AD_NO_PERIODIC,
  sm_periodic_timer_counter = 0,
  sm_mux_state = AD_MUX_COLLECTING_DISTRIBUTING,
  sm_mux_timer_counter = 0,
  sm_tx_state = AD_TX_DUMMY,
  sm_tx_timer_counter = 1,
  sm_churn_actor_timer_counter = 0,
  sm_churn_partner_timer_counter = 0,
  churn_actor_count = 0,
  churn_partner_count = 0,
  lacpdu_send_success_count = 10,
  lacpdu_send_failure_count = 0,
  lacpdu_recv_count = 150,
  marker_info_recv_count = 0,
  marker_resp_recv_count = 0,
  marker_unkown_recv_count = 0,
  sm_churn_actor_state = AD_NO_CHURN,
  sm_churn_partner_state = AD_NO_CHURN,
  slave = 0x0,
  aggregator = 0xffff803e739bea00,
  next_port_in_aggregator = 0x0,
  transaction_id = 0,
 -- MORE --

2.I also have another call stack, same as in another person's post:
https://lore.kernel.org/netdev/52630cba-cc60-a024-8dd0-8319e5245044@huawei.com/

Signed-off-by: Aichun Li <liaichun@huawei.com>
---
 drivers/net/bonding/bond_3ad.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index aa001b16765a..9c8894631bdd 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -183,7 +183,7 @@ static inline void __enable_port(struct port *port)
 {
 	struct slave *slave = port->slave;
 
-	if ((slave->link == BOND_LINK_UP) && bond_slave_is_up(slave))
+	if (slave && (slave->link == BOND_LINK_UP) && bond_slave_is_up(slave))
 		bond_set_slave_active_flags(slave, BOND_SLAVE_NOTIFY_LATER);
 }
 
@@ -1516,6 +1516,7 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr)
 				  port->actor_port_number,
 				  port->aggregator->aggregator_identifier);
 		} else {
+			port->aggregator = &(SLAVE_AD_INFO(slave)->aggregator);
 			slave_err(bond->dev, port->slave->dev,
 				  "Port %d did not find a suitable aggregator\n",
 				  port->actor_port_number);
-- 
2.19.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH net v2]bonding: check port and aggregator when select
  2021-01-28  8:20 [PATCH net v2]bonding: check port and aggregator when select Aichun Li
@ 2021-02-01 21:52 ` Jakub Kicinski
  2021-02-02 20:02 ` Jay Vosburgh
  1 sibling, 0 replies; 6+ messages in thread
From: Jakub Kicinski @ 2021-02-01 21:52 UTC (permalink / raw)
  To: j.vosburgh, vfalico, andy; +Cc: Aichun Li, davem, netdev, rose.chen

On Thu, 28 Jan 2021 16:20:34 +0800 Aichun Li wrote:
> When the network service is repeatedly restarted in 802.3ad, there is a low
>  probability that oops occurs.
> Test commands:systemctl restart network

Jay, others, any thoughts?

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH net v2]bonding: check port and aggregator when select
  2021-01-28  8:20 [PATCH net v2]bonding: check port and aggregator when select Aichun Li
  2021-02-01 21:52 ` Jakub Kicinski
@ 2021-02-02 20:02 ` Jay Vosburgh
  1 sibling, 0 replies; 6+ messages in thread
From: Jay Vosburgh @ 2021-02-02 20:02 UTC (permalink / raw)
  To: Aichun Li; +Cc: davem, kuba, vfalico, andy, netdev, rose.chen, moyufeng

Aichun Li <liaichun@huawei.com> wrote:

>When the network service is repeatedly restarted in 802.3ad, there is a low
> probability that oops occurs.
>Test commands:systemctl restart network
>
>1.crash: __enable_port():port->slave is NULL
[...]
>     PC: ffff000000e2fcd0  [ad_agg_selection_logic+328]
[...]
>2.I also have another call stack, same as in another person's post:
>https://lore.kernel.org/netdev/52630cba-cc60-a024-8dd0-8319e5245044@huawei.com/

	What hardware platform are you using here?

	moyufeng <moyufeng@huawei.com> appears to be using the same
platform, and I've not had any success so far with the provided script
to reproduce the issue.  I'm using an x86_64 system, however, so I
wonder if perhaps this platform needs a barrier somewhere that x86 does
not, or there's something different in the timing of the device driver
close logic.

>Signed-off-by: Aichun Li <liaichun@huawei.com>
>---
> drivers/net/bonding/bond_3ad.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
>diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
>index aa001b16765a..9c8894631bdd 100644
>--- a/drivers/net/bonding/bond_3ad.c
>+++ b/drivers/net/bonding/bond_3ad.c
>@@ -183,7 +183,7 @@ static inline void __enable_port(struct port *port)
> {
> 	struct slave *slave = port->slave;
> 
>-	if ((slave->link == BOND_LINK_UP) && bond_slave_is_up(slave))
>+	if (slave && (slave->link == BOND_LINK_UP) && bond_slave_is_up(slave))
> 		bond_set_slave_active_flags(slave, BOND_SLAVE_NOTIFY_LATER);
> }

	This change seems like a band aid to cover the real problem.
The caller of __enable_port is ad_agg_selection_logic, and it shouldn't
be possible for port->slave to be NULL when assigned to an aggregator.

>@@ -1516,6 +1516,7 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr)
> 				  port->actor_port_number,
> 				  port->aggregator->aggregator_identifier);
> 		} else {
>+			port->aggregator = &(SLAVE_AD_INFO(slave)->aggregator);
> 			slave_err(bond->dev, port->slave->dev,
> 				  "Port %d did not find a suitable aggregator\n",
> 				  port->actor_port_number);

	This change isn't correct; it's assigning the port to a more or
less random aggregator.  This would eliminate the panic, but isn't doing
the right thing.  At this point in the code, the selection logic has
failed to find an aggregator that matches, and also failed to find a
free aggregator.

	I do need to fix up the failure handling here when it hits the
"did not find a suitable agg" case; the code here is simply wrong, and
has been wrong since the beginning.  I'll hack the driver to induce this
situation rather than reproducing whatever problem is making it unable
to find a suitable aggregator.

	-J

---
	-Jay Vosburgh, jay.vosburgh@canonical.com

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH net v2]bonding: check port and aggregator when select
@ 2021-05-24 15:21 liaichun
  0 siblings, 0 replies; 6+ messages in thread
From: liaichun @ 2021-05-24 15:21 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: liaichun, davem, kuba, vfalico, andy, netdev, Chenxiang (EulerOS),
	moyufeng

Hi Jay:

	I repeated the "did not find a suitable aggregator" issue by repeatedly hot-plugging the NIC device.

	https://lore.kernel.org/netdev/52630cba-cc60-a024-8dd0-8319e5245044@huawei.com/
	Add the hot swap operation of network adapters:
		while true; do
        	ETS_SSHCMD "echo 1 > /sys/bus/pci/devices/ enp3s0 /remove" &
        	ETS_SSHCMD "echo 1 > /sys/bus/pci/devices/ enp4s0 /remove" &
        	ETS_SSHCMD "echo 1 > /sys/bus/pci/devices/ enp5s0 /remove" &
        	ETS_SSHCMD "echo 1 > /sys/bus/pci/devices/ enp6s0 /remove" &
        	sleep 10
        	ETS_SSHCMD "echo 1 > /sys/bus/pci/rescan"
			sleep 100
		done &

	The numbers of port_params updated, and then can not find a suitable aggregator.

The detailed code analysis is as follows:
bond_3ad_lacpdu_recv ->bond_3ad_rx_indication-> ad_rx_machine-> __record_pdu
static void __record_pdu(struct lacpdu *lacpdu, struct port *port)
{
	if (lacpdu && port) {
		struct port_params *partner = &port->partner_oper;

		__choose_matched(lacpdu, port);
		/* record the new parameter values for the partner
		 * operational
		 */
		partner->port_number = ntohs(lacpdu->actor_port);
		partner->port_priority = ntohs(lacpdu->actor_port_priority);
		partner->system = lacpdu->actor_system;
		partner->system_priority = ntohs(lacpdu->actor_system_priority);
		partner->key = ntohs(lacpdu->actor_key);
		partner->port_state = lacpdu->actor_state;
--------analysis: Update the member of the partner here.

		/* set actor_oper_port_state.defaulted to FALSE */
		port->actor_oper_port_state &= ~AD_STATE_DEFAULTED;

		/* set the partner sync. to on if the partner is sync,
		 * and the port is matched
		 */
		if ((port->sm_vars & AD_PORT_MATCHED) &&
		    (lacpdu->actor_state & AD_STATE_SYNCHRONIZATION)) {
			partner->port_state |= AD_STATE_SYNCHRONIZATION;
			pr_debug("%s partner sync=1\n", port->slave->dev->name);
		} else {
			partner->port_state &= ~AD_STATE_SYNCHRONIZATION;
			pr_debug("%s partner sync=0\n", port->slave->dev->name);
		}
	}
}

static void ad_port_selection_logic(struct port *port, bool *update_slave_arr)
{
[...]
	/* search on all aggregators for a suitable aggregator for this port */
	bond_for_each_slave(bond, slave, iter) {
		aggregator = &(SLAVE_AD_INFO(slave)->aggregator);

		/* keep a free aggregator for later use(if needed) */
		if (!aggregator->lag_ports) {
			if (!free_aggregator)
				free_aggregator = aggregator;
			continue;
		}
		/* check if current aggregator suits us */

----analysis: here check the number of aggregator's partner, but it was updated by bond_3ad_lacpdu_recv

		if (((aggregator->actor_oper_aggregator_key == port->actor_oper_port_key) && /* if all parameters match AND */
		     MAC_ADDRESS_EQUAL(&(aggregator->partner_system), &(port->partner_oper.system)) &&
		     (aggregator->partner_system_priority == port->partner_oper.system_priority) &&
		     (aggregator->partner_oper_aggregator_key == port->partner_oper.key)
		    ) &&
		    ((!MAC_ADDRESS_EQUAL(&(port->partner_oper.system), &(null_mac_addr)) && /* partner answers */
		      !aggregator->is_individual)  /* but is not individual OR */
		    )
		   ) {
			/* attach to the founded aggregator */
			port->aggregator = aggregator;
			port->actor_port_aggregator_identifier =
				port->aggregator->aggregator_identifier;
			port->next_port_in_aggregator = aggregator->lag_ports;
			port->aggregator->num_of_ports++;
			aggregator->lag_ports = port;
			netdev_dbg(bond->dev, "Port %d joined LAG %d(existing LAG)\n",
				   port->actor_port_number,
				   port->aggregator->aggregator_identifier);

			/* mark this port as selected */
			port->sm_vars |= AD_PORT_SELECTED;
			found = 1;
----analysis: found is 0
			break;
		}
	}
	/* the port couldn't find an aggregator - attach it to a new
	 * aggregator
	 */
	if (!found) {
----analysis: No proper free_aggregator is found.
		if (free_aggregator) {     
			/* assign port a new aggregator */
			port->aggregator = free_aggregator;
			port->actor_port_aggregator_identifier =
				port->aggregator->aggregator_identifier;

			/* update the new aggregator's parameters
			 * if port was responsed from the end-user
			 */
			if (port->actor_oper_port_key & AD_DUPLEX_KEY_MASKS)
				/* if port is full duplex */
				port->aggregator->is_individual = false;
			else
				port->aggregator->is_individual = true;

			port->aggregator->actor_admin_aggregator_key =
				port->actor_admin_port_key;
			port->aggregator->actor_oper_aggregator_key =
				port->actor_oper_port_key;
			port->aggregator->partner_system =
				port->partner_oper.system;
			port->aggregator->partner_system_priority =
				port->partner_oper.system_priority;
			port->aggregator->partner_oper_aggregator_key = port->partner_oper.key;
			port->aggregator->receive_state = 1;
			port->aggregator->transmit_state = 1;
			port->aggregator->lag_ports = port;
			port->aggregator->num_of_ports++;

			/* mark this port as selected */
			port->sm_vars |= AD_PORT_SELECTED;

			netdev_dbg(bond->dev, "Port %d joined LAG %d(new LAG)\n",
				   port->actor_port_number,
				   port->aggregator->aggregator_identifier);
		} else {
----analysis: The fault scenario goes to this branch, and port->aggregator remains NULL.
			netdev_err(bond->dev, "Port %d (on %s) did not find a suitable aggregator\n",
			       port->actor_port_number, port->slave->dev->name);
		}
	}
	/* if all aggregator's ports are READY_N == TRUE, set ready=TRUE
	 * in all aggregator's ports, else set ready=FALSE in all
	 * aggregator's ports
	 */
	__set_agg_ports_ready(port->aggregator,
			      __agg_ports_are_ready(port->aggregator));

----analysis: port->aggregator is still NULL, which causes problem.

 [...]
}

 In this case ,I want to use old aggregator, but init it's number.And I don't know if that's correct.

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index cfb0060fc..bd66f3f48 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -1450,7 +1450,6 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr)
                                /* clear the port's relations to this
                                 * aggregator
                                 */
-                               port->aggregator = NULL;
                                port->next_port_in_aggregator = NULL;
                                port->actor_port_aggregator_identifier = 0;

@@ -1521,43 +1520,47 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr)
                if (free_aggregator) {
                        /* assign port a new aggregator */
                        port->aggregator = free_aggregator;
-                       port->actor_port_aggregator_identifier =
-                               port->aggregator->aggregator_identifier;
-
-                       /* update the new aggregator's parameters
-                        * if port was responsed from the end-user
-                        */
-                       if (port->actor_oper_port_key & AD_DUPLEX_KEY_MASKS)
-                               /* if port is full duplex */
-                               port->aggregator->is_individual = false;
-                       else
-                               port->aggregator->is_individual = true;
-
-                       port->aggregator->actor_admin_aggregator_key =
-                               port->actor_admin_port_key;
-                       port->aggregator->actor_oper_aggregator_key =
-                               port->actor_oper_port_key;
-                       port->aggregator->partner_system =
-                               port->partner_oper.system;
-                       port->aggregator->partner_system_priority =
-                               port->partner_oper.system_priority;
-                       port->aggregator->partner_oper_aggregator_key = port->partner_oper.key;
-                       port->aggregator->receive_state = 1;
-                       port->aggregator->transmit_state = 1;
-                       port->aggregator->lag_ports = port;
-                       port->aggregator->num_of_ports++;
-
-                       /* mark this port as selected */
-                       port->sm_vars |= AD_PORT_SELECTED;
-
-                       netdev_dbg(bond->dev, "Port %d joined LAG %d(new LAG)\n",
-                                  port->actor_port_number,
-                                  port->aggregator->aggregator_identifier);
                } else {
+                       /* use old aggregator */
                        netdev_err(bond->dev, "Port %d (on %s) did not find a suitable aggregator\n",
-                              port->actor_port_number, port->slave->dev->name);
-                       return;
+                                  port->actor_port_number, port->slave->dev->name);
+                       if (port->aggregator == NULL) {
+                               netdev_err(bond->dev, "old aggregator is still NULL\n");
+                               return;
+                       }
                }
+               port->actor_port_aggregator_identifier =
+                       port->aggregator->aggregator_identifier;
+
+               /* update the new aggregator's parameters
+                * if port was responsed from the end-user
+                */
+               if (port->actor_oper_port_key & AD_DUPLEX_KEY_MASKS)
+                       /* if port is full duplex */
+                       port->aggregator->is_individual = false;
+               else
+                       port->aggregator->is_individual = true;
+
+               port->aggregator->actor_admin_aggregator_key =
+                       port->actor_admin_port_key;
+               port->aggregator->actor_oper_aggregator_key =
+                       port->actor_oper_port_key;
+               port->aggregator->partner_system =
+                       port->partner_oper.system;
+               port->aggregator->partner_system_priority =
+                       port->partner_oper.system_priority;
+               port->aggregator->partner_oper_aggregator_key = port->partner_oper.key;
+               port->aggregator->receive_state = 1;
+               port->aggregator->transmit_state = 1;
+               port->aggregator->lag_ports = port;
+               port->aggregator->num_of_ports++;
+
+               /* mark this port as selected */
+               port->sm_vars |= AD_PORT_SELECTED;
+
+               netdev_dbg(bond->dev, "Port %d joined LAG %d(new LAG)\n",
+                          port->actor_port_number,
+                          port->aggregator->aggregator_identifier);
        }
        /* if all aggregator's ports are READY_N == TRUE, set ready=TRUE
         * in all aggregator's ports, else set ready=FALSE in all

Thanks.

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* RE: [PATCH net v2]bonding: check port and aggregator when select
@ 2021-04-15 11:58 liaichun
  0 siblings, 0 replies; 6+ messages in thread
From: liaichun @ 2021-04-15 11:58 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: davem, kuba, vfalico, andy, netdev, Chenxiang (EulerOS), moyufeng


> > Aichun Li <liaichun@huawei.com> wrote:
> >
> > >When the network service is repeatedly restarted in 802.3ad, there is
> > >a low  probability that oops occurs.
> > >Test commands:systemctl restart network
> > >
> > >1.crash: __enable_port():port->slave is NULL
> > [...]
> > >     PC: ffff000000e2fcd0  [ad_agg_selection_logic+328]
> > [...]
> > >2.I also have another call stack, same as in another person's post:
> > >https://lore.kernel.org/netdev/52630cba-cc60-a024-8dd0-8319e5245044@ huawei.com
> >
> > 	What hardware platform are you using here?
> >
> > 	moyufeng <moyufeng@huawei.com> appears to be using the same platform,
> > and I've not had any success so far with the provided script to
> > reproduce the issue.  I'm using an x86_64 system, however, so I wonder
> > if perhaps this platform needs a barrier somewhere that x86 does not,
> > or there's something different in the timing of the device driver close logic.
> 	Yes, I'm using an arm64 system.
>     And i'm different from moyufeng. I'm a physical machine, and he's a virtual
> machine.

I'm sorry, I haven't heard from you in a while. I was wondering if you've reproduced this question?

> >
> > >Signed-off-by: Aichun Li <liaichun@huawei.com>
> > >---
> > > drivers/net/bonding/bond_3ad.c | 3 ++-
> > > 1 file changed, 2 insertions(+), 1 deletion(-)
> > >
> > >diff --git a/drivers/net/bonding/bond_3ad.c
> > >b/drivers/net/bonding/bond_3ad.c index aa001b16765a..9c8894631bdd
> > >100644
> > >--- a/drivers/net/bonding/bond_3ad.c
> > >+++ b/drivers/net/bonding/bond_3ad.c
> > >@@ -183,7 +183,7 @@ static inline void __enable_port(struct port
> > >*port) {
> > > 	struct slave *slave = port->slave;
> > >
> > >-	if ((slave->link == BOND_LINK_UP) && bond_slave_is_up(slave))
> > >+	if (slave && (slave->link == BOND_LINK_UP) &&
> > >+bond_slave_is_up(slave))
> > > 		bond_set_slave_active_flags(slave, BOND_SLAVE_NOTIFY_LATER); }
> >
> > 	This change seems like a band aid to cover the real problem.
> > The caller of __enable_port is ad_agg_selection_logic, and it
> > shouldn't be possible for port->slave to be NULL when assigned to an
> aggregator.
> >
> > >@@ -1516,6 +1516,7 @@ static void ad_port_selection_logic(struct port
> > *port, bool *update_slave_arr)
> > > 				  port->actor_port_number,
> > > 				  port->aggregator->aggregator_identifier);
> > > 		} else {
> > >+			port->aggregator = &(SLAVE_AD_INFO(slave)->aggregator);
> > > 			slave_err(bond->dev, port->slave->dev,
> > > 				  "Port %d did not find a suitable aggregator\n",
> > > 				  port->actor_port_number);
> >
> > 	This change isn't correct; it's assigning the port to a more or less
> > random aggregator.  This would eliminate the panic, but isn't doing the right
> thing.
> > At this point in the code, the selection logic has failed to find an
> > aggregator that matches, and also failed to find a free aggregator.
> >
> > 	I do need to fix up the failure handling here when it hits the "did
> > not find a suitable agg" case; the code here is simply wrong, and has
> > been wrong since the beginning.  I'll hack the driver to induce this
> > situation rather than reproducing whatever problem is making it unable
> > to find a suitable aggregator.
> 
>     Thank you for your reply and look forward to your solution.
> >
> > 	-J
> >
> > ---
> > 	-Jay Vosburgh, jay.vosburgh@canonical.com
> 
> ---
> 	-Aichun Li <liaichun@huawei.com>


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH net v2]bonding: check port and aggregator when select
@ 2021-02-03  1:54 liaichun
  0 siblings, 0 replies; 6+ messages in thread
From: liaichun @ 2021-02-03  1:54 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: davem, kuba, vfalico, andy, netdev, Chenxiang (EulerOS), moyufeng

> Aichun Li <liaichun@huawei.com> wrote:
> 
> >When the network service is repeatedly restarted in 802.3ad, there is a
> >low  probability that oops occurs.
> >Test commands:systemctl restart network
> >
> >1.crash: __enable_port():port->slave is NULL
> [...]
> >     PC: ffff000000e2fcd0  [ad_agg_selection_logic+328]
> [...]
> >2.I also have another call stack, same as in another person's post:
> >https://lore.kernel.org/netdev/52630cba-cc60-a024-8dd0-8319e5245044@hu
> a
> >wei.com/
> 
> 	What hardware platform are you using here?
> 
> 	moyufeng <moyufeng@huawei.com> appears to be using the same
> platform, and I've not had any success so far with the provided script to
> reproduce the issue.  I'm using an x86_64 system, however, so I wonder if
> perhaps this platform needs a barrier somewhere that x86 does not, or there's
> something different in the timing of the device driver close logic.
	Yes, I'm using an arm64 system.
    And i'm different from moyufeng. I'm a physical machine, and he's a virtual
 machine.
> 
> >Signed-off-by: Aichun Li <liaichun@huawei.com>
> >---
> > drivers/net/bonding/bond_3ad.c | 3 ++-
> > 1 file changed, 2 insertions(+), 1 deletion(-)
> >
> >diff --git a/drivers/net/bonding/bond_3ad.c
> >b/drivers/net/bonding/bond_3ad.c index aa001b16765a..9c8894631bdd
> >100644
> >--- a/drivers/net/bonding/bond_3ad.c
> >+++ b/drivers/net/bonding/bond_3ad.c
> >@@ -183,7 +183,7 @@ static inline void __enable_port(struct port *port)
> >{
> > 	struct slave *slave = port->slave;
> >
> >-	if ((slave->link == BOND_LINK_UP) && bond_slave_is_up(slave))
> >+	if (slave && (slave->link == BOND_LINK_UP) &&
> >+bond_slave_is_up(slave))
> > 		bond_set_slave_active_flags(slave, BOND_SLAVE_NOTIFY_LATER); }
> 
> 	This change seems like a band aid to cover the real problem.
> The caller of __enable_port is ad_agg_selection_logic, and it shouldn't be
> possible for port->slave to be NULL when assigned to an aggregator.
> 
> >@@ -1516,6 +1516,7 @@ static void ad_port_selection_logic(struct port
> *port, bool *update_slave_arr)
> > 				  port->actor_port_number,
> > 				  port->aggregator->aggregator_identifier);
> > 		} else {
> >+			port->aggregator = &(SLAVE_AD_INFO(slave)->aggregator);
> > 			slave_err(bond->dev, port->slave->dev,
> > 				  "Port %d did not find a suitable aggregator\n",
> > 				  port->actor_port_number);
> 
> 	This change isn't correct; it's assigning the port to a more or less random
> aggregator.  This would eliminate the panic, but isn't doing the right thing.
> At this point in the code, the selection logic has failed to find an aggregator
> that matches, and also failed to find a free aggregator.
> 
> 	I do need to fix up the failure handling here when it hits the "did not find a
> suitable agg" case; the code here is simply wrong, and has been wrong since
> the beginning.  I'll hack the driver to induce this situation rather than
> reproducing whatever problem is making it unable to find a suitable
> aggregator.
	
    Thank you for your reply and look forward to your solution.
> 
> 	-J
> 
> ---
> 	-Jay Vosburgh, jay.vosburgh@canonical.com

---
	-Aichun Li <liaichun@huawei.com>


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-05-24 15:23 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-28  8:20 [PATCH net v2]bonding: check port and aggregator when select Aichun Li
2021-02-01 21:52 ` Jakub Kicinski
2021-02-02 20:02 ` Jay Vosburgh
2021-02-03  1:54 liaichun
2021-04-15 11:58 liaichun
2021-05-24 15:21 liaichun

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.