netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Алексей Захаров" <zaharov@selectel.ru>
To: Jay Vosburgh <jay.vosburgh@canonical.com>
Cc: netdev@vger.kernel.org
Subject: Re: Fwd: [PATCH] bonding/802.3ad: fix slave initialization states race
Date: Fri, 20 Sep 2019 19:00:17 +0300	[thread overview]
Message-ID: <CAJYOGF-L0bEF_BqbyeKqv4xmLV=e2VKUvo5zPx4rULWdwt8e0Q@mail.gmail.com> (raw)
In-Reply-To: <7154.1568987531@nyx>

>
> Jay Vosburgh <jay.vosburgh@canonical.com> wrote:
> [...]
> >       In any event, I think I see what the failure is, I'm working up
> >a patch to test and will post it when I have it ready.
>
>         Aleksei,
>
>         Would you be able to test the following patch and see if it
> resolves the issue in your testing?  This is against current net-next,
> but applies to current net as well.  Your kernel appears to be a bit
> older (as the message formats differ), so hopefully it will apply there
> as well.  I've tested this a bit (but not the ARP mon portion), and it
> seems to do the right thing, but I can't reproduce the original issue
> locally.
We're testing on ubuntu-bionic 4.15 kernel.

>
>         Thanks,
>
>         -J
>
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index 931d9d935686..38042399717b 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -2086,8 +2086,7 @@ static int bond_miimon_inspect(struct bonding *bond)
>         ignore_updelay = !rcu_dereference(bond->curr_active_slave);
>
>         bond_for_each_slave_rcu(bond, slave, iter) {
> -               slave->new_link = BOND_LINK_NOCHANGE;
> -               slave->link_new_state = slave->link;
> +               bond_propose_link_state(slave, BOND_LINK_NOCHANGE);
>
>                 link_state = bond_check_dev_link(bond, slave->dev, 0);
>
> @@ -2121,7 +2120,7 @@ static int bond_miimon_inspect(struct bonding *bond)
>                         }
>
>                         if (slave->delay <= 0) {
> -                               slave->new_link = BOND_LINK_DOWN;
> +                               bond_propose_link_state(slave, BOND_LINK_DOWN);
>                                 commit++;
>                                 continue;
>                         }
> @@ -2158,7 +2157,7 @@ static int bond_miimon_inspect(struct bonding *bond)
>                                 slave->delay = 0;
>
>                         if (slave->delay <= 0) {
> -                               slave->new_link = BOND_LINK_UP;
> +                               bond_propose_link_state(slave, BOND_LINK_UP);
>                                 commit++;
>                                 ignore_updelay = false;
>                                 continue;
> @@ -2196,7 +2195,7 @@ static void bond_miimon_commit(struct bonding *bond)
>         struct slave *slave, *primary;
>
>         bond_for_each_slave(bond, slave, iter) {
> -               switch (slave->new_link) {
> +               switch (slave->link_new_state) {
>                 case BOND_LINK_NOCHANGE:
>                         /* For 802.3ad mode, check current slave speed and
>                          * duplex again in case its port was disabled after
> @@ -2268,8 +2267,8 @@ static void bond_miimon_commit(struct bonding *bond)
>
>                 default:
>                         slave_err(bond->dev, slave->dev, "invalid new link %d on slave\n",
> -                                 slave->new_link);
> -                       slave->new_link = BOND_LINK_NOCHANGE;
> +                                 slave->link_new_state);
> +                       bond_propose_link_state(slave, BOND_LINK_NOCHANGE);
>
>                         continue;
>                 }
> @@ -2677,13 +2676,13 @@ static void bond_loadbalance_arp_mon(struct bonding *bond)
>         bond_for_each_slave_rcu(bond, slave, iter) {
>                 unsigned long trans_start = dev_trans_start(slave->dev);
>
> -               slave->new_link = BOND_LINK_NOCHANGE;
> +               bond_propose_link_state(slave, BOND_LINK_NOCHANGE);
>
>                 if (slave->link != BOND_LINK_UP) {
>                         if (bond_time_in_interval(bond, trans_start, 1) &&
>                             bond_time_in_interval(bond, slave->last_rx, 1)) {
>
> -                               slave->new_link = BOND_LINK_UP;
> +                               bond_propose_link_state(slave, BOND_LINK_UP);
>                                 slave_state_changed = 1;
>
>                                 /* primary_slave has no meaning in round-robin
> @@ -2708,7 +2707,7 @@ static void bond_loadbalance_arp_mon(struct bonding *bond)
>                         if (!bond_time_in_interval(bond, trans_start, 2) ||
>                             !bond_time_in_interval(bond, slave->last_rx, 2)) {
>
> -                               slave->new_link = BOND_LINK_DOWN;
> +                               bond_propose_link_state(slave, BOND_LINK_DOWN);
>                                 slave_state_changed = 1;
>
>                                 if (slave->link_failure_count < UINT_MAX)
> @@ -2739,8 +2738,8 @@ static void bond_loadbalance_arp_mon(struct bonding *bond)
>                         goto re_arm;
>
>                 bond_for_each_slave(bond, slave, iter) {
> -                       if (slave->new_link != BOND_LINK_NOCHANGE)
> -                               slave->link = slave->new_link;
> +                       if (slave->link_new_state != BOND_LINK_NOCHANGE)
> +                               slave->link = slave->link_new_state;
>                 }
>
>                 if (slave_state_changed) {
> @@ -2763,9 +2762,9 @@ static void bond_loadbalance_arp_mon(struct bonding *bond)
>  }
>
>  /* Called to inspect slaves for active-backup mode ARP monitor link state
> - * changes.  Sets new_link in slaves to specify what action should take
> - * place for the slave.  Returns 0 if no changes are found, >0 if changes
> - * to link states must be committed.
> + * changes.  Sets proposed link state in slaves to specify what action
> + * should take place for the slave.  Returns 0 if no changes are found, >0
> + * if changes to link states must be committed.
>   *
>   * Called with rcu_read_lock held.
>   */
> @@ -2777,12 +2776,12 @@ static int bond_ab_arp_inspect(struct bonding *bond)
>         int commit = 0;
>
>         bond_for_each_slave_rcu(bond, slave, iter) {
> -               slave->new_link = BOND_LINK_NOCHANGE;
> +               bond_propose_link_state(slave, BOND_LINK_NOCHANGE);
>                 last_rx = slave_last_rx(bond, slave);
>
>                 if (slave->link != BOND_LINK_UP) {
>                         if (bond_time_in_interval(bond, last_rx, 1)) {
> -                               slave->new_link = BOND_LINK_UP;
> +                               bond_propose_link_state(slave, BOND_LINK_UP);
>                                 commit++;
>                         }
>                         continue;
> @@ -2810,7 +2809,7 @@ static int bond_ab_arp_inspect(struct bonding *bond)
>                 if (!bond_is_active_slave(slave) &&
>                     !rcu_access_pointer(bond->current_arp_slave) &&
>                     !bond_time_in_interval(bond, last_rx, 3)) {
> -                       slave->new_link = BOND_LINK_DOWN;
> +                       bond_propose_link_state(slave, BOND_LINK_DOWN);
>                         commit++;
>                 }
>
> @@ -2823,7 +2822,7 @@ static int bond_ab_arp_inspect(struct bonding *bond)
>                 if (bond_is_active_slave(slave) &&
>                     (!bond_time_in_interval(bond, trans_start, 2) ||
>                      !bond_time_in_interval(bond, last_rx, 2))) {
> -                       slave->new_link = BOND_LINK_DOWN;
> +                       bond_propose_link_state(slave, BOND_LINK_DOWN);
>                         commit++;
>                 }
>         }
> @@ -2843,7 +2842,7 @@ static void bond_ab_arp_commit(struct bonding *bond)
>         struct slave *slave;
>
>         bond_for_each_slave(bond, slave, iter) {
> -               switch (slave->new_link) {
> +               switch (slave->link_new_state) {
>                 case BOND_LINK_NOCHANGE:
>                         continue;
>
> @@ -2893,8 +2892,9 @@ static void bond_ab_arp_commit(struct bonding *bond)
>                         continue;
>
>                 default:
> -                       slave_err(bond->dev, slave->dev, "impossible: new_link %d on slave\n",
> -                                 slave->new_link);
> +                       slave_err(bond->dev, slave->dev,
> +                                 "impossible: link_new_state %d on slave\n",
> +                                 slave->link_new_state);
>                         continue;
>                 }
>
> diff --git a/include/net/bonding.h b/include/net/bonding.h
> index f7fe45689142..d416af72404b 100644
> --- a/include/net/bonding.h
> +++ b/include/net/bonding.h
> @@ -159,7 +159,6 @@ struct slave {
>         unsigned long target_last_arp_rx[BOND_MAX_ARP_TARGETS];
>         s8     link;            /* one of BOND_LINK_XXXX */
>         s8     link_new_state;  /* one of BOND_LINK_XXXX */
> -       s8     new_link;
>         u8     backup:1,   /* indicates backup slave. Value corresponds with
>                               BOND_STATE_ACTIVE and BOND_STATE_BACKUP */
>                inactive:1, /* indicates inactive slave */
> @@ -549,7 +548,7 @@ static inline void bond_propose_link_state(struct slave *slave, int state)
>
>  static inline void bond_commit_link_state(struct slave *slave, bool notify)
>  {
> -       if (slave->link == slave->link_new_state)
> +       if (slave->link_new_state == BOND_LINK_NOCHANGE)
>                 return;
>
>         slave->link = slave->link_new_state;
>
>
> ---
>         -Jay Vosburgh, jay.vosburgh@canonical.com
I had to change slave_err to netdev_err, because there's no slave_err
macro in 4.15.
I did some fast testing, things become a bit more weird for me.
Right after reboot one of the slaves hangs with actor port state 71
and partner port state 1.
It doesn't send lacpdu and seems to be broken.
Setting link down and up again fixes slave state.
Dmesg after boot:
[Fri Sep 20 17:56:53 2019] bond-san: Enslaving eth3 as a backup
interface with an up link
[Fri Sep 20 17:56:53 2019] bond-san: Enslaving eth2 as a backup
interface with an up link
[Fri Sep 20 17:56:54 2019] bond-san: Warning: No 802.3ad response from
the link partner for any adapters in the bond
[Fri Sep 20 17:56:54 2019] IPv6: ADDRCONF(NETDEV_UP): bond-san: link
is not ready
[Fri Sep 20 17:56:54 2019] 8021q: adding VLAN 0 to HW filter on device bond-san
[Fri Sep 20 17:56:54 2019] IPv6: ADDRCONF(NETDEV_CHANGE): bond-san:
link becomes ready
[Fri Sep 20 17:56:54 2019] bond-san: link status definitely up for
interface eth3, 10000 Mbps full duplex
[Fri Sep 20 17:56:54 2019] bond-san: first active interface up!

Broken link here is eth2. After set it down and up:
[Fri Sep 20 18:02:04 2019] mlx4_en: eth2: Link Up
[Fri Sep 20 18:02:04 2019] bond-san: link status up again after -200
ms for interface eth2
[Fri Sep 20 18:02:04 2019] bond-san: link status definitely up for
interface eth2, 10000 Mbps full duplex

If I'm trying to reproduce previous behavior, I get different messages
from time to time:
[Fri Sep 20 18:04:48 2019] mlx4_en: eth2: Link Up
[Fri Sep 20 18:04:48 2019] bond-san: link status up for interface
eth2, enabling it in 500 ms
[Fri Sep 20 18:04:48 2019] bond-san: invalid new link 3 on slave eth2
[Fri Sep 20 18:04:49 2019] bond-san: link status definitely up for
interface eth2, 10000 Mbps full duplex
or:
[Fri Sep 20 18:05:48 2019] mlx4_en: eth2: Link Up
[Fri Sep 20 18:05:49 2019] bond-san: link status up again after 0 ms
for interface eth2
[Fri Sep 20 18:05:49 2019] bond-san: link status definitely up for
interface eth2, 10000 Mbps full duplex

In both cases this slave works after up.

-- 
Best Regards,
Aleksei Zakharov
System administrator

  reply	other threads:[~2019-09-20 16:00 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-09-18 13:05 [PATCH] bonding/802.3ad: fix slave initialization states race Aleksei Zakharov
2019-09-18 14:34 ` Jay Vosburgh
     [not found]   ` <CAJYOGF9KZdouvmTxQcTOQgsi-uBxbvW50K3ufW1=8neeW98QVA@mail.gmail.com>
2019-09-18 18:27     ` Fwd: " Алексей Захаров
2019-09-19  8:00       ` Jay Vosburgh
2019-09-19  9:53         ` Алексей Захаров
2019-09-19 15:27           ` Jay Vosburgh
2019-09-20 13:52             ` Jay Vosburgh
2019-09-20 16:00               ` Алексей Захаров [this message]
2019-09-21  7:06                 ` Jay Vosburgh
2019-09-21 11:17                   ` Алексей Захаров
2019-09-25  0:31                     ` Jay Vosburgh
2019-09-25 11:01                       ` Aleksei Zakharov
2019-09-26  4:38                         ` Jay Vosburgh
2019-09-26 14:25                           ` Aleksei Zakharov
2019-09-26 20:01                             ` Jay Vosburgh
2019-09-27 11:14                               ` Aleksei Zakharov
2019-09-27  9:43                         ` zhangsha (A)
2019-10-22 12:05                       ` Aleksei Zakharov
2019-09-24 13:52 ` David Miller

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAJYOGF-L0bEF_BqbyeKqv4xmLV=e2VKUvo5zPx4rULWdwt8e0Q@mail.gmail.com' \
    --to=zaharov@selectel.ru \
    --cc=jay.vosburgh@canonical.com \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).