All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mac80211: Move crypto IV generation to after TXQ dequeue.
@ 2016-08-17 12:58 Toke Høiland-Jørgensen
  2016-08-17 13:08 ` Johannes Berg
  2016-08-17 14:45 ` [PATCH v2] " Toke Høiland-Jørgensen
  0 siblings, 2 replies; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-08-17 12:58 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless
  Cc: Toke Høiland-Jørgensen, Felix Fietkau

The FQ portion of the intermediate queues will reorder packets, which
means that crypto IV generation needs to happen after dequeue when they
are enabled, or the receiver will throw packets away when receiving
them.

This fixes the performance regression introduced by enabling softq in
ath9k.

Cc: Felix Fietkau <nbd@nbd.name>
Tested-by: Dave Taht <dave@taht.net>
Signed-off-by: Toke H=C3=B8iland-J=C3=B8rgensen <toke@toke.dk>
---
 include/net/mac80211.h  |  2 ++
 net/mac80211/sta_info.h |  3 +--
 net/mac80211/tx.c       | 55 +++++++++++++++++++++++++++++++------------=
------
 3 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index cca510a..b23deba 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1556,6 +1556,7 @@ enum ieee80211_key_flags {
  * @tx_pn: PN used for TX keys, may be used by the driver as well if it
  *	needs to do software PN assignment by itself (e.g. due to TSO)
  * @flags: key flags, see &enum ieee80211_key_flags.
+ * @pn_offs: offset where to put PN for crypto (or 0 if not needed)
  * @keyidx: the key index (0-3)
  * @keylen: key material length
  * @key: key material. For ALG_TKIP the key is encoded as a 256-bit (32 =
byte)
@@ -1573,6 +1574,7 @@ struct ieee80211_key_conf {
 	u8 iv_len;
 	u8 hw_key_idx;
 	u8 flags;
+	u8 pn_offs;
 	s8 keyidx;
 	u8 keylen;
 	u8 key[0];
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 0556be3..c9d4d69 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -266,7 +266,6 @@ struct sta_ampdu_mlme {
  * @hdr_len: actual 802.11 header length
  * @sa_offs: offset of the SA
  * @da_offs: offset of the DA
- * @pn_offs: offset where to put PN for crypto (or 0 if not needed)
  * @band: band this will be transmitted on, for tx_info
  * @rcu_head: RCU head to free this struct
  *
@@ -277,7 +276,7 @@ struct sta_ampdu_mlme {
 struct ieee80211_fast_tx {
 	struct ieee80211_key *key;
 	u8 hdr_len;
-	u8 sa_offs, da_offs, pn_offs;
+	u8 sa_offs, da_offs;
 	u8 band;
 	u8 hdr[30 + 2 + IEEE80211_FAST_XMIT_MAX_IV +
 	       sizeof(rfc1042_header)] __aligned(2);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 1d0746d..4ae1f2c 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1074,6 +1074,33 @@ ieee80211_tx_h_calculate_duration(struct ieee80211=
_tx_data *tx)
 	return TX_CONTINUE;
 }
=20
+static inline void ieee80211_set_crypto_pn(struct ieee80211_key_conf *co=
nf,
+					   struct sk_buff *skb)
+{
+	u64 pn;
+	u8 *crypto_hdr =3D skb->data + conf->pn_offs;
+
+	if (!conf->pn_offs)
+		return;
+
+	switch (conf->cipher) {
+	case WLAN_CIPHER_SUITE_CCMP:
+	case WLAN_CIPHER_SUITE_CCMP_256:
+	case WLAN_CIPHER_SUITE_GCMP:
+	case WLAN_CIPHER_SUITE_GCMP_256:
+		pn =3D atomic64_inc_return(&conf->tx_pn);
+		crypto_hdr[0] =3D pn;
+		crypto_hdr[1] =3D pn >> 8;
+		crypto_hdr[4] =3D pn >> 16;
+		crypto_hdr[5] =3D pn >> 24;
+		crypto_hdr[6] =3D pn >> 32;
+		crypto_hdr[7] =3D pn >> 40;
+		break;
+	}
+}
+
+
+
 /* actual transmit path */
=20
 static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx,
@@ -1503,6 +1530,10 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80=
211_hw *hw,
 						    sta);
 		struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
=20
+		if (info->control.hw_key) {
+			ieee80211_set_crypto_pn(info->control.hw_key, skb);
+		}
+
 		hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, txq->tid);
 		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
 			info->flags |=3D IEEE80211_TX_CTL_AMPDU;
@@ -2874,7 +2905,7 @@ void ieee80211_check_fast_xmit(struct sta_info *sta=
)
 			if (gen_iv) {
 				(build.hdr + build.hdr_len)[3] =3D
 					0x20 | (build.key->conf.keyidx << 6);
-				build.pn_offs =3D build.hdr_len;
+				build.key->conf.pn_offs =3D build.hdr_len;
 			}
 			if (gen_iv || iv_spc)
 				build.hdr_len +=3D IEEE80211_CCMP_HDR_LEN;
@@ -2885,7 +2916,7 @@ void ieee80211_check_fast_xmit(struct sta_info *sta=
)
 			if (gen_iv) {
 				(build.hdr + build.hdr_len)[3] =3D
 					0x20 | (build.key->conf.keyidx << 6);
-				build.pn_offs =3D build.hdr_len;
+				build.key->conf.pn_offs =3D build.hdr_len;
 			}
 			if (gen_iv || iv_spc)
 				build.hdr_len +=3D IEEE80211_GCMP_HDR_LEN;
@@ -3289,24 +3320,8 @@ static bool ieee80211_xmit_fast(struct ieee80211_s=
ub_if_data *sdata,
 	sta->tx_stats.bytes[skb_get_queue_mapping(skb)] +=3D skb->len;
 	sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
=20
-	if (fast_tx->pn_offs) {
-		u64 pn;
-		u8 *crypto_hdr =3D skb->data + fast_tx->pn_offs;
-
-		switch (fast_tx->key->conf.cipher) {
-		case WLAN_CIPHER_SUITE_CCMP:
-		case WLAN_CIPHER_SUITE_CCMP_256:
-		case WLAN_CIPHER_SUITE_GCMP:
-		case WLAN_CIPHER_SUITE_GCMP_256:
-			pn =3D atomic64_inc_return(&fast_tx->key->conf.tx_pn);
-			crypto_hdr[0] =3D pn;
-			crypto_hdr[1] =3D pn >> 8;
-			crypto_hdr[4] =3D pn >> 16;
-			crypto_hdr[5] =3D pn >> 24;
-			crypto_hdr[6] =3D pn >> 32;
-			crypto_hdr[7] =3D pn >> 40;
-			break;
-		}
+	if (fast_tx->key && !local->ops->wake_tx_queue) {
+		ieee80211_set_crypto_pn(&fast_tx->key->conf, skb);
 	}
=20
 	if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
--=20
2.9.2

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [PATCH] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-17 12:58 [PATCH] mac80211: Move crypto IV generation to after TXQ dequeue Toke Høiland-Jørgensen
@ 2016-08-17 13:08 ` Johannes Berg
  2016-08-17 13:16   ` Toke Høiland-Jørgensen
  2016-08-17 14:45 ` [PATCH v2] " Toke Høiland-Jørgensen
  1 sibling, 1 reply; 51+ messages in thread
From: Johannes Berg @ 2016-08-17 13:08 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless
  Cc: Felix Fietkau


> @@ -1573,6 +1574,7 @@ struct ieee80211_key_conf {
>  	u8 iv_len;
>  	u8 hw_key_idx;
>  	u8 flags;
> +	u8 pn_offs;
> 
This is completely wrong.

johannes

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-17 13:08 ` Johannes Berg
@ 2016-08-17 13:16   ` Toke Høiland-Jørgensen
  2016-08-17 13:18     ` Johannes Berg
  0 siblings, 1 reply; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-08-17 13:16 UTC (permalink / raw)
  To: Johannes Berg; +Cc: make-wifi-fast, linux-wireless, Felix Fietkau

Johannes Berg <johannes@sipsolutions.net> writes:

>> @@ -1573,6 +1574,7 @@ struct ieee80211_key_conf {
>> =C2=A0	u8 iv_len;
>> =C2=A0	u8 hw_key_idx;
>> =C2=A0	u8 flags;
>> +	u8 pn_offs;
>>=20
> This is completely wrong.

Well, the ieee80211_fast_tx struct is not available in
ieee80211_tx_dequeue, and I need the offset there. I thought about
sticking it into ieee80211_tx_info, but that is kinda full, and since
the ieee80211_key_conf is already available there, carrying it there
seems to work.

What would be a better way to do this?

-Toke

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-17 13:16   ` Toke Høiland-Jørgensen
@ 2016-08-17 13:18     ` Johannes Berg
  2016-08-17 13:23       ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 51+ messages in thread
From: Johannes Berg @ 2016-08-17 13:18 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: make-wifi-fast, linux-wireless, Felix Fietkau

On Wed, 2016-08-17 at 15:16 +0200, Toke Høiland-Jørgensen wrote:
> Johannes Berg <johannes@sipsolutions.net> writes:
> 
> > 
> > > 
> > > @@ -1573,6 +1574,7 @@ struct ieee80211_key_conf {
> > >  	u8 iv_len;
> > >  	u8 hw_key_idx;
> > >  	u8 flags;
> > > +	u8 pn_offs;
> > > 
> > This is completely wrong.
> 
> Well, the ieee80211_fast_tx struct is not available in
> ieee80211_tx_dequeue, and I need the offset there. I thought about
> sticking it into ieee80211_tx_info, but that is kinda full, and since
> the ieee80211_key_conf is already available there, carrying it there
> seems to work.

For very limited testing, perhaps. But this isn't static across all
usages of the key, so this is still completely broken.

> What would be a better way to do this?
> 

Some redesign/rearchitecture, probably. Or just do it all in the driver
like iwlmvm?

johannes

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-17 13:18     ` Johannes Berg
@ 2016-08-17 13:23       ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-08-17 13:23 UTC (permalink / raw)
  To: Johannes Berg; +Cc: make-wifi-fast, linux-wireless, Felix Fietkau

Johannes Berg <johannes@sipsolutions.net> writes:

> On Wed, 2016-08-17 at 15:16 +0200, Toke H=C3=B8iland-J=C3=B8rgensen wrote:
>> Johannes Berg <johannes@sipsolutions.net> writes:
>>=20
>> >=20
>> > >=20
>> > > @@ -1573,6 +1574,7 @@ struct ieee80211_key_conf {
>> > > =C2=A0	u8 iv_len;
>> > > =C2=A0	u8 hw_key_idx;
>> > > =C2=A0	u8 flags;
>> > > +	u8 pn_offs;
>> > >=20
>> > This is completely wrong.
>>=20
>> Well, the ieee80211_fast_tx struct is not available in
>> ieee80211_tx_dequeue, and I need the offset there. I thought about
>> sticking it into ieee80211_tx_info, but that is kinda full, and since
>> the ieee80211_key_conf is already available there, carrying it there
>> seems to work.
>
> For very limited testing, perhaps. But this isn't static across all
> usages of the key, so this is still completely broken.

OK, noted.

>> What would be a better way to do this?
>>=20
>
> Some redesign/rearchitecture, probably. Or just do it all in the driver
> like iwlmvm?

Will look it over again. Should be possible to re-calculate the offset,
I guess.

-Toke

^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v2] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-17 12:58 [PATCH] mac80211: Move crypto IV generation to after TXQ dequeue Toke Høiland-Jørgensen
  2016-08-17 13:08 ` Johannes Berg
@ 2016-08-17 14:45 ` Toke Høiland-Jørgensen
  2016-08-17 19:49   ` Johannes Berg
  2016-08-24 16:20   ` [PATCH v3] mac80211: Move reorder-sensitive TX handlers " Toke Høiland-Jørgensen
  1 sibling, 2 replies; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-08-17 14:45 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless
  Cc: Toke Høiland-Jørgensen, Felix Fietkau

The FQ portion of the intermediate queues will reorder packets, which
means that crypto IV generation needs to happen after dequeue when they
are enabled, or the receiver will throw packets away when receiving
them.

This fixes the performance regression introduced by enabling softq in
ath9k.

Cc: Felix Fietkau <nbd@nbd.name>
Tested-by: Dave Taht <dave@taht.net>
Signed-off-by: Toke H=C3=B8iland-J=C3=B8rgensen <toke@toke.dk>
---
Changes since v1:
  - Recalculate pn_offs when needed instead of storing it.

 net/mac80211/sta_info.h |  3 +-
 net/mac80211/tx.c       | 85 +++++++++++++++++++++++++++++++++++++------=
------
 2 files changed, 66 insertions(+), 22 deletions(-)

diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 0556be3..c9d4d69 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -266,7 +266,6 @@ struct sta_ampdu_mlme {
  * @hdr_len: actual 802.11 header length
  * @sa_offs: offset of the SA
  * @da_offs: offset of the DA
- * @pn_offs: offset where to put PN for crypto (or 0 if not needed)
  * @band: band this will be transmitted on, for tx_info
  * @rcu_head: RCU head to free this struct
  *
@@ -277,7 +276,7 @@ struct sta_ampdu_mlme {
 struct ieee80211_fast_tx {
 	struct ieee80211_key *key;
 	u8 hdr_len;
-	u8 sa_offs, da_offs, pn_offs;
+	u8 sa_offs, da_offs;
 	u8 band;
 	u8 hdr[30 + 2 + IEEE80211_FAST_XMIT_MAX_IV +
 	       sizeof(rfc1042_header)] __aligned(2);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 1d0746d..9caf75f 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1074,6 +1074,64 @@ ieee80211_tx_h_calculate_duration(struct ieee80211=
_tx_data *tx)
 	return TX_CONTINUE;
 }
=20
+static void ieee80211_gen_crypto_iv(struct ieee80211_key_conf *conf,
+					   struct sta_info *sta, struct sk_buff *skb)
+{
+	struct ieee80211_sub_if_data *sdata;
+	u64 pn;
+	u8 *crypto_hdr;
+	u8 pn_offs =3D 0;
+
+	if (!conf || !sta || !(conf->flags & IEEE80211_KEY_FLAG_GENERATE_IV))
+		return;
+
+	sdata =3D sta->sdata;
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_STATION:
+		if (sdata->u.mgd.use_4addr) {
+			pn_offs =3D 30;
+			break;
+		}
+		pn_offs =3D 24;
+		break;
+	case NL80211_IFTYPE_AP_VLAN:
+		if (sdata->wdev.use_4addr) {
+			pn_offs =3D 30;
+			break;
+		}
+		/* fall through */
+	case NL80211_IFTYPE_ADHOC:
+	case NL80211_IFTYPE_AP:
+		pn_offs =3D 24;
+		break;
+	default:
+		return;
+	}
+
+	if (sta->sta.wme) {
+		pn_offs +=3D 2;
+	}
+
+	crypto_hdr =3D skb->data + pn_offs;
+	switch (conf->cipher) {
+	case WLAN_CIPHER_SUITE_CCMP:
+	case WLAN_CIPHER_SUITE_CCMP_256:
+	case WLAN_CIPHER_SUITE_GCMP:
+	case WLAN_CIPHER_SUITE_GCMP_256:
+		pn =3D atomic64_inc_return(&conf->tx_pn);
+		crypto_hdr[0] =3D pn;
+		crypto_hdr[1] =3D pn >> 8;
+		crypto_hdr[4] =3D pn >> 16;
+		crypto_hdr[5] =3D pn >> 24;
+		crypto_hdr[6] =3D pn >> 32;
+		crypto_hdr[7] =3D pn >> 40;
+		break;
+	}
+}
+
+
+
 /* actual transmit path */
=20
 static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx,
@@ -1503,6 +1561,11 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80=
211_hw *hw,
 						    sta);
 		struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
=20
+		if (info->control.hw_key) {
+			ieee80211_gen_crypto_iv(info->control.hw_key,
+			container_of(txq->sta, struct sta_info, sta), skb);
+		}
+
 		hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, txq->tid);
 		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
 			info->flags |=3D IEEE80211_TX_CTL_AMPDU;
@@ -2874,7 +2937,6 @@ void ieee80211_check_fast_xmit(struct sta_info *sta=
)
 			if (gen_iv) {
 				(build.hdr + build.hdr_len)[3] =3D
 					0x20 | (build.key->conf.keyidx << 6);
-				build.pn_offs =3D build.hdr_len;
 			}
 			if (gen_iv || iv_spc)
 				build.hdr_len +=3D IEEE80211_CCMP_HDR_LEN;
@@ -2885,7 +2947,6 @@ void ieee80211_check_fast_xmit(struct sta_info *sta=
)
 			if (gen_iv) {
 				(build.hdr + build.hdr_len)[3] =3D
 					0x20 | (build.key->conf.keyidx << 6);
-				build.pn_offs =3D build.hdr_len;
 			}
 			if (gen_iv || iv_spc)
 				build.hdr_len +=3D IEEE80211_GCMP_HDR_LEN;
@@ -3289,24 +3350,8 @@ static bool ieee80211_xmit_fast(struct ieee80211_s=
ub_if_data *sdata,
 	sta->tx_stats.bytes[skb_get_queue_mapping(skb)] +=3D skb->len;
 	sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
=20
-	if (fast_tx->pn_offs) {
-		u64 pn;
-		u8 *crypto_hdr =3D skb->data + fast_tx->pn_offs;
-
-		switch (fast_tx->key->conf.cipher) {
-		case WLAN_CIPHER_SUITE_CCMP:
-		case WLAN_CIPHER_SUITE_CCMP_256:
-		case WLAN_CIPHER_SUITE_GCMP:
-		case WLAN_CIPHER_SUITE_GCMP_256:
-			pn =3D atomic64_inc_return(&fast_tx->key->conf.tx_pn);
-			crypto_hdr[0] =3D pn;
-			crypto_hdr[1] =3D pn >> 8;
-			crypto_hdr[4] =3D pn >> 16;
-			crypto_hdr[5] =3D pn >> 24;
-			crypto_hdr[6] =3D pn >> 32;
-			crypto_hdr[7] =3D pn >> 40;
-			break;
-		}
+	if (fast_tx->key && !local->ops->wake_tx_queue) {
+		ieee80211_gen_crypto_iv(&fast_tx->key->conf, sta, skb);
 	}
=20
 	if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
--=20
2.9.2

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [PATCH v2] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-17 14:45 ` [PATCH v2] " Toke Høiland-Jørgensen
@ 2016-08-17 19:49   ` Johannes Berg
  2016-08-17 20:07     ` [Make-wifi-fast] " Dave Taht
  2016-08-24 16:20   ` [PATCH v3] mac80211: Move reorder-sensitive TX handlers " Toke Høiland-Jørgensen
  1 sibling, 1 reply; 51+ messages in thread
From: Johannes Berg @ 2016-08-17 19:49 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless
  Cc: Felix Fietkau

Hi,

You need to work on coding style, a lot of your indentation is
completely messed up.

> +	switch (sdata->vif.type) {
> +	case NL80211_IFTYPE_STATION:
> +		if (sdata->u.mgd.use_4addr) {
> +			pn_offs = 30;
> +			break;
> +		}
> +		pn_offs = 24;
> +		break;
> +	case NL80211_IFTYPE_AP_VLAN:
> +		if (sdata->wdev.use_4addr) {
> +			pn_offs = 30;
> +			break;
> +		}
> +		/* fall through */
> +	case NL80211_IFTYPE_ADHOC:
> +	case NL80211_IFTYPE_AP:
> +		pn_offs = 24;
> +		break;
> +	default:
> +		return;
> +	}
> +
> +	if (sta->sta.wme) {
> +		pn_offs += 2;
> +	}

I think you just reinvented ieee80211_hdrlen(). No?

> -	if (fast_tx->pn_offs) {
> -		u64 pn;
> -		u8 *crypto_hdr = skb->data + fast_tx->pn_offs;

No need to undo the pn_offs optimisation for the !txq case, you can
pass it in to the new function that will fill it.

However, you're still doing it wrong - now you haven't fixed anything
for TKIP, which won't hit the fastpath.

johannes

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [Make-wifi-fast] [PATCH v2] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-17 19:49   ` Johannes Berg
@ 2016-08-17 20:07     ` Dave Taht
  2016-08-17 20:43       ` Johannes Berg
  0 siblings, 1 reply; 51+ messages in thread
From: Dave Taht @ 2016-08-17 20:07 UTC (permalink / raw)
  To: Johannes Berg
  Cc: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless,
	Felix Fietkau

On Wed, Aug 17, 2016 at 9:49 PM, Johannes Berg
<johannes@sipsolutions.net> wrote:
> Hi,
>
> You need to work on coding style, a lot of your indentation is
> completely messed up.
>
>> +     switch (sdata->vif.type) {
>> +     case NL80211_IFTYPE_STATION:
>> +             if (sdata->u.mgd.use_4addr) {
>> +                     pn_offs =3D 30;
>> +                     break;
>> +             }
>> +             pn_offs =3D 24;
>> +             break;
>> +     case NL80211_IFTYPE_AP_VLAN:
>> +             if (sdata->wdev.use_4addr) {
>> +                     pn_offs =3D 30;
>> +                     break;
>> +             }
>> +             /* fall through */
>> +     case NL80211_IFTYPE_ADHOC:
>> +     case NL80211_IFTYPE_AP:
>> +             pn_offs =3D 24;
>> +             break;
>> +     default:
>> +             return;
>> +     }
>> +
>> +     if (sta->sta.wme) {
>> +             pn_offs +=3D 2;
>> +     }
>
> I think you just reinvented ieee80211_hdrlen(). No?
>
>> -     if (fast_tx->pn_offs) {
>> -             u64 pn;
>> -             u8 *crypto_hdr =3D skb->data + fast_tx->pn_offs;
>
> No need to undo the pn_offs optimisation for the !txq case, you can
> pass it in to the new function that will fill it.
>
> However, you're still doing it wrong - now you haven't fixed anything
> for TKIP, which won't hit the fastpath.

well, we're getting there. the results of both patch attempts were
really nice, and brought encrypted performance with fq back into line
with unencrypted. Still running crypted tests as I write...

So fixing TKIP would be next, forcing the AP to use that? What other
scenarios do we have to worry about? WDS?


> johannes
> _______________________________________________
> Make-wifi-fast mailing list
> Make-wifi-fast@lists.bufferbloat.net
> https://lists.bufferbloat.net/listinfo/make-wifi-fast



--=20
Dave T=C3=A4ht
Let's go make home routers and wifi faster! With better software!
http://blog.cerowrt.org

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [Make-wifi-fast] [PATCH v2] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-17 20:07     ` [Make-wifi-fast] " Dave Taht
@ 2016-08-17 20:43       ` Johannes Berg
  2016-08-22 14:47         ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 51+ messages in thread
From: Johannes Berg @ 2016-08-17 20:43 UTC (permalink / raw)
  To: Dave Taht
  Cc: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless,
	Felix Fietkau


> well, we're getting there. the results of both patch attempts were
> really nice, and brought encrypted performance with fq back into line
> with unencrypted. Still running crypted tests as I write...
> 
> So fixing TKIP would be next, forcing the AP to use that? What other
> scenarios do we have to worry about? WDS?
> 

I don't think there's anything else, I just don't really feel it's
getting anywhere. This is a mere symptom of the design.

Felix had worked around the SN assignment in a similar way, but I feel
that perhaps the whole thing isn't quite the right architecture. Why
are we applying FQ after the wifi conversion, when clearly that doesn't
work well? Seems to me that it would make more sense to let the frames
sit on the queues as they come in, and do most of the wifi handling
only when needed (obviously, things like control port would still have
to be done).
We even count those packets that are dropped for TX statistics, which
would seem to be a big behavioural difference vs. applying a qdisc.

Now, it's unlikely to be that simple - fragmentation, for example,
might mess this up.

Overall though, I'm definitely wondering if it should be this way,
since all the special cases just add complexity.

johannes

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [Make-wifi-fast] [PATCH v2] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-17 20:43       ` Johannes Berg
@ 2016-08-22 14:47         ` Toke Høiland-Jørgensen
  2016-08-26  8:38           ` Johannes Berg
  0 siblings, 1 reply; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-08-22 14:47 UTC (permalink / raw)
  To: Johannes Berg; +Cc: Dave Taht, make-wifi-fast, linux-wireless, Felix Fietkau

Johannes Berg <johannes@sipsolutions.net> writes:

>> well, we're getting there. the results of both patch attempts were
>> really nice, and brought encrypted performance with fq back into line
>> with unencrypted. Still running crypted tests as I write...
>> 
>> So fixing TKIP would be next, forcing the AP to use that? What other
>> scenarios do we have to worry about? WDS?
>> 
>
> I don't think there's anything else, I just don't really feel it's
> getting anywhere. This is a mere symptom of the design.
>
> Felix had worked around the SN assignment in a similar way, but I feel
> that perhaps the whole thing isn't quite the right architecture. Why
> are we applying FQ after the wifi conversion, when clearly that doesn't
> work well? Seems to me that it would make more sense to let the frames
> sit on the queues as they come in, and do most of the wifi handling
> only when needed (obviously, things like control port would still have
> to be done).

I suppose that could be a way to do it (i.e. have ieee80211_tx_dequeue
call all the TX hooks etc), but am not sure whether there would be
problems doing all this work in the loop that's building aggregates
(which is what would happen for ath9k at least).

An alternative could be to split the process up in two: An "early" and
"late" stage, where the early stage does everything that is not
sensitive to reordering and the occasional drop, and the late stage is
everything that is. Then the queueing step could happen in-between the
two stages, and the non-queueing path could just call both stages at
once. In effect, this would just make the current work-arounds be more
explicit in the structure, rather than being marked as exceptions.

> We even count those packets that are dropped for TX statistics, which
> would seem to be a big behavioural difference vs. applying a qdisc.

While you're right in principle, in practice I don't think this has too
big of an impact. In normal operation, CoDel drops (at most) dozens of
packets per *minute*, so it's not going to skew the statistics too much.

> Now, it's unlikely to be that simple - fragmentation, for example,
> might mess this up.
>
> Overall though, I'm definitely wondering if it should be this way,
> since all the special cases just add complexity.

I agree that the work-arounds are iffy, but I do also think it's
important to keep in mind that we are improving latency by orders of
magnitude here. A few special cases is worth it to achieve that, IMO.
And then iterating towards a design that don't need them, of course :)

-Toke

^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v3] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-08-17 14:45 ` [PATCH v2] " Toke Høiland-Jørgensen
  2016-08-17 19:49   ` Johannes Berg
@ 2016-08-24 16:20   ` Toke Høiland-Jørgensen
  2016-08-30 13:15     ` [PATCH v4] " Toke Høiland-Jørgensen
  1 sibling, 1 reply; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-08-24 16:20 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless; +Cc: Toke Høiland-Jørgensen

The TXQ intermediate queues can cause packet reordering when more than
one flow is active to a single station. Since some of the wifi-specific
packet handling (notably sequence number and encryption handling) is
sensitive to re-ordering, things break if they are applied before the
TXQ.

This splits up the TX handlers and fast_xmit logic into two parts: An
early part and a late part. The former is applied before TXQ enqueue,
and the latter after dequeue. The non-TXQ path just applies both parts
at once.

To avoid having to deal with fragmentation on dequeue, the split is set
to be after the fragmentation handler. This means that some reordering
of TX handlers is necessary, and some handlers had to be made aware of
fragmentation due to this reordering.

This approach avoids having to scatter special cases for when TXQ is
enabled, at the cost of making the fast_xmit and TX handler code
slightly more complex.

Signed-off-by: Toke H=C3=B8iland-J=C3=B8rgensen <toke@toke.dk>
---
Changes since v2:

This is a completely different approach: Instead of adding exceptions
for TXQ handling, split up the entire TX path in an early and late part,
and apply the latter after TXQ dequeue. This should fix things that
don't hit the fast path as well.

I've tested this with both unencrypted traffic and with CCMP and TKIP
and it appears to fix the previous performance regression seen with
softq-enabled ath9k. I most likely haven't hit all code paths, though
(not sure how I would even go about ensuring that), but looks promising
so far.

 include/net/mac80211.h |   2 +
 net/mac80211/tx.c      | 276 ++++++++++++++++++++++++++++++++++++++-----=
------
 net/mac80211/wpa.c     |  18 +++-
 3 files changed, 235 insertions(+), 61 deletions(-)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index cca510a..9a6a3e9 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -715,6 +715,7 @@ enum mac80211_tx_info_flags {
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate info=
rmation
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
+ * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xm=
it path
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -723,6 +724,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PS_RESPONSE		=3D BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		=3D BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			=3D BIT(3),
+	IEEE80211_TX_CTRL_FAST_XMIT		=3D BIT(4),
 };
=20
 /*
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 1d0746d..7042d2c 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -38,6 +38,12 @@
 #include "wme.h"
 #include "rate.h"
=20
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sda=
ta,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb, bool xmit);
+
 /* misc utils */
=20
 static inline void ieee80211_tx_stats(struct net_device *dev, u32 len)
@@ -585,20 +591,27 @@ static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_key *key;
-	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(tx->skb);
-	struct ieee80211_hdr *hdr =3D (struct ieee80211_hdr *)tx->skb->data;
+	struct ieee80211_tx_info *info;
+	struct ieee80211_hdr *hdr;
+	struct sk_buff *skb =3D tx->skb;
+
+	if (!skb)
+		skb =3D skb_peek(&tx->skbs);
+
+	info =3D IEEE80211_SKB_CB(skb);
+	hdr =3D (struct ieee80211_hdr *)skb->data;
=20
 	if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT))
 		tx->key =3D NULL;
 	else if (tx->sta &&
 		 (key =3D rcu_dereference(tx->sta->ptk[tx->sta->ptk_idx])))
 		tx->key =3D key;
-	else if (ieee80211_is_group_privacy_action(tx->skb) &&
+	else if (ieee80211_is_group_privacy_action(skb) &&
 		(key =3D rcu_dereference(tx->sdata->default_multicast_key)))
 		tx->key =3D key;
 	else if (ieee80211_is_mgmt(hdr->frame_control) &&
 		 is_multicast_ether_addr(hdr->addr1) &&
-		 ieee80211_is_robust_mgmt_frame(tx->skb) &&
+		 ieee80211_is_robust_mgmt_frame(skb) &&
 		 (key =3D rcu_dereference(tx->sdata->default_mgmt_key)))
 		tx->key =3D key;
 	else if (is_multicast_ether_addr(hdr->addr1) &&
@@ -628,8 +641,8 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *t=
x)
 		case WLAN_CIPHER_SUITE_GCMP_256:
 			if (!ieee80211_is_data_present(hdr->frame_control) &&
 			    !ieee80211_use_mfp(hdr->frame_control, tx->sta,
-					       tx->skb) &&
-			    !ieee80211_is_group_privacy_action(tx->skb))
+					       skb) &&
+			    !ieee80211_is_group_privacy_action(skb))
 				tx->key =3D NULL;
 			else
 				skip_hw =3D (tx->key->conf.flags &
@@ -799,10 +812,12 @@ static __le16 ieee80211_tx_next_seq(struct sta_info=
 *sta, int tid)
 static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 {
-	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(tx->skb);
-	struct ieee80211_hdr *hdr =3D (struct ieee80211_hdr *)tx->skb->data;
+	struct sk_buff *skb =3D skb_peek(&tx->skbs);
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr =3D (struct ieee80211_hdr *)skb->data;
 	u8 *qc;
 	int tid;
+	u16 fragnum, seq;
=20
 	/*
 	 * Packet injection may want to control the sequence
@@ -829,10 +844,16 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *t=
x)
 	 */
 	if (!ieee80211_is_data_qos(hdr->frame_control) ||
 	    is_multicast_ether_addr(hdr->addr1)) {
-		/* driver should assign sequence number */
-		info->flags |=3D IEEE80211_TX_CTL_ASSIGN_SEQ;
-		/* for pure STA mode without beacons, we can do it */
-		hdr->seq_ctrl =3D cpu_to_le16(tx->sdata->sequence_number);
+		fragnum =3D 0;
+		seq =3D cpu_to_le16(tx->sdata->sequence_number);
+		skb_queue_walk(&tx->skbs, skb) {
+			info =3D IEEE80211_SKB_CB(skb);
+			hdr =3D (struct ieee80211_hdr *)skb->data;
+			/* driver should assign sequence number */
+			info->flags |=3D IEEE80211_TX_CTL_ASSIGN_SEQ;
+			/* for pure STA mode without beacons, we can do it */
+			hdr->seq_ctrl =3D seq | fragnum++;
+		}
 		tx->sdata->sequence_number +=3D 0x10;
 		if (tx->sta)
 			tx->sta->tx_stats.msdu[IEEE80211_NUM_TIDS]++;
@@ -853,8 +874,14 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx=
)
 	tid =3D *qc & IEEE80211_QOS_CTL_TID_MASK;
 	tx->sta->tx_stats.msdu[tid]++;
=20
-	if (!tx->sta->sta.txq[0])
-		hdr->seq_ctrl =3D ieee80211_tx_next_seq(tx->sta, tid);
+	if (!tx->sta->sta.txq[0]) {
+		seq =3D ieee80211_tx_next_seq(tx->sta, tid);
+		fragnum =3D 0;
+		skb_queue_walk(&tx->skbs, skb) {
+			hdr =3D (struct ieee80211_hdr *)skb->data;
+			hdr->seq_ctrl =3D seq | fragnum++;
+		}
+	}
=20
 	return TX_CONTINUE;
 }
@@ -927,7 +954,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 	struct ieee80211_hdr *hdr =3D (void *)skb->data;
 	int frag_threshold =3D tx->local->hw.wiphy->frag_threshold;
 	int hdrlen;
-	int fragnum;
=20
 	/* no matter what happens, tx->skb moves to tx->skbs */
 	__skb_queue_tail(&tx->skbs, skb);
@@ -964,9 +990,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 	if (ieee80211_fragment(tx, skb, hdrlen, frag_threshold))
 		return TX_DROP;
=20
-	/* update duration/seq/flags of fragments */
-	fragnum =3D 0;
-
 	skb_queue_walk(&tx->skbs, skb) {
 		const __le16 morefrags =3D cpu_to_le16(IEEE80211_FCTL_MOREFRAGS);
=20
@@ -987,8 +1010,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx=
)
 		} else {
 			hdr->frame_control &=3D ~morefrags;
 		}
-		hdr->seq_ctrl |=3D cpu_to_le16(fragnum & IEEE80211_SCTL_FRAG);
-		fragnum++;
 	}
=20
 	return TX_CONTINUE;
@@ -1481,33 +1502,59 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee8=
0211_hw *hw,
 {
 	struct ieee80211_local *local =3D hw_to_local(hw);
 	struct txq_info *txqi =3D container_of(txq, struct txq_info, txq);
-	struct ieee80211_hdr *hdr;
 	struct sk_buff *skb =3D NULL;
 	struct fq *fq =3D &local->fq;
 	struct fq_tin *tin =3D &txqi->tin;
+	struct ieee80211_tx_info *info;
=20
 	spin_lock_bh(&fq->lock);
=20
 	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
 		goto out;
=20
+begin:
 	skb =3D fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
 	if (!skb)
 		goto out;
=20
 	ieee80211_set_skb_vif(skb, txqi);
=20
-	hdr =3D (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+	info =3D IEEE80211_SKB_CB(skb);
+	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
 		struct sta_info *sta =3D container_of(txq->sta, struct sta_info,
 						    sta);
-		struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+		struct ieee80211_fast_tx *fast_tx;
=20
-		hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |=3D IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &=3D ~IEEE80211_TX_CTL_AMPDU;
+		fast_tx =3D rcu_dereference(sta->fast_tx);
+		if (!fast_tx ||
+		    !ieee80211_xmit_fast_finish(sta->sdata, sta, fast_tx, skb,
+						false)) {
+			/* fast xmit was started, but fails to finish */
+			ieee80211_free_txskb(hw, skb);
+			goto begin;
+		}
+	} else {
+		struct ieee80211_tx_data tx =3D { };
+
+		__skb_queue_head_init(&tx.skbs);
+		tx.local =3D local;
+		if (txq->sta) {
+			struct sta_info *sta =3D container_of(txq->sta,
+							    struct sta_info,
+							    sta);
+			tx.sta =3D container_of(txq->sta, struct sta_info, sta);
+			tx.sdata =3D sta->sdata;
+		} else {
+			tx.sdata =3D container_of(info->control.vif,
+					struct ieee80211_sub_if_data, vif);
+		}
+
+		__skb_queue_tail(&tx.skbs, skb);
+
+		if (invoke_tx_handlers_late(&tx))
+			goto begin;
+
+		__skb_unlink(skb, &tx.skbs);
 	}
=20
 out:
@@ -1521,6 +1568,77 @@ out:
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
=20
+static bool ieee80211_queue_skb(struct ieee80211_local *local,
+				struct ieee80211_vif *vif,
+				struct ieee80211_sta *sta,
+				struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+	struct fq *fq =3D &local->fq;
+	struct txq_info *txqi =3D ieee80211_get_txq(local, vif, sta, skb);
+
+	if (!txqi)
+		return false;
+
+	info->control.vif =3D vif;
+
+	spin_lock_bh(&fq->lock);
+	ieee80211_txq_enqueue(local, txqi, skb);
+	spin_unlock_bh(&fq->lock);
+
+	drv_wake_tx_queue(local, txqi);
+
+	return true;
+}
+
+static bool ieee80211_queue_frags(struct ieee80211_local *local,
+				  struct sta_info *sta,
+				  struct sk_buff_head *skbs)
+{
+	struct txq_info *txqi;
+	struct sk_buff *skb, *tmp;
+	struct ieee80211_tx_info *info;
+	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_vif *vif;
+	struct ieee80211_sta *pubsta;
+
+	if (WARN_ON(skb_queue_empty(skbs)))
+		return true;
+
+	skb =3D skb_peek(skbs);
+	info =3D IEEE80211_SKB_CB(skb);
+	sdata =3D vif_to_sdata(info->control.vif);
+	if (sta && !sta->uploaded)
+		sta =3D NULL;
+
+	if (sta)
+		pubsta =3D &sta->sta;
+	else
+		pubsta =3D NULL;
+
+	switch (sdata->vif.type) {
+	case NL80211_IFTYPE_MONITOR:
+		return false;
+	case NL80211_IFTYPE_AP_VLAN:
+		sdata =3D container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+		/* fall through */
+	default:
+		vif =3D &sdata->vif;
+		break;
+	}
+
+	skb_queue_walk_safe(skbs, skb, tmp) {
+		txqi =3D ieee80211_get_txq(local, vif, pubsta, skb);
+		if (txqi) {
+			__skb_unlink(skb, skbs);
+			ieee80211_queue_skb(local, vif, pubsta, skb);
+		}
+	}
+
+	return !!skb_queue_empty(skbs);
+}
+
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -1528,9 +1646,7 @@ static bool ieee80211_tx_frags(struct ieee80211_loc=
al *local,
 			       bool txpending)
 {
 	struct ieee80211_tx_control control =3D {};
-	struct fq *fq =3D &local->fq;
 	struct sk_buff *skb, *tmp;
-	struct txq_info *txqi;
 	unsigned long flags;
=20
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1545,21 +1661,6 @@ static bool ieee80211_tx_frags(struct ieee80211_lo=
cal *local,
 		}
 #endif
=20
-		txqi =3D ieee80211_get_txq(local, vif, sta, skb);
-		if (txqi) {
-			info->control.vif =3D vif;
-
-			__skb_unlink(skb, skbs);
-
-			spin_lock_bh(&fq->lock);
-			ieee80211_txq_enqueue(local, txqi, skb);
-			spin_unlock_bh(&fq->lock);
-
-			drv_wake_tx_queue(local, txqi);
-
-			continue;
-		}
-
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1680,8 +1781,12 @@ static bool __ieee80211_tx(struct ieee80211_local =
*local,
 /*
  * Invoke TX handlers, return 0 on success and non-zero if the
  * frame was dropped or queued.
+ *
+ * The handlers are split into an early and late part. The latter is eve=
rything
+ * that can be sensitive to reordering, and will be deferred to after pa=
ckets
+ * are dequeued from the intermediate queues (when they are enabled).
  */
-static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(tx->skb);
 	ieee80211_tx_result res =3D TX_DROP;
@@ -1697,7 +1802,6 @@ static int invoke_tx_handlers(struct ieee80211_tx_d=
ata *tx)
 	CALL_TXH(ieee80211_tx_h_check_assoc);
 	CALL_TXH(ieee80211_tx_h_ps_buf);
 	CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
-	CALL_TXH(ieee80211_tx_h_select_key);
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
 		CALL_TXH(ieee80211_tx_h_rate_ctrl);
=20
@@ -1706,11 +1810,32 @@ static int invoke_tx_handlers(struct ieee80211_tx=
_data *tx)
 		tx->skb =3D NULL;
 		goto txh_done;
 	}
+	CALL_TXH(ieee80211_tx_h_fragment);
+
+ txh_done:
+	if (unlikely(res =3D=3D TX_DROP)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop);
+		if (tx->skb)
+			ieee80211_free_txskb(&tx->local->hw, tx->skb);
+		else
+			ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+		return -1;
+	} else if (unlikely(res =3D=3D TX_QUEUED)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_queued);
+		return -1;
+	}
=20
+	return 0;
+}
+
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
+{
+	ieee80211_tx_result res =3D TX_DROP;
+
+	/* late tx handlers must be aware of tx info fragmentation! */
+	CALL_TXH(ieee80211_tx_h_select_key);
 	CALL_TXH(ieee80211_tx_h_michael_mic_add);
 	CALL_TXH(ieee80211_tx_h_sequence);
-	CALL_TXH(ieee80211_tx_h_fragment);
-	/* handlers after fragment must be aware of tx info fragmentation! */
 	CALL_TXH(ieee80211_tx_h_stats);
 	CALL_TXH(ieee80211_tx_h_encrypt);
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
@@ -1733,6 +1858,11 @@ static int invoke_tx_handlers(struct ieee80211_tx_=
data *tx)
 	return 0;
 }
=20
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+	return invoke_tx_handlers_early(tx) || invoke_tx_handlers_late(tx);
+}
+
 bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif, struct sk_buff *skb,
 			      int band, struct ieee80211_sta **sta)
@@ -1782,7 +1912,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_da=
ta *sdata,
 	struct ieee80211_tx_data tx;
 	ieee80211_tx_result res_prepare;
 	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
-	bool result =3D true;
+	bool result =3D true, queue =3D !!(local->ops->wake_tx_queue);
 	int led_len;
=20
 	if (unlikely(skb->len < 10)) {
@@ -1807,7 +1937,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_d=
ata *sdata,
 		info->hw_queue =3D
 			sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
=20
-	if (!invoke_tx_handlers(&tx))
+	if (invoke_tx_handlers_early(&tx))
+		return false;
+
+	if (queue && ieee80211_queue_frags(local, tx.sta, &tx.skbs))
+		return true;
+
+	if (!invoke_tx_handlers_late(&tx))
 		result =3D __ieee80211_tx(local, &tx.skbs, led_len,
 					tx.sta, txpending);
=20
@@ -3170,10 +3306,9 @@ static bool ieee80211_xmit_fast(struct ieee80211_s=
ub_if_data *sdata,
 	struct ethhdr eth;
 	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
 	struct ieee80211_hdr *hdr =3D (void *)fast_tx->hdr;
-	struct ieee80211_tx_data tx;
-	ieee80211_tx_result r;
 	struct tid_ampdu_tx *tid_tx =3D NULL;
 	u8 tid =3D IEEE80211_NUM_TIDS;
+	bool queue =3D !!(local->ops->wake_tx_queue);
=20
 	/* control port protocol needs a lot of special handling */
 	if (cpu_to_be16(ethertype) =3D=3D sdata->control_port_protocol)
@@ -3240,8 +3375,32 @@ static bool ieee80211_xmit_fast(struct ieee80211_s=
ub_if_data *sdata,
 	info->flags =3D IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
 		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
+	info->control.flags =3D IEEE80211_TX_CTRL_FAST_XMIT;
+
+	if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
+		sdata =3D container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+
+	if (queue && ieee80211_queue_skb(local, &sdata->vif, &sta->sta, skb))
+		return true;
+
+	return ieee80211_xmit_fast_finish(sdata, sta, fast_tx, skb, true);
+}
+
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sda=
ta,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb, bool xmit)
+{
+	struct ieee80211_local *local =3D sdata->local;
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr =3D (void *)fast_tx->hdr;
+	struct ieee80211_tx_data tx;
+	ieee80211_tx_result r;
+	u8 tid =3D IEEE80211_NUM_TIDS;
=20
 	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+		tid =3D skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
 		*ieee80211_get_qos_ctl(hdr) =3D tid;
 		if (!sta->sta.txq[0])
 			hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, tid);
@@ -3309,12 +3468,11 @@ static bool ieee80211_xmit_fast(struct ieee80211_=
sub_if_data *sdata,
 		}
 	}
=20
-	if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
-		sdata =3D container_of(sdata->bss,
-				     struct ieee80211_sub_if_data, u.ap);
+	if (xmit) {
+		__skb_queue_tail(&tx.skbs, skb);
+		ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
+	}
=20
-	__skb_queue_tail(&tx.skbs, skb);
-	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
 	return true;
 }
=20
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index b48c1e1..71c479a 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -28,13 +28,13 @@
 #include "wpa.h"
=20
 ieee80211_tx_result
-ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
+ieee80211_tx_h_michael_mic_add_skb(struct ieee80211_tx_data *tx,
+				   struct sk_buff *skb)
 {
 	u8 *data, *key, *mic;
 	size_t data_len;
 	unsigned int hdrlen;
 	struct ieee80211_hdr *hdr;
-	struct sk_buff *skb =3D tx->skb;
 	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
 	int tail;
=20
@@ -83,6 +83,20 @@ ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_dat=
a *tx)
 	return TX_CONTINUE;
 }
=20
+ieee80211_tx_result
+ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
+{
+	struct sk_buff *skb;
+	ieee80211_tx_result r;
+
+	skb_queue_walk(&tx->skbs, skb) {
+		r =3D ieee80211_tx_h_michael_mic_add_skb(tx, skb);
+		if (r !=3D TX_CONTINUE)
+			return r;
+	}
+	return TX_CONTINUE;
+}
+
=20
 ieee80211_rx_result
 ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx)
--=20
2.9.3

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [Make-wifi-fast] [PATCH v2] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-22 14:47         ` Toke Høiland-Jørgensen
@ 2016-08-26  8:38           ` Johannes Berg
  2016-08-26  8:54             ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 51+ messages in thread
From: Johannes Berg @ 2016-08-26  8:38 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: Dave Taht, make-wifi-fast, linux-wireless, Felix Fietkau

On Mon, 2016-08-22 at 16:47 +0200, Toke Høiland-Jørgensen wrote:
> 
> I suppose that could be a way to do it (i.e. have
> ieee80211_tx_dequeue call all the TX hooks etc), but am not sure
> whether there would be problems doing all this work in the loop
> that's building aggregates (which is what would happen for ath9k at
> least).

I don't know, but it seems that it's worth trying.

> An alternative could be to split the process up in two: An "early"
> and "late" stage, where the early stage does everything that is not
> sensitive to reordering and the occasional drop, and the late stage
> is everything that is. Then the queueing step could happen in-between 
> the two stages, and the non-queueing path could just call both stages
> at once. In effect, this would just make the current work-arounds be
> more explicit in the structure, rather than being marked as
> exceptions.

I'm not sure that works the way you think it does.

What you did works for fast-xmit, but *only* because that doesn't do
software crypto. If, for some reason, the TXQ stuff combines with
software crypto, which doesn't seem impossible (ath9k even has a module
parameter, iirc), then you have no way for this to work.

> > Now, it's unlikely to be that simple - fragmentation, for example,
> > might mess this up.
> > 
> > Overall though, I'm definitely wondering if it should be this way,
> > since all the special cases just add complexity.
> 
> I agree that the work-arounds are iffy, but I do also think it's
> important to keep in mind that we are improving latency by orders of
> magnitude here. A few special cases is worth it to achieve that, IMO.
> And then iterating towards a design that don't need them, of course
> :)

I don't really agree, I'm not going to treat this unlike any other
feature, which gets merged when it's ready for that.

Right now, your code here obviously isn't, since it doesn't even
address the cases that ath9k could run into, so either ath9k shouldn't
use this mac80211 feature, or the mac80211 feature needs to be fixed
before ath9k can use it.

I have no problems with documenting that the TXQ stuff can only be used
with full hardware crypto, but then we should add some checks and
warnings in mac80211 to ensure that, i.e. not allow software keys when
TXQ stuff is used, nor allow keys with mac80211 PN assignment, etc.

Even QoS-seqno assignment will be broken btw, so you do need a bunch
more offloads to make this work.

johannes

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [Make-wifi-fast] [PATCH v2] mac80211: Move crypto IV generation to after TXQ dequeue.
  2016-08-26  8:38           ` Johannes Berg
@ 2016-08-26  8:54             ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-08-26  8:54 UTC (permalink / raw)
  To: Johannes Berg; +Cc: Dave Taht, make-wifi-fast, linux-wireless, Felix Fietkau

Johannes Berg <johannes@sipsolutions.net> writes:

> On Mon, 2016-08-22 at 16:47 +0200, Toke H=C3=B8iland-J=C3=B8rgensen wrote:
>>=C2=A0
>> I suppose that could be a way to do it (i.e. have
>> ieee80211_tx_dequeue call all the TX hooks etc), but am not sure
>> whether there would be problems doing all this work in the loop
>> that's building aggregates (which is what would happen for ath9k at
>> least).
>
> I don't know, but it seems that it's worth trying.
>
>> An alternative could be to split the process up in two: An "early"
>> and "late" stage, where the early stage does everything that is not
>> sensitive to reordering and the occasional drop, and the late stage
>> is everything that is. Then the queueing step could happen in-between=20
>> the two stages, and the non-queueing path could just call both stages
>> at once. In effect, this would just make the current work-arounds be
>> more explicit in the structure, rather than being marked as
>> exceptions.
>
> I'm not sure that works the way you think it does.
>
> What you did works for fast-xmit, but *only* because that doesn't do
> software crypto. If, for some reason, the TXQ stuff combines with
> software crypto, which doesn't seem impossible (ath9k even has a module
> parameter, iirc), then you have no way for this to work.

Yeah, I realised that when I started reviewing the slow path (sorry for
not realising that straight away). The v3 takes the "split handlers"
approach for this reason. That saved having to deal with fragmentation
on TXQ dequeue, and it means that some of the processing can be done
before queueing (such as GSO splitting; having packets be as small as
possible before applying FQ to them is a good thing if we want to
realise the full potential).

It seems there are still some bugs to work out with that patch, but I'd
be grateful if you could glance at it and comment on whether you think
this is a viable way forward (provided we can work out all the bugs, of
course).

>> > Now, it's unlikely to be that simple - fragmentation, for example,
>> > might mess this up.
>> >=20
>> > Overall though, I'm definitely wondering if it should be this way,
>> > since all the special cases just add complexity.
>>=20
>> I agree that the work-arounds are iffy, but I do also think it's
>> important to keep in mind that we are improving latency by orders of
>> magnitude here. A few special cases is worth it to achieve that, IMO.
>> And then iterating towards a design that don't need them, of course
>> :)
>
> I don't really agree, I'm not going to treat this unlike any other
> feature, which gets merged when it's ready for that.
>
> Right now, your code here obviously isn't, since it doesn't even
> address the cases that ath9k could run into, so either ath9k shouldn't
> use this mac80211 feature, or the mac80211 feature needs to be fixed
> before ath9k can use it.

Yeah, I agree now that I've looked at it some more :)

> I have no problems with documenting that the TXQ stuff can only be
> used with full hardware crypto, but then we should add some checks and
> warnings in mac80211 to ensure that, i.e. not allow software keys when
> TXQ stuff is used, nor allow keys with mac80211 PN assignment, etc.

I'd much rather fix things so it works in all cases. My patch to ath9k
to use this stuff completely removes the old TX path, and things like
the airtime fairness scheduler needs the intermediate queues to work.

-Toke

^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v4] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-08-24 16:20   ` [PATCH v3] mac80211: Move reorder-sensitive TX handlers " Toke Høiland-Jørgensen
@ 2016-08-30 13:15     ` Toke Høiland-Jørgensen
  2016-08-31 21:06       ` Johannes Berg
  2016-09-01 16:03       ` [PATCH v5] " Toke Høiland-Jørgensen
  0 siblings, 2 replies; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-08-30 13:15 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless; +Cc: Toke Høiland-Jørgensen

The TXQ intermediate queues can cause packet reordering when more than
one flow is active to a single station. Since some of the wifi-specific
packet handling (notably sequence number and encryption handling) is
sensitive to re-ordering, things break if they are applied before the
TXQ.

This splits up the TX handlers and fast_xmit logic into two parts: An
early part and a late part. The former is applied before TXQ enqueue,
and the latter after dequeue. The non-TXQ path just applies both parts
at once.

To avoid having to deal with fragmentation on dequeue, the split is set
to be after the fragmentation handler. This means that some reordering
of TX handlers is necessary, and some handlers had to be made aware of
fragmentation due to this reordering.

This approach avoids having to scatter special cases for when TXQ is
enabled, at the cost of making the fast_xmit and TX handler code
slightly more complex.

Signed-off-by: Toke H=C3=B8iland-J=C3=B8rgensen <toke@toke.dk>
---
Changes since v3:
  - Fix sequence number assignment in the fast path.
  - Code cleanup.

 include/net/mac80211.h |   2 +
 net/mac80211/tx.c      | 269 ++++++++++++++++++++++++++++++++++++++-----=
------
 net/mac80211/wpa.c     |  18 +++-
 3 files changed, 227 insertions(+), 62 deletions(-)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index cca510a..9a6a3e9 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -715,6 +715,7 @@ enum mac80211_tx_info_flags {
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate info=
rmation
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
+ * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xm=
it path
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -723,6 +724,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PS_RESPONSE		=3D BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		=3D BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			=3D BIT(3),
+	IEEE80211_TX_CTRL_FAST_XMIT		=3D BIT(4),
 };
=20
 /*
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 1d0746d..56dca2d 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -38,6 +38,12 @@
 #include "wme.h"
 #include "rate.h"
=20
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sda=
ta,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb, bool xmit);
+
 /* misc utils */
=20
 static inline void ieee80211_tx_stats(struct net_device *dev, u32 len)
@@ -585,20 +591,27 @@ static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_key *key;
-	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(tx->skb);
-	struct ieee80211_hdr *hdr =3D (struct ieee80211_hdr *)tx->skb->data;
+	struct ieee80211_tx_info *info;
+	struct ieee80211_hdr *hdr;
+	struct sk_buff *skb =3D tx->skb;
+
+	if (!skb)
+		skb =3D skb_peek(&tx->skbs);
+
+	info =3D IEEE80211_SKB_CB(skb);
+	hdr =3D (struct ieee80211_hdr *)skb->data;
=20
 	if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT))
 		tx->key =3D NULL;
 	else if (tx->sta &&
 		 (key =3D rcu_dereference(tx->sta->ptk[tx->sta->ptk_idx])))
 		tx->key =3D key;
-	else if (ieee80211_is_group_privacy_action(tx->skb) &&
+	else if (ieee80211_is_group_privacy_action(skb) &&
 		(key =3D rcu_dereference(tx->sdata->default_multicast_key)))
 		tx->key =3D key;
 	else if (ieee80211_is_mgmt(hdr->frame_control) &&
 		 is_multicast_ether_addr(hdr->addr1) &&
-		 ieee80211_is_robust_mgmt_frame(tx->skb) &&
+		 ieee80211_is_robust_mgmt_frame(skb) &&
 		 (key =3D rcu_dereference(tx->sdata->default_mgmt_key)))
 		tx->key =3D key;
 	else if (is_multicast_ether_addr(hdr->addr1) &&
@@ -628,8 +641,8 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *t=
x)
 		case WLAN_CIPHER_SUITE_GCMP_256:
 			if (!ieee80211_is_data_present(hdr->frame_control) &&
 			    !ieee80211_use_mfp(hdr->frame_control, tx->sta,
-					       tx->skb) &&
-			    !ieee80211_is_group_privacy_action(tx->skb))
+					       skb) &&
+			    !ieee80211_is_group_privacy_action(skb))
 				tx->key =3D NULL;
 			else
 				skip_hw =3D (tx->key->conf.flags &
@@ -799,10 +812,12 @@ static __le16 ieee80211_tx_next_seq(struct sta_info=
 *sta, int tid)
 static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 {
-	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(tx->skb);
-	struct ieee80211_hdr *hdr =3D (struct ieee80211_hdr *)tx->skb->data;
+	struct sk_buff *skb =3D skb_peek(&tx->skbs);
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr =3D (struct ieee80211_hdr *)skb->data;
 	u8 *qc;
 	int tid;
+	u16 fragnum, seq;
=20
 	/*
 	 * Packet injection may want to control the sequence
@@ -829,10 +844,16 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *t=
x)
 	 */
 	if (!ieee80211_is_data_qos(hdr->frame_control) ||
 	    is_multicast_ether_addr(hdr->addr1)) {
-		/* driver should assign sequence number */
-		info->flags |=3D IEEE80211_TX_CTL_ASSIGN_SEQ;
-		/* for pure STA mode without beacons, we can do it */
-		hdr->seq_ctrl =3D cpu_to_le16(tx->sdata->sequence_number);
+		fragnum =3D 0;
+		seq =3D cpu_to_le16(tx->sdata->sequence_number);
+		skb_queue_walk(&tx->skbs, skb) {
+			info =3D IEEE80211_SKB_CB(skb);
+			hdr =3D (struct ieee80211_hdr *)skb->data;
+			/* driver should assign sequence number */
+			info->flags |=3D IEEE80211_TX_CTL_ASSIGN_SEQ;
+			/* for pure STA mode without beacons, we can do it */
+			hdr->seq_ctrl =3D seq | fragnum++;
+		}
 		tx->sdata->sequence_number +=3D 0x10;
 		if (tx->sta)
 			tx->sta->tx_stats.msdu[IEEE80211_NUM_TIDS]++;
@@ -853,8 +874,14 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx=
)
 	tid =3D *qc & IEEE80211_QOS_CTL_TID_MASK;
 	tx->sta->tx_stats.msdu[tid]++;
=20
-	if (!tx->sta->sta.txq[0])
-		hdr->seq_ctrl =3D ieee80211_tx_next_seq(tx->sta, tid);
+	if (!tx->sta->sta.txq[0]) {
+		seq =3D ieee80211_tx_next_seq(tx->sta, tid);
+		fragnum =3D 0;
+		skb_queue_walk(&tx->skbs, skb) {
+			hdr =3D (struct ieee80211_hdr *)skb->data;
+			hdr->seq_ctrl =3D seq | fragnum++;
+		}
+	}
=20
 	return TX_CONTINUE;
 }
@@ -927,7 +954,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 	struct ieee80211_hdr *hdr =3D (void *)skb->data;
 	int frag_threshold =3D tx->local->hw.wiphy->frag_threshold;
 	int hdrlen;
-	int fragnum;
=20
 	/* no matter what happens, tx->skb moves to tx->skbs */
 	__skb_queue_tail(&tx->skbs, skb);
@@ -964,9 +990,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 	if (ieee80211_fragment(tx, skb, hdrlen, frag_threshold))
 		return TX_DROP;
=20
-	/* update duration/seq/flags of fragments */
-	fragnum =3D 0;
-
 	skb_queue_walk(&tx->skbs, skb) {
 		const __le16 morefrags =3D cpu_to_le16(IEEE80211_FCTL_MOREFRAGS);
=20
@@ -987,8 +1010,6 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx=
)
 		} else {
 			hdr->frame_control &=3D ~morefrags;
 		}
-		hdr->seq_ctrl |=3D cpu_to_le16(fragnum & IEEE80211_SCTL_FRAG);
-		fragnum++;
 	}
=20
 	return TX_CONTINUE;
@@ -1481,33 +1502,58 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee8=
0211_hw *hw,
 {
 	struct ieee80211_local *local =3D hw_to_local(hw);
 	struct txq_info *txqi =3D container_of(txq, struct txq_info, txq);
-	struct ieee80211_hdr *hdr;
 	struct sk_buff *skb =3D NULL;
 	struct fq *fq =3D &local->fq;
 	struct fq_tin *tin =3D &txqi->tin;
+	struct ieee80211_tx_info *info;
=20
 	spin_lock_bh(&fq->lock);
=20
 	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
 		goto out;
=20
+begin:
 	skb =3D fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
 	if (!skb)
 		goto out;
=20
 	ieee80211_set_skb_vif(skb, txqi);
=20
-	hdr =3D (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+	info =3D IEEE80211_SKB_CB(skb);
+	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
 		struct sta_info *sta =3D container_of(txq->sta, struct sta_info,
 						    sta);
-		struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+		struct ieee80211_fast_tx *fast_tx;
=20
-		hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |=3D IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &=3D ~IEEE80211_TX_CTL_AMPDU;
+		fast_tx =3D rcu_dereference(sta->fast_tx);
+		if (!fast_tx ||
+		    !ieee80211_xmit_fast_finish(sta->sdata, sta, fast_tx, skb,
+						false)) {
+			/* fast xmit was started, but fails to finish */
+			ieee80211_free_txskb(hw, skb);
+			goto begin;
+		}
+	} else {
+		struct ieee80211_tx_data tx =3D { };
+
+		__skb_queue_head_init(&tx.skbs);
+		tx.local =3D local;
+		if (txq->sta) {
+			struct sta_info *sta =3D container_of(txq->sta,
+							    struct sta_info,
+							    sta);
+			tx.sta =3D container_of(txq->sta, struct sta_info, sta);
+			tx.sdata =3D sta->sdata;
+		} else {
+			tx.sdata =3D vif_to_sdata(info->control.vif);
+		}
+
+		__skb_queue_tail(&tx.skbs, skb);
+
+		if (invoke_tx_handlers_late(&tx))
+			goto begin;
+
+		__skb_unlink(skb, &tx.skbs);
 	}
=20
 out:
@@ -1521,6 +1567,71 @@ out:
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
=20
+static bool ieee80211_queue_skb(struct ieee80211_local *local,
+				struct ieee80211_sub_if_data *sdata,
+				struct ieee80211_sta *sta,
+				struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+	struct fq *fq =3D &local->fq;
+	struct ieee80211_vif *vif;
+	struct txq_info *txqi;
+
+	if (!local->ops->wake_tx_queue)
+		return false;
+
+	if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
+		sdata =3D container_of(sdata->bss,
+				struct ieee80211_sub_if_data, u.ap);
+
+	vif =3D &sdata->vif;
+	txqi =3D ieee80211_get_txq(local, vif, sta, skb);
+
+	if (!txqi)
+		return false;
+
+	info->control.vif =3D vif;
+
+	spin_lock_bh(&fq->lock);
+	ieee80211_txq_enqueue(local, txqi, skb);
+	spin_unlock_bh(&fq->lock);
+
+	drv_wake_tx_queue(local, txqi);
+
+	return true;
+}
+
+static bool ieee80211_queue_frags(struct ieee80211_local *local,
+				  struct ieee80211_sub_if_data *sdata,
+				  struct sta_info *sta,
+				  struct sk_buff_head *skbs)
+{
+	struct sk_buff *skb;
+	struct ieee80211_sta *pubsta;
+
+	if (WARN_ON(skb_queue_empty(skbs)))
+		return true;
+
+	if (!local->ops->wake_tx_queue ||
+	    sdata->vif.type =3D=3D NL80211_IFTYPE_MONITOR)
+		return false;
+
+	if (sta && sta->uploaded)
+		pubsta =3D &sta->sta;
+	else
+		pubsta =3D NULL;
+
+	while (!skb_queue_empty(skbs)) {
+		skb =3D __skb_dequeue(skbs);
+		if (unlikely(!ieee80211_queue_skb(local, sdata, pubsta, skb))) {
+			__skb_queue_head(skbs, skb);
+			return false;
+		}
+	}
+
+	return true;
+}
+
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -1528,9 +1639,7 @@ static bool ieee80211_tx_frags(struct ieee80211_loc=
al *local,
 			       bool txpending)
 {
 	struct ieee80211_tx_control control =3D {};
-	struct fq *fq =3D &local->fq;
 	struct sk_buff *skb, *tmp;
-	struct txq_info *txqi;
 	unsigned long flags;
=20
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1545,21 +1654,6 @@ static bool ieee80211_tx_frags(struct ieee80211_lo=
cal *local,
 		}
 #endif
=20
-		txqi =3D ieee80211_get_txq(local, vif, sta, skb);
-		if (txqi) {
-			info->control.vif =3D vif;
-
-			__skb_unlink(skb, skbs);
-
-			spin_lock_bh(&fq->lock);
-			ieee80211_txq_enqueue(local, txqi, skb);
-			spin_unlock_bh(&fq->lock);
-
-			drv_wake_tx_queue(local, txqi);
-
-			continue;
-		}
-
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1680,8 +1774,12 @@ static bool __ieee80211_tx(struct ieee80211_local =
*local,
 /*
  * Invoke TX handlers, return 0 on success and non-zero if the
  * frame was dropped or queued.
+ *
+ * The handlers are split into an early and late part. The latter is eve=
rything
+ * that can be sensitive to reordering, and will be deferred to after pa=
ckets
+ * are dequeued from the intermediate queues (when they are enabled).
  */
-static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(tx->skb);
 	ieee80211_tx_result res =3D TX_DROP;
@@ -1697,7 +1795,6 @@ static int invoke_tx_handlers(struct ieee80211_tx_d=
ata *tx)
 	CALL_TXH(ieee80211_tx_h_check_assoc);
 	CALL_TXH(ieee80211_tx_h_ps_buf);
 	CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
-	CALL_TXH(ieee80211_tx_h_select_key);
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
 		CALL_TXH(ieee80211_tx_h_rate_ctrl);
=20
@@ -1706,11 +1803,32 @@ static int invoke_tx_handlers(struct ieee80211_tx=
_data *tx)
 		tx->skb =3D NULL;
 		goto txh_done;
 	}
+	CALL_TXH(ieee80211_tx_h_fragment);
+
+ txh_done:
+	if (unlikely(res =3D=3D TX_DROP)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop);
+		if (tx->skb)
+			ieee80211_free_txskb(&tx->local->hw, tx->skb);
+		else
+			ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+		return -1;
+	} else if (unlikely(res =3D=3D TX_QUEUED)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_queued);
+		return -1;
+	}
=20
+	return 0;
+}
+
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
+{
+	ieee80211_tx_result res =3D TX_DROP;
+
+	/* late tx handlers must be aware of tx info fragmentation! */
+	CALL_TXH(ieee80211_tx_h_select_key);
 	CALL_TXH(ieee80211_tx_h_michael_mic_add);
 	CALL_TXH(ieee80211_tx_h_sequence);
-	CALL_TXH(ieee80211_tx_h_fragment);
-	/* handlers after fragment must be aware of tx info fragmentation! */
 	CALL_TXH(ieee80211_tx_h_stats);
 	CALL_TXH(ieee80211_tx_h_encrypt);
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
@@ -1733,6 +1851,11 @@ static int invoke_tx_handlers(struct ieee80211_tx_=
data *tx)
 	return 0;
 }
=20
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+	return invoke_tx_handlers_early(tx) || invoke_tx_handlers_late(tx);
+}
+
 bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif, struct sk_buff *skb,
 			      int band, struct ieee80211_sta **sta)
@@ -1807,7 +1930,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_d=
ata *sdata,
 		info->hw_queue =3D
 			sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
=20
-	if (!invoke_tx_handlers(&tx))
+	if (invoke_tx_handlers_early(&tx))
+		return false;
+
+	if (ieee80211_queue_frags(local, sdata, tx.sta, &tx.skbs))
+		return true;
+
+	if (!invoke_tx_handlers_late(&tx))
 		result =3D __ieee80211_tx(local, &tx.skbs, led_len,
 					tx.sta, txpending);
=20
@@ -3170,8 +3299,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_su=
b_if_data *sdata,
 	struct ethhdr eth;
 	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
 	struct ieee80211_hdr *hdr =3D (void *)fast_tx->hdr;
-	struct ieee80211_tx_data tx;
-	ieee80211_tx_result r;
 	struct tid_ampdu_tx *tid_tx =3D NULL;
 	u8 tid =3D IEEE80211_NUM_TIDS;
=20
@@ -3240,11 +3367,30 @@ static bool ieee80211_xmit_fast(struct ieee80211_=
sub_if_data *sdata,
 	info->flags =3D IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
 		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
+	info->control.flags =3D IEEE80211_TX_CTRL_FAST_XMIT;
+
+	if (ieee80211_queue_skb(local, sdata, &sta->sta, skb))
+		return true;
+
+	return ieee80211_xmit_fast_finish(sdata, sta, fast_tx, skb, true);
+}
+
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sda=
ta,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb, bool xmit)
+{
+	struct ieee80211_local *local =3D sdata->local;
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr =3D (void *)skb->data;
+	struct ieee80211_tx_data tx;
+	ieee80211_tx_result r;
+	u8 tid =3D IEEE80211_NUM_TIDS;
=20
 	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+		tid =3D skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
 		*ieee80211_get_qos_ctl(hdr) =3D tid;
-		if (!sta->sta.txq[0])
-			hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, tid);
+		hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, tid);
 	} else {
 		info->flags |=3D IEEE80211_TX_CTL_ASSIGN_SEQ;
 		hdr->seq_ctrl =3D cpu_to_le16(sdata->sequence_number);
@@ -3309,12 +3455,15 @@ static bool ieee80211_xmit_fast(struct ieee80211_=
sub_if_data *sdata,
 		}
 	}
=20
-	if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
-		sdata =3D container_of(sdata->bss,
-				     struct ieee80211_sub_if_data, u.ap);
+	if (xmit) {
+		if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
+			sdata =3D container_of(sdata->bss,
+					struct ieee80211_sub_if_data, u.ap);
+
+		__skb_queue_tail(&tx.skbs, skb);
+		ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
+	}
=20
-	__skb_queue_tail(&tx.skbs, skb);
-	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
 	return true;
 }
=20
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index b48c1e1..71c479a 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -28,13 +28,13 @@
 #include "wpa.h"
=20
 ieee80211_tx_result
-ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
+ieee80211_tx_h_michael_mic_add_skb(struct ieee80211_tx_data *tx,
+				   struct sk_buff *skb)
 {
 	u8 *data, *key, *mic;
 	size_t data_len;
 	unsigned int hdrlen;
 	struct ieee80211_hdr *hdr;
-	struct sk_buff *skb =3D tx->skb;
 	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
 	int tail;
=20
@@ -83,6 +83,20 @@ ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_dat=
a *tx)
 	return TX_CONTINUE;
 }
=20
+ieee80211_tx_result
+ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
+{
+	struct sk_buff *skb;
+	ieee80211_tx_result r;
+
+	skb_queue_walk(&tx->skbs, skb) {
+		r =3D ieee80211_tx_h_michael_mic_add_skb(tx, skb);
+		if (r !=3D TX_CONTINUE)
+			return r;
+	}
+	return TX_CONTINUE;
+}
+
=20
 ieee80211_rx_result
 ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx)
--=20
2.9.3

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [PATCH v4] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-08-30 13:15     ` [PATCH v4] " Toke Høiland-Jørgensen
@ 2016-08-31 21:06       ` Johannes Berg
  2016-09-01  8:23         ` Toke Høiland-Jørgensen
  2016-09-01 16:03       ` [PATCH v5] " Toke Høiland-Jørgensen
  1 sibling, 1 reply; 51+ messages in thread
From: Johannes Berg @ 2016-08-31 21:06 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless

On Tue, 2016-08-30 at 15:15 +0200, Toke Høiland-Jørgensen wrote:

> @@ -829,10 +844,16 @@ ieee80211_tx_h_sequence(struct
> ieee80211_tx_data *tx)
>  	 */
>  	if (!ieee80211_is_data_qos(hdr->frame_control) ||
>  	    is_multicast_ether_addr(hdr->addr1)) {
> -		/* driver should assign sequence number */
> -		info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
> -		/* for pure STA mode without beacons, we can do it
> */
> -		hdr->seq_ctrl = cpu_to_le16(tx->sdata-
> >sequence_number);
> +		fragnum = 0;
> +		seq = cpu_to_le16(tx->sdata->sequence_number);
> +		skb_queue_walk(&tx->skbs, skb) {
> +			info = IEEE80211_SKB_CB(skb);
> +			hdr = (struct ieee80211_hdr *)skb->data;
> +			/* driver should assign sequence number */
> +			info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ;
> +			/* for pure STA mode without beacons, we can
> do it */
> +			hdr->seq_ctrl = seq | fragnum++;

I would very much prefer you kept fragnum assignment in the
fragmentation handler.

Also, you just broke this on big endian, please run sparse on your
patches if you don't see these things directly.

> +		if (!fast_tx ||
> +		    !ieee80211_xmit_fast_finish(sta->sdata, sta,
> fast_tx, skb,
> +						false)) {
> +			/* fast xmit was started, but fails to
> finish */
> +			ieee80211_free_txskb(hw, skb);
> +			goto begin;
> +		}

That obviously cannot happen, it can't fail to finish. See the comments
in xmit_fast() and the return values ...

> +static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
> +{
> +	return invoke_tx_handlers_early(tx) ||
> invoke_tx_handlers_late(tx);
> +}

Ugh, please, no, don't be tricky where it's not necessary. Now every
person reading this has to first look up the return type, and then the
return value, and make sure they understand that success is actually
the value 0 ... that's way too much to ask.
 
> +ieee80211_tx_result
> +ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx)
> +{
> +	struct sk_buff *skb;
> +	ieee80211_tx_result r;
> +
> +	skb_queue_walk(&tx->skbs, skb) {
> +		r = ieee80211_tx_h_michael_mic_add_skb(tx, skb);
> +		if (r != TX_CONTINUE)
> +			return r;
> +	}
> +	return TX_CONTINUE;
> +}

You just broke TKIP completely again. Adding the MMIC and fragmentation
are not commutative operations.

johannes

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v4] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-08-31 21:06       ` Johannes Berg
@ 2016-09-01  8:23         ` Toke Høiland-Jørgensen
  2016-09-01  8:34           ` Johannes Berg
  0 siblings, 1 reply; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-01  8:23 UTC (permalink / raw)
  To: Johannes Berg; +Cc: make-wifi-fast, linux-wireless

Johannes Berg <johannes@sipsolutions.net> writes:

>> +static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
>> +{
>> +	return invoke_tx_handlers_early(tx) ||
>> invoke_tx_handlers_late(tx);
>> +}
>
> Ugh, please, no, don't be tricky where it's not necessary. Now every
> person reading this has to first look up the return type, and then the
> return value, and make sure they understand that success is actually
> the value 0 ... that's way too much to ask.

Noted. Any objections to turning these into bool return types?


I'll go through and fix your other comments and send a new version.
Thanks for the feedback :)

-Toke

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v4] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-01  8:23         ` Toke Høiland-Jørgensen
@ 2016-09-01  8:34           ` Johannes Berg
  2016-09-01  8:38             ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 51+ messages in thread
From: Johannes Berg @ 2016-09-01  8:34 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast, linux-wireless


> > > +static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
> > > +{
> > > +	return invoke_tx_handlers_early(tx) ||
> > > invoke_tx_handlers_late(tx);
> > > +}
> > 
> > Ugh, please, no, don't be tricky where it's not necessary. Now
> > every
> > person reading this has to first look up the return type, and then
> > the
> > return value, and make sure they understand that success is
> > actually
> > the value 0 ... that's way too much to ask.
> 
> Noted. Any objections to turning these into bool return types?

They have three possible values ... :)

johannes

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v4] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-01  8:34           ` Johannes Berg
@ 2016-09-01  8:38             ` Toke Høiland-Jørgensen
  2016-09-01  9:07               ` Johannes Berg
  0 siblings, 1 reply; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-01  8:38 UTC (permalink / raw)
  To: Johannes Berg; +Cc: make-wifi-fast, linux-wireless

Johannes Berg <johannes@sipsolutions.net> writes:

>> > > +static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
>> > > +{
>> > > +	return invoke_tx_handlers_early(tx) ||
>> > > invoke_tx_handlers_late(tx);
>> > > +}
>> > 
>> > Ugh, please, no, don't be tricky where it's not necessary. Now
>> > every
>> > person reading this has to first look up the return type, and then
>> > the
>> > return value, and make sure they understand that success is
>> > actually
>> > the value 0 ... that's way too much to ask.
>> 
>> Noted. Any objections to turning these into bool return types?
>
> They have three possible values ... :)

Ah, no, not the handlers themselves. Meant the invoke_tx_handlers()
function (or all three of them after my patch; hence the plural). To
avoid the "0 means true" confusion you alluded to :)

-Toke

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v4] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-01  8:38             ` Toke Høiland-Jørgensen
@ 2016-09-01  9:07               ` Johannes Berg
  2016-09-01  9:20                 ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 51+ messages in thread
From: Johannes Berg @ 2016-09-01  9:07 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast, linux-wireless


> > They have three possible values ... :)
> 
> Ah, no, not the handlers themselves. Meant the invoke_tx_handlers()
> function (or all three of them after my patch; hence the plural). To
> avoid the "0 means true" confusion you alluded to :)
> 

Ah. Actually, even I got confused and thought the return value *was*
the same as the handler.

I think it doesn't matter to be tricky, gcc is probably going to (have
to) generate exactly the same code like when you explicitly put an if
statement in there, it seems?

johannes

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v4] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-01  9:07               ` Johannes Berg
@ 2016-09-01  9:20                 ` Toke Høiland-Jørgensen
  2016-09-01  9:27                   ` Johannes Berg
  0 siblings, 1 reply; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-01  9:20 UTC (permalink / raw)
  To: Johannes Berg; +Cc: make-wifi-fast, linux-wireless

Johannes Berg <johannes@sipsolutions.net> writes:

>> > They have three possible values ... :)
>> 
>> Ah, no, not the handlers themselves. Meant the invoke_tx_handlers()
>> function (or all three of them after my patch; hence the plural). To
>> avoid the "0 means true" confusion you alluded to :)
>> 
>
> Ah. Actually, even I got confused and thought the return value *was*
> the same as the handler.
>
> I think it doesn't matter to be tricky, gcc is probably going to (have
> to) generate exactly the same code like when you explicitly put an if
> statement in there, it seems?

Yeah, was going to do that anyway. But since I'm touching the code
anyway, this might be an opportunity to avoid constructs like this:

if (!invoke_tx_handlers(tx))
  /* continue sending the packet */

Most other succeed/fail functions seem to be of type bool, so it would
help consistency as well. Unless there is some particular reason why
this function happens to be using 0 to indicate success?

-Toke

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v4] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-01  9:20                 ` Toke Høiland-Jørgensen
@ 2016-09-01  9:27                   ` Johannes Berg
  2016-09-01  9:42                     ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 51+ messages in thread
From: Johannes Berg @ 2016-09-01  9:27 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast, linux-wireless


> Yeah, was going to do that anyway. But since I'm touching the code
> anyway, this might be an opportunity to avoid constructs like this:
> 
> if (!invoke_tx_handlers(tx))
>   /* continue sending the packet */
> 
> Most other succeed/fail functions seem to be of type bool, so it
> would help consistency as well. Unless there is some particular
> reason why this function happens to be using 0 to indicate success?
> 

It's just convention in the kernel, really.

IMHO if a function has a bool return value it should be have a more
expressive name that indicates better what's going on, like e.g.

bool ieee80211_is_radar_required(...);

but of course that's not always done.

johannes

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v4] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-01  9:27                   ` Johannes Berg
@ 2016-09-01  9:42                     ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-01  9:42 UTC (permalink / raw)
  To: Johannes Berg; +Cc: make-wifi-fast, linux-wireless

Johannes Berg <johannes@sipsolutions.net> writes:

>> Yeah, was going to do that anyway. But since I'm touching the code
>> anyway, this might be an opportunity to avoid constructs like this:
>>=20
>> if (!invoke_tx_handlers(tx))
>> =C2=A0 /* continue sending the packet */
>>=20
>> Most other succeed/fail functions seem to be of type bool, so it
>> would help consistency as well. Unless there is some particular
>> reason why this function happens to be using 0 to indicate success?
>>=20
>
> It's just convention in the kernel, really.
>
> IMHO if a function has a bool return value it should be have a more
> expressive name that indicates better what's going on, like e.g.
>
> bool ieee80211_is_radar_required(...);
>
> but of course that's not always done.

Well, it's applied somewhat inconsistently across mac80211, it seems
(e.g. ieee80211_tx() and ieee80211_tx_prepare_skb() are bool, while
invoke_tx_handlers() and ieee80211_skb_resize() are int). But okay,
don't have that strong an opinion about the colour of this particular
bikeshed so I'll keep it the way it is ;)

-Toke

^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v5] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-08-30 13:15     ` [PATCH v4] " Toke Høiland-Jørgensen
  2016-08-31 21:06       ` Johannes Berg
@ 2016-09-01 16:03       ` Toke Høiland-Jørgensen
  2016-09-01 17:59         ` Johannes Berg
                           ` (2 more replies)
  1 sibling, 3 replies; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-01 16:03 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless; +Cc: Toke Høiland-Jørgensen

The TXQ intermediate queues can cause packet reordering when more than
one flow is active to a single station. Since some of the wifi-specific
packet handling (notably sequence number and encryption handling) is
sensitive to re-ordering, things break if they are applied before the
TXQ.

This splits up the TX handlers and fast_xmit logic into two parts: An
early part and a late part. The former is applied before TXQ enqueue,
and the latter after dequeue. The non-TXQ path just applies both parts
at once.

To avoid having to deal with fragmentation on dequeue, the split is set
to be after the fragmentation handler. This means that some reordering
of TX handlers is necessary, and some handlers had to be made aware of
fragmentation due to this reordering.

This approach avoids having to scatter special cases for when TXQ is
enabled, at the cost of making the fast_xmit and TX handler code
slightly more complex.

Signed-off-by: Toke H=C3=B8iland-J=C3=B8rgensen <toke@toke.dk>
---
Changes since v4:
- Keep fragnum assignment in fragmentation handler and fix endianness
  issues in seqno handler.
- Assume xmit_fast_finish can't fail in dequeue handler (and warn if
  fast_tx handle disappears).
- Move TKIP MIC and key selection handlers back before fragmentation
  handler. Turns out the MIC doesn't actually depend on a global
  sequence number, so it can be before the intermediate queueing step.
  The only cost of this is running the key selection handler twice in
  some cases.
- Improve readability of the composite invoke_tx_handlers() function.


 include/net/mac80211.h |   2 +
 net/mac80211/tx.c      | 266 +++++++++++++++++++++++++++++++++++++++----=
------
 2 files changed, 214 insertions(+), 54 deletions(-)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index cca510a..9a6a3e9 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -715,6 +715,7 @@ enum mac80211_tx_info_flags {
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate info=
rmation
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
+ * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xm=
it path
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -723,6 +724,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PS_RESPONSE		=3D BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		=3D BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			=3D BIT(3),
+	IEEE80211_TX_CTRL_FAST_XMIT		=3D BIT(4),
 };
=20
 /*
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 1d0746d..f7373c2 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -38,6 +38,12 @@
 #include "wme.h"
 #include "rate.h"
=20
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sda=
ta,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb, bool xmit);
+
 /* misc utils */
=20
 static inline void ieee80211_tx_stats(struct net_device *dev, u32 len)
@@ -585,20 +591,27 @@ static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_key *key;
-	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(tx->skb);
-	struct ieee80211_hdr *hdr =3D (struct ieee80211_hdr *)tx->skb->data;
+	struct ieee80211_tx_info *info;
+	struct ieee80211_hdr *hdr;
+	struct sk_buff *skb =3D tx->skb;
+
+	if (!skb)
+		skb =3D skb_peek(&tx->skbs);
+
+	info =3D IEEE80211_SKB_CB(skb);
+	hdr =3D (struct ieee80211_hdr *)skb->data;
=20
 	if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT))
 		tx->key =3D NULL;
 	else if (tx->sta &&
 		 (key =3D rcu_dereference(tx->sta->ptk[tx->sta->ptk_idx])))
 		tx->key =3D key;
-	else if (ieee80211_is_group_privacy_action(tx->skb) &&
+	else if (ieee80211_is_group_privacy_action(skb) &&
 		(key =3D rcu_dereference(tx->sdata->default_multicast_key)))
 		tx->key =3D key;
 	else if (ieee80211_is_mgmt(hdr->frame_control) &&
 		 is_multicast_ether_addr(hdr->addr1) &&
-		 ieee80211_is_robust_mgmt_frame(tx->skb) &&
+		 ieee80211_is_robust_mgmt_frame(skb) &&
 		 (key =3D rcu_dereference(tx->sdata->default_mgmt_key)))
 		tx->key =3D key;
 	else if (is_multicast_ether_addr(hdr->addr1) &&
@@ -628,8 +641,8 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *t=
x)
 		case WLAN_CIPHER_SUITE_GCMP_256:
 			if (!ieee80211_is_data_present(hdr->frame_control) &&
 			    !ieee80211_use_mfp(hdr->frame_control, tx->sta,
-					       tx->skb) &&
-			    !ieee80211_is_group_privacy_action(tx->skb))
+					       skb) &&
+			    !ieee80211_is_group_privacy_action(skb))
 				tx->key =3D NULL;
 			else
 				skip_hw =3D (tx->key->conf.flags &
@@ -799,10 +812,12 @@ static __le16 ieee80211_tx_next_seq(struct sta_info=
 *sta, int tid)
 static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 {
-	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(tx->skb);
-	struct ieee80211_hdr *hdr =3D (struct ieee80211_hdr *)tx->skb->data;
+	struct sk_buff *skb =3D skb_peek(&tx->skbs);
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr =3D (struct ieee80211_hdr *)skb->data;
 	u8 *qc;
 	int tid;
+	__le16 seq;
=20
 	/*
 	 * Packet injection may want to control the sequence
@@ -829,10 +844,15 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *t=
x)
 	 */
 	if (!ieee80211_is_data_qos(hdr->frame_control) ||
 	    is_multicast_ether_addr(hdr->addr1)) {
-		/* driver should assign sequence number */
-		info->flags |=3D IEEE80211_TX_CTL_ASSIGN_SEQ;
-		/* for pure STA mode without beacons, we can do it */
-		hdr->seq_ctrl =3D cpu_to_le16(tx->sdata->sequence_number);
+		seq =3D cpu_to_le16(tx->sdata->sequence_number);
+		skb_queue_walk(&tx->skbs, skb) {
+			info =3D IEEE80211_SKB_CB(skb);
+			hdr =3D (struct ieee80211_hdr *)skb->data;
+			/* driver should assign sequence number */
+			info->flags |=3D IEEE80211_TX_CTL_ASSIGN_SEQ;
+			/* for pure STA mode without beacons, we can do it */
+			hdr->seq_ctrl |=3D seq;
+		}
 		tx->sdata->sequence_number +=3D 0x10;
 		if (tx->sta)
 			tx->sta->tx_stats.msdu[IEEE80211_NUM_TIDS]++;
@@ -853,8 +873,13 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx=
)
 	tid =3D *qc & IEEE80211_QOS_CTL_TID_MASK;
 	tx->sta->tx_stats.msdu[tid]++;
=20
-	if (!tx->sta->sta.txq[0])
-		hdr->seq_ctrl =3D ieee80211_tx_next_seq(tx->sta, tid);
+	if (!tx->sta->sta.txq[0]) {
+		seq =3D ieee80211_tx_next_seq(tx->sta, tid);
+		skb_queue_walk(&tx->skbs, skb) {
+			hdr =3D (struct ieee80211_hdr *)skb->data;
+			hdr->seq_ctrl |=3D seq;
+		}
+	}
=20
 	return TX_CONTINUE;
 }
@@ -1481,33 +1506,57 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee8=
0211_hw *hw,
 {
 	struct ieee80211_local *local =3D hw_to_local(hw);
 	struct txq_info *txqi =3D container_of(txq, struct txq_info, txq);
-	struct ieee80211_hdr *hdr;
 	struct sk_buff *skb =3D NULL;
 	struct fq *fq =3D &local->fq;
 	struct fq_tin *tin =3D &txqi->tin;
+	struct ieee80211_tx_info *info;
=20
 	spin_lock_bh(&fq->lock);
=20
 	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
 		goto out;
=20
+begin:
 	skb =3D fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
 	if (!skb)
 		goto out;
=20
 	ieee80211_set_skb_vif(skb, txqi);
=20
-	hdr =3D (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+	info =3D IEEE80211_SKB_CB(skb);
+	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
 		struct sta_info *sta =3D container_of(txq->sta, struct sta_info,
 						    sta);
-		struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+		struct ieee80211_fast_tx *fast_tx;
=20
-		hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |=3D IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &=3D ~IEEE80211_TX_CTL_AMPDU;
+		fast_tx =3D rcu_dereference(sta->fast_tx);
+		if (WARN_ON(!fast_tx)) {
+			/* lost the fast_tx pointer while the packet was queued */
+			ieee80211_free_txskb(hw, skb);
+			goto begin;
+		}
+		ieee80211_xmit_fast_finish(sta->sdata, sta, fast_tx, skb, false);
+	} else {
+		struct ieee80211_tx_data tx =3D { };
+
+		__skb_queue_head_init(&tx.skbs);
+		tx.local =3D local;
+		if (txq->sta) {
+			struct sta_info *sta =3D container_of(txq->sta,
+							    struct sta_info,
+							    sta);
+			tx.sta =3D container_of(txq->sta, struct sta_info, sta);
+			tx.sdata =3D sta->sdata;
+		} else {
+			tx.sdata =3D vif_to_sdata(info->control.vif);
+		}
+
+		__skb_queue_tail(&tx.skbs, skb);
+
+		if (invoke_tx_handlers_late(&tx))
+			goto begin;
+
+		__skb_unlink(skb, &tx.skbs);
 	}
=20
 out:
@@ -1521,6 +1570,71 @@ out:
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
=20
+static bool ieee80211_queue_skb(struct ieee80211_local *local,
+				struct ieee80211_sub_if_data *sdata,
+				struct ieee80211_sta *sta,
+				struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+	struct fq *fq =3D &local->fq;
+	struct ieee80211_vif *vif;
+	struct txq_info *txqi;
+
+	if (!local->ops->wake_tx_queue)
+		return false;
+
+	if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
+		sdata =3D container_of(sdata->bss,
+				struct ieee80211_sub_if_data, u.ap);
+
+	vif =3D &sdata->vif;
+	txqi =3D ieee80211_get_txq(local, vif, sta, skb);
+
+	if (!txqi)
+		return false;
+
+	info->control.vif =3D vif;
+
+	spin_lock_bh(&fq->lock);
+	ieee80211_txq_enqueue(local, txqi, skb);
+	spin_unlock_bh(&fq->lock);
+
+	drv_wake_tx_queue(local, txqi);
+
+	return true;
+}
+
+static bool ieee80211_queue_frags(struct ieee80211_local *local,
+				  struct ieee80211_sub_if_data *sdata,
+				  struct sta_info *sta,
+				  struct sk_buff_head *skbs)
+{
+	struct sk_buff *skb;
+	struct ieee80211_sta *pubsta;
+
+	if (WARN_ON(skb_queue_empty(skbs)))
+		return true;
+
+	if (!local->ops->wake_tx_queue ||
+	    sdata->vif.type =3D=3D NL80211_IFTYPE_MONITOR)
+		return false;
+
+	if (sta && sta->uploaded)
+		pubsta =3D &sta->sta;
+	else
+		pubsta =3D NULL;
+
+	while (!skb_queue_empty(skbs)) {
+		skb =3D __skb_dequeue(skbs);
+		if (unlikely(!ieee80211_queue_skb(local, sdata, pubsta, skb))) {
+			__skb_queue_head(skbs, skb);
+			return false;
+		}
+	}
+
+	return true;
+}
+
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -1528,9 +1642,7 @@ static bool ieee80211_tx_frags(struct ieee80211_loc=
al *local,
 			       bool txpending)
 {
 	struct ieee80211_tx_control control =3D {};
-	struct fq *fq =3D &local->fq;
 	struct sk_buff *skb, *tmp;
-	struct txq_info *txqi;
 	unsigned long flags;
=20
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1545,21 +1657,6 @@ static bool ieee80211_tx_frags(struct ieee80211_lo=
cal *local,
 		}
 #endif
=20
-		txqi =3D ieee80211_get_txq(local, vif, sta, skb);
-		if (txqi) {
-			info->control.vif =3D vif;
-
-			__skb_unlink(skb, skbs);
-
-			spin_lock_bh(&fq->lock);
-			ieee80211_txq_enqueue(local, txqi, skb);
-			spin_unlock_bh(&fq->lock);
-
-			drv_wake_tx_queue(local, txqi);
-
-			continue;
-		}
-
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1680,8 +1777,12 @@ static bool __ieee80211_tx(struct ieee80211_local =
*local,
 /*
  * Invoke TX handlers, return 0 on success and non-zero if the
  * frame was dropped or queued.
+ *
+ * The handlers are split into an early and late part. The latter is eve=
rything
+ * that can be sensitive to reordering, and will be deferred to after pa=
ckets
+ * are dequeued from the intermediate queues (when they are enabled).
  */
-static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(tx->skb);
 	ieee80211_tx_result res =3D TX_DROP;
@@ -1708,9 +1807,32 @@ static int invoke_tx_handlers(struct ieee80211_tx_=
data *tx)
 	}

 	CALL_TXH(ieee80211_tx_h_michael_mic_add);
-	CALL_TXH(ieee80211_tx_h_sequence);
 	CALL_TXH(ieee80211_tx_h_fragment);
-	/* handlers after fragment must be aware of tx info fragmentation! */
+
+ txh_done:
+	if (unlikely(res =3D=3D TX_DROP)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop);
+		if (tx->skb)
+			ieee80211_free_txskb(&tx->local->hw, tx->skb);
+		else
+			ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+		return -1;
+	} else if (unlikely(res =3D=3D TX_QUEUED)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_queued);
+		return -1;
+	}
+
+	return 0;
+}
+
+/* late tx handlers must be aware of tx info fragmentation! */
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
+{
+	ieee80211_tx_result res =3D TX_DROP;
+
+	if (!tx->key) /* Not set unless early and late handlers where chained. =
*/
+		CALL_TXH(ieee80211_tx_h_select_key);
+	CALL_TXH(ieee80211_tx_h_sequence);
 	CALL_TXH(ieee80211_tx_h_stats);
 	CALL_TXH(ieee80211_tx_h_encrypt);
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
@@ -1733,6 +1856,15 @@ static int invoke_tx_handlers(struct ieee80211_tx_=
data *tx)
 	return 0;
 }
=20
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+	int r =3D invoke_tx_handlers_early(tx);
+	if (r)
+		return r;
+
+	return invoke_tx_handlers_late(tx);
+}
+
 bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif, struct sk_buff *skb,
 			      int band, struct ieee80211_sta **sta)
@@ -1807,7 +1939,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_d=
ata *sdata,
 		info->hw_queue =3D
 			sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
=20
-	if (!invoke_tx_handlers(&tx))
+	if (invoke_tx_handlers_early(&tx))
+		return false;
+
+	if (ieee80211_queue_frags(local, sdata, tx.sta, &tx.skbs))
+		return true;
+
+	if (!invoke_tx_handlers_late(&tx))
 		result =3D __ieee80211_tx(local, &tx.skbs, led_len,
 					tx.sta, txpending);
=20
@@ -3170,8 +3308,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_su=
b_if_data *sdata,
 	struct ethhdr eth;
 	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
 	struct ieee80211_hdr *hdr =3D (void *)fast_tx->hdr;
-	struct ieee80211_tx_data tx;
-	ieee80211_tx_result r;
 	struct tid_ampdu_tx *tid_tx =3D NULL;
 	u8 tid =3D IEEE80211_NUM_TIDS;
=20
@@ -3240,11 +3376,30 @@ static bool ieee80211_xmit_fast(struct ieee80211_=
sub_if_data *sdata,
 	info->flags =3D IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
 		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
+	info->control.flags =3D IEEE80211_TX_CTRL_FAST_XMIT;
+
+	if (ieee80211_queue_skb(local, sdata, &sta->sta, skb))
+		return true;
+
+	return ieee80211_xmit_fast_finish(sdata, sta, fast_tx, skb, true);
+}
+
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sda=
ta,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb, bool xmit)
+{
+	struct ieee80211_local *local =3D sdata->local;
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr =3D (void *)skb->data;
+	struct ieee80211_tx_data tx;
+	ieee80211_tx_result r;
+	u8 tid =3D IEEE80211_NUM_TIDS;
=20
 	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+		tid =3D skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
 		*ieee80211_get_qos_ctl(hdr) =3D tid;
-		if (!sta->sta.txq[0])
-			hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, tid);
+		hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, tid);
 	} else {
 		info->flags |=3D IEEE80211_TX_CTL_ASSIGN_SEQ;
 		hdr->seq_ctrl =3D cpu_to_le16(sdata->sequence_number);
@@ -3309,12 +3464,15 @@ static bool ieee80211_xmit_fast(struct ieee80211_=
sub_if_data *sdata,
 		}
 	}
=20
-	if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
-		sdata =3D container_of(sdata->bss,
-				     struct ieee80211_sub_if_data, u.ap);
+	if (xmit) {
+		if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
+			sdata =3D container_of(sdata->bss,
+					struct ieee80211_sub_if_data, u.ap);
+
+		__skb_queue_tail(&tx.skbs, skb);
+		ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
+	}
=20
-	__skb_queue_tail(&tx.skbs, skb);
-	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
 	return true;
 }
=20
--=20
2.9.3

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [PATCH v5] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-01 16:03       ` [PATCH v5] " Toke Høiland-Jørgensen
@ 2016-09-01 17:59         ` Johannes Berg
  2016-09-01 18:30           ` Toke Høiland-Jørgensen
  2016-09-02  2:48         ` Jason Andryuk
  2016-09-02 13:41         ` [PATCH v6] " Toke Høiland-Jørgensen
  2 siblings, 1 reply; 51+ messages in thread
From: Johannes Berg @ 2016-09-01 17:59 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless


> To avoid having to deal with fragmentation on dequeue, the split is
> set to be after the fragmentation handler. This means that some
> reordering of TX handlers is necessary, and some handlers had to be
> made aware of fragmentation due to this reordering.

Come to think of it, that's actually counterproductive.

If a fragment is dropped, or even just if fragments are reordered, the
receiver will not be able to defragment the frame, and will thus drop
it. Therefore, it's all-or-nothing, and we shouldn't transmit any
fragment if we drop/reorder one (*).

So ... I think you'll just have to deal with fragmentation on the
codel/fq/whatever queues and keep fragments together, or do
fragmentation afterwards.

johannes


(*) also, couldn't this mean that we send something completely stupid
like

seq=1,frag=0
seq=2,frag=0
seq=2,frag=1
seq=2,frag=1

if reordering happened?

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v5] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-01 17:59         ` Johannes Berg
@ 2016-09-01 18:30           ` Toke Høiland-Jørgensen
  2016-09-01 18:35             ` Johannes Berg
  0 siblings, 1 reply; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-01 18:30 UTC (permalink / raw)
  To: Johannes Berg; +Cc: make-wifi-fast, linux-wireless

Johannes Berg <johannes@sipsolutions.net> writes:

>> To avoid having to deal with fragmentation on dequeue, the split is
>> set to be after the fragmentation handler. This means that some
>> reordering of TX handlers is necessary, and some handlers had to be
>> made aware of fragmentation due to this reordering.
>
> Come to think of it, that's actually counterproductive.
>
> If a fragment is dropped, or even just if fragments are reordered, the
> receiver will not be able to defragment the frame, and will thus drop
> it. Therefore, it's all-or-nothing, and we shouldn't transmit any
> fragment if we drop/reorder one (*).
>
> So ... I think you'll just have to deal with fragmentation on the
> codel/fq/whatever queues and keep fragments together, or do
> fragmentation afterwards.

Hmm, guess that makes sense. Bugger. Will think about how to do that.

>
> johannes
>
> (*) also, couldn't this mean that we send something completely stupid
> like
>
> seq=1,frag=0
> seq=2,frag=0
> seq=2,frag=1
> seq=2,frag=1
>
> if reordering happened?

(assuming the last line was supposed to read 'seq=1,frag=1')

Yes, that could happen, in principle (it depends on the fragments' size
in relation to the FQ quantum).


When does fragmentation happen anyway? Is it safe to assume there's no
aggregation when it does?

-Toke

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v5] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-01 18:30           ` Toke Høiland-Jørgensen
@ 2016-09-01 18:35             ` Johannes Berg
  0 siblings, 0 replies; 51+ messages in thread
From: Johannes Berg @ 2016-09-01 18:35 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast, linux-wireless

On Thu, 2016-09-01 at 20:30 +0200, Toke Høiland-Jørgensen wrote:

> > seq=1,frag=0
> > seq=2,frag=0
> > seq=2,frag=1
> > seq=2,frag=1
> > 
> > if reordering happened?
> 
> (assuming the last line was supposed to read 'seq=1,frag=1')

I did actually mean seq=2,frag=1, since the seqno assignment happened
after fragmentation in your patch, and after codel reordering, and
would not change the seqno until it encountered a frag=0 packet.

Or maybe that was only with the previous version of the patch.

> When does fragmentation happen anyway? Is it safe to assume there's
> no aggregation when it does?
> 

Yes, fragmented packets are not allowed to be aggregated.

johannes

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v5] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-01 16:03       ` [PATCH v5] " Toke Høiland-Jørgensen
  2016-09-01 17:59         ` Johannes Berg
@ 2016-09-02  2:48         ` Jason Andryuk
  2016-09-02  9:27           ` Toke Høiland-Jørgensen
  2016-09-02 13:41         ` [PATCH v6] " Toke Høiland-Jørgensen
  2 siblings, 1 reply; 51+ messages in thread
From: Jason Andryuk @ 2016-09-02  2:48 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast, linux-wireless

On Thu, Sep 1, 2016 at 12:03 PM, Toke H=C3=B8iland-J=C3=B8rgensen <toke@tok=
e.dk> wrote:
> @@ -1481,33 +1506,57 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee8=
0211_hw *hw,
>  {
>         struct ieee80211_local *local =3D hw_to_local(hw);
>         struct txq_info *txqi =3D container_of(txq, struct txq_info, txq)=
;
> -       struct ieee80211_hdr *hdr;
>         struct sk_buff *skb =3D NULL;
>         struct fq *fq =3D &local->fq;
>         struct fq_tin *tin =3D &txqi->tin;
> +       struct ieee80211_tx_info *info;
>
>         spin_lock_bh(&fq->lock);
>
>         if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
>                 goto out;
>
> +begin:
>         skb =3D fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
>         if (!skb)
>                 goto out;
>
>         ieee80211_set_skb_vif(skb, txqi);
>
> -       hdr =3D (struct ieee80211_hdr *)skb->data;
> -       if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
> +       info =3D IEEE80211_SKB_CB(skb);
> +       if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT=
) {
>                 struct sta_info *sta =3D container_of(txq->sta, struct st=
a_info,
>                                                     sta);
> -               struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
> +               struct ieee80211_fast_tx *fast_tx;
>
> -               hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, txq->tid);
> -               if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
> -                       info->flags |=3D IEEE80211_TX_CTL_AMPDU;
> -               else
> -                       info->flags &=3D ~IEEE80211_TX_CTL_AMPDU;
> +               fast_tx =3D rcu_dereference(sta->fast_tx);
> +               if (WARN_ON(!fast_tx)) {
> +                       /* lost the fast_tx pointer while the packet was =
queued */
> +                       ieee80211_free_txskb(hw, skb);
> +                       goto begin;
> +               }
> +               ieee80211_xmit_fast_finish(sta->sdata, sta, fast_tx, skb,=
 false);
> +       } else {
> +               struct ieee80211_tx_data tx =3D { };
> +
> +               __skb_queue_head_init(&tx.skbs);
> +               tx.local =3D local;
> +               if (txq->sta) {
> +                       struct sta_info *sta =3D container_of(txq->sta,
> +                                                           struct sta_in=
fo,
> +                                                           sta);

sta is unneeded give the assignment below?

Regards,
Jason

> +                       tx.sta =3D container_of(txq->sta, struct sta_info=
, sta);
> +                       tx.sdata =3D sta->sdata;
> +               } else {
> +                       tx.sdata =3D vif_to_sdata(info->control.vif);
> +               }
> +
> +               __skb_queue_tail(&tx.skbs, skb);
> +
> +               if (invoke_tx_handlers_late(&tx))
> +                       goto begin;
> +
> +               __skb_unlink(skb, &tx.skbs);
>         }

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v5] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-02  2:48         ` Jason Andryuk
@ 2016-09-02  9:27           ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-02  9:27 UTC (permalink / raw)
  To: Jason Andryuk; +Cc: make-wifi-fast, linux-wireless

Jason Andryuk <jandryuk@gmail.com> writes:

> On Thu, Sep 1, 2016 at 12:03 PM, Toke H=C3=B8iland-J=C3=B8rgensen <toke@t=
oke.dk> wrote:
>> @@ -1481,33 +1506,57 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee=
80211_hw *hw,
>>  {
>>         struct ieee80211_local *local =3D hw_to_local(hw);
>>         struct txq_info *txqi =3D container_of(txq, struct txq_info, txq=
);
>> -       struct ieee80211_hdr *hdr;
>>         struct sk_buff *skb =3D NULL;
>>         struct fq *fq =3D &local->fq;
>>         struct fq_tin *tin =3D &txqi->tin;
>> +       struct ieee80211_tx_info *info;
>>
>>         spin_lock_bh(&fq->lock);
>>
>>         if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
>>                 goto out;
>>
>> +begin:
>>         skb =3D fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
>>         if (!skb)
>>                 goto out;
>>
>>         ieee80211_set_skb_vif(skb, txqi);
>>
>> -       hdr =3D (struct ieee80211_hdr *)skb->data;
>> -       if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
>> +       info =3D IEEE80211_SKB_CB(skb);
>> +       if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMI=
T) {
>>                 struct sta_info *sta =3D container_of(txq->sta, struct s=
ta_info,
>>                                                     sta);
>> -               struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
>> +               struct ieee80211_fast_tx *fast_tx;
>>
>> -               hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, txq->tid);
>> -               if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
>> -                       info->flags |=3D IEEE80211_TX_CTL_AMPDU;
>> -               else
>> -                       info->flags &=3D ~IEEE80211_TX_CTL_AMPDU;
>> +               fast_tx =3D rcu_dereference(sta->fast_tx);
>> +               if (WARN_ON(!fast_tx)) {
>> +                       /* lost the fast_tx pointer while the packet was=
 queued */
>> +                       ieee80211_free_txskb(hw, skb);
>> +                       goto begin;
>> +               }
>> +               ieee80211_xmit_fast_finish(sta->sdata, sta, fast_tx, skb=
, false);
>> +       } else {
>> +               struct ieee80211_tx_data tx =3D { };
>> +
>> +               __skb_queue_head_init(&tx.skbs);
>> +               tx.local =3D local;
>> +               if (txq->sta) {
>> +                       struct sta_info *sta =3D container_of(txq->sta,
>> +                                                           struct sta_i=
nfo,
>> +                                                           sta);
>
> sta is unneeded give the assignment below?

Yeah, you're right. Think that was left over from a previous version.
Thanks for spotting it :)

-Toke

^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v6] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-01 16:03       ` [PATCH v5] " Toke Høiland-Jørgensen
  2016-09-01 17:59         ` Johannes Berg
  2016-09-02  2:48         ` Jason Andryuk
@ 2016-09-02 13:41         ` Toke Høiland-Jørgensen
  2016-09-02 14:44           ` Toke Høiland-Jørgensen
  2016-09-05 11:30           ` [PATCH v7] " Toke Høiland-Jørgensen
  2 siblings, 2 replies; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-02 13:41 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless; +Cc: Toke Høiland-Jørgensen

The TXQ intermediate queues can cause packet reordering when more than
one flow is active to a single station. Since some of the wifi-specific
packet handling (notably sequence number and encryption handling) is
sensitive to re-ordering, things break if they are applied before the
TXQ.

This splits up the TX handlers and fast_xmit logic into two parts: An
early part and a late part. The former is applied before TXQ enqueue,
and the latter after dequeue. The non-TXQ path just applies both parts
at once.

Because fragments shouldn't be split up or reordered, the fragmentation
handler is run after dequeue. Any fragments are then kept in the TXQ and
on subsequent dequeues they take precedence over dequeueing from the FQ
structure.

This approach avoids having to scatter special cases for when TXQ is
enabled, at the cost of making the fast_xmit and TX handler code
slightly more complex.

Signed-off-by: Toke H=C3=B8iland-J=C3=B8rgensen <toke@toke.dk>
---
Changes since v5:
- Move the fragmentation handler to *after* TXQ dequeue. Fragments are
  kept in the TXQ for subsequent dequeues. This change also means that
  the changes to make some of the handlers fragmentation aware are no
  longer necessary.
- One of the TX stats updates in the fast path was done before the
  enqueue step; move that to xmit_fast_finish().
- Move the rate selection handler to after dequeue, so it's run closer
  to the time where the packet is actually transmitted.
 =20
 include/net/mac80211.h     |   2 +
 net/mac80211/ieee80211_i.h |   2 +
 net/mac80211/tx.c          | 207 +++++++++++++++++++++++++++++++++++----=
------
 3 files changed, 168 insertions(+), 43 deletions(-)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index cca510a..9a6a3e9 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -715,6 +715,7 @@ enum mac80211_tx_info_flags {
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate info=
rmation
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
+ * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xm=
it path
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -723,6 +724,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PS_RESPONSE		=3D BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		=3D BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			=3D BIT(3),
+	IEEE80211_TX_CTRL_FAST_XMIT		=3D BIT(4),
 };
=20
 /*
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index f56d342..de9991d 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -813,11 +813,13 @@ enum txq_info_flags {
  * @def_flow: used as a fallback flow when a packet destined to @tin has=
hes to
  *	a fq_flow which is already owned by a different tin
  * @def_cvars: codel vars for @def_flow
+ * @frags: used to keep fragments created after dequeue
  */
 struct txq_info {
 	struct fq_tin tin;
 	struct fq_flow def_flow;
 	struct codel_vars def_cvars;
+	struct sk_buff_head frags;
 	unsigned long flags;
=20
 	/* keep last! */
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 1d0746d..a3a4593 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -38,6 +38,12 @@
 #include "wme.h"
 #include "rate.h"
=20
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sda=
ta,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb, bool xmit);
+
 /* misc utils */
=20
 static inline void ieee80211_tx_stats(struct net_device *dev, u32 len)
@@ -1403,6 +1409,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_dat=
a *sdata,
 	fq_tin_init(&txqi->tin);
 	fq_flow_init(&txqi->def_flow);
 	codel_vars_init(&txqi->def_cvars);
+	__skb_queue_head_init(&txqi->frags);
=20
 	txqi->txq.vif =3D &sdata->vif;
=20
@@ -1425,6 +1432,7 @@ void ieee80211_txq_purge(struct ieee80211_local *lo=
cal,
 	struct fq_tin *tin =3D &txqi->tin;
=20
 	fq_tin_reset(fq, tin, fq_skb_free_func);
+	ieee80211_purge_tx_queue(&local->hw, &txqi->frags);
 }
=20
 int ieee80211_txq_setup_flows(struct ieee80211_local *local)
@@ -1481,33 +1489,62 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee8=
0211_hw *hw,
 {
 	struct ieee80211_local *local =3D hw_to_local(hw);
 	struct txq_info *txqi =3D container_of(txq, struct txq_info, txq);
-	struct ieee80211_hdr *hdr;
 	struct sk_buff *skb =3D NULL;
 	struct fq *fq =3D &local->fq;
 	struct fq_tin *tin =3D &txqi->tin;
+	struct ieee80211_tx_info *info;
=20
 	spin_lock_bh(&fq->lock);
=20
 	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
 		goto out;
=20
+	/* Make sure fragments stay together. */
+	skb =3D __skb_dequeue(&txqi->frags);
+	if (skb)
+		goto out;
+
+begin:
 	skb =3D fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
 	if (!skb)
 		goto out;
=20
 	ieee80211_set_skb_vif(skb, txqi);
=20
-	hdr =3D (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+	info =3D IEEE80211_SKB_CB(skb);
+	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
 		struct sta_info *sta =3D container_of(txq->sta, struct sta_info,
 						    sta);
-		struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+		struct ieee80211_fast_tx *fast_tx;
=20
-		hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |=3D IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &=3D ~IEEE80211_TX_CTL_AMPDU;
+		fast_tx =3D rcu_dereference(sta->fast_tx);
+		if (WARN_ON(!fast_tx)) {
+			/* lost the fast_tx pointer while the packet was queued */
+			ieee80211_free_txskb(hw, skb);
+			goto begin;
+		}
+		ieee80211_xmit_fast_finish(sta->sdata, sta, fast_tx, skb, false);
+	} else {
+		struct ieee80211_tx_data tx =3D { };
+
+		__skb_queue_head_init(&tx.skbs);
+		tx.local =3D local;
+		tx.skb =3D skb;
+		if (txq->sta) {
+			tx.sta =3D container_of(txq->sta, struct sta_info, sta);
+			tx.sdata =3D tx.sta->sdata;
+		} else {
+			tx.sdata =3D vif_to_sdata(info->control.vif);
+		}
+
+
+		if (invoke_tx_handlers_late(&tx))
+			goto begin;
+
+		skb =3D __skb_dequeue(&tx.skbs);
+
+		if (!skb_queue_empty(&tx.skbs))
+			skb_queue_splice_tail(&tx.skbs, &txqi->frags);
 	}
=20
 out:
@@ -1521,6 +1558,47 @@ out:
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
=20
+static bool ieee80211_queue_skb(struct ieee80211_local *local,
+				struct ieee80211_sub_if_data *sdata,
+				struct sta_info *sta,
+				struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+	struct fq *fq =3D &local->fq;
+	struct ieee80211_vif *vif;
+	struct txq_info *txqi;
+	struct ieee80211_sta *pubsta;
+
+	if (!local->ops->wake_tx_queue ||
+	    sdata->vif.type =3D=3D NL80211_IFTYPE_MONITOR)
+		return false;
+
+	if (sta && sta->uploaded)
+		pubsta =3D &sta->sta;
+	else
+		pubsta =3D NULL;
+
+	if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
+		sdata =3D container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+
+	vif =3D &sdata->vif;
+	txqi =3D ieee80211_get_txq(local, vif, pubsta, skb);
+
+	if (!txqi)
+		return false;
+
+	info->control.vif =3D vif;
+
+	spin_lock_bh(&fq->lock);
+	ieee80211_txq_enqueue(local, txqi, skb);
+	spin_unlock_bh(&fq->lock);
+
+	drv_wake_tx_queue(local, txqi);
+
+	return true;
+}
+
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -1528,9 +1606,7 @@ static bool ieee80211_tx_frags(struct ieee80211_loc=
al *local,
 			       bool txpending)
 {
 	struct ieee80211_tx_control control =3D {};
-	struct fq *fq =3D &local->fq;
 	struct sk_buff *skb, *tmp;
-	struct txq_info *txqi;
 	unsigned long flags;
=20
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1545,21 +1621,6 @@ static bool ieee80211_tx_frags(struct ieee80211_lo=
cal *local,
 		}
 #endif
=20
-		txqi =3D ieee80211_get_txq(local, vif, sta, skb);
-		if (txqi) {
-			info->control.vif =3D vif;
-
-			__skb_unlink(skb, skbs);
-
-			spin_lock_bh(&fq->lock);
-			ieee80211_txq_enqueue(local, txqi, skb);
-			spin_unlock_bh(&fq->lock);
-
-			drv_wake_tx_queue(local, txqi);
-
-			continue;
-		}
-
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1680,10 +1741,13 @@ static bool __ieee80211_tx(struct ieee80211_local=
 *local,
 /*
  * Invoke TX handlers, return 0 on success and non-zero if the
  * frame was dropped or queued.
+ *
+ * The handlers are split into an early and late part. The latter is eve=
rything
+ * that can be sensitive to reordering, and will be deferred to after pa=
ckets
+ * are dequeued from the intermediate queues (when they are enabled).
  */
-static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
 {
-	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(tx->skb);
 	ieee80211_tx_result res =3D TX_DROP;
=20
 #define CALL_TXH(txh) \
@@ -1697,7 +1761,28 @@ static int invoke_tx_handlers(struct ieee80211_tx_=
data *tx)
 	CALL_TXH(ieee80211_tx_h_check_assoc);
 	CALL_TXH(ieee80211_tx_h_ps_buf);
 	CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
-	CALL_TXH(ieee80211_tx_h_select_key);
+
+ txh_done:
+	if (unlikely(res =3D=3D TX_DROP)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop);
+		if (tx->skb)
+			ieee80211_free_txskb(&tx->local->hw, tx->skb);
+		else
+			ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+		return -1;
+	} else if (unlikely(res =3D=3D TX_QUEUED)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_queued);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
+{
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(tx->skb);
+	ieee80211_tx_result res =3D TX_DROP;
+
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
 		CALL_TXH(ieee80211_tx_h_rate_ctrl);
=20
@@ -1707,6 +1792,7 @@ static int invoke_tx_handlers(struct ieee80211_tx_d=
ata *tx)
 		goto txh_done;
 	}
=20
+	CALL_TXH(ieee80211_tx_h_select_key);
 	CALL_TXH(ieee80211_tx_h_michael_mic_add);
 	CALL_TXH(ieee80211_tx_h_sequence);
 	CALL_TXH(ieee80211_tx_h_fragment);
@@ -1733,6 +1819,15 @@ static int invoke_tx_handlers(struct ieee80211_tx_=
data *tx)
 	return 0;
 }
=20
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+	int r =3D invoke_tx_handlers_early(tx);
+	if (r)
+		return r;
+
+	return invoke_tx_handlers_late(tx);
+}
+
 bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif, struct sk_buff *skb,
 			      int band, struct ieee80211_sta **sta)
@@ -1807,7 +1902,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_d=
ata *sdata,
 		info->hw_queue =3D
 			sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
=20
-	if (!invoke_tx_handlers(&tx))
+	if (invoke_tx_handlers_early(&tx))
+		return false;
+
+	if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb))
+		return true;
+
+	if (!invoke_tx_handlers_late(&tx))
 		result =3D __ieee80211_tx(local, &tx.skbs, led_len,
 					tx.sta, txpending);
=20
@@ -3159,7 +3260,7 @@ out:
 }
=20
 static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
-				struct net_device *dev, struct sta_info *sta,
+				struct sta_info *sta,
 				struct ieee80211_fast_tx *fast_tx,
 				struct sk_buff *skb)
 {
@@ -3170,8 +3271,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_su=
b_if_data *sdata,
 	struct ethhdr eth;
 	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
 	struct ieee80211_hdr *hdr =3D (void *)fast_tx->hdr;
-	struct ieee80211_tx_data tx;
-	ieee80211_tx_result r;
 	struct tid_ampdu_tx *tid_tx =3D NULL;
 	u8 tid =3D IEEE80211_NUM_TIDS;
=20
@@ -3210,8 +3309,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_su=
b_if_data *sdata,
 			return true;
 	}
=20
-	ieee80211_tx_stats(dev, skb->len + extra_head);
-
 	if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
 	    ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
 		return true;
@@ -3240,11 +3337,32 @@ static bool ieee80211_xmit_fast(struct ieee80211_=
sub_if_data *sdata,
 	info->flags =3D IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
 		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
+	info->control.flags =3D IEEE80211_TX_CTRL_FAST_XMIT;
+
+	if (ieee80211_queue_skb(local, sdata, sta, skb))
+		return true;
+
+	return ieee80211_xmit_fast_finish(sdata, sta, fast_tx, skb, true);
+}
+
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sda=
ta,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb, bool xmit)
+{
+	struct ieee80211_local *local =3D sdata->local;
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr =3D (void *)skb->data;
+	struct ieee80211_tx_data tx;
+	ieee80211_tx_result r;
+	u8 tid =3D IEEE80211_NUM_TIDS;
+
+	ieee80211_tx_stats(skb->dev, skb->len);
=20
 	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+		tid =3D skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
 		*ieee80211_get_qos_ctl(hdr) =3D tid;
-		if (!sta->sta.txq[0])
-			hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, tid);
+		hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, tid);
 	} else {
 		info->flags |=3D IEEE80211_TX_CTL_ASSIGN_SEQ;
 		hdr->seq_ctrl =3D cpu_to_le16(sdata->sequence_number);
@@ -3309,12 +3427,15 @@ static bool ieee80211_xmit_fast(struct ieee80211_=
sub_if_data *sdata,
 		}
 	}
=20
-	if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
-		sdata =3D container_of(sdata->bss,
-				     struct ieee80211_sub_if_data, u.ap);
+	if (xmit) {
+		if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
+			sdata =3D container_of(sdata->bss,
+					struct ieee80211_sub_if_data, u.ap);
+
+		__skb_queue_tail(&tx.skbs, skb);
+		ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
+	}
=20
-	__skb_queue_tail(&tx.skbs, skb);
-	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
 	return true;
 }
=20
@@ -3342,7 +3463,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *s=
kb,
 		fast_tx =3D rcu_dereference(sta->fast_tx);
=20
 		if (fast_tx &&
-		    ieee80211_xmit_fast(sdata, dev, sta, fast_tx, skb))
+		    ieee80211_xmit_fast(sdata, sta, fast_tx, skb))
 			goto out;
 	}
=20
--=20
2.9.3

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [PATCH v6] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-02 13:41         ` [PATCH v6] " Toke Høiland-Jørgensen
@ 2016-09-02 14:44           ` Toke Høiland-Jørgensen
  2016-09-05 11:30           ` [PATCH v7] " Toke Høiland-Jørgensen
  1 sibling, 0 replies; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-02 14:44 UTC (permalink / raw)
  To: make-wifi-fast; +Cc: linux-wireless

Toke H=C3=B8iland-J=C3=B8rgensen <toke@toke.dk> writes:

> The TXQ intermediate queues can cause packet reordering when more than
> one flow is active to a single station. Since some of the wifi-specific
> packet handling (notably sequence number and encryption handling) is
> sensitive to re-ordering, things break if they are applied before the
> TXQ.
>
> This splits up the TX handlers and fast_xmit logic into two parts: An
> early part and a late part. The former is applied before TXQ enqueue,
> and the latter after dequeue. The non-TXQ path just applies both parts
> at once.
>
> Because fragments shouldn't be split up or reordered, the fragmentation
> handler is run after dequeue. Any fragments are then kept in the TXQ and
> on subsequent dequeues they take precedence over dequeueing from the FQ
> structure.
>
> This approach avoids having to scatter special cases for when TXQ is
> enabled, at the cost of making the fast_xmit and TX handler code
> slightly more complex.
>
> Signed-off-by: Toke H=C3=B8iland-J=C3=B8rgensen <toke@toke.dk>
> ---
> Changes since v5:
> - Move the fragmentation handler to *after* TXQ dequeue. Fragments are
>   kept in the TXQ for subsequent dequeues. This change also means that
>   the changes to make some of the handlers fragmentation aware are no
>   longer necessary.
> - One of the TX stats updates in the fast path was done before the
>   enqueue step; move that to xmit_fast_finish().
> - Move the rate selection handler to after dequeue, so it's run closer
>   to the time where the packet is actually transmitted.

Found one other thing that needs fixing shortly after posting this, but
figure that I'm probably not done anyway, so will leave it for the next
round. :)

-Toke

^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-02 13:41         ` [PATCH v6] " Toke Høiland-Jørgensen
  2016-09-02 14:44           ` Toke Høiland-Jørgensen
@ 2016-09-05 11:30           ` Toke Høiland-Jørgensen
  2016-09-05 17:49             ` Felix Fietkau
                               ` (2 more replies)
  1 sibling, 3 replies; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-05 11:30 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless; +Cc: Toke Høiland-Jørgensen

The TXQ intermediate queues can cause packet reordering when more than
one flow is active to a single station. Since some of the wifi-specific
packet handling (notably sequence number and encryption handling) is
sensitive to re-ordering, things break if they are applied before the
TXQ.

This splits up the TX handlers and fast_xmit logic into two parts: An
early part and a late part. The former is applied before TXQ enqueue,
and the latter after dequeue. The non-TXQ path just applies both parts
at once.

Because fragments shouldn't be split up or reordered, the fragmentation
handler is run after dequeue. Any fragments are then kept in the TXQ and
on subsequent dequeues they take precedence over dequeueing from the FQ
structure.

This approach avoids having to scatter special cases for when TXQ is
enabled, at the cost of making the fast_xmit and TX handler code
slightly more complex.

Signed-off-by: Toke H=C3=B8iland-J=C3=B8rgensen <toke@toke.dk>
---
Changes since v6:
  - Invoking the rate control handler can cause packets to be generated
    (for establishing a BA session). This can cause a deadlock because
    dequeue can happen while sta->lock is held. So this version moves
    the rate control handler back before the intermediate queue step.
  - Fix sequence number allocation on the slow path.
 =20
 include/net/mac80211.h     |   2 +
 net/mac80211/ieee80211_i.h |   2 +
 net/mac80211/tx.c          | 250 ++++++++++++++++++++++++++++++++++-----=
------
 3 files changed, 192 insertions(+), 62 deletions(-)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index cca510a..9a6a3e9 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -715,6 +715,7 @@ enum mac80211_tx_info_flags {
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate info=
rmation
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
+ * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xm=
it path
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -723,6 +724,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PS_RESPONSE		=3D BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		=3D BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			=3D BIT(3),
+	IEEE80211_TX_CTRL_FAST_XMIT		=3D BIT(4),
 };
=20
 /*
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 9211cce..d36f3b1 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -813,11 +813,13 @@ enum txq_info_flags {
  * @def_flow: used as a fallback flow when a packet destined to @tin has=
hes to
  *	a fq_flow which is already owned by a different tin
  * @def_cvars: codel vars for @def_flow
+ * @frags: used to keep fragments created after dequeue
  */
 struct txq_info {
 	struct fq_tin tin;
 	struct fq_flow def_flow;
 	struct codel_vars def_cvars;
+	struct sk_buff_head frags;
 	unsigned long flags;
=20
 	/* keep last! */
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index efc38e7..94f38cc 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -38,6 +38,12 @@
 #include "wme.h"
 #include "rate.h"
=20
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sda=
ta,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb);
+
 /* misc utils */
=20
 static inline void ieee80211_tx_stats(struct net_device *dev, u32 len)
@@ -853,8 +859,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	tid =3D *qc & IEEE80211_QOS_CTL_TID_MASK;
 	tx->sta->tx_stats.msdu[tid]++;
=20
-	if (!tx->sta->sta.txq[0])
-		hdr->seq_ctrl =3D ieee80211_tx_next_seq(tx->sta, tid);
+	hdr->seq_ctrl =3D ieee80211_tx_next_seq(tx->sta, tid);
=20
 	return TX_CONTINUE;
 }
@@ -1403,6 +1408,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_dat=
a *sdata,
 	fq_tin_init(&txqi->tin);
 	fq_flow_init(&txqi->def_flow);
 	codel_vars_init(&txqi->def_cvars);
+	__skb_queue_head_init(&txqi->frags);
=20
 	txqi->txq.vif =3D &sdata->vif;
=20
@@ -1425,6 +1431,7 @@ void ieee80211_txq_purge(struct ieee80211_local *lo=
cal,
 	struct fq_tin *tin =3D &txqi->tin;
=20
 	fq_tin_reset(fq, tin, fq_skb_free_func);
+	ieee80211_purge_tx_queue(&local->hw, &txqi->frags);
 }
=20
 int ieee80211_txq_setup_flows(struct ieee80211_local *local)
@@ -1481,33 +1488,61 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee8=
0211_hw *hw,
 {
 	struct ieee80211_local *local =3D hw_to_local(hw);
 	struct txq_info *txqi =3D container_of(txq, struct txq_info, txq);
-	struct ieee80211_hdr *hdr;
 	struct sk_buff *skb =3D NULL;
 	struct fq *fq =3D &local->fq;
 	struct fq_tin *tin =3D &txqi->tin;
+	struct ieee80211_tx_info *info;
=20
 	spin_lock_bh(&fq->lock);
=20
 	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
 		goto out;
=20
+	/* Make sure fragments stay together. */
+	skb =3D __skb_dequeue(&txqi->frags);
+	if (skb)
+		goto out;
+
+begin:
 	skb =3D fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
 	if (!skb)
 		goto out;
=20
 	ieee80211_set_skb_vif(skb, txqi);
=20
-	hdr =3D (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+	info =3D IEEE80211_SKB_CB(skb);
+	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
 		struct sta_info *sta =3D container_of(txq->sta, struct sta_info,
 						    sta);
-		struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+		struct ieee80211_fast_tx *fast_tx;
=20
-		hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |=3D IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &=3D ~IEEE80211_TX_CTL_AMPDU;
+		fast_tx =3D rcu_dereference(sta->fast_tx);
+		if (WARN_ON(!fast_tx)) {
+			/* lost fast_tx pointer while the packet was queued */
+			ieee80211_free_txskb(hw, skb);
+			goto begin;
+		}
+		ieee80211_xmit_fast_finish(sta->sdata, sta, fast_tx, skb);
+	} else {
+		struct ieee80211_tx_data tx =3D { };
+
+		__skb_queue_head_init(&tx.skbs);
+		tx.local =3D local;
+		tx.skb =3D skb;
+		if (txq->sta) {
+			tx.sta =3D container_of(txq->sta, struct sta_info, sta);
+			tx.sdata =3D tx.sta->sdata;
+		} else {
+			tx.sdata =3D vif_to_sdata(info->control.vif);
+		}
+
+		if (invoke_tx_handlers_late(&tx))
+			goto begin;
+
+		skb =3D __skb_dequeue(&tx.skbs);
+
+		if (!skb_queue_empty(&tx.skbs))
+			skb_queue_splice_tail(&tx.skbs, &txqi->frags);
 	}
=20
 out:
@@ -1521,6 +1556,47 @@ out:
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
=20
+static bool ieee80211_queue_skb(struct ieee80211_local *local,
+				struct ieee80211_sub_if_data *sdata,
+				struct sta_info *sta,
+				struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+	struct fq *fq =3D &local->fq;
+	struct ieee80211_vif *vif;
+	struct txq_info *txqi;
+	struct ieee80211_sta *pubsta;
+
+	if (!local->ops->wake_tx_queue ||
+	    sdata->vif.type =3D=3D NL80211_IFTYPE_MONITOR)
+		return false;
+
+	if (sta && sta->uploaded)
+		pubsta =3D &sta->sta;
+	else
+		pubsta =3D NULL;
+
+	if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
+		sdata =3D container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+
+	vif =3D &sdata->vif;
+	txqi =3D ieee80211_get_txq(local, vif, pubsta, skb);
+
+	if (!txqi)
+		return false;
+
+	info->control.vif =3D vif;
+
+	spin_lock_bh(&fq->lock);
+	ieee80211_txq_enqueue(local, txqi, skb);
+	spin_unlock_bh(&fq->lock);
+
+	drv_wake_tx_queue(local, txqi);
+
+	return true;
+}
+
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -1528,9 +1604,7 @@ static bool ieee80211_tx_frags(struct ieee80211_loc=
al *local,
 			       bool txpending)
 {
 	struct ieee80211_tx_control control =3D {};
-	struct fq *fq =3D &local->fq;
 	struct sk_buff *skb, *tmp;
-	struct txq_info *txqi;
 	unsigned long flags;
=20
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1545,21 +1619,6 @@ static bool ieee80211_tx_frags(struct ieee80211_lo=
cal *local,
 		}
 #endif
=20
-		txqi =3D ieee80211_get_txq(local, vif, sta, skb);
-		if (txqi) {
-			info->control.vif =3D vif;
-
-			__skb_unlink(skb, skbs);
-
-			spin_lock_bh(&fq->lock);
-			ieee80211_txq_enqueue(local, txqi, skb);
-			spin_unlock_bh(&fq->lock);
-
-			drv_wake_tx_queue(local, txqi);
-
-			continue;
-		}
-
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1680,10 +1739,13 @@ static bool __ieee80211_tx(struct ieee80211_local=
 *local,
 /*
  * Invoke TX handlers, return 0 on success and non-zero if the
  * frame was dropped or queued.
+ *
+ * The handlers are split into an early and late part. The latter is eve=
rything
+ * that can be sensitive to reordering, and will be deferred to after pa=
ckets
+ * are dequeued from the intermediate queues (when they are enabled).
  */
-static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
 {
-	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(tx->skb);
 	ieee80211_tx_result res =3D TX_DROP;
=20
 #define CALL_TXH(txh) \
@@ -1697,16 +1759,42 @@ static int invoke_tx_handlers(struct ieee80211_tx=
_data *tx)
 	CALL_TXH(ieee80211_tx_h_check_assoc);
 	CALL_TXH(ieee80211_tx_h_ps_buf);
 	CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
-	CALL_TXH(ieee80211_tx_h_select_key);
+
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
 		CALL_TXH(ieee80211_tx_h_rate_ctrl);
=20
+ txh_done:
+	if (unlikely(res =3D=3D TX_DROP)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop);
+		if (tx->skb)
+			ieee80211_free_txskb(&tx->local->hw, tx->skb);
+		else
+			ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+		return -1;
+	} else if (unlikely(res =3D=3D TX_QUEUED)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_queued);
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Late handlers can be called while the sta lock is held. Handlers that=
 can
+ * cause packets to be generated will cause deadlock!
+ */
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
+{
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(tx->skb);
+	ieee80211_tx_result res =3D TX_CONTINUE;
+
 	if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
 		__skb_queue_tail(&tx->skbs, tx->skb);
 		tx->skb =3D NULL;
 		goto txh_done;
 	}
=20
+	CALL_TXH(ieee80211_tx_h_select_key);
 	CALL_TXH(ieee80211_tx_h_michael_mic_add);
 	CALL_TXH(ieee80211_tx_h_sequence);
 	CALL_TXH(ieee80211_tx_h_fragment);
@@ -1733,6 +1821,15 @@ static int invoke_tx_handlers(struct ieee80211_tx_=
data *tx)
 	return 0;
 }
=20
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+	int r =3D invoke_tx_handlers_early(tx);
+	if (r)
+		return r;
+
+	return invoke_tx_handlers_late(tx);
+}
+
 bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif, struct sk_buff *skb,
 			      int band, struct ieee80211_sta **sta)
@@ -1807,7 +1904,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_d=
ata *sdata,
 		info->hw_queue =3D
 			sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
=20
-	if (!invoke_tx_handlers(&tx))
+	if (invoke_tx_handlers_early(&tx))
+		return false;
+
+	if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb))
+		return true;
+
+	if (!invoke_tx_handlers_late(&tx))
 		result =3D __ieee80211_tx(local, &tx.skbs, led_len,
 					tx.sta, txpending);
=20
@@ -3159,7 +3262,7 @@ out:
 }
=20
 static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
-				struct net_device *dev, struct sta_info *sta,
+				struct sta_info *sta,
 				struct ieee80211_fast_tx *fast_tx,
 				struct sk_buff *skb)
 {
@@ -3170,9 +3273,9 @@ static bool ieee80211_xmit_fast(struct ieee80211_su=
b_if_data *sdata,
 	struct ethhdr eth;
 	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
 	struct ieee80211_hdr *hdr =3D (void *)fast_tx->hdr;
-	struct ieee80211_tx_data tx;
-	ieee80211_tx_result r;
 	struct tid_ampdu_tx *tid_tx =3D NULL;
+	ieee80211_tx_result r;
+	struct ieee80211_tx_data tx;
 	u8 tid =3D IEEE80211_NUM_TIDS;
=20
 	/* control port protocol needs a lot of special handling */
@@ -3210,8 +3313,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_su=
b_if_data *sdata,
 			return true;
 	}
=20
-	ieee80211_tx_stats(dev, skb->len + extra_head);
-
 	if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
 	    ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
 		return true;
@@ -3240,24 +3341,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_s=
ub_if_data *sdata,
 	info->flags =3D IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
 		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
-
-	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
-		*ieee80211_get_qos_ctl(hdr) =3D tid;
-		if (!sta->sta.txq[0])
-			hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, tid);
-	} else {
-		info->flags |=3D IEEE80211_TX_CTL_ASSIGN_SEQ;
-		hdr->seq_ctrl =3D cpu_to_le16(sdata->sequence_number);
-		sdata->sequence_number +=3D 0x10;
-	}
-
-	if (skb_shinfo(skb)->gso_size)
-		sta->tx_stats.msdu[tid] +=3D
-			DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
-	else
-		sta->tx_stats.msdu[tid]++;
-
-	info->hw_queue =3D sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+	info->control.flags =3D IEEE80211_TX_CTRL_FAST_XMIT;
=20
 	__skb_queue_head_init(&tx.skbs);
=20
@@ -3283,6 +3367,54 @@ static bool ieee80211_xmit_fast(struct ieee80211_s=
ub_if_data *sdata,
 		}
 	}
=20
+	if (ieee80211_queue_skb(local, sdata, sta, skb))
+		return true;
+
+	ieee80211_xmit_fast_finish(sdata, sta, fast_tx, skb);
+
+	if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
+		sdata =3D container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+
+	__skb_queue_tail(&tx.skbs, skb);
+	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
+
+	return true;
+}
+
+/*
+ * Can be called while the sta lock is held. Anything that can cause pac=
kets to
+ * be generated will cause deadlock!
+ */
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sda=
ta,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr =3D (void *)skb->data;
+	u8 tid =3D IEEE80211_NUM_TIDS;
+
+	ieee80211_tx_stats(skb->dev, skb->len);
+
+	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+		tid =3D skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+		*ieee80211_get_qos_ctl(hdr) =3D tid;
+		hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, tid);
+	} else {
+		info->flags |=3D IEEE80211_TX_CTL_ASSIGN_SEQ;
+		hdr->seq_ctrl =3D cpu_to_le16(sdata->sequence_number);
+		sdata->sequence_number +=3D 0x10;
+	}
+
+	if (skb_shinfo(skb)->gso_size)
+		sta->tx_stats.msdu[tid] +=3D
+			DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
+	else
+		sta->tx_stats.msdu[tid]++;
+
+	info->hw_queue =3D sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+
 	/* statistics normally done by ieee80211_tx_h_stats (but that
 	 * has to consider fragmentation, so is more complex)
 	 */
@@ -3309,12 +3441,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_s=
ub_if_data *sdata,
 		}
 	}
=20
-	if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
-		sdata =3D container_of(sdata->bss,
-				     struct ieee80211_sub_if_data, u.ap);
-
-	__skb_queue_tail(&tx.skbs, skb);
-	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
 	return true;
 }
=20
@@ -3342,7 +3468,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *s=
kb,
 		fast_tx =3D rcu_dereference(sta->fast_tx);
=20
 		if (fast_tx &&
-		    ieee80211_xmit_fast(sdata, dev, sta, fast_tx, skb))
+		    ieee80211_xmit_fast(sdata, sta, fast_tx, skb))
 			goto out;
 	}
=20
--=20
2.9.3

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 11:30           ` [PATCH v7] " Toke Høiland-Jørgensen
@ 2016-09-05 17:49             ` Felix Fietkau
  2016-09-05 17:59               ` Toke Høiland-Jørgensen
  2016-09-06 11:43             ` Toke Høiland-Jørgensen
  2016-09-06 11:44             ` [PATCH v8] " Toke Høiland-Jørgensen
  2 siblings, 1 reply; 51+ messages in thread
From: Felix Fietkau @ 2016-09-05 17:49 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless

On 2016-09-05 13:30, Toke Høiland-Jørgensen wrote:
> The TXQ intermediate queues can cause packet reordering when more than
> one flow is active to a single station. Since some of the wifi-specific
> packet handling (notably sequence number and encryption handling) is
> sensitive to re-ordering, things break if they are applied before the
> TXQ.
> 
> This splits up the TX handlers and fast_xmit logic into two parts: An
> early part and a late part. The former is applied before TXQ enqueue,
> and the latter after dequeue. The non-TXQ path just applies both parts
> at once.
> 
> Because fragments shouldn't be split up or reordered, the fragmentation
> handler is run after dequeue. Any fragments are then kept in the TXQ and
> on subsequent dequeues they take precedence over dequeueing from the FQ
> structure.
> 
> This approach avoids having to scatter special cases for when TXQ is
> enabled, at the cost of making the fast_xmit and TX handler code
> slightly more complex.
In my test, this one completely breaks ath9k with the txq patch.
One or two packets go through, then tx stalls completely.

- Felix

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 17:49             ` Felix Fietkau
@ 2016-09-05 17:59               ` Toke Høiland-Jørgensen
  2016-09-05 18:44                 ` Felix Fietkau
  0 siblings, 1 reply; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-05 17:59 UTC (permalink / raw)
  To: Felix Fietkau; +Cc: make-wifi-fast, linux-wireless

Felix Fietkau <nbd@nbd.name> writes:

> On 2016-09-05 13:30, Toke H=C3=B8iland-J=C3=B8rgensen wrote:
>> The TXQ intermediate queues can cause packet reordering when more than
>> one flow is active to a single station. Since some of the wifi-specific
>> packet handling (notably sequence number and encryption handling) is
>> sensitive to re-ordering, things break if they are applied before the
>> TXQ.
>>=20
>> This splits up the TX handlers and fast_xmit logic into two parts: An
>> early part and a late part. The former is applied before TXQ enqueue,
>> and the latter after dequeue. The non-TXQ path just applies both parts
>> at once.
>>=20
>> Because fragments shouldn't be split up or reordered, the fragmentation
>> handler is run after dequeue. Any fragments are then kept in the TXQ and
>> on subsequent dequeues they take precedence over dequeueing from the FQ
>> structure.
>>=20
>> This approach avoids having to scatter special cases for when TXQ is
>> enabled, at the cost of making the fast_xmit and TX handler code
>> slightly more complex.
> In my test, this one completely breaks ath9k with the txq patch.
> One or two packets go through, then tx stalls completely.

I assume you are testing on LEDE? It requires a change to work with the
patch in the LEDE tree that puts hdrlen into ieee80211_tx_data. Did you
fix that? Otherwise multicast (and possibly other things) will break
badly.

I have a version that should work with LEDE here:

https://kau.toke.dk/git/lede/tree/package/kernel/mac80211/patches/346-mac80=
211-move-reorder-sensitive-tx-handlers-to-after-TXQ-dequeue.patch

-Toke

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 17:59               ` Toke Høiland-Jørgensen
@ 2016-09-05 18:44                 ` Felix Fietkau
  0 siblings, 0 replies; 51+ messages in thread
From: Felix Fietkau @ 2016-09-05 18:44 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast, linux-wireless

On 2016-09-05 19:59, Toke Høiland-Jørgensen wrote:
> Felix Fietkau <nbd@nbd.name> writes:
> 
>> On 2016-09-05 13:30, Toke Høiland-Jørgensen wrote:
>>> The TXQ intermediate queues can cause packet reordering when more than
>>> one flow is active to a single station. Since some of the wifi-specific
>>> packet handling (notably sequence number and encryption handling) is
>>> sensitive to re-ordering, things break if they are applied before the
>>> TXQ.
>>> 
>>> This splits up the TX handlers and fast_xmit logic into two parts: An
>>> early part and a late part. The former is applied before TXQ enqueue,
>>> and the latter after dequeue. The non-TXQ path just applies both parts
>>> at once.
>>> 
>>> Because fragments shouldn't be split up or reordered, the fragmentation
>>> handler is run after dequeue. Any fragments are then kept in the TXQ and
>>> on subsequent dequeues they take precedence over dequeueing from the FQ
>>> structure.
>>> 
>>> This approach avoids having to scatter special cases for when TXQ is
>>> enabled, at the cost of making the fast_xmit and TX handler code
>>> slightly more complex.
>> In my test, this one completely breaks ath9k with the txq patch.
>> One or two packets go through, then tx stalls completely.
> 
> I assume you are testing on LEDE? It requires a change to work with the
> patch in the LEDE tree that puts hdrlen into ieee80211_tx_data. Did you
> fix that? Otherwise multicast (and possibly other things) will break
> badly.
You're right, I missed that.

> I have a version that should work with LEDE here:
> 
> https://kau.toke.dk/git/lede/tree/package/kernel/mac80211/patches/346-mac80211-move-reorder-sensitive-tx-handlers-to-after-TXQ-dequeue.patch
That one works fine in my test.

Thanks,

- Felix

^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 11:30           ` [PATCH v7] " Toke Høiland-Jørgensen
  2016-09-05 17:49             ` Felix Fietkau
@ 2016-09-06 11:43             ` Toke Høiland-Jørgensen
  2016-09-06 11:45               ` Toke Høiland-Jørgensen
  2016-09-06 11:44             ` [PATCH v8] " Toke Høiland-Jørgensen
  2 siblings, 1 reply; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-06 11:43 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless; +Cc: Toke Høiland-Jørgensen

The TXQ intermediate queues can cause packet reordering when more than
one flow is active to a single station. Since some of the wifi-specific
packet handling (notably sequence number and encryption handling) is
sensitive to re-ordering, things break if they are applied before the
TXQ.

This splits up the TX handlers and fast_xmit logic into two parts: An
early part and a late part. The former is applied before TXQ enqueue,
and the latter after dequeue. The non-TXQ path just applies both parts
at once.

Because fragments shouldn't be split up or reordered, the fragmentation
handler is run after dequeue. Any fragments are then kept in the TXQ and
on subsequent dequeues they take precedence over dequeueing from the FQ
structure.

This approach avoids having to scatter special cases for when TXQ is
enabled, at the cost of making the fast_xmit and TX handler code
slightly more complex.

Signed-off-by: Toke H=C3=B8iland-J=C3=B8rgensen <toke@toke.dk>
---
Changes since v6:
  - Invoking the rate control handler can cause packets to be generated
    (for establishing a BA session). This can cause a deadlock because
    dequeue can happen while sta->lock is held. So this version moves
    the rate control handler back before the intermediate queue step.
  - Fix sequence number allocation on the slow path.
 =20
 include/net/mac80211.h     |   2 +
 net/mac80211/ieee80211_i.h |   2 +
 net/mac80211/tx.c          | 250 ++++++++++++++++++++++++++++++++++-----=
------
 3 files changed, 192 insertions(+), 62 deletions(-)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index cca510a..9a6a3e9 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -715,6 +715,7 @@ enum mac80211_tx_info_flags {
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate info=
rmation
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
+ * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xm=
it path
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -723,6 +724,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PS_RESPONSE		=3D BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		=3D BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			=3D BIT(3),
+	IEEE80211_TX_CTRL_FAST_XMIT		=3D BIT(4),
 };
=20
 /*
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 9211cce..d36f3b1 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -813,11 +813,13 @@ enum txq_info_flags {
  * @def_flow: used as a fallback flow when a packet destined to @tin has=
hes to
  *	a fq_flow which is already owned by a different tin
  * @def_cvars: codel vars for @def_flow
+ * @frags: used to keep fragments created after dequeue
  */
 struct txq_info {
 	struct fq_tin tin;
 	struct fq_flow def_flow;
 	struct codel_vars def_cvars;
+	struct sk_buff_head frags;
 	unsigned long flags;
=20
 	/* keep last! */
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index efc38e7..94f38cc 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -38,6 +38,12 @@
 #include "wme.h"
 #include "rate.h"
=20
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sda=
ta,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb);
+
 /* misc utils */
=20
 static inline void ieee80211_tx_stats(struct net_device *dev, u32 len)
@@ -853,8 +859,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	tid =3D *qc & IEEE80211_QOS_CTL_TID_MASK;
 	tx->sta->tx_stats.msdu[tid]++;
=20
-	if (!tx->sta->sta.txq[0])
-		hdr->seq_ctrl =3D ieee80211_tx_next_seq(tx->sta, tid);
+	hdr->seq_ctrl =3D ieee80211_tx_next_seq(tx->sta, tid);
=20
 	return TX_CONTINUE;
 }
@@ -1403,6 +1408,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_dat=
a *sdata,
 	fq_tin_init(&txqi->tin);
 	fq_flow_init(&txqi->def_flow);
 	codel_vars_init(&txqi->def_cvars);
+	__skb_queue_head_init(&txqi->frags);
=20
 	txqi->txq.vif =3D &sdata->vif;
=20
@@ -1425,6 +1431,7 @@ void ieee80211_txq_purge(struct ieee80211_local *lo=
cal,
 	struct fq_tin *tin =3D &txqi->tin;
=20
 	fq_tin_reset(fq, tin, fq_skb_free_func);
+	ieee80211_purge_tx_queue(&local->hw, &txqi->frags);
 }
=20
 int ieee80211_txq_setup_flows(struct ieee80211_local *local)
@@ -1481,33 +1488,61 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee8=
0211_hw *hw,
 {
 	struct ieee80211_local *local =3D hw_to_local(hw);
 	struct txq_info *txqi =3D container_of(txq, struct txq_info, txq);
-	struct ieee80211_hdr *hdr;
 	struct sk_buff *skb =3D NULL;
 	struct fq *fq =3D &local->fq;
 	struct fq_tin *tin =3D &txqi->tin;
+	struct ieee80211_tx_info *info;
=20
 	spin_lock_bh(&fq->lock);
=20
 	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
 		goto out;
=20
+	/* Make sure fragments stay together. */
+	skb =3D __skb_dequeue(&txqi->frags);
+	if (skb)
+		goto out;
+
+begin:
 	skb =3D fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
 	if (!skb)
 		goto out;
=20
 	ieee80211_set_skb_vif(skb, txqi);
=20
-	hdr =3D (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+	info =3D IEEE80211_SKB_CB(skb);
+	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
 		struct sta_info *sta =3D container_of(txq->sta, struct sta_info,
 						    sta);
-		struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+		struct ieee80211_fast_tx *fast_tx;
=20
-		hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |=3D IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &=3D ~IEEE80211_TX_CTL_AMPDU;
+		fast_tx =3D rcu_dereference(sta->fast_tx);
+		if (WARN_ON(!fast_tx)) {
+			/* lost fast_tx pointer while the packet was queued */
+			ieee80211_free_txskb(hw, skb);
+			goto begin;
+		}
+		ieee80211_xmit_fast_finish(sta->sdata, sta, fast_tx, skb);
+	} else {
+		struct ieee80211_tx_data tx =3D { };
+
+		__skb_queue_head_init(&tx.skbs);
+		tx.local =3D local;
+		tx.skb =3D skb;
+		if (txq->sta) {
+			tx.sta =3D container_of(txq->sta, struct sta_info, sta);
+			tx.sdata =3D tx.sta->sdata;
+		} else {
+			tx.sdata =3D vif_to_sdata(info->control.vif);
+		}
+
+		if (invoke_tx_handlers_late(&tx))
+			goto begin;
+
+		skb =3D __skb_dequeue(&tx.skbs);
+
+		if (!skb_queue_empty(&tx.skbs))
+			skb_queue_splice_tail(&tx.skbs, &txqi->frags);
 	}
=20
 out:
@@ -1521,6 +1556,47 @@ out:
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
=20
+static bool ieee80211_queue_skb(struct ieee80211_local *local,
+				struct ieee80211_sub_if_data *sdata,
+				struct sta_info *sta,
+				struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+	struct fq *fq =3D &local->fq;
+	struct ieee80211_vif *vif;
+	struct txq_info *txqi;
+	struct ieee80211_sta *pubsta;
+
+	if (!local->ops->wake_tx_queue ||
+	    sdata->vif.type =3D=3D NL80211_IFTYPE_MONITOR)
+		return false;
+
+	if (sta && sta->uploaded)
+		pubsta =3D &sta->sta;
+	else
+		pubsta =3D NULL;
+
+	if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
+		sdata =3D container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+
+	vif =3D &sdata->vif;
+	txqi =3D ieee80211_get_txq(local, vif, pubsta, skb);
+
+	if (!txqi)
+		return false;
+
+	info->control.vif =3D vif;
+
+	spin_lock_bh(&fq->lock);
+	ieee80211_txq_enqueue(local, txqi, skb);
+	spin_unlock_bh(&fq->lock);
+
+	drv_wake_tx_queue(local, txqi);
+
+	return true;
+}
+
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -1528,9 +1604,7 @@ static bool ieee80211_tx_frags(struct ieee80211_loc=
al *local,
 			       bool txpending)
 {
 	struct ieee80211_tx_control control =3D {};
-	struct fq *fq =3D &local->fq;
 	struct sk_buff *skb, *tmp;
-	struct txq_info *txqi;
 	unsigned long flags;
=20
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1545,21 +1619,6 @@ static bool ieee80211_tx_frags(struct ieee80211_lo=
cal *local,
 		}
 #endif
=20
-		txqi =3D ieee80211_get_txq(local, vif, sta, skb);
-		if (txqi) {
-			info->control.vif =3D vif;
-
-			__skb_unlink(skb, skbs);
-
-			spin_lock_bh(&fq->lock);
-			ieee80211_txq_enqueue(local, txqi, skb);
-			spin_unlock_bh(&fq->lock);
-
-			drv_wake_tx_queue(local, txqi);
-
-			continue;
-		}
-
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1680,10 +1739,13 @@ static bool __ieee80211_tx(struct ieee80211_local=
 *local,
 /*
  * Invoke TX handlers, return 0 on success and non-zero if the
  * frame was dropped or queued.
+ *
+ * The handlers are split into an early and late part. The latter is eve=
rything
+ * that can be sensitive to reordering, and will be deferred to after pa=
ckets
+ * are dequeued from the intermediate queues (when they are enabled).
  */
-static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
 {
-	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(tx->skb);
 	ieee80211_tx_result res =3D TX_DROP;
=20
 #define CALL_TXH(txh) \
@@ -1697,16 +1759,42 @@ static int invoke_tx_handlers(struct ieee80211_tx=
_data *tx)
 	CALL_TXH(ieee80211_tx_h_check_assoc);
 	CALL_TXH(ieee80211_tx_h_ps_buf);
 	CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
-	CALL_TXH(ieee80211_tx_h_select_key);
+
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
 		CALL_TXH(ieee80211_tx_h_rate_ctrl);
=20
+ txh_done:
+	if (unlikely(res =3D=3D TX_DROP)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop);
+		if (tx->skb)
+			ieee80211_free_txskb(&tx->local->hw, tx->skb);
+		else
+			ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+		return -1;
+	} else if (unlikely(res =3D=3D TX_QUEUED)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_queued);
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Late handlers can be called while the sta lock is held. Handlers that=
 can
+ * cause packets to be generated will cause deadlock!
+ */
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
+{
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(tx->skb);
+	ieee80211_tx_result res =3D TX_CONTINUE;
+
 	if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
 		__skb_queue_tail(&tx->skbs, tx->skb);
 		tx->skb =3D NULL;
 		goto txh_done;
 	}
=20
+	CALL_TXH(ieee80211_tx_h_select_key);
 	CALL_TXH(ieee80211_tx_h_michael_mic_add);
 	CALL_TXH(ieee80211_tx_h_sequence);
 	CALL_TXH(ieee80211_tx_h_fragment);
@@ -1733,6 +1821,15 @@ static int invoke_tx_handlers(struct ieee80211_tx_=
data *tx)
 	return 0;
 }
=20
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+	int r =3D invoke_tx_handlers_early(tx);
+	if (r)
+		return r;
+
+	return invoke_tx_handlers_late(tx);
+}
+
 bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif, struct sk_buff *skb,
 			      int band, struct ieee80211_sta **sta)
@@ -1807,7 +1904,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_d=
ata *sdata,
 		info->hw_queue =3D
 			sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
=20
-	if (!invoke_tx_handlers(&tx))
+	if (invoke_tx_handlers_early(&tx))
+		return false;
+
+	if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb))
+		return true;
+
+	if (!invoke_tx_handlers_late(&tx))
 		result =3D __ieee80211_tx(local, &tx.skbs, led_len,
 					tx.sta, txpending);
=20
@@ -3159,7 +3262,7 @@ out:
 }
=20
 static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
-				struct net_device *dev, struct sta_info *sta,
+				struct sta_info *sta,
 				struct ieee80211_fast_tx *fast_tx,
 				struct sk_buff *skb)
 {
@@ -3170,9 +3273,9 @@ static bool ieee80211_xmit_fast(struct ieee80211_su=
b_if_data *sdata,
 	struct ethhdr eth;
 	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
 	struct ieee80211_hdr *hdr =3D (void *)fast_tx->hdr;
-	struct ieee80211_tx_data tx;
-	ieee80211_tx_result r;
 	struct tid_ampdu_tx *tid_tx =3D NULL;
+	ieee80211_tx_result r;
+	struct ieee80211_tx_data tx;
 	u8 tid =3D IEEE80211_NUM_TIDS;
=20
 	/* control port protocol needs a lot of special handling */
@@ -3210,8 +3313,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_su=
b_if_data *sdata,
 			return true;
 	}
=20
-	ieee80211_tx_stats(dev, skb->len + extra_head);
-
 	if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
 	    ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
 		return true;
@@ -3240,24 +3341,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_s=
ub_if_data *sdata,
 	info->flags =3D IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
 		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
-
-	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
-		*ieee80211_get_qos_ctl(hdr) =3D tid;
-		if (!sta->sta.txq[0])
-			hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, tid);
-	} else {
-		info->flags |=3D IEEE80211_TX_CTL_ASSIGN_SEQ;
-		hdr->seq_ctrl =3D cpu_to_le16(sdata->sequence_number);
-		sdata->sequence_number +=3D 0x10;
-	}
-
-	if (skb_shinfo(skb)->gso_size)
-		sta->tx_stats.msdu[tid] +=3D
-			DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
-	else
-		sta->tx_stats.msdu[tid]++;
-
-	info->hw_queue =3D sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+	info->control.flags =3D IEEE80211_TX_CTRL_FAST_XMIT;
=20
 	__skb_queue_head_init(&tx.skbs);
=20
@@ -3283,6 +3367,54 @@ static bool ieee80211_xmit_fast(struct ieee80211_s=
ub_if_data *sdata,
 		}
 	}
=20
+	if (ieee80211_queue_skb(local, sdata, sta, skb))
+		return true;
+
+	ieee80211_xmit_fast_finish(sdata, sta, fast_tx, skb);
+
+	if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
+		sdata =3D container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+
+	__skb_queue_tail(&tx.skbs, skb);
+	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
+
+	return true;
+}
+
+/*
+ * Can be called while the sta lock is held. Anything that can cause pac=
kets to
+ * be generated will cause deadlock!
+ */
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sda=
ta,
+				       struct sta_info *sta,
+				       struct ieee80211_fast_tx *fast_tx,
+				       struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr =3D (void *)skb->data;
+	u8 tid =3D IEEE80211_NUM_TIDS;
+
+	ieee80211_tx_stats(skb->dev, skb->len);
+
+	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+		tid =3D skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+		*ieee80211_get_qos_ctl(hdr) =3D tid;
+		hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, tid);
+	} else {
+		info->flags |=3D IEEE80211_TX_CTL_ASSIGN_SEQ;
+		hdr->seq_ctrl =3D cpu_to_le16(sdata->sequence_number);
+		sdata->sequence_number +=3D 0x10;
+	}
+
+	if (skb_shinfo(skb)->gso_size)
+		sta->tx_stats.msdu[tid] +=3D
+			DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
+	else
+		sta->tx_stats.msdu[tid]++;
+
+	info->hw_queue =3D sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+
 	/* statistics normally done by ieee80211_tx_h_stats (but that
 	 * has to consider fragmentation, so is more complex)
 	 */
@@ -3309,12 +3441,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_s=
ub_if_data *sdata,
 		}
 	}
=20
-	if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
-		sdata =3D container_of(sdata->bss,
-				     struct ieee80211_sub_if_data, u.ap);
-
-	__skb_queue_tail(&tx.skbs, skb);
-	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
 	return true;
 }
=20
@@ -3342,7 +3468,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *s=
kb,
 		fast_tx =3D rcu_dereference(sta->fast_tx);
=20
 		if (fast_tx &&
-		    ieee80211_xmit_fast(sdata, dev, sta, fast_tx, skb))
+		    ieee80211_xmit_fast(sdata, sta, fast_tx, skb))
 			goto out;
 	}
=20
--=20
2.9.3

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH v8] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-05 11:30           ` [PATCH v7] " Toke Høiland-Jørgensen
  2016-09-05 17:49             ` Felix Fietkau
  2016-09-06 11:43             ` Toke Høiland-Jørgensen
@ 2016-09-06 11:44             ` Toke Høiland-Jørgensen
  2016-09-06 22:04               ` Felix Fietkau
                                 ` (4 more replies)
  2 siblings, 5 replies; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-06 11:44 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless; +Cc: Toke Høiland-Jørgensen

The TXQ intermediate queues can cause packet reordering when more than
one flow is active to a single station. Since some of the wifi-specific
packet handling (notably sequence number and encryption handling) is
sensitive to re-ordering, things break if they are applied before the
TXQ.

This splits up the TX handlers and fast_xmit logic into two parts: An
early part and a late part. The former is applied before TXQ enqueue,
and the latter after dequeue. The non-TXQ path just applies both parts
at once.

Because fragments shouldn't be split up or reordered, the fragmentation
handler is run after dequeue. Any fragments are then kept in the TXQ and
on subsequent dequeues they take precedence over dequeueing from the FQ
structure.

This approach avoids having to scatter special cases for when TXQ is
enabled, at the cost of making the fast_xmit and TX handler code
slightly more complex.

Signed-off-by: Toke H=C3=B8iland-J=C3=B8rgensen <toke@toke.dk>
---
Changes since v8:
- Don't rely on the fast_tx pointer on TXQ dequeue; it can go away while
  the packet was queued, and we don't actually need it, since we can get
  the key configuration and offset from the packet info.

 include/net/mac80211.h     |   2 +
 net/mac80211/ieee80211_i.h |   2 +
 net/mac80211/tx.c          | 255 +++++++++++++++++++++++++++++++++------=
------
 3 files changed, 195 insertions(+), 64 deletions(-)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index cca510a..9a6a3e9 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -715,6 +715,7 @@ enum mac80211_tx_info_flags {
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate info=
rmation
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
+ * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xm=
it path
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -723,6 +724,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PS_RESPONSE		=3D BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		=3D BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			=3D BIT(3),
+	IEEE80211_TX_CTRL_FAST_XMIT		=3D BIT(4),
 };
=20
 /*
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 9211cce..d36f3b1 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -813,11 +813,13 @@ enum txq_info_flags {
  * @def_flow: used as a fallback flow when a packet destined to @tin has=
hes to
  *	a fq_flow which is already owned by a different tin
  * @def_cvars: codel vars for @def_flow
+ * @frags: used to keep fragments created after dequeue
  */
 struct txq_info {
 	struct fq_tin tin;
 	struct fq_flow def_flow;
 	struct codel_vars def_cvars;
+	struct sk_buff_head frags;
 	unsigned long flags;
=20
 	/* keep last! */
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index efc38e7..f8eec60 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -38,6 +38,12 @@
 #include "wme.h"
 #include "rate.h"
=20
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sda=
ta,
+				       struct sta_info *sta, u8 pn_offs,
+				       struct ieee80211_key_conf *key_conf,
+				       struct sk_buff *skb);
+
 /* misc utils */
=20
 static inline void ieee80211_tx_stats(struct net_device *dev, u32 len)
@@ -853,8 +859,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	tid =3D *qc & IEEE80211_QOS_CTL_TID_MASK;
 	tx->sta->tx_stats.msdu[tid]++;
=20
-	if (!tx->sta->sta.txq[0])
-		hdr->seq_ctrl =3D ieee80211_tx_next_seq(tx->sta, tid);
+	hdr->seq_ctrl =3D ieee80211_tx_next_seq(tx->sta, tid);
=20
 	return TX_CONTINUE;
 }
@@ -1403,6 +1408,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_dat=
a *sdata,
 	fq_tin_init(&txqi->tin);
 	fq_flow_init(&txqi->def_flow);
 	codel_vars_init(&txqi->def_cvars);
+	__skb_queue_head_init(&txqi->frags);
=20
 	txqi->txq.vif =3D &sdata->vif;
=20
@@ -1425,6 +1431,7 @@ void ieee80211_txq_purge(struct ieee80211_local *lo=
cal,
 	struct fq_tin *tin =3D &txqi->tin;
=20
 	fq_tin_reset(fq, tin, fq_skb_free_func);
+	ieee80211_purge_tx_queue(&local->hw, &txqi->frags);
 }
=20
 int ieee80211_txq_setup_flows(struct ieee80211_local *local)
@@ -1485,12 +1492,19 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee8=
0211_hw *hw,
 	struct sk_buff *skb =3D NULL;
 	struct fq *fq =3D &local->fq;
 	struct fq_tin *tin =3D &txqi->tin;
+	struct ieee80211_tx_info *info;
=20
 	spin_lock_bh(&fq->lock);
=20
 	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
 		goto out;
=20
+	/* Make sure fragments stay together. */
+	skb =3D __skb_dequeue(&txqi->frags);
+	if (skb)
+		goto out;
+
+begin:
 	skb =3D fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
 	if (!skb)
 		goto out;
@@ -1498,16 +1512,37 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee8=
0211_hw *hw,
 	ieee80211_set_skb_vif(skb, txqi);
=20
 	hdr =3D (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+	info =3D IEEE80211_SKB_CB(skb);
+	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
 		struct sta_info *sta =3D container_of(txq->sta, struct sta_info,
 						    sta);
-		struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+		u8 pn_offs =3D 0;
=20
-		hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |=3D IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &=3D ~IEEE80211_TX_CTL_AMPDU;
+		if (info->control.hw_key)
+			pn_offs =3D ieee80211_hdrlen(hdr->frame_control);
+
+		ieee80211_xmit_fast_finish(sta->sdata, sta, pn_offs,
+					   info->control.hw_key, skb);
+	} else {
+		struct ieee80211_tx_data tx =3D { };
+
+		__skb_queue_head_init(&tx.skbs);
+		tx.local =3D local;
+		tx.skb =3D skb;
+		if (txq->sta) {
+			tx.sta =3D container_of(txq->sta, struct sta_info, sta);
+			tx.sdata =3D tx.sta->sdata;
+		} else {
+			tx.sdata =3D vif_to_sdata(info->control.vif);
+		}
+
+		if (invoke_tx_handlers_late(&tx))
+			goto begin;
+
+		skb =3D __skb_dequeue(&tx.skbs);
+
+		if (!skb_queue_empty(&tx.skbs))
+			skb_queue_splice_tail(&tx.skbs, &txqi->frags);
 	}
=20
 out:
@@ -1521,6 +1556,47 @@ out:
 }
 EXPORT_SYMBOL(ieee80211_tx_dequeue);
=20
+static bool ieee80211_queue_skb(struct ieee80211_local *local,
+				struct ieee80211_sub_if_data *sdata,
+				struct sta_info *sta,
+				struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+	struct fq *fq =3D &local->fq;
+	struct ieee80211_vif *vif;
+	struct txq_info *txqi;
+	struct ieee80211_sta *pubsta;
+
+	if (!local->ops->wake_tx_queue ||
+	    sdata->vif.type =3D=3D NL80211_IFTYPE_MONITOR)
+		return false;
+
+	if (sta && sta->uploaded)
+		pubsta =3D &sta->sta;
+	else
+		pubsta =3D NULL;
+
+	if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
+		sdata =3D container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+
+	vif =3D &sdata->vif;
+	txqi =3D ieee80211_get_txq(local, vif, pubsta, skb);
+
+	if (!txqi)
+		return false;
+
+	info->control.vif =3D vif;
+
+	spin_lock_bh(&fq->lock);
+	ieee80211_txq_enqueue(local, txqi, skb);
+	spin_unlock_bh(&fq->lock);
+
+	drv_wake_tx_queue(local, txqi);
+
+	return true;
+}
+
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -1528,9 +1604,7 @@ static bool ieee80211_tx_frags(struct ieee80211_loc=
al *local,
 			       bool txpending)
 {
 	struct ieee80211_tx_control control =3D {};
-	struct fq *fq =3D &local->fq;
 	struct sk_buff *skb, *tmp;
-	struct txq_info *txqi;
 	unsigned long flags;
=20
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1545,21 +1619,6 @@ static bool ieee80211_tx_frags(struct ieee80211_lo=
cal *local,
 		}
 #endif
=20
-		txqi =3D ieee80211_get_txq(local, vif, sta, skb);
-		if (txqi) {
-			info->control.vif =3D vif;
-
-			__skb_unlink(skb, skbs);
-
-			spin_lock_bh(&fq->lock);
-			ieee80211_txq_enqueue(local, txqi, skb);
-			spin_unlock_bh(&fq->lock);
-
-			drv_wake_tx_queue(local, txqi);
-
-			continue;
-		}
-
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1680,10 +1739,13 @@ static bool __ieee80211_tx(struct ieee80211_local=
 *local,
 /*
  * Invoke TX handlers, return 0 on success and non-zero if the
  * frame was dropped or queued.
+ *
+ * The handlers are split into an early and late part. The latter is eve=
rything
+ * that can be sensitive to reordering, and will be deferred to after pa=
ckets
+ * are dequeued from the intermediate queues (when they are enabled).
  */
-static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
 {
-	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(tx->skb);
 	ieee80211_tx_result res =3D TX_DROP;
=20
 #define CALL_TXH(txh) \
@@ -1697,16 +1759,42 @@ static int invoke_tx_handlers(struct ieee80211_tx=
_data *tx)
 	CALL_TXH(ieee80211_tx_h_check_assoc);
 	CALL_TXH(ieee80211_tx_h_ps_buf);
 	CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
-	CALL_TXH(ieee80211_tx_h_select_key);
+
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
 		CALL_TXH(ieee80211_tx_h_rate_ctrl);
=20
+ txh_done:
+	if (unlikely(res =3D=3D TX_DROP)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop);
+		if (tx->skb)
+			ieee80211_free_txskb(&tx->local->hw, tx->skb);
+		else
+			ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+		return -1;
+	} else if (unlikely(res =3D=3D TX_QUEUED)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_queued);
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Late handlers can be called while the sta lock is held. Handlers that=
 can
+ * cause packets to be generated will cause deadlock!
+ */
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
+{
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(tx->skb);
+	ieee80211_tx_result res =3D TX_CONTINUE;
+
 	if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
 		__skb_queue_tail(&tx->skbs, tx->skb);
 		tx->skb =3D NULL;
 		goto txh_done;
 	}
=20
+	CALL_TXH(ieee80211_tx_h_select_key);
 	CALL_TXH(ieee80211_tx_h_michael_mic_add);
 	CALL_TXH(ieee80211_tx_h_sequence);
 	CALL_TXH(ieee80211_tx_h_fragment);
@@ -1733,6 +1821,15 @@ static int invoke_tx_handlers(struct ieee80211_tx_=
data *tx)
 	return 0;
 }
=20
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+	int r =3D invoke_tx_handlers_early(tx);
+
+	if (r)
+		return r;
+	return invoke_tx_handlers_late(tx);
+}
+
 bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif, struct sk_buff *skb,
 			      int band, struct ieee80211_sta **sta)
@@ -1807,7 +1904,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_d=
ata *sdata,
 		info->hw_queue =3D
 			sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
=20
-	if (!invoke_tx_handlers(&tx))
+	if (invoke_tx_handlers_early(&tx))
+		return false;
+
+	if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb))
+		return true;
+
+	if (!invoke_tx_handlers_late(&tx))
 		result =3D __ieee80211_tx(local, &tx.skbs, led_len,
 					tx.sta, txpending);
=20
@@ -3159,7 +3262,7 @@ out:
 }
=20
 static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
-				struct net_device *dev, struct sta_info *sta,
+				struct sta_info *sta,
 				struct ieee80211_fast_tx *fast_tx,
 				struct sk_buff *skb)
 {
@@ -3170,9 +3273,9 @@ static bool ieee80211_xmit_fast(struct ieee80211_su=
b_if_data *sdata,
 	struct ethhdr eth;
 	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
 	struct ieee80211_hdr *hdr =3D (void *)fast_tx->hdr;
-	struct ieee80211_tx_data tx;
-	ieee80211_tx_result r;
 	struct tid_ampdu_tx *tid_tx =3D NULL;
+	ieee80211_tx_result r;
+	struct ieee80211_tx_data tx;
 	u8 tid =3D IEEE80211_NUM_TIDS;
=20
 	/* control port protocol needs a lot of special handling */
@@ -3210,8 +3313,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_su=
b_if_data *sdata,
 			return true;
 	}
=20
-	ieee80211_tx_stats(dev, skb->len + extra_head);
-
 	if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
 	    ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
 		return true;
@@ -3240,24 +3341,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_s=
ub_if_data *sdata,
 	info->flags =3D IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
 		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
-
-	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
-		*ieee80211_get_qos_ctl(hdr) =3D tid;
-		if (!sta->sta.txq[0])
-			hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, tid);
-	} else {
-		info->flags |=3D IEEE80211_TX_CTL_ASSIGN_SEQ;
-		hdr->seq_ctrl =3D cpu_to_le16(sdata->sequence_number);
-		sdata->sequence_number +=3D 0x10;
-	}
-
-	if (skb_shinfo(skb)->gso_size)
-		sta->tx_stats.msdu[tid] +=3D
-			DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
-	else
-		sta->tx_stats.msdu[tid]++;
-
-	info->hw_queue =3D sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+	info->control.flags =3D IEEE80211_TX_CTRL_FAST_XMIT;
=20
 	__skb_queue_head_init(&tx.skbs);
=20
@@ -3283,22 +3367,71 @@ static bool ieee80211_xmit_fast(struct ieee80211_=
sub_if_data *sdata,
 		}
 	}
=20
+	if (ieee80211_queue_skb(local, sdata, sta, skb))
+		return true;
+
+	ieee80211_xmit_fast_finish(sdata, sta, fast_tx->pn_offs,
+				   &fast_tx->key->conf, skb);
+
+	if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
+		sdata =3D container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+
+	__skb_queue_tail(&tx.skbs, skb);
+	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
+
+	return true;
+}
+
+/*
+ * Can be called while the sta lock is held. Anything that can cause pac=
kets to
+ * be generated will cause deadlock!
+ */
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sda=
ta,
+				       struct sta_info *sta, u8 pn_offs,
+				       struct ieee80211_key_conf *key_conf,
+				       struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr =3D (void *)skb->data;
+	u8 tid =3D IEEE80211_NUM_TIDS;
+
+	ieee80211_tx_stats(skb->dev, skb->len);
+
+	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+		tid =3D skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+		*ieee80211_get_qos_ctl(hdr) =3D tid;
+		hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, tid);
+	} else {
+		info->flags |=3D IEEE80211_TX_CTL_ASSIGN_SEQ;
+		hdr->seq_ctrl =3D cpu_to_le16(sdata->sequence_number);
+		sdata->sequence_number +=3D 0x10;
+	}
+
+	if (skb_shinfo(skb)->gso_size)
+		sta->tx_stats.msdu[tid] +=3D
+			DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
+	else
+		sta->tx_stats.msdu[tid]++;
+
+	info->hw_queue =3D sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+
 	/* statistics normally done by ieee80211_tx_h_stats (but that
 	 * has to consider fragmentation, so is more complex)
 	 */
 	sta->tx_stats.bytes[skb_get_queue_mapping(skb)] +=3D skb->len;
 	sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
=20
-	if (fast_tx->pn_offs) {
+	if (pn_offs) {
 		u64 pn;
-		u8 *crypto_hdr =3D skb->data + fast_tx->pn_offs;
+		u8 *crypto_hdr =3D skb->data + pn_offs;
=20
-		switch (fast_tx->key->conf.cipher) {
+		switch (key_conf->cipher) {
 		case WLAN_CIPHER_SUITE_CCMP:
 		case WLAN_CIPHER_SUITE_CCMP_256:
 		case WLAN_CIPHER_SUITE_GCMP:
 		case WLAN_CIPHER_SUITE_GCMP_256:
-			pn =3D atomic64_inc_return(&fast_tx->key->conf.tx_pn);
+			pn =3D atomic64_inc_return(&key_conf->tx_pn);
 			crypto_hdr[0] =3D pn;
 			crypto_hdr[1] =3D pn >> 8;
 			crypto_hdr[4] =3D pn >> 16;
@@ -3309,12 +3442,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_s=
ub_if_data *sdata,
 		}
 	}
=20
-	if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
-		sdata =3D container_of(sdata->bss,
-				     struct ieee80211_sub_if_data, u.ap);
-
-	__skb_queue_tail(&tx.skbs, skb);
-	ieee80211_tx_frags(local, &sdata->vif, &sta->sta, &tx.skbs, false);
 	return true;
 }
=20
@@ -3342,7 +3469,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *s=
kb,
 		fast_tx =3D rcu_dereference(sta->fast_tx);
=20
 		if (fast_tx &&
-		    ieee80211_xmit_fast(sdata, dev, sta, fast_tx, skb))
+		    ieee80211_xmit_fast(sdata, sta, fast_tx, skb))
 			goto out;
 	}
=20
--=20
2.9.3

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [PATCH v7] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-06 11:43             ` Toke Høiland-Jørgensen
@ 2016-09-06 11:45               ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-06 11:45 UTC (permalink / raw)
  To: make-wifi-fast; +Cc: linux-wireless

Toke H=C3=B8iland-J=C3=B8rgensen <toke@toke.dk> writes:

> The TXQ intermediate queues can cause packet reordering when more than
> one flow is active to a single station. Since some of the wifi-specific
> packet handling (notably sequence number and encryption handling) is
> sensitive to re-ordering, things break if they are applied before the
> TXQ.
>
> This splits up the TX handlers and fast_xmit logic into two parts: An
> early part and a late part. The former is applied before TXQ enqueue,
> and the latter after dequeue. The non-TXQ path just applies both parts
> at once.
>
> Because fragments shouldn't be split up or reordered, the fragmentation
> handler is run after dequeue. Any fragments are then kept in the TXQ and
> on subsequent dequeues they take precedence over dequeueing from the FQ
> structure.
>
> This approach avoids having to scatter special cases for when TXQ is
> enabled, at the cost of making the fast_xmit and TX handler code
> slightly more complex.
>
> Signed-off-by: Toke H=C3=B8iland-J=C3=B8rgensen <toke@toke.dk>

Sorry for sending this again; meant to send v8. :/

-Toke

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-06 11:44             ` [PATCH v8] " Toke Høiland-Jørgensen
@ 2016-09-06 22:04               ` Felix Fietkau
  2016-09-12 12:35               ` Johannes Berg
                                 ` (3 subsequent siblings)
  4 siblings, 0 replies; 51+ messages in thread
From: Felix Fietkau @ 2016-09-06 22:04 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless

On 2016-09-06 13:44, Toke Høiland-Jørgensen wrote:
> The TXQ intermediate queues can cause packet reordering when more than
> one flow is active to a single station. Since some of the wifi-specific
> packet handling (notably sequence number and encryption handling) is
> sensitive to re-ordering, things break if they are applied before the
> TXQ.
> 
> This splits up the TX handlers and fast_xmit logic into two parts: An
> early part and a late part. The former is applied before TXQ enqueue,
> and the latter after dequeue. The non-TXQ path just applies both parts
> at once.
> 
> Because fragments shouldn't be split up or reordered, the fragmentation
> handler is run after dequeue. Any fragments are then kept in the TXQ and
> on subsequent dequeues they take precedence over dequeueing from the FQ
> structure.
> 
> This approach avoids having to scatter special cases for when TXQ is
> enabled, at the cost of making the fast_xmit and TX handler code
> slightly more complex.
> 
> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
Acked-by: Felix Fietkau <nbd@nbd.name>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-06 11:44             ` [PATCH v8] " Toke Høiland-Jørgensen
  2016-09-06 22:04               ` Felix Fietkau
@ 2016-09-12 12:35               ` Johannes Berg
  2016-09-12 13:08                 ` Toke Høiland-Jørgensen
  2016-09-22 17:04               ` [PATCH v9 0/2] mac80211: TXQ dequeue path rework Toke Høiland-Jørgensen
                                 ` (2 subsequent siblings)
  4 siblings, 1 reply; 51+ messages in thread
From: Johannes Berg @ 2016-09-12 12:35 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless


> +static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
> +static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
> +				       struct sta_info *sta, u8 pn_offs,
> +				       struct ieee80211_key_conf *key_conf,
> +				       struct sk_buff *skb);
> +

I'm not very happy with this - I think you should do some
refactoring/code move in a separate prior patch to avoid this.

> +	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
>  		struct sta_info *sta = container_of(txq->sta, struct sta_info,
>  						    sta);
> -		struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
> +		u8 pn_offs = 0;
>  
> -		hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid);
> -		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
> -			info->flags |= IEEE80211_TX_CTL_AMPDU;
> -		else
> -			info->flags &= ~IEEE80211_TX_CTL_AMPDU;
> +		if (info->control.hw_key)
> +			pn_offs = ieee80211_hdrlen(hdr->frame_control);

Not very happy with this either - the fast-xmit path explicitly tries
to avoid all these calculations.

I suppose I don't have to care all that much about the TXQs, but ...

Then again, adding a field in the skb->cb for the sake of this? No, not really either.


> +		ieee80211_xmit_fast_finish(sta->sdata, sta, pn_offs,
> +					   info->control.hw_key, skb);

I don't see how keeping the info->control.hw_key pointer across the
TXQ/FQ/Codel queueing isn't a potential bug? Probably one that already
exists in your code today, before this patch, of course.


> +	} else {
> +		struct ieee80211_tx_data tx = { };
> +
> +		__skb_queue_head_init(&tx.skbs);
> +		tx.local = local;
> +		tx.skb = skb;

an empty initializer is weird - why not at least move local/skb
initializations into it? Even txq->sta, I guess, since you can assign
txq->sta either way.

> -	CALL_TXH(ieee80211_tx_h_select_key);
> +
>  	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
>  		CALL_TXH(ieee80211_tx_h_rate_ctrl);
[...]
> 	if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
>  		__skb_queue_tail(&tx->skbs, tx->skb);
>  		tx->skb = NULL;
>  		goto txh_done;
>  	}
> 
> +	CALL_TXH(ieee80211_tx_h_select_key);

What happens for the IEEE80211_TX_INTFL_RETRANSMISSION packets wrt. key
selection? Why is it OK to change this?

johannes

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-12 12:35               ` Johannes Berg
@ 2016-09-12 13:08                 ` Toke Høiland-Jørgensen
  2016-09-12 13:19                   ` Johannes Berg
  0 siblings, 1 reply; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-12 13:08 UTC (permalink / raw)
  To: Johannes Berg; +Cc: make-wifi-fast, linux-wireless

Johannes Berg <johannes@sipsolutions.net> writes:

>> +static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx);
>> +static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sd=
ata,
>> +				=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0struct sta_info *sta, u8 =
pn_offs,
>> +				=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0struct ieee80211_key_conf=
 *key_conf,
>> +				=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0struct sk_buff *skb);
>> +
>
> I'm not very happy with this - I think you should do some
> refactoring/code move in a separate prior patch to avoid this.

Noted, will do.

>> +	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
>> =C2=A0		struct sta_info *sta =3D container_of(txq->sta, struct sta_info,
>> =C2=A0						=C2=A0=C2=A0=C2=A0=C2=A0sta);
>> -		struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
>> +		u8 pn_offs =3D 0;
>> =C2=A0
>> -		hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, txq->tid);
>> -		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
>> -			info->flags |=3D IEEE80211_TX_CTL_AMPDU;
>> -		else
>> -			info->flags &=3D ~IEEE80211_TX_CTL_AMPDU;
>> +		if (info->control.hw_key)
>> +			pn_offs =3D ieee80211_hdrlen(hdr->frame_control);
>
> Not very happy with this either - the fast-xmit path explicitly tries
> to avoid all these calculations.

Well, the TXQ already adds a lot of other overhead (hashing on the
packet header, for one), so my guess would be that this would be
negligible compared to all that?=20

> I suppose I don't have to care all that much about the TXQs, but ...
>
> Then again, adding a field in the skb->cb for the sake of this? No,
> not really either.

So that's a "keep it", then? :)

>> +		ieee80211_xmit_fast_finish(sta->sdata, sta, pn_offs,
>> +					=C2=A0=C2=A0=C2=A0info->control.hw_key, skb);
>
> I don't see how keeping the info->control.hw_key pointer across the
> TXQ/FQ/Codel queueing isn't a potential bug? Probably one that already
> exists in your code today, before this patch, of course.

You mean the key could get removed from the hardware while the packet
was queued? Can certainly add a check for that. Under what conditions
does that happen? Does it make sense to try to recover from it (I guess
by calling tx_h_select_key), or is it rare enough that giving up and
dropping the packet makes more sense?

>> +	} else {
>> +		struct ieee80211_tx_data tx =3D { };
>> +
>> +		__skb_queue_head_init(&tx.skbs);
>> +		tx.local =3D local;
>> +		tx.skb =3D skb;
>
> an empty initializer is weird - why not at least move local/skb
> initializations into it? Even txq->sta, I guess, since you can assign
> txq->sta either way.

Yup, makes sense. Noted.

>> -	CALL_TXH(ieee80211_tx_h_select_key);
>> +
>> =C2=A0	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
>> =C2=A0		CALL_TXH(ieee80211_tx_h_rate_ctrl);
> [...]
>> 	if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
>> =C2=A0		__skb_queue_tail(&tx->skbs, tx->skb);
>> =C2=A0		tx->skb =3D NULL;
>> =C2=A0		goto txh_done;
>> =C2=A0	}
>>=20
>> +	CALL_TXH(ieee80211_tx_h_select_key);
>
> What happens for the=C2=A0IEEE80211_TX_INTFL_RETRANSMISSION packets wrt.
> key selection? Why is it OK to change this?

You're right, that's an oversight on my part. Will fix.

-Toke

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue.
  2016-09-12 13:08                 ` Toke Høiland-Jørgensen
@ 2016-09-12 13:19                   ` Johannes Berg
  0 siblings, 0 replies; 51+ messages in thread
From: Johannes Berg @ 2016-09-12 13:19 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast, linux-wireless


> Well, the TXQ already adds a lot of other overhead (hashing on the
> packet header, for one), so my guess would be that this would be
> negligible compared to all that? 
> 
> > 
> > I suppose I don't have to care all that much about the TXQs, but
> > ...
> > 
> > Then again, adding a field in the skb->cb for the sake of this? No,
> > not really either.
> 
> So that's a "keep it", then? :)

Yeah I think so :)

> > > +		ieee80211_xmit_fast_finish(sta->sdata, sta,
> > > pn_offs,
> > > +					   info->control.hw_key, 
> > > skb);
> > 
> > I don't see how keeping the info->control.hw_key pointer across the
> > TXQ/FQ/Codel queueing isn't a potential bug? Probably one that
> > already exists in your code today, before this patch, of course.
> 
> You mean the key could get removed from the hardware while the packet
> was queued? Can certainly add a check for that. Under what conditions
> does that happen? Does it make sense to try to recover from it (I
> guess by calling tx_h_select_key), or is it rare enough that giving
> up and dropping the packet makes more sense?

Not just from the hardware, more importantly the whole key structure
can be kfree()d, leading to use-after-free here, no?

Fast-xmit solves this by invalidating the fast-xmit cache when the key
pointer changes/goes away and possibly punting some frames to the slow
path, but you've absolutely no protection on these pointers here within
the TXQs, afaict?

A similar situation occurs with other pointers, like stations and vifs,
but when those are removed then obviously the entire TXQs are flushed,
so they're not relevant.

With the key though, frames can be on the queue while a key is removed,
and even before this patch, drivers would consequently access an
invalid key pointer.

Mind you, as I just wrote I think that issue exists even before this
patch, so you should probably look at it separately. Felix might know
better too.

johannes

^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v9 0/2] mac80211: TXQ dequeue path rework
  2016-09-06 11:44             ` [PATCH v8] " Toke Høiland-Jørgensen
  2016-09-06 22:04               ` Felix Fietkau
  2016-09-12 12:35               ` Johannes Berg
@ 2016-09-22 17:04               ` Toke Høiland-Jørgensen
  2016-09-22 17:04               ` [PATCH v9 1/2] mac80211: Move ieee802111_tx_dequeue() to later in tx.c Toke Høiland-Jørgensen
  2016-09-22 17:04               ` [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue Toke Høiland-Jørgensen
  4 siblings, 0 replies; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-22 17:04 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless; +Cc: Toke Høiland-Jørgensen

This is the ninth iteration of my attempts to reorder the TXQ dequeue
path to avoid issues with reorder-sensitive operations. This version is
split into two patches; the first one moves ieee80211_tx_dequeue() to
avoid adding function stubs at the top of tx.c.

Changes since v8:
- Don't add function stubs to the beginning of tx.c
- Don't use control.hw_key from the dequeued packet, since that can go
  away while the packet is queued. Instead, run the select_key handler
  on dequeue and use the key from that.
- Change places that check tin.backlog_packets as an indication of
  whether the TXQ has anything queued to also look at the 'frags' queue.
- Don't change the order of the select_key handler with respect to the
  other handlers.
- Rebase on current mac80211-next tree.

Toke H=C3=B8iland-J=C3=B8rgensen (2):
  mac80211: Move ieee802111_tx_dequeue() to later in tx.c
  mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue

 include/net/mac80211.h     |   2 +
 net/mac80211/ieee80211_i.h |   8 ++
 net/mac80211/rx.c          |   4 +-
 net/mac80211/sta_info.c    |  10 +-
 net/mac80211/tx.c          | 335 +++++++++++++++++++++++++++++++--------=
------
 net/mac80211/util.c        |  11 +-
 6 files changed, 256 insertions(+), 114 deletions(-)

--=20
2.9.3

base-commit: c13ed534b8db543e4d8ead3885f4b06585a5771c

^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v9 1/2] mac80211: Move ieee802111_tx_dequeue() to later in tx.c
  2016-09-06 11:44             ` [PATCH v8] " Toke Høiland-Jørgensen
                                 ` (2 preceding siblings ...)
  2016-09-22 17:04               ` [PATCH v9 0/2] mac80211: TXQ dequeue path rework Toke Høiland-Jørgensen
@ 2016-09-22 17:04               ` Toke Høiland-Jørgensen
  2016-09-30 11:13                 ` Johannes Berg
  2016-09-22 17:04               ` [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue Toke Høiland-Jørgensen
  4 siblings, 1 reply; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-22 17:04 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless; +Cc: Toke Høiland-Jørgensen

The TXQ path restructure requires ieee80211_tx_dequeue() to call TX
handlers and parts of the xmit_fast path. Move the function to later in
tx.c in preparation for this.

Signed-off-by: Toke H=C3=B8iland-J=C3=B8rgensen <toke@toke.dk>
---
 net/mac80211/tx.c | 90 +++++++++++++++++++++++++++----------------------=
------
 1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 61d302d..e8c9964 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1476,51 +1476,6 @@ void ieee80211_txq_teardown_flows(struct ieee80211=
_local *local)
 	spin_unlock_bh(&fq->lock);
 }
=20
-struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
-				     struct ieee80211_txq *txq)
-{
-	struct ieee80211_local *local =3D hw_to_local(hw);
-	struct txq_info *txqi =3D container_of(txq, struct txq_info, txq);
-	struct ieee80211_hdr *hdr;
-	struct sk_buff *skb =3D NULL;
-	struct fq *fq =3D &local->fq;
-	struct fq_tin *tin =3D &txqi->tin;
-
-	spin_lock_bh(&fq->lock);
-
-	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
-		goto out;
-
-	skb =3D fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
-	if (!skb)
-		goto out;
-
-	ieee80211_set_skb_vif(skb, txqi);
-
-	hdr =3D (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
-		struct sta_info *sta =3D container_of(txq->sta, struct sta_info,
-						    sta);
-		struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
-
-		hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |=3D IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &=3D ~IEEE80211_TX_CTL_AMPDU;
-	}
-
-out:
-	spin_unlock_bh(&fq->lock);
-
-	if (skb && skb_has_frag_list(skb) &&
-	    !ieee80211_hw_check(&local->hw, TX_FRAG_LIST))
-		skb_linearize(skb);
-
-	return skb;
-}
-EXPORT_SYMBOL(ieee80211_tx_dequeue);
-
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -3311,6 +3266,51 @@ static bool ieee80211_xmit_fast(struct ieee80211_s=
ub_if_data *sdata,
 	return true;
 }
=20
+struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
+				     struct ieee80211_txq *txq)
+{
+	struct ieee80211_local *local =3D hw_to_local(hw);
+	struct txq_info *txqi =3D container_of(txq, struct txq_info, txq);
+	struct ieee80211_hdr *hdr;
+	struct sk_buff *skb =3D NULL;
+	struct fq *fq =3D &local->fq;
+	struct fq_tin *tin =3D &txqi->tin;
+
+	spin_lock_bh(&fq->lock);
+
+	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
+		goto out;
+
+	skb =3D fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
+	if (!skb)
+		goto out;
+
+	ieee80211_set_skb_vif(skb, txqi);
+
+	hdr =3D (struct ieee80211_hdr *)skb->data;
+	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+		struct sta_info *sta =3D container_of(txq->sta, struct sta_info,
+						    sta);
+		struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+
+		hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, txq->tid);
+		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
+			info->flags |=3D IEEE80211_TX_CTL_AMPDU;
+		else
+			info->flags &=3D ~IEEE80211_TX_CTL_AMPDU;
+	}
+
+out:
+	spin_unlock_bh(&fq->lock);
+
+	if (skb && skb_has_frag_list(skb) &&
+	    !ieee80211_hw_check(&local->hw, TX_FRAG_LIST))
+		skb_linearize(skb);
+
+	return skb;
+}
+EXPORT_SYMBOL(ieee80211_tx_dequeue);
+
 void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 				  struct net_device *dev,
 				  u32 info_flags)
--=20
2.9.3

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue
  2016-09-06 11:44             ` [PATCH v8] " Toke Høiland-Jørgensen
                                 ` (3 preceding siblings ...)
  2016-09-22 17:04               ` [PATCH v9 1/2] mac80211: Move ieee802111_tx_dequeue() to later in tx.c Toke Høiland-Jørgensen
@ 2016-09-22 17:04               ` Toke Høiland-Jørgensen
  2016-09-30 10:27                 ` Johannes Berg
  2016-09-30 12:49                 ` Johannes Berg
  4 siblings, 2 replies; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-22 17:04 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless; +Cc: Toke Høiland-Jørgensen

The TXQ intermediate queues can cause packet reordering when more than
one flow is active to a single station. Since some of the wifi-specific
packet handling (notably sequence number and encryption handling) is
sensitive to re-ordering, things break if they are applied before the
TXQ.

This splits up the TX handlers and fast_xmit logic into two parts: An
early part and a late part. The former is applied before TXQ enqueue,
and the latter after dequeue. The non-TXQ path just applies both parts
at once.

Because fragments shouldn't be split up or reordered, the fragmentation
handler is run after dequeue. Any fragments are then kept in the TXQ and
on subsequent dequeues they take precedence over dequeueing from the FQ
structure.

This approach avoids having to scatter special cases all over the place
for when TXQ is enabled, at the cost of making the fast_xmit and TX
handler code slightly more complex.

Signed-off-by: Toke H=C3=B8iland-J=C3=B8rgensen <toke@toke.dk>
---
 include/net/mac80211.h     |   2 +
 net/mac80211/ieee80211_i.h |   8 ++
 net/mac80211/rx.c          |   4 +-
 net/mac80211/sta_info.c    |  10 +-
 net/mac80211/tx.c          | 287 +++++++++++++++++++++++++++++++++------=
------
 net/mac80211/util.c        |  11 +-
 6 files changed, 232 insertions(+), 90 deletions(-)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 5296100..9463039 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -715,6 +715,7 @@ enum mac80211_tx_info_flags {
  *	frame (PS-Poll or uAPSD).
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate info=
rmation
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
+ * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xm=
it path
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -723,6 +724,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_PS_RESPONSE		=3D BIT(1),
 	IEEE80211_TX_CTRL_RATE_INJECT		=3D BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			=3D BIT(3),
+	IEEE80211_TX_CTRL_FAST_XMIT		=3D BIT(4),
 };
=20
 /*
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index c71c735..caca265 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -813,12 +813,14 @@ enum txq_info_flags {
  * @def_flow: used as a fallback flow when a packet destined to @tin has=
hes to
  *	a fq_flow which is already owned by a different tin
  * @def_cvars: codel vars for @def_flow
+ * @frags: used to keep fragments created after dequeue
  */
 struct txq_info {
 	struct fq_tin tin;
 	struct fq_flow def_flow;
 	struct codel_vars def_cvars;
 	struct codel_stats cstats;
+	struct sk_buff_head frags;
 	unsigned long flags;
=20
 	/* keep last! */
@@ -1481,6 +1483,12 @@ static inline struct txq_info *to_txq_info(struct =
ieee80211_txq *txq)
 	return container_of(txq, struct txq_info, txq);
 }
=20
+static inline bool txq_has_queue(struct ieee80211_txq *txq)
+{
+	struct txq_info *txqi =3D to_txq_info(txq);
+	return !(skb_queue_empty(&txqi->frags) && !txqi->tin.backlog_packets);
+}
+
 static inline int ieee80211_bssid_match(const u8 *raddr, const u8 *addr)
 {
 	return ether_addr_equal(raddr, addr) ||
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index e796060..ae5786b8 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1323,9 +1323,7 @@ static void sta_ps_start(struct sta_info *sta)
 		return;
=20
 	for (tid =3D 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
-		struct txq_info *txqi =3D to_txq_info(sta->sta.txq[tid]);
-
-		if (txqi->tin.backlog_packets)
+		if (txq_has_queue(sta->sta.txq[tid]))
 			set_bit(tid, &sta->txq_buffered_tids);
 		else
 			clear_bit(tid, &sta->txq_buffered_tids);
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 1b1b28f..167bff0 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -1212,12 +1212,10 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_i=
nfo *sta)
=20
 	if (sta->sta.txq[0]) {
 		for (i =3D 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
-			struct txq_info *txqi =3D to_txq_info(sta->sta.txq[i]);
-
-			if (!txqi->tin.backlog_packets)
+			if (!txq_has_queue(sta->sta.txq[i]))
 				continue;
=20
-			drv_wake_tx_queue(local, txqi);
+			drv_wake_tx_queue(local, to_txq_info(sta->sta.txq[i]));
 		}
 	}
=20
@@ -1649,9 +1647,7 @@ ieee80211_sta_ps_deliver_response(struct sta_info *=
sta,
 			return;
=20
 		for (tid =3D 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) {
-			struct txq_info *txqi =3D to_txq_info(sta->sta.txq[tid]);
-
-			if (!(tids & BIT(tid)) || txqi->tin.backlog_packets)
+			if (!(tids & BIT(tid)) || txq_has_queue(sta->sta.txq[tid]))
 				continue;
=20
 			sta_info_recalc_tim(sta);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index e8c9964..75e6adf 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -853,8 +853,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 	tid =3D *qc & IEEE80211_QOS_CTL_TID_MASK;
 	tx->sta->tx_stats.msdu[tid]++;
=20
-	if (!tx->sta->sta.txq[0])
-		hdr->seq_ctrl =3D ieee80211_tx_next_seq(tx->sta, tid);
+	hdr->seq_ctrl =3D ieee80211_tx_next_seq(tx->sta, tid);
=20
 	return TX_CONTINUE;
 }
@@ -1404,6 +1403,7 @@ void ieee80211_txq_init(struct ieee80211_sub_if_dat=
a *sdata,
 	fq_flow_init(&txqi->def_flow);
 	codel_vars_init(&txqi->def_cvars);
 	codel_stats_init(&txqi->cstats);
+	__skb_queue_head_init(&txqi->frags);
=20
 	txqi->txq.vif =3D &sdata->vif;
=20
@@ -1426,6 +1426,7 @@ void ieee80211_txq_purge(struct ieee80211_local *lo=
cal,
 	struct fq_tin *tin =3D &txqi->tin;
=20
 	fq_tin_reset(fq, tin, fq_skb_free_func);
+	ieee80211_purge_tx_queue(&local->hw, &txqi->frags);
 }
=20
 int ieee80211_txq_setup_flows(struct ieee80211_local *local)
@@ -1476,6 +1477,47 @@ void ieee80211_txq_teardown_flows(struct ieee80211=
_local *local)
 	spin_unlock_bh(&fq->lock);
 }
=20
+static bool ieee80211_queue_skb(struct ieee80211_local *local,
+				struct ieee80211_sub_if_data *sdata,
+				struct sta_info *sta,
+				struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+	struct fq *fq =3D &local->fq;
+	struct ieee80211_vif *vif;
+	struct txq_info *txqi;
+	struct ieee80211_sta *pubsta;
+
+	if (!local->ops->wake_tx_queue ||
+	    sdata->vif.type =3D=3D NL80211_IFTYPE_MONITOR)
+		return false;
+
+	if (sta && sta->uploaded)
+		pubsta =3D &sta->sta;
+	else
+		pubsta =3D NULL;
+
+	if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
+		sdata =3D container_of(sdata->bss,
+				     struct ieee80211_sub_if_data, u.ap);
+
+	vif =3D &sdata->vif;
+	txqi =3D ieee80211_get_txq(local, vif, pubsta, skb);
+
+	if (!txqi)
+		return false;
+
+	info->control.vif =3D vif;
+
+	spin_lock_bh(&fq->lock);
+	ieee80211_txq_enqueue(local, txqi, skb);
+	spin_unlock_bh(&fq->lock);
+
+	drv_wake_tx_queue(local, txqi);
+
+	return true;
+}
+
 static bool ieee80211_tx_frags(struct ieee80211_local *local,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_sta *sta,
@@ -1483,9 +1525,7 @@ static bool ieee80211_tx_frags(struct ieee80211_loc=
al *local,
 			       bool txpending)
 {
 	struct ieee80211_tx_control control =3D {};
-	struct fq *fq =3D &local->fq;
 	struct sk_buff *skb, *tmp;
-	struct txq_info *txqi;
 	unsigned long flags;
=20
 	skb_queue_walk_safe(skbs, skb, tmp) {
@@ -1500,21 +1540,6 @@ static bool ieee80211_tx_frags(struct ieee80211_lo=
cal *local,
 		}
 #endif
=20
-		txqi =3D ieee80211_get_txq(local, vif, sta, skb);
-		if (txqi) {
-			info->control.vif =3D vif;
-
-			__skb_unlink(skb, skbs);
-
-			spin_lock_bh(&fq->lock);
-			ieee80211_txq_enqueue(local, txqi, skb);
-			spin_unlock_bh(&fq->lock);
-
-			drv_wake_tx_queue(local, txqi);
-
-			continue;
-		}
-
 		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 		if (local->queue_stop_reasons[q] ||
 		    (!txpending && !skb_queue_empty(&local->pending[q]))) {
@@ -1635,10 +1660,13 @@ static bool __ieee80211_tx(struct ieee80211_local=
 *local,
 /*
  * Invoke TX handlers, return 0 on success and non-zero if the
  * frame was dropped or queued.
+ *
+ * The handlers are split into an early and late part. The latter is eve=
rything
+ * that can be sensitive to reordering, and will be deferred to after pa=
ckets
+ * are dequeued from the intermediate queues (when they are enabled).
  */
-static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
 {
-	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(tx->skb);
 	ieee80211_tx_result res =3D TX_DROP;
=20
 #define CALL_TXH(txh) \
@@ -1656,6 +1684,31 @@ static int invoke_tx_handlers(struct ieee80211_tx_=
data *tx)
 	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
 		CALL_TXH(ieee80211_tx_h_rate_ctrl);
=20
+ txh_done:
+	if (unlikely(res =3D=3D TX_DROP)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_drop);
+		if (tx->skb)
+			ieee80211_free_txskb(&tx->local->hw, tx->skb);
+		else
+			ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs);
+		return -1;
+	} else if (unlikely(res =3D=3D TX_QUEUED)) {
+		I802_DEBUG_INC(tx->local->tx_handlers_queued);
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Late handlers can be called while the sta lock is held. Handlers that=
 can
+ * cause packets to be generated will cause deadlock!
+ */
+static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
+{
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(tx->skb);
+	ieee80211_tx_result res =3D TX_CONTINUE;
+
 	if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
 		__skb_queue_tail(&tx->skbs, tx->skb);
 		tx->skb =3D NULL;
@@ -1688,6 +1741,15 @@ static int invoke_tx_handlers(struct ieee80211_tx_=
data *tx)
 	return 0;
 }
=20
+static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
+{
+	int r =3D invoke_tx_handlers_early(tx);
+
+	if (r)
+		return r;
+	return invoke_tx_handlers_late(tx);
+}
+
 bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
 			      struct ieee80211_vif *vif, struct sk_buff *skb,
 			      int band, struct ieee80211_sta **sta)
@@ -1762,7 +1824,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_d=
ata *sdata,
 		info->hw_queue =3D
 			sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
=20
-	if (!invoke_tx_handlers(&tx))
+	if (invoke_tx_handlers_early(&tx))
+		return false;
+
+	if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb))
+		return true;
+
+	if (!invoke_tx_handlers_late(&tx))
 		result =3D __ieee80211_tx(local, &tx.skbs, led_len,
 					tx.sta, txpending);
=20
@@ -3106,8 +3174,73 @@ out:
 	return ret;
 }
=20
+/*
+ * Can be called while the sta lock is held. Anything that can cause pac=
kets to
+ * be generated will cause deadlock!
+ */
+static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sda=
ta,
+				       struct sta_info *sta, u8 pn_offs,
+				       struct ieee80211_key *key,
+				       struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+	struct ieee80211_hdr *hdr =3D (void *)skb->data;
+	u8 tid =3D IEEE80211_NUM_TIDS;
+
+	if (key)
+		info->control.hw_key =3D &key->conf;
+
+	ieee80211_tx_stats(skb->dev, skb->len);
+
+	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+		tid =3D skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+		*ieee80211_get_qos_ctl(hdr) =3D tid;
+		hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, tid);
+	} else {
+		info->flags |=3D IEEE80211_TX_CTL_ASSIGN_SEQ;
+		hdr->seq_ctrl =3D cpu_to_le16(sdata->sequence_number);
+		sdata->sequence_number +=3D 0x10;
+	}
+
+	if (skb_shinfo(skb)->gso_size)
+		sta->tx_stats.msdu[tid] +=3D
+			DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
+	else
+		sta->tx_stats.msdu[tid]++;
+
+	info->hw_queue =3D sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+
+	/* statistics normally done by ieee80211_tx_h_stats (but that
+	 * has to consider fragmentation, so is more complex)
+	 */
+	sta->tx_stats.bytes[skb_get_queue_mapping(skb)] +=3D skb->len;
+	sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
+
+	if (pn_offs) {
+		u64 pn;
+		u8 *crypto_hdr =3D skb->data + pn_offs;
+
+		switch (key->conf.cipher) {
+		case WLAN_CIPHER_SUITE_CCMP:
+		case WLAN_CIPHER_SUITE_CCMP_256:
+		case WLAN_CIPHER_SUITE_GCMP:
+		case WLAN_CIPHER_SUITE_GCMP_256:
+			pn =3D atomic64_inc_return(&key->conf.tx_pn);
+			crypto_hdr[0] =3D pn;
+			crypto_hdr[1] =3D pn >> 8;
+			crypto_hdr[4] =3D pn >> 16;
+			crypto_hdr[5] =3D pn >> 24;
+			crypto_hdr[6] =3D pn >> 32;
+			crypto_hdr[7] =3D pn >> 40;
+			break;
+		}
+	}
+
+	return true;
+}
+
 static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
-				struct net_device *dev, struct sta_info *sta,
+				struct sta_info *sta,
 				struct ieee80211_fast_tx *fast_tx,
 				struct sk_buff *skb)
 {
@@ -3158,8 +3291,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_su=
b_if_data *sdata,
 			return true;
 	}
=20
-	ieee80211_tx_stats(dev, skb->len + extra_head);
-
 	if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
 	    ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
 		return true;
@@ -3188,24 +3319,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_s=
ub_if_data *sdata,
 	info->flags =3D IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
 		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
-
-	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
-		*ieee80211_get_qos_ctl(hdr) =3D tid;
-		if (!sta->sta.txq[0])
-			hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, tid);
-	} else {
-		info->flags |=3D IEEE80211_TX_CTL_ASSIGN_SEQ;
-		hdr->seq_ctrl =3D cpu_to_le16(sdata->sequence_number);
-		sdata->sequence_number +=3D 0x10;
-	}
-
-	if (skb_shinfo(skb)->gso_size)
-		sta->tx_stats.msdu[tid] +=3D
-			DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size);
-	else
-		sta->tx_stats.msdu[tid]++;
-
-	info->hw_queue =3D sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+	info->control.flags =3D IEEE80211_TX_CTRL_FAST_XMIT;
=20
 	__skb_queue_head_init(&tx.skbs);
=20
@@ -3215,9 +3329,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_su=
b_if_data *sdata,
 	tx.sta =3D sta;
 	tx.key =3D fast_tx->key;
=20
-	if (fast_tx->key)
-		info->control.hw_key =3D &fast_tx->key->conf;
-
 	if (!ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) {
 		tx.skb =3D skb;
 		r =3D ieee80211_tx_h_rate_ctrl(&tx);
@@ -3231,31 +3342,11 @@ static bool ieee80211_xmit_fast(struct ieee80211_=
sub_if_data *sdata,
 		}
 	}
=20
-	/* statistics normally done by ieee80211_tx_h_stats (but that
-	 * has to consider fragmentation, so is more complex)
-	 */
-	sta->tx_stats.bytes[skb_get_queue_mapping(skb)] +=3D skb->len;
-	sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
-
-	if (fast_tx->pn_offs) {
-		u64 pn;
-		u8 *crypto_hdr =3D skb->data + fast_tx->pn_offs;
+	if (ieee80211_queue_skb(local, sdata, sta, skb))
+		return true;
=20
-		switch (fast_tx->key->conf.cipher) {
-		case WLAN_CIPHER_SUITE_CCMP:
-		case WLAN_CIPHER_SUITE_CCMP_256:
-		case WLAN_CIPHER_SUITE_GCMP:
-		case WLAN_CIPHER_SUITE_GCMP_256:
-			pn =3D atomic64_inc_return(&fast_tx->key->conf.tx_pn);
-			crypto_hdr[0] =3D pn;
-			crypto_hdr[1] =3D pn >> 8;
-			crypto_hdr[4] =3D pn >> 16;
-			crypto_hdr[5] =3D pn >> 24;
-			crypto_hdr[6] =3D pn >> 32;
-			crypto_hdr[7] =3D pn >> 40;
-			break;
-		}
-	}
+	ieee80211_xmit_fast_finish(sdata, sta, fast_tx->pn_offs,
+				   fast_tx->key, skb);
=20
 	if (sdata->vif.type =3D=3D NL80211_IFTYPE_AP_VLAN)
 		sdata =3D container_of(sdata->bss,
@@ -3275,12 +3366,22 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee8=
0211_hw *hw,
 	struct sk_buff *skb =3D NULL;
 	struct fq *fq =3D &local->fq;
 	struct fq_tin *tin =3D &txqi->tin;
+	struct ieee80211_tx_info *info;
+	struct ieee80211_tx_data tx;
+	ieee80211_tx_result r;
+
=20
 	spin_lock_bh(&fq->lock);
=20
 	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
 		goto out;
=20
+	/* Make sure fragments stay together. */
+	skb =3D __skb_dequeue(&txqi->frags);
+	if (skb)
+		goto out;
+
+begin:
 	skb =3D fq_tin_dequeue(fq, tin, fq_tin_dequeue_func);
 	if (!skb)
 		goto out;
@@ -3288,16 +3389,46 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee8=
0211_hw *hw,
 	ieee80211_set_skb_vif(skb, txqi);
=20
 	hdr =3D (struct ieee80211_hdr *)skb->data;
-	if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+	info =3D IEEE80211_SKB_CB(skb);
+
+	memset(&tx, 0, sizeof(tx));
+	__skb_queue_head_init(&tx.skbs);
+	tx.local =3D local;
+	tx.skb =3D skb;
+	tx.sdata =3D vif_to_sdata(info->control.vif);
+
+	if (txq->sta)
+		tx.sta =3D container_of(txq->sta, struct sta_info, sta);
+
+	/*
+	 * The key can be removed while the packet was queued, so need to call
+	 * this here to get the current key.
+	 */
+	r =3D ieee80211_tx_h_select_key(&tx);
+	if (r !=3D TX_CONTINUE) {
+		ieee80211_free_txskb(&local->hw, skb);
+		goto begin;
+	}
+
+	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
 		struct sta_info *sta =3D container_of(txq->sta, struct sta_info,
 						    sta);
-		struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(skb);
+		u8 pn_offs =3D 0;
=20
-		hdr->seq_ctrl =3D ieee80211_tx_next_seq(sta, txq->tid);
-		if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags))
-			info->flags |=3D IEEE80211_TX_CTL_AMPDU;
-		else
-			info->flags &=3D ~IEEE80211_TX_CTL_AMPDU;
+		if (tx.key &&
+		    (tx.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV))
+			pn_offs =3D ieee80211_hdrlen(hdr->frame_control);
+
+		ieee80211_xmit_fast_finish(sta->sdata, sta, pn_offs,
+					   tx.key, skb);
+	} else {
+		if (invoke_tx_handlers_late(&tx))
+			goto begin;
+
+		skb =3D __skb_dequeue(&tx.skbs);
+
+		if (!skb_queue_empty(&tx.skbs))
+			skb_queue_splice_tail(&tx.skbs, &txqi->frags);
 	}
=20
 out:
@@ -3335,7 +3466,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *s=
kb,
 		fast_tx =3D rcu_dereference(sta->fast_tx);
=20
 		if (fast_tx &&
-		    ieee80211_xmit_fast(sdata, dev, sta, fast_tx, skb))
+		    ieee80211_xmit_fast(sdata, sta, fast_tx, skb))
 			goto out;
 	}
=20
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index b6865d8..8006f9a 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -3393,11 +3393,18 @@ void ieee80211_txq_get_depth(struct ieee80211_txq=
 *txq,
 			     unsigned long *byte_cnt)
 {
 	struct txq_info *txqi =3D to_txq_info(txq);
+	u32 frag_cnt =3D 0, frag_bytes =3D 0;
+	struct sk_buff *skb;
+
+	skb_queue_walk(&txqi->frags, skb) {
+		frag_cnt++;
+		frag_bytes +=3D skb->len;
+	}
=20
 	if (frame_cnt)
-		*frame_cnt =3D txqi->tin.backlog_packets;
+		*frame_cnt =3D txqi->tin.backlog_packets + frag_cnt;
=20
 	if (byte_cnt)
-		*byte_cnt =3D txqi->tin.backlog_bytes;
+		*byte_cnt =3D txqi->tin.backlog_bytes + frag_bytes;
 }
 EXPORT_SYMBOL(ieee80211_txq_get_depth);
--=20
2.9.3

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue
  2016-09-22 17:04               ` [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue Toke Høiland-Jørgensen
@ 2016-09-30 10:27                 ` Johannes Berg
  2016-09-30 12:39                   ` Toke Høiland-Jørgensen
  2016-09-30 12:49                 ` Johannes Berg
  1 sibling, 1 reply; 51+ messages in thread
From: Johannes Berg @ 2016-09-30 10:27 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless

Hi Toke,

Sorry for the delay reviewing this.

I think I still have a few comments/questions.

> +static inline bool txq_has_queue(struct ieee80211_txq *txq)
> +{
> +	struct txq_info *txqi = to_txq_info(txq);
> +	return !(skb_queue_empty(&txqi->frags) && !txqi->tin.backlog_packets);
> +}

Tiny nit - there should probably be a blank line between the two lines
here, but I could just fix that when I apply if you don't resend anyway
for some other reason.

[snip helper stuff that looks fine]

> -	if (!tx->sta->sta.txq[0])
> -		hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
> +	hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);

Just to make sure I get this right - this is because the handler is now
run on dequeue, so the special case is no longer needed?

>  #define CALL_TXH(txh) \
> @@ -1656,6 +1684,31 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
>  	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
>  		CALL_TXH(ieee80211_tx_h_rate_ctrl);

Just for reference - the code block here that's unchanged contains
this:

        CALL_TXH(ieee80211_tx_h_dynamic_ps);
        CALL_TXH(ieee80211_tx_h_check_assoc);
        CALL_TXH(ieee80211_tx_h_ps_buf);
        CALL_TXH(ieee80211_tx_h_check_control_port_protocol);
        CALL_TXH(ieee80211_tx_h_select_key);
        if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
                CALL_TXH(ieee80211_tx_h_rate_ctrl);

> +static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
> +{
> +	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
> +	ieee80211_tx_result res = TX_CONTINUE;
> +
>  	if (unlikely(info->flags &
> IEEE80211_TX_INTFL_RETRANSMISSION)) {
>  		__skb_queue_tail(&tx->skbs, tx->skb);
>  		tx->skb = NULL;

And this code here is also unchanged from the original TX handler
invocation, so contains this:

        if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) {
                __skb_queue_tail(&tx->skbs, tx->skb);
                tx->skb = NULL;
                goto txh_done;
        }

        CALL_TXH(ieee80211_tx_h_michael_mic_add);
        CALL_TXH(ieee80211_tx_h_sequence);
        CALL_TXH(ieee80211_tx_h_fragment);
        /* handlers after fragment must be aware of tx info fragmentation! */
        CALL_TXH(ieee80211_tx_h_stats);
        CALL_TXH(ieee80211_tx_h_encrypt);
        if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
                CALL_TXH(ieee80211_tx_h_calculate_duration);

But now you have a problem (that you solved) that the key pointer can
be invalidated while you have the packet queued between the two points,
and then the tx_h_michael_mic_add and/or tx_h_encrypt would crash.

You solve this by re-running tx_h_select_key() on dequeue, but it's not
clear to me why you didn't move that to the late handlers instead?

I *think* it should commute with the rate control handler, but even so,
wouldn't it make more sense to have rate control late? Assuming the
packets are queued for some amount of time, having rate control
information queued with them would get stale.

Similarly, it seems to me that checking the control port protocol later
(or perhaps duplicating that?) would be a good idea?


> +/*
> + * Can be called while the sta lock is held. Anything that can cause
> packets to
> + * be generated will cause deadlock!
> + */
> +static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data
> *sdata,
> +				       struct sta_info *sta, u8
> pn_offs,
> +				       struct ieee80211_key *key,
> +				       struct sk_buff *skb)

That should be a void function now, you never check the return value
and only return true anyway.

> +	struct ieee80211_tx_info *info;
> +	struct ieee80211_tx_data tx;
> +	ieee80211_tx_result r;
> +

nit: extra blank line

>  	spin_lock_bh(&fq->lock);
>  
>  	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
>  		goto out;
>  
> +	/* Make sure fragments stay together. */
> +	skb = __skb_dequeue(&txqi->frags);
> +	if (skb)
> +		goto out;
> +
> +begin:

I guess now that you introduced that anyway, we should consider making
the skb_linearize() failure go there. Should be a follow-up patch
though.

> +	/*
> +	 * The key can be removed while the packet was queued, so
> need to call
> +	 * this here to get the current key.
> +	 */
> +	r = ieee80211_tx_h_select_key(&tx);
> +	if (r != TX_CONTINUE) {
> +		ieee80211_free_txskb(&local->hw, skb);
> +		goto begin;
> +	}
> +
> +	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {

It's a bit unfortunate that you lose fast-xmit here completely for the
key stuff, but I don't see a good way to avoid that, other than
completely rejiggering all the (possibly affected) queues when keys
change... might be very complex to do that, certainly a follow-up patch
if it's desired.

This check seems a bit weird though - how could fast-xmit be set
without a TXQ station?

> +++ b/net/mac80211/util.c
> @@ -3393,11 +3393,18 @@ void ieee80211_txq_get_depth(struct
> ieee80211_txq *txq,
>  			     unsigned long *byte_cnt)
>  {
>  	struct txq_info *txqi = to_txq_info(txq);
> +	u32 frag_cnt = 0, frag_bytes = 0;
> +	struct sk_buff *skb;
> +
> +	skb_queue_walk(&txqi->frags, skb) {
> +		frag_cnt++;
> +		frag_bytes += skb->len;
> +	}

I hope this is called infrequently :)

johannes

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 1/2] mac80211: Move ieee802111_tx_dequeue() to later in tx.c
  2016-09-22 17:04               ` [PATCH v9 1/2] mac80211: Move ieee802111_tx_dequeue() to later in tx.c Toke Høiland-Jørgensen
@ 2016-09-30 11:13                 ` Johannes Berg
  0 siblings, 0 replies; 51+ messages in thread
From: Johannes Berg @ 2016-09-30 11:13 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless

On Thu, 2016-09-22 at 19:04 +0200, Toke Høiland-Jørgensen wrote:
> The TXQ path restructure requires ieee80211_tx_dequeue() to call TX
> handlers and parts of the xmit_fast path. Move the function to later
> in tx.c in preparation for this.
> 
Applied.

johannes

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue
  2016-09-30 10:27                 ` Johannes Berg
@ 2016-09-30 12:39                   ` Toke Høiland-Jørgensen
  2016-09-30 12:43                     ` Johannes Berg
  0 siblings, 1 reply; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-30 12:39 UTC (permalink / raw)
  To: Johannes Berg; +Cc: make-wifi-fast, linux-wireless

Johannes Berg <johannes@sipsolutions.net> writes:

> Hi Toke,
>
> Sorry for the delay reviewing this.
>
> I think I still have a few comments/questions.

No worries. And not terribly surprised ;)

>> +static inline bool txq_has_queue(struct ieee80211_txq *txq)
>> +{
>> +	struct txq_info *txqi =3D to_txq_info(txq);
>> +	return !(skb_queue_empty(&txqi->frags) && !txqi->tin.backlog_packets);
>> +}
>
> Tiny nit - there should probably be a blank line between the two lines
> here, but I could just fix that when I apply if you don't resend anyway
> for some other reason.

Noted.

> [snip helper stuff that looks fine]
>
>> -	if (!tx->sta->sta.txq[0])
>> -		hdr->seq_ctrl =3D ieee80211_tx_next_seq(tx->sta, tid);
>> +	hdr->seq_ctrl =3D ieee80211_tx_next_seq(tx->sta, tid);
>
> Just to make sure I get this right - this is because the handler is now
> run on dequeue, so the special case is no longer needed?

Yup. The same change is made in xmit_fast (but obscured by the moving of
the surrounding code into _finish()).

>> =C2=A0#define CALL_TXH(txh) \
>> @@ -1656,6 +1684,31 @@ static int invoke_tx_handlers(struct ieee80211_tx=
_data *tx)
>> =C2=A0	if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL))
>> =C2=A0		CALL_TXH(ieee80211_tx_h_rate_ctrl);
>
> Just for reference - the code block here that's unchanged contains
> this:
>
> =C2=A0 =C2=A0 =C2=A0 =C2=A0 CALL_TXH(ieee80211_tx_h_dynamic_ps);
> =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0CALL_TXH(ieee80211_tx_h_c=
heck_assoc);
> =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0CALL_TXH(ieee80211_tx_h_p=
s_buf);
> =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0CALL_TXH(ieee80211_tx_h_c=
heck_control_port_protocol);
> =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0CALL_TXH(ieee80211_tx_h_s=
elect_key);
> =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0if (!ieee80211_hw_check(&=
tx->local->hw, HAS_RATE_CONTROL))
> =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=
=C2=A0=C2=A0=C2=A0=C2=A0CALL_TXH(ieee80211_tx_h_rate_ctrl);
>
>> +static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
>> +{
>> +	struct ieee80211_tx_info *info =3D IEEE80211_SKB_CB(tx->skb);
>> +	ieee80211_tx_result res =3D TX_CONTINUE;
>> +
>> =C2=A0	if (unlikely(info->flags &
>> IEEE80211_TX_INTFL_RETRANSMISSION)) {
>> =C2=A0		__skb_queue_tail(&tx->skbs, tx->skb);
>> =C2=A0		tx->skb =3D NULL;
>
> And this code here is also unchanged from the original TX handler
> invocation, so contains this:
>
> =C2=A0 =C2=A0 =C2=A0 =C2=A0 if (unlikely(info->flags & IEEE80211_TX_INTFL=
_RETRANSMISSION)) {
> =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=
=C2=A0=C2=A0=C2=A0=C2=A0__skb_queue_tail(&tx->skbs, tx->skb);
> =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=
=C2=A0=C2=A0=C2=A0=C2=A0tx->skb =3D NULL;
> =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=
=C2=A0=C2=A0=C2=A0=C2=A0goto txh_done;
> =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0}
>
> =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0CALL_TXH(ieee80211_tx_h_m=
ichael_mic_add);
> =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0CALL_TXH(ieee80211_tx_h_s=
equence);
> =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0CALL_TXH(ieee80211_tx_h_f=
ragment);
> =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0/* handlers after fragmen=
t must be aware of tx info fragmentation! */
> =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0CALL_TXH(ieee80211_tx_h_s=
tats);
> =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0CALL_TXH(ieee80211_tx_h_e=
ncrypt);
> =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0if (!ieee80211_hw_check(&=
tx->local->hw, HAS_RATE_CONTROL))
> =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=
=C2=A0=C2=A0=C2=A0=C2=A0CALL_TXH(ieee80211_tx_h_calculate_duration);
>
> But now you have a problem (that you solved) that the key pointer can
> be invalidated while you have the packet queued between the two points,
> and then the tx_h_michael_mic_add and/or tx_h_encrypt would crash.
>
> You solve this by re-running tx_h_select_key() on dequeue, but it's not
> clear to me why you didn't move that to the late handlers instead?

Because I need to run it anyway for the xmit_fast path on dequeue. I
thought doing it this way simplifies the code (at the cost of the
handler getting called twice when xmit_fast is not active).

> I *think* it should commute with the rate control handler, but even
> so, wouldn't it make more sense to have rate control late? Assuming
> the packets are queued for some amount of time, having rate control
> information queued with them would get stale.

Yes, having rate control run at dequeue would be good, and that's what I
did initially. However, I found that this would lead to a deadlock
because the rate control handler would send out packets in some cases (I
forget the details but can go back and check if needed). And since the
dequeue function is called with the driver TXQ lock held, that would
lead to a deadlock when those packets made it to the driver TX path.

So I decided to just keep it this way for now; I plan to go poking into
the rate controller later anyway, so moving the handler to later could
be part of that.

> Similarly, it seems to me that checking the control port protocol later
> (or perhaps duplicating that?) would be a good idea?

But that handler only sets a few flags? Is
tx->sdata->control_port_protocol likely to change while the packet is
queued?

>> +/*
>> + * Can be called while the sta lock is held. Anything that can cause
>> packets to
>> + * be generated will cause deadlock!
>> + */
>> +static bool ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data
>> *sdata,
>> +				=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0struct sta_info *sta, u8
>> pn_offs,
>> +				=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0struct ieee80211_key *key,
>> +				=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0struct sk_buff *skb)
>
> That should be a void function now, you never check the return value
> and only return true anyway.

Noted.

>> +	struct ieee80211_tx_info *info;
>> +	struct ieee80211_tx_data tx;
>> +	ieee80211_tx_result r;
>> +
>
> nit: extra blank line

The horror ;) (thought I got rid of all those; ah well, will fix)
>
>> =C2=A0	spin_lock_bh(&fq->lock);
>> =C2=A0
>> =C2=A0	if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags))
>> =C2=A0		goto out;
>> =C2=A0
>> +	/* Make sure fragments stay together. */
>> +	skb =3D __skb_dequeue(&txqi->frags);
>> +	if (skb)
>> +		goto out;
>> +
>> +begin:
>
> I guess now that you introduced that anyway, we should consider making
> the skb_linearize() failure go there. Should be a follow-up patch
> though.

Can do.

>
>> +	/*
>> +	=C2=A0* The key can be removed while the packet was queued, so
>> need to call
>> +	=C2=A0* this here to get the current key.
>> +	=C2=A0*/
>> +	r =3D ieee80211_tx_h_select_key(&tx);
>> +	if (r !=3D TX_CONTINUE) {
>> +		ieee80211_free_txskb(&local->hw, skb);
>> +		goto begin;
>> +	}
>> +
>> +	if (txq->sta && info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
>
> It's a bit unfortunate that you lose fast-xmit here completely for the
> key stuff, but I don't see a good way to avoid that, other than
> completely rejiggering all the (possibly affected) queues when keys
> change... might be very complex to do that, certainly a follow-up
> patch if it's desired.

Yeah, figured it was better to have something that's correct and then go
back and change it if the performance hit turns out to be too high.

> This check seems a bit weird though - how could fast-xmit be set
> without a TXQ station?

I think that is probably just left over from before I introduced the
control flag. Should be fine to remove it.

>> +++ b/net/mac80211/util.c
>> @@ -3393,11 +3393,18 @@ void ieee80211_txq_get_depth(struct
>> ieee80211_txq *txq,
>> =C2=A0			=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0unsigned long *byte_cnt)
>> =C2=A0{
>> =C2=A0	struct txq_info *txqi =3D to_txq_info(txq);
>> +	u32 frag_cnt =3D 0, frag_bytes =3D 0;
>> +	struct sk_buff *skb;
>> +
>> +	skb_queue_walk(&txqi->frags, skb) {
>> +		frag_cnt++;
>> +		frag_bytes +=3D skb->len;
>> +	}
>
> I hope this is called infrequently :)

Well, ath10k is the only user. It does get called on each wake_tx_queue,
though, so not that infrequently. My reasoning was that since the frags
queue is never going to have more than a fairly small number of packets
in it (those produced from a single split packet), counting this way is
acceptable instead of keeping a state variable up to date. Can change it
if you disagree :)


Not sure if you want a v10, or if you're satisfied with the above
comments and will just fix up the nits on merging?

-Toke

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue
  2016-09-30 12:39                   ` Toke Høiland-Jørgensen
@ 2016-09-30 12:43                     ` Johannes Berg
  2016-09-30 12:45                       ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 51+ messages in thread
From: Johannes Berg @ 2016-09-30 12:43 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen; +Cc: make-wifi-fast, linux-wireless

> Because I need to run it anyway for the xmit_fast path on dequeue. I
> thought doing it this way simplifies the code (at the cost of the
> handler getting called twice when xmit_fast is not active).

Ok, that's fair.

> > I *think* it should commute with the rate control handler, but even
> > so, wouldn't it make more sense to have rate control late? Assuming
> > the packets are queued for some amount of time, having rate control
> > information queued with them would get stale.
> 
> Yes, having rate control run at dequeue would be good, and that's
> what I did initially. However, I found that this would lead to a
> deadlock because the rate control handler would send out packets in
> some cases (I forget the details but can go back and check if
> needed). And since the dequeue function is called with the driver TXQ
> lock held, that would lead to a deadlock when those packets made it
> to the driver TX path.

That seems really odd, but I can see how a deadlock happens then.

> So I decided to just keep it this way for now; I plan to go poking
> into the rate controller later anyway, so moving the handler to later
> could be part of that.

Sure, that's fair.

> But that handler only sets a few flags? Is
> tx->sdata->control_port_protocol likely to change while the packet is
> queued?

Oh right, I confused things there. We check the controlled port much
earlier, but anyway that should be OK.

> > It's a bit unfortunate that you lose fast-xmit here completely for
> > the key stuff, but I don't see a good way to avoid that, other than
> > completely rejiggering all the (possibly affected) queues when keys
> > change... might be very complex to do that, certainly a follow-up
> > patch if it's desired.
> 
> Yeah, figured it was better to have something that's correct and then
> go back and change it if the performance hit turns out to be too
> high.

Makes sense.

> > This check seems a bit weird though - how could fast-xmit be set
> > without a TXQ station?
> 
> I think that is probably just left over from before I introduced the
> control flag. Should be fine to remove it.

Ok.

> > 
> > > 
> > > +++ b/net/mac80211/util.c
> > > @@ -3393,11 +3393,18 @@ void ieee80211_txq_get_depth(struct
> > > ieee80211_txq *txq,
> > >  			     unsigned long *byte_cnt)
> > >  {
> > >  	struct txq_info *txqi = to_txq_info(txq);
> > > +	u32 frag_cnt = 0, frag_bytes = 0;
> > > +	struct sk_buff *skb;
> > > +
> > > +	skb_queue_walk(&txqi->frags, skb) {
> > > +		frag_cnt++;
> > > +		frag_bytes += skb->len;
> > > +	}
> > 
> > I hope this is called infrequently :)
> 
> Well, ath10k is the only user. It does get called on each
> wake_tx_queue, though, so not that infrequently. My reasoning was
> that since the frags queue is never going to have more than a fairly
> small number of packets in it (those produced from a single split
> packet), counting this way is acceptable instead of keeping a state
> variable up to date. Can change it if you disagree :)

No, I guess you're right, it can't be a long queue.

> Not sure if you want a v10, or if you're satisfied with the above
> comments and will just fix up the nits on merging?
> 

I'll fix it up. Thanks!

johannes

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue
  2016-09-30 12:43                     ` Johannes Berg
@ 2016-09-30 12:45                       ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-30 12:45 UTC (permalink / raw)
  To: Johannes Berg; +Cc: make-wifi-fast, linux-wireless

Johannes Berg <johannes@sipsolutions.net> writes:

>> Not sure if you want a v10, or if you're satisfied with the above
>> comments and will just fix up the nits on merging?
>> 
>
> I'll fix it up. Thanks!

Cool, thanks :)

-Toke

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue
  2016-09-22 17:04               ` [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue Toke Høiland-Jørgensen
  2016-09-30 10:27                 ` Johannes Berg
@ 2016-09-30 12:49                 ` Johannes Berg
  2016-09-30 14:01                   ` Toke Høiland-Jørgensen
  1 sibling, 1 reply; 51+ messages in thread
From: Johannes Berg @ 2016-09-30 12:49 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, make-wifi-fast, linux-wireless

Applied, with the nits fixed as discussed.

Come to think of it, if somebody is bored ;-) perhaps a hwsim option to
use TXQs (should be optional I guess) would be nice so we can exercise
this code with the wpa_supplicant hwsim tests. That would have caught
the TKIP issues etc. pretty early on too, I think.

johannes

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue
  2016-09-30 12:49                 ` Johannes Berg
@ 2016-09-30 14:01                   ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 51+ messages in thread
From: Toke Høiland-Jørgensen @ 2016-09-30 14:01 UTC (permalink / raw)
  To: Johannes Berg; +Cc: make-wifi-fast, linux-wireless

Johannes Berg <johannes@sipsolutions.net> writes:

> Applied, with the nits fixed as discussed.

Awesome, thanks!

> Come to think of it, if somebody is bored ;-) perhaps a hwsim option
> to use TXQs (should be optional I guess) would be nice so we can
> exercise this code with the wpa_supplicant hwsim tests. That would
> have caught the TKIP issues etc. pretty early on too, I think.

Noted. I'll look into that the next time I'm bored ;)

-Toke

^ permalink raw reply	[flat|nested] 51+ messages in thread

end of thread, other threads:[~2016-09-30 14:01 UTC | newest]

Thread overview: 51+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-17 12:58 [PATCH] mac80211: Move crypto IV generation to after TXQ dequeue Toke Høiland-Jørgensen
2016-08-17 13:08 ` Johannes Berg
2016-08-17 13:16   ` Toke Høiland-Jørgensen
2016-08-17 13:18     ` Johannes Berg
2016-08-17 13:23       ` Toke Høiland-Jørgensen
2016-08-17 14:45 ` [PATCH v2] " Toke Høiland-Jørgensen
2016-08-17 19:49   ` Johannes Berg
2016-08-17 20:07     ` [Make-wifi-fast] " Dave Taht
2016-08-17 20:43       ` Johannes Berg
2016-08-22 14:47         ` Toke Høiland-Jørgensen
2016-08-26  8:38           ` Johannes Berg
2016-08-26  8:54             ` Toke Høiland-Jørgensen
2016-08-24 16:20   ` [PATCH v3] mac80211: Move reorder-sensitive TX handlers " Toke Høiland-Jørgensen
2016-08-30 13:15     ` [PATCH v4] " Toke Høiland-Jørgensen
2016-08-31 21:06       ` Johannes Berg
2016-09-01  8:23         ` Toke Høiland-Jørgensen
2016-09-01  8:34           ` Johannes Berg
2016-09-01  8:38             ` Toke Høiland-Jørgensen
2016-09-01  9:07               ` Johannes Berg
2016-09-01  9:20                 ` Toke Høiland-Jørgensen
2016-09-01  9:27                   ` Johannes Berg
2016-09-01  9:42                     ` Toke Høiland-Jørgensen
2016-09-01 16:03       ` [PATCH v5] " Toke Høiland-Jørgensen
2016-09-01 17:59         ` Johannes Berg
2016-09-01 18:30           ` Toke Høiland-Jørgensen
2016-09-01 18:35             ` Johannes Berg
2016-09-02  2:48         ` Jason Andryuk
2016-09-02  9:27           ` Toke Høiland-Jørgensen
2016-09-02 13:41         ` [PATCH v6] " Toke Høiland-Jørgensen
2016-09-02 14:44           ` Toke Høiland-Jørgensen
2016-09-05 11:30           ` [PATCH v7] " Toke Høiland-Jørgensen
2016-09-05 17:49             ` Felix Fietkau
2016-09-05 17:59               ` Toke Høiland-Jørgensen
2016-09-05 18:44                 ` Felix Fietkau
2016-09-06 11:43             ` Toke Høiland-Jørgensen
2016-09-06 11:45               ` Toke Høiland-Jørgensen
2016-09-06 11:44             ` [PATCH v8] " Toke Høiland-Jørgensen
2016-09-06 22:04               ` Felix Fietkau
2016-09-12 12:35               ` Johannes Berg
2016-09-12 13:08                 ` Toke Høiland-Jørgensen
2016-09-12 13:19                   ` Johannes Berg
2016-09-22 17:04               ` [PATCH v9 0/2] mac80211: TXQ dequeue path rework Toke Høiland-Jørgensen
2016-09-22 17:04               ` [PATCH v9 1/2] mac80211: Move ieee802111_tx_dequeue() to later in tx.c Toke Høiland-Jørgensen
2016-09-30 11:13                 ` Johannes Berg
2016-09-22 17:04               ` [PATCH v9 2/2] mac80211: Move reorder-sensitive TX handlers to after TXQ dequeue Toke Høiland-Jørgensen
2016-09-30 10:27                 ` Johannes Berg
2016-09-30 12:39                   ` Toke Høiland-Jørgensen
2016-09-30 12:43                     ` Johannes Berg
2016-09-30 12:45                       ` Toke Høiland-Jørgensen
2016-09-30 12:49                 ` Johannes Berg
2016-09-30 14:01                   ` Toke Høiland-Jørgensen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.