All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] bpf: Use spinlock_t in bpf_lru_list
@ 2019-04-10 14:30 Sebastian Andrzej Siewior
  2019-04-10 14:30 ` [PATCH 2/3] bpf: Use spinlock_t in hashtab Sebastian Andrzej Siewior
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Sebastian Andrzej Siewior @ 2019-04-10 14:30 UTC (permalink / raw)
  To: bpf
  Cc: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
	Yonghong Song, tglx, Steven Rostedt, Sebastian Andrzej Siewior

There is no difference between spinlock_t and raw_spinlock_t for !RT
kernels. It is possible that bpf_lru_list_pop_free_to_local() invokes
at least three list walks which look unbounded it probably makes sense
to use spinlock_t.

Make bpf_lru_list use a spinlock_t.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/bpf/bpf_lru_list.c | 38 +++++++++++++++++++-------------------
 kernel/bpf/bpf_lru_list.h |  4 ++--
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index e6ef4401a1380..40f47210c3817 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -313,9 +313,9 @@ static void bpf_lru_list_push_free(struct bpf_lru_list *l,
 	if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
 		return;
 
-	raw_spin_lock_irqsave(&l->lock, flags);
+	spin_lock_irqsave(&l->lock, flags);
 	__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
-	raw_spin_unlock_irqrestore(&l->lock, flags);
+	spin_unlock_irqrestore(&l->lock, flags);
 }
 
 static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
@@ -325,7 +325,7 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
 	struct bpf_lru_node *node, *tmp_node;
 	unsigned int nfree = 0;
 
-	raw_spin_lock(&l->lock);
+	spin_lock(&l->lock);
 
 	__local_list_flush(l, loc_l);
 
@@ -344,7 +344,7 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
 				      local_free_list(loc_l),
 				      BPF_LRU_LOCAL_LIST_T_FREE);
 
-	raw_spin_unlock(&l->lock);
+	spin_unlock(&l->lock);
 }
 
 static void __local_list_add_pending(struct bpf_lru *lru,
@@ -410,7 +410,7 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
 
 	l = per_cpu_ptr(lru->percpu_lru, cpu);
 
-	raw_spin_lock_irqsave(&l->lock, flags);
+	spin_lock_irqsave(&l->lock, flags);
 
 	__bpf_lru_list_rotate(lru, l);
 
@@ -426,7 +426,7 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
 		__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
 	}
 
-	raw_spin_unlock_irqrestore(&l->lock, flags);
+	spin_unlock_irqrestore(&l->lock, flags);
 
 	return node;
 }
@@ -443,7 +443,7 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
 
 	loc_l = per_cpu_ptr(clru->local_list, cpu);
 
-	raw_spin_lock_irqsave(&loc_l->lock, flags);
+	spin_lock_irqsave(&loc_l->lock, flags);
 
 	node = __local_list_pop_free(loc_l);
 	if (!node) {
@@ -454,7 +454,7 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
 	if (node)
 		__local_list_add_pending(lru, loc_l, cpu, node, hash);
 
-	raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+	spin_unlock_irqrestore(&loc_l->lock, flags);
 
 	if (node)
 		return node;
@@ -472,13 +472,13 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
 	do {
 		steal_loc_l = per_cpu_ptr(clru->local_list, steal);
 
-		raw_spin_lock_irqsave(&steal_loc_l->lock, flags);
+		spin_lock_irqsave(&steal_loc_l->lock, flags);
 
 		node = __local_list_pop_free(steal_loc_l);
 		if (!node)
 			node = __local_list_pop_pending(lru, steal_loc_l);
 
-		raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
+		spin_unlock_irqrestore(&steal_loc_l->lock, flags);
 
 		steal = get_next_cpu(steal);
 	} while (!node && steal != first_steal);
@@ -486,9 +486,9 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
 	loc_l->next_steal = steal;
 
 	if (node) {
-		raw_spin_lock_irqsave(&loc_l->lock, flags);
+		spin_lock_irqsave(&loc_l->lock, flags);
 		__local_list_add_pending(lru, loc_l, cpu, node, hash);
-		raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+		spin_unlock_irqrestore(&loc_l->lock, flags);
 	}
 
 	return node;
@@ -516,10 +516,10 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru,
 
 		loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu);
 
-		raw_spin_lock_irqsave(&loc_l->lock, flags);
+		spin_lock_irqsave(&loc_l->lock, flags);
 
 		if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) {
-			raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+			spin_unlock_irqrestore(&loc_l->lock, flags);
 			goto check_lru_list;
 		}
 
@@ -527,7 +527,7 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru,
 		node->ref = 0;
 		list_move(&node->list, local_free_list(loc_l));
 
-		raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+		spin_unlock_irqrestore(&loc_l->lock, flags);
 		return;
 	}
 
@@ -543,11 +543,11 @@ static void bpf_percpu_lru_push_free(struct bpf_lru *lru,
 
 	l = per_cpu_ptr(lru->percpu_lru, node->cpu);
 
-	raw_spin_lock_irqsave(&l->lock, flags);
+	spin_lock_irqsave(&l->lock, flags);
 
 	__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
 
-	raw_spin_unlock_irqrestore(&l->lock, flags);
+	spin_unlock_irqrestore(&l->lock, flags);
 }
 
 void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
@@ -627,7 +627,7 @@ static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu)
 
 	loc_l->next_steal = cpu;
 
-	raw_spin_lock_init(&loc_l->lock);
+	spin_lock_init(&loc_l->lock);
 }
 
 static void bpf_lru_list_init(struct bpf_lru_list *l)
@@ -642,7 +642,7 @@ static void bpf_lru_list_init(struct bpf_lru_list *l)
 
 	l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE];
 
-	raw_spin_lock_init(&l->lock);
+	spin_lock_init(&l->lock);
 }
 
 int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 7d4f89b7cb841..4e1e4608f1bb0 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -36,13 +36,13 @@ struct bpf_lru_list {
 	/* The next inacitve list rotation starts from here */
 	struct list_head *next_inactive_rotation;
 
-	raw_spinlock_t lock ____cacheline_aligned_in_smp;
+	spinlock_t lock ____cacheline_aligned_in_smp;
 };
 
 struct bpf_lru_locallist {
 	struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T];
 	u16 next_steal;
-	raw_spinlock_t lock;
+	spinlock_t lock;
 };
 
 struct bpf_common_lru {
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/3] bpf: Use spinlock_t in hashtab
  2019-04-10 14:30 [PATCH 1/3] bpf: Use spinlock_t in bpf_lru_list Sebastian Andrzej Siewior
@ 2019-04-10 14:30 ` Sebastian Andrzej Siewior
  2019-04-10 14:30 ` [PATCH 3/3] bpf: Use spinlock_t in lpm_trie Sebastian Andrzej Siewior
  2019-04-10 17:08 ` [PATCH 1/3] bpf: Use spinlock_t in bpf_lru_list Yonghong Song
  2 siblings, 0 replies; 12+ messages in thread
From: Sebastian Andrzej Siewior @ 2019-04-10 14:30 UTC (permalink / raw)
  To: bpf
  Cc: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
	Yonghong Song, tglx, Steven Rostedt, Sebastian Andrzej Siewior

There is no difference between spinlock_t and raw_spinlock_t for !RT
kernels. htab_map_update_elem() may allocate memory while it is holding
the ->lock which is not possible on RT kernels.

Make the ->lock a spinlock_t.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/bpf/hashtab.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index f9274114c88d3..b4f903a5ef36e 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -27,7 +27,7 @@
 
 struct bucket {
 	struct hlist_nulls_head head;
-	raw_spinlock_t lock;
+	spinlock_t lock;
 };
 
 struct bpf_htab {
@@ -385,7 +385,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 
 	for (i = 0; i < htab->n_buckets; i++) {
 		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
-		raw_spin_lock_init(&htab->buckets[i].lock);
+		spin_lock_init(&htab->buckets[i].lock);
 	}
 
 	if (prealloc) {
@@ -580,7 +580,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
 	b = __select_bucket(htab, tgt_l->hash);
 	head = &b->head;
 
-	raw_spin_lock_irqsave(&b->lock, flags);
+	spin_lock_irqsave(&b->lock, flags);
 
 	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
 		if (l == tgt_l) {
@@ -588,7 +588,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
 			break;
 		}
 
-	raw_spin_unlock_irqrestore(&b->lock, flags);
+	spin_unlock_irqrestore(&b->lock, flags);
 
 	return l == tgt_l;
 }
@@ -842,7 +842,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	head = &b->head;
 
 	/* bpf_map_update_elem() can be called in_irq() */
-	raw_spin_lock_irqsave(&b->lock, flags);
+	spin_lock_irqsave(&b->lock, flags);
 
 	l_old = lookup_elem_raw(head, hash, key, key_size);
 
@@ -869,7 +869,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	}
 	ret = 0;
 err:
-	raw_spin_unlock_irqrestore(&b->lock, flags);
+	spin_unlock_irqrestore(&b->lock, flags);
 	return ret;
 }
 
@@ -908,7 +908,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
 	memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
 
 	/* bpf_map_update_elem() can be called in_irq() */
-	raw_spin_lock_irqsave(&b->lock, flags);
+	spin_lock_irqsave(&b->lock, flags);
 
 	l_old = lookup_elem_raw(head, hash, key, key_size);
 
@@ -927,7 +927,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
 	ret = 0;
 
 err:
-	raw_spin_unlock_irqrestore(&b->lock, flags);
+	spin_unlock_irqrestore(&b->lock, flags);
 
 	if (ret)
 		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
@@ -963,7 +963,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 	head = &b->head;
 
 	/* bpf_map_update_elem() can be called in_irq() */
-	raw_spin_lock_irqsave(&b->lock, flags);
+	spin_lock_irqsave(&b->lock, flags);
 
 	l_old = lookup_elem_raw(head, hash, key, key_size);
 
@@ -986,7 +986,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 	}
 	ret = 0;
 err:
-	raw_spin_unlock_irqrestore(&b->lock, flags);
+	spin_unlock_irqrestore(&b->lock, flags);
 	return ret;
 }
 
@@ -1027,7 +1027,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
 	}
 
 	/* bpf_map_update_elem() can be called in_irq() */
-	raw_spin_lock_irqsave(&b->lock, flags);
+	spin_lock_irqsave(&b->lock, flags);
 
 	l_old = lookup_elem_raw(head, hash, key, key_size);
 
@@ -1049,7 +1049,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
 	}
 	ret = 0;
 err:
-	raw_spin_unlock_irqrestore(&b->lock, flags);
+	spin_unlock_irqrestore(&b->lock, flags);
 	if (l_new)
 		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
 	return ret;
@@ -1087,7 +1087,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 	b = __select_bucket(htab, hash);
 	head = &b->head;
 
-	raw_spin_lock_irqsave(&b->lock, flags);
+	spin_lock_irqsave(&b->lock, flags);
 
 	l = lookup_elem_raw(head, hash, key, key_size);
 
@@ -1097,7 +1097,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 		ret = 0;
 	}
 
-	raw_spin_unlock_irqrestore(&b->lock, flags);
+	spin_unlock_irqrestore(&b->lock, flags);
 	return ret;
 }
 
@@ -1119,7 +1119,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
 	b = __select_bucket(htab, hash);
 	head = &b->head;
 
-	raw_spin_lock_irqsave(&b->lock, flags);
+	spin_lock_irqsave(&b->lock, flags);
 
 	l = lookup_elem_raw(head, hash, key, key_size);
 
@@ -1128,7 +1128,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
 		ret = 0;
 	}
 
-	raw_spin_unlock_irqrestore(&b->lock, flags);
+	spin_unlock_irqrestore(&b->lock, flags);
 	if (l)
 		bpf_lru_push_free(&htab->lru, &l->lru_node);
 	return ret;
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/3] bpf: Use spinlock_t in lpm_trie
  2019-04-10 14:30 [PATCH 1/3] bpf: Use spinlock_t in bpf_lru_list Sebastian Andrzej Siewior
  2019-04-10 14:30 ` [PATCH 2/3] bpf: Use spinlock_t in hashtab Sebastian Andrzej Siewior
@ 2019-04-10 14:30 ` Sebastian Andrzej Siewior
  2019-04-11 20:36   ` Song Liu
  2019-04-10 17:08 ` [PATCH 1/3] bpf: Use spinlock_t in bpf_lru_list Yonghong Song
  2 siblings, 1 reply; 12+ messages in thread
From: Sebastian Andrzej Siewior @ 2019-04-10 14:30 UTC (permalink / raw)
  To: bpf
  Cc: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
	Yonghong Song, tglx, Steven Rostedt, Sebastian Andrzej Siewior

There is no difference between spinlock_t and raw_spinlock_t for !RT
kernels. trie_update_elem() will allocate memory while holding ->lock
which is not possible on RT kernels.

Make the ->lock a spinlock_t.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/bpf/lpm_trie.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 93a5cbbde421c..2ceb32452b594 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -37,7 +37,7 @@ struct lpm_trie {
 	size_t				n_entries;
 	size_t				max_prefixlen;
 	size_t				data_size;
-	raw_spinlock_t			lock;
+	spinlock_t			lock;
 };
 
 /* This trie implements a longest prefix match algorithm that can be used to
@@ -318,7 +318,7 @@ static int trie_update_elem(struct bpf_map *map,
 	if (key->prefixlen > trie->max_prefixlen)
 		return -EINVAL;
 
-	raw_spin_lock_irqsave(&trie->lock, irq_flags);
+	spin_lock_irqsave(&trie->lock, irq_flags);
 
 	/* Allocate and fill a new node */
 
@@ -425,7 +425,7 @@ static int trie_update_elem(struct bpf_map *map,
 		kfree(im_node);
 	}
 
-	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
+	spin_unlock_irqrestore(&trie->lock, irq_flags);
 
 	return ret;
 }
@@ -445,7 +445,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
 	if (key->prefixlen > trie->max_prefixlen)
 		return -EINVAL;
 
-	raw_spin_lock_irqsave(&trie->lock, irq_flags);
+	spin_lock_irqsave(&trie->lock, irq_flags);
 
 	/* Walk the tree looking for an exact key/length match and keeping
 	 * track of the path we traverse.  We will need to know the node
@@ -521,7 +521,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
 	kfree_rcu(node, rcu);
 
 out:
-	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
+	spin_unlock_irqrestore(&trie->lock, irq_flags);
 
 	return ret;
 }
@@ -583,7 +583,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 	if (ret)
 		goto out_err;
 
-	raw_spin_lock_init(&trie->lock);
+	spin_lock_init(&trie->lock);
 
 	return &trie->map;
 out_err:
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/3] bpf: Use spinlock_t in bpf_lru_list
  2019-04-10 14:30 [PATCH 1/3] bpf: Use spinlock_t in bpf_lru_list Sebastian Andrzej Siewior
  2019-04-10 14:30 ` [PATCH 2/3] bpf: Use spinlock_t in hashtab Sebastian Andrzej Siewior
  2019-04-10 14:30 ` [PATCH 3/3] bpf: Use spinlock_t in lpm_trie Sebastian Andrzej Siewior
@ 2019-04-10 17:08 ` Yonghong Song
  2019-04-10 17:19   ` Daniel Borkmann
  2 siblings, 1 reply; 12+ messages in thread
From: Yonghong Song @ 2019-04-10 17:08 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, bpf
  Cc: Alexei Starovoitov, Daniel Borkmann, Martin Lau, Song Liu, tglx,
	Steven Rostedt



On 4/10/19 7:30 AM, Sebastian Andrzej Siewior wrote:
> There is no difference between spinlock_t and raw_spinlock_t for !RT
> kernels. It is possible that bpf_lru_list_pop_free_to_local() invokes
> at least three list walks which look unbounded it probably makes sense
> to use spinlock_t.
> 
> Make bpf_lru_list use a spinlock_t.

Could you add a cover letter for the patch set since you have
more than one patch?

> 
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
>   kernel/bpf/bpf_lru_list.c | 38 +++++++++++++++++++-------------------
>   kernel/bpf/bpf_lru_list.h |  4 ++--
>   2 files changed, 21 insertions(+), 21 deletions(-)
> 
> diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
> index e6ef4401a1380..40f47210c3817 100644
> --- a/kernel/bpf/bpf_lru_list.c
> +++ b/kernel/bpf/bpf_lru_list.c
> @@ -313,9 +313,9 @@ static void bpf_lru_list_push_free(struct bpf_lru_list *l,
>   	if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
>   		return;
>   
> -	raw_spin_lock_irqsave(&l->lock, flags);
> +	spin_lock_irqsave(&l->lock, flags);
>   	__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
> -	raw_spin_unlock_irqrestore(&l->lock, flags);
> +	spin_unlock_irqrestore(&l->lock, flags);
>   }

This function (plus many others) is called by bpf program helpers which 
cannot sleep. Is it possible that under RT spin_lock_irqsave() could 
sleep and this will make bpf subsystem cannot be used under RT?

>   
>   static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
> @@ -325,7 +325,7 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
>   	struct bpf_lru_node *node, *tmp_node;
>   	unsigned int nfree = 0;
>   
> -	raw_spin_lock(&l->lock);
> +	spin_lock(&l->lock);
>   
>   	__local_list_flush(l, loc_l);
>   
> @@ -344,7 +344,7 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
>   				      local_free_list(loc_l),
>   				      BPF_LRU_LOCAL_LIST_T_FREE);
>   
> -	raw_spin_unlock(&l->lock);
> +	spin_unlock(&l->lock);
>   }
>   
>   static void __local_list_add_pending(struct bpf_lru *lru,
[...]
>   static void bpf_lru_list_init(struct bpf_lru_list *l)
> @@ -642,7 +642,7 @@ static void bpf_lru_list_init(struct bpf_lru_list *l)
>   
>   	l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE];
>   
> -	raw_spin_lock_init(&l->lock);
> +	spin_lock_init(&l->lock);
>   }
>   
>   int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
> diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
> index 7d4f89b7cb841..4e1e4608f1bb0 100644
> --- a/kernel/bpf/bpf_lru_list.h
> +++ b/kernel/bpf/bpf_lru_list.h
> @@ -36,13 +36,13 @@ struct bpf_lru_list {
>   	/* The next inacitve list rotation starts from here */
>   	struct list_head *next_inactive_rotation;
>   
> -	raw_spinlock_t lock ____cacheline_aligned_in_smp;
> +	spinlock_t lock ____cacheline_aligned_in_smp;
>   };
>   
>   struct bpf_lru_locallist {
>   	struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T];
>   	u16 next_steal;
> -	raw_spinlock_t lock;
> +	spinlock_t lock;
>   };
>   
>   struct bpf_common_lru {
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/3] bpf: Use spinlock_t in bpf_lru_list
  2019-04-10 17:08 ` [PATCH 1/3] bpf: Use spinlock_t in bpf_lru_list Yonghong Song
@ 2019-04-10 17:19   ` Daniel Borkmann
  2019-04-10 17:44     ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 12+ messages in thread
From: Daniel Borkmann @ 2019-04-10 17:19 UTC (permalink / raw)
  To: Yonghong Song, Sebastian Andrzej Siewior, bpf
  Cc: Alexei Starovoitov, Martin Lau, Song Liu, tglx, Steven Rostedt

On 04/10/2019 07:08 PM, Yonghong Song wrote:
> On 4/10/19 7:30 AM, Sebastian Andrzej Siewior wrote:
>> There is no difference between spinlock_t and raw_spinlock_t for !RT
>> kernels. It is possible that bpf_lru_list_pop_free_to_local() invokes
>> at least three list walks which look unbounded it probably makes sense
>> to use spinlock_t.
>>
>> Make bpf_lru_list use a spinlock_t.
> 
> Could you add a cover letter for the patch set since you have
> more than one patch?
> 
>> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
>> ---
>>   kernel/bpf/bpf_lru_list.c | 38 +++++++++++++++++++-------------------
>>   kernel/bpf/bpf_lru_list.h |  4 ++--
>>   2 files changed, 21 insertions(+), 21 deletions(-)
>>
>> diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
>> index e6ef4401a1380..40f47210c3817 100644
>> --- a/kernel/bpf/bpf_lru_list.c
>> +++ b/kernel/bpf/bpf_lru_list.c
>> @@ -313,9 +313,9 @@ static void bpf_lru_list_push_free(struct bpf_lru_list *l,
>>   	if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
>>   		return;
>>   
>> -	raw_spin_lock_irqsave(&l->lock, flags);
>> +	spin_lock_irqsave(&l->lock, flags);
>>   	__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
>> -	raw_spin_unlock_irqrestore(&l->lock, flags);
>> +	spin_unlock_irqrestore(&l->lock, flags);
>>   }
> 
> This function (plus many others) is called by bpf program helpers which 
> cannot sleep. Is it possible that under RT spin_lock_irqsave() could 
> sleep and this will make bpf subsystem cannot be used under RT?

Back then in ac00881f9221 ("bpf: convert hashtab lock to raw lock")
Yang Shi converted spinlock_t to raw_spinlock_t due to exactly the
above mentioned issue. I presume that hasn't changed, right?

>>   static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
>> @@ -325,7 +325,7 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
>>   	struct bpf_lru_node *node, *tmp_node;
>>   	unsigned int nfree = 0;
>>   
>> -	raw_spin_lock(&l->lock);
>> +	spin_lock(&l->lock);
>>   
>>   	__local_list_flush(l, loc_l);
>>   
>> @@ -344,7 +344,7 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
>>   				      local_free_list(loc_l),
>>   				      BPF_LRU_LOCAL_LIST_T_FREE);
>>   
>> -	raw_spin_unlock(&l->lock);
>> +	spin_unlock(&l->lock);
>>   }
>>   
>>   static void __local_list_add_pending(struct bpf_lru *lru,
> [...]
>>   static void bpf_lru_list_init(struct bpf_lru_list *l)
>> @@ -642,7 +642,7 @@ static void bpf_lru_list_init(struct bpf_lru_list *l)
>>   
>>   	l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE];
>>   
>> -	raw_spin_lock_init(&l->lock);
>> +	spin_lock_init(&l->lock);
>>   }
>>   
>>   int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
>> diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
>> index 7d4f89b7cb841..4e1e4608f1bb0 100644
>> --- a/kernel/bpf/bpf_lru_list.h
>> +++ b/kernel/bpf/bpf_lru_list.h
>> @@ -36,13 +36,13 @@ struct bpf_lru_list {
>>   	/* The next inacitve list rotation starts from here */
>>   	struct list_head *next_inactive_rotation;
>>   
>> -	raw_spinlock_t lock ____cacheline_aligned_in_smp;
>> +	spinlock_t lock ____cacheline_aligned_in_smp;
>>   };
>>   
>>   struct bpf_lru_locallist {
>>   	struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T];
>>   	u16 next_steal;
>> -	raw_spinlock_t lock;
>> +	spinlock_t lock;
>>   };
>>   
>>   struct bpf_common_lru {
>>


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/3] bpf: Use spinlock_t in bpf_lru_list
  2019-04-10 17:19   ` Daniel Borkmann
@ 2019-04-10 17:44     ` Sebastian Andrzej Siewior
  2019-04-10 18:35       ` Yonghong Song
  2019-04-10 19:44       ` Daniel Borkmann
  0 siblings, 2 replies; 12+ messages in thread
From: Sebastian Andrzej Siewior @ 2019-04-10 17:44 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Yonghong Song, bpf, Alexei Starovoitov, Martin Lau, Song Liu,
	tglx, Steven Rostedt

On 2019-04-10 19:19:27 [+0200], Daniel Borkmann wrote:
> On 04/10/2019 07:08 PM, Yonghong Song wrote:
> > On 4/10/19 7:30 AM, Sebastian Andrzej Siewior wrote:
> >> There is no difference between spinlock_t and raw_spinlock_t for !RT
> >> kernels. It is possible that bpf_lru_list_pop_free_to_local() invokes
> >> at least three list walks which look unbounded it probably makes sense
> >> to use spinlock_t.
> >>
> >> Make bpf_lru_list use a spinlock_t.
> > 
> > Could you add a cover letter for the patch set since you have
> > more than one patch?

yes.

> >> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> >> ---
> >>   kernel/bpf/bpf_lru_list.c | 38 +++++++++++++++++++-------------------
> >>   kernel/bpf/bpf_lru_list.h |  4 ++--
> >>   2 files changed, 21 insertions(+), 21 deletions(-)
> >>
> >> diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
> >> index e6ef4401a1380..40f47210c3817 100644
> >> --- a/kernel/bpf/bpf_lru_list.c
> >> +++ b/kernel/bpf/bpf_lru_list.c
> >> @@ -313,9 +313,9 @@ static void bpf_lru_list_push_free(struct bpf_lru_list *l,
> >>   	if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
> >>   		return;
> >>   
> >> -	raw_spin_lock_irqsave(&l->lock, flags);
> >> +	spin_lock_irqsave(&l->lock, flags);
> >>   	__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
> >> -	raw_spin_unlock_irqrestore(&l->lock, flags);
> >> +	spin_unlock_irqrestore(&l->lock, flags);
> >>   }
> > 
> > This function (plus many others) is called by bpf program helpers which 
> > cannot sleep. Is it possible that under RT spin_lock_irqsave() could 
> > sleep and this will make bpf subsystem cannot be used under RT?
> 
> Back then in ac00881f9221 ("bpf: convert hashtab lock to raw lock")
> Yang Shi converted spinlock_t to raw_spinlock_t due to exactly the
> above mentioned issue. I presume that hasn't changed, right?

Ah. I checked one or two of those and it looked like it was raw since
the beginning. Anyway, it would be nice to Cc: the RT developer while
fiddling with something that only concerns RT.

That usage pattern that is mentioned in ac00881f9221, is it true for all
data structure algorithms? In bpf_lru_list I was concerned about the
list loops. However hashtab and lpm_trie may perform memory allocations
while holding the lock and this isn't going to work.

Sebastian

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/3] bpf: Use spinlock_t in bpf_lru_list
  2019-04-10 17:44     ` Sebastian Andrzej Siewior
@ 2019-04-10 18:35       ` Yonghong Song
  2019-04-12 15:38         ` Sebastian Andrzej Siewior
  2019-04-10 19:44       ` Daniel Borkmann
  1 sibling, 1 reply; 12+ messages in thread
From: Yonghong Song @ 2019-04-10 18:35 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, Daniel Borkmann
  Cc: bpf, Alexei Starovoitov, Martin Lau, Song Liu, tglx, Steven Rostedt



On 4/10/19 10:44 AM, Sebastian Andrzej Siewior wrote:
> On 2019-04-10 19:19:27 [+0200], Daniel Borkmann wrote:
>> On 04/10/2019 07:08 PM, Yonghong Song wrote:
>>> On 4/10/19 7:30 AM, Sebastian Andrzej Siewior wrote:
>>>> There is no difference between spinlock_t and raw_spinlock_t for !RT
>>>> kernels. It is possible that bpf_lru_list_pop_free_to_local() invokes
>>>> at least three list walks which look unbounded it probably makes sense
>>>> to use spinlock_t.
>>>>
>>>> Make bpf_lru_list use a spinlock_t.
>>>
>>> Could you add a cover letter for the patch set since you have
>>> more than one patch?
> 
> yes.
> 
>>>> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
>>>> ---
>>>>    kernel/bpf/bpf_lru_list.c | 38 +++++++++++++++++++-------------------
>>>>    kernel/bpf/bpf_lru_list.h |  4 ++--
>>>>    2 files changed, 21 insertions(+), 21 deletions(-)
>>>>
>>>> diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
>>>> index e6ef4401a1380..40f47210c3817 100644
>>>> --- a/kernel/bpf/bpf_lru_list.c
>>>> +++ b/kernel/bpf/bpf_lru_list.c
>>>> @@ -313,9 +313,9 @@ static void bpf_lru_list_push_free(struct bpf_lru_list *l,
>>>>    	if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
>>>>    		return;
>>>>    
>>>> -	raw_spin_lock_irqsave(&l->lock, flags);
>>>> +	spin_lock_irqsave(&l->lock, flags);
>>>>    	__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
>>>> -	raw_spin_unlock_irqrestore(&l->lock, flags);
>>>> +	spin_unlock_irqrestore(&l->lock, flags);
>>>>    }
>>>
>>> This function (plus many others) is called by bpf program helpers which
>>> cannot sleep. Is it possible that under RT spin_lock_irqsave() could
>>> sleep and this will make bpf subsystem cannot be used under RT?
>>
>> Back then in ac00881f9221 ("bpf: convert hashtab lock to raw lock")
>> Yang Shi converted spinlock_t to raw_spinlock_t due to exactly the
>> above mentioned issue. I presume that hasn't changed, right?
> 
> Ah. I checked one or two of those and it looked like it was raw since
> the beginning. Anyway, it would be nice to Cc: the RT developer while
> fiddling with something that only concerns RT.
> 
> That usage pattern that is mentioned in ac00881f9221, is it true for all
> data structure algorithms? In bpf_lru_list I was concerned about the
> list loops. 
For some skewed hash tables with a lot of elements, yes, it is possible
time to traverse the loop could be a little bit longer...

> However hashtab and lpm_trie may perform memory allocations
> while holding the lock and this isn't going to work.

The memory allocation in these two places uses GFP_ATOMIC which should 
not sleep/block.

> 
> Sebastian
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/3] bpf: Use spinlock_t in bpf_lru_list
  2019-04-10 17:44     ` Sebastian Andrzej Siewior
  2019-04-10 18:35       ` Yonghong Song
@ 2019-04-10 19:44       ` Daniel Borkmann
  2019-04-12 16:14         ` Sebastian Andrzej Siewior
  1 sibling, 1 reply; 12+ messages in thread
From: Daniel Borkmann @ 2019-04-10 19:44 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Yonghong Song, bpf, Alexei Starovoitov, Martin Lau, Song Liu,
	tglx, Steven Rostedt

On 04/10/2019 07:44 PM, Sebastian Andrzej Siewior wrote:
> On 2019-04-10 19:19:27 [+0200], Daniel Borkmann wrote:
>> On 04/10/2019 07:08 PM, Yonghong Song wrote:
>>> On 4/10/19 7:30 AM, Sebastian Andrzej Siewior wrote:
>>>> There is no difference between spinlock_t and raw_spinlock_t for !RT
>>>> kernels. It is possible that bpf_lru_list_pop_free_to_local() invokes
>>>> at least three list walks which look unbounded it probably makes sense
>>>> to use spinlock_t.
>>>>
>>>> Make bpf_lru_list use a spinlock_t.
>>>
>>> Could you add a cover letter for the patch set since you have
>>> more than one patch?
> 
> yes.
> 
>>>> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
>>>> ---
>>>>   kernel/bpf/bpf_lru_list.c | 38 +++++++++++++++++++-------------------
>>>>   kernel/bpf/bpf_lru_list.h |  4 ++--
>>>>   2 files changed, 21 insertions(+), 21 deletions(-)
>>>>
>>>> diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
>>>> index e6ef4401a1380..40f47210c3817 100644
>>>> --- a/kernel/bpf/bpf_lru_list.c
>>>> +++ b/kernel/bpf/bpf_lru_list.c
>>>> @@ -313,9 +313,9 @@ static void bpf_lru_list_push_free(struct bpf_lru_list *l,
>>>>   	if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
>>>>   		return;
>>>>   
>>>> -	raw_spin_lock_irqsave(&l->lock, flags);
>>>> +	spin_lock_irqsave(&l->lock, flags);
>>>>   	__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
>>>> -	raw_spin_unlock_irqrestore(&l->lock, flags);
>>>> +	spin_unlock_irqrestore(&l->lock, flags);
>>>>   }
>>>
>>> This function (plus many others) is called by bpf program helpers which 
>>> cannot sleep. Is it possible that under RT spin_lock_irqsave() could 
>>> sleep and this will make bpf subsystem cannot be used under RT?
>>
>> Back then in ac00881f9221 ("bpf: convert hashtab lock to raw lock")
>> Yang Shi converted spinlock_t to raw_spinlock_t due to exactly the
>> above mentioned issue. I presume that hasn't changed, right?
> 
> Ah. I checked one or two of those and it looked like it was raw since
> the beginning. Anyway, it would be nice to Cc: the RT developer while

The later ones like form LRU may have just been adapted to use the
same after the conversion from ac00881f9221, but I presume there might
unfortunately be little to no testing on RT kernels currently. Hopefully
we'll get there such that at min bots would yell if something is off
so it can be fixed.

> fiddling with something that only concerns RT.

I think that was the case back then, see discussion / Cc list:

https://lore.kernel.org/netdev/1446243386-26582-1-git-send-email-yang.shi@linaro.org/T/

> That usage pattern that is mentioned in ac00881f9221, is it true for all
> data structure algorithms? In bpf_lru_list I was concerned about the
> list loops. However hashtab and lpm_trie may perform memory allocations
> while holding the lock and this isn't going to work.

Do you have some document or guide for such typical patterns and how to
make them behave better for RT?

Thanks,
Daniel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] bpf: Use spinlock_t in lpm_trie
  2019-04-10 14:30 ` [PATCH 3/3] bpf: Use spinlock_t in lpm_trie Sebastian Andrzej Siewior
@ 2019-04-11 20:36   ` Song Liu
  2019-04-12 16:23     ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 12+ messages in thread
From: Song Liu @ 2019-04-11 20:36 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: bpf, Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau,
	Song Liu, Yonghong Song, Thomas Gleixner, Steven Rostedt

Hi,

On Wed, Apr 10, 2019 at 7:31 AM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
>
> There is no difference between spinlock_t and raw_spinlock_t for !RT
> kernels. trie_update_elem() will allocate memory while holding ->lock
> which is not possible on RT kernels.

I am new to RT kernel. For !RT kernel, it is OK to hold a lock and call
malloc with GFP_ATOMIC. Is this different for RT kernel?

Thanks,
Song

>
> Make the ->lock a spinlock_t.
>
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
>  kernel/bpf/lpm_trie.c | 12 ++++++------
>  1 file changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
> index 93a5cbbde421c..2ceb32452b594 100644
> --- a/kernel/bpf/lpm_trie.c
> +++ b/kernel/bpf/lpm_trie.c
> @@ -37,7 +37,7 @@ struct lpm_trie {
>         size_t                          n_entries;
>         size_t                          max_prefixlen;
>         size_t                          data_size;
> -       raw_spinlock_t                  lock;
> +       spinlock_t                      lock;
>  };
>
>  /* This trie implements a longest prefix match algorithm that can be used to
> @@ -318,7 +318,7 @@ static int trie_update_elem(struct bpf_map *map,
>         if (key->prefixlen > trie->max_prefixlen)
>                 return -EINVAL;
>
> -       raw_spin_lock_irqsave(&trie->lock, irq_flags);
> +       spin_lock_irqsave(&trie->lock, irq_flags);
>
>         /* Allocate and fill a new node */
>
> @@ -425,7 +425,7 @@ static int trie_update_elem(struct bpf_map *map,
>                 kfree(im_node);
>         }
>
> -       raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
> +       spin_unlock_irqrestore(&trie->lock, irq_flags);
>
>         return ret;
>  }
> @@ -445,7 +445,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
>         if (key->prefixlen > trie->max_prefixlen)
>                 return -EINVAL;
>
> -       raw_spin_lock_irqsave(&trie->lock, irq_flags);
> +       spin_lock_irqsave(&trie->lock, irq_flags);
>
>         /* Walk the tree looking for an exact key/length match and keeping
>          * track of the path we traverse.  We will need to know the node
> @@ -521,7 +521,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
>         kfree_rcu(node, rcu);
>
>  out:
> -       raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
> +       spin_unlock_irqrestore(&trie->lock, irq_flags);
>
>         return ret;
>  }
> @@ -583,7 +583,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
>         if (ret)
>                 goto out_err;
>
> -       raw_spin_lock_init(&trie->lock);
> +       spin_lock_init(&trie->lock);
>
>         return &trie->map;
>  out_err:
> --
> 2.20.1
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/3] bpf: Use spinlock_t in bpf_lru_list
  2019-04-10 18:35       ` Yonghong Song
@ 2019-04-12 15:38         ` Sebastian Andrzej Siewior
  0 siblings, 0 replies; 12+ messages in thread
From: Sebastian Andrzej Siewior @ 2019-04-12 15:38 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Daniel Borkmann, bpf, Alexei Starovoitov, Martin Lau, Song Liu,
	tglx, Steven Rostedt

On 2019-04-10 18:35:21 [+0000], Yonghong Song wrote:
> > Ah. I checked one or two of those and it looked like it was raw since
> > the beginning. Anyway, it would be nice to Cc: the RT developer while
> > fiddling with something that only concerns RT.
> > 
> > That usage pattern that is mentioned in ac00881f9221, is it true for all
> > data structure algorithms? In bpf_lru_list I was concerned about the
> > list loops. 
> For some skewed hash tables with a lot of elements, yes, it is possible
> time to traverse the loop could be a little bit longer...

okay. So all that adds to your max latency which we try to keep low. If
this is debugging only one could argue that it is okay. However if it is
intended to be used in production then we usually try to keep as low as
possible (i.e. have an upper limit).

> > However hashtab and lpm_trie may perform memory allocations
> > while holding the lock and this isn't going to work.
> 
> The memory allocation in these two places uses GFP_ATOMIC which should 
> not sleep/block.

That is not true for -RT usage. GFP_ATOMIC enables the usage of
emergency memory pools (which are in general precious) and won't
schedule() in order to free memory. This doesn't change on -RT.
However, SLUB may call into the page allocator in order to get a fresh
page and this is not working on -RT. This means you can't allocate
memory from a preempt_disable() / local_irq_disable() section which
includes your raw_spin_lock(). It works from spin_lock_irq() because
that one is a sleeping lock on -RT.

Sebastian

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/3] bpf: Use spinlock_t in bpf_lru_list
  2019-04-10 19:44       ` Daniel Borkmann
@ 2019-04-12 16:14         ` Sebastian Andrzej Siewior
  0 siblings, 0 replies; 12+ messages in thread
From: Sebastian Andrzej Siewior @ 2019-04-12 16:14 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Yonghong Song, bpf, Alexei Starovoitov, Martin Lau, Song Liu,
	tglx, Steven Rostedt

On 2019-04-10 21:44:22 [+0200], Daniel Borkmann wrote:
> > Ah. I checked one or two of those and it looked like it was raw since
> > the beginning. Anyway, it would be nice to Cc: the RT developer while
> 
> The later ones like form LRU may have just been adapted to use the
> same after the conversion from ac00881f9221, but I presume there might
> unfortunately be little to no testing on RT kernels currently. Hopefully
> we'll get there such that at min bots would yell if something is off
> so it can be fixed.

Thanks. The boot part what made me look at lpm_trie.

> > fiddling with something that only concerns RT.
> 
> I think that was the case back then, see discussion / Cc list:
> 
> https://lore.kernel.org/netdev/1446243386-26582-1-git-send-email-yang.shi@linaro.org/T/

So there was a discussion and I somehow missed it. Fair enough.

This memory allocation under the lock. Is that new or was it not seen
back then?

> > That usage pattern that is mentioned in ac00881f9221, is it true for all
> > data structure algorithms? In bpf_lru_list I was concerned about the
> > list loops. However hashtab and lpm_trie may perform memory allocations
> > while holding the lock and this isn't going to work.
> 
> Do you have some document or guide for such typical patterns and how to
> make them behave better for RT?

Let me put something simple together and once I have the pieces in
lockdep I hope that there will be also a document explaining things in
more detail.
For now: try to keep you preemptible.
- spin_lock() -> raw_spin_lock() is correct but
  raw_spin_lock() -> spin_lock() is not correct.

- interrupts handlers run threaded (as with "threadirqs" command line).
  Most code therefore never really disables interrupts. This includes
  spin_lock_irq().
  Therefore local_irq_disable() + spin_lock() != spin_lock_irq()

- preempt_disable(), local_irq_disable(), raw_spin_lock() enables atomic
  context on -RT which makes scheduling impossible.

- in atomic context

  - memory allocations are not possible (including GFP_ATOMIC).

  - a spin_lock() can not be acquired nor released.

  - unbounded loops add to task's max latency which should be avoided.

- architecture's core runs with disabled interrupts and it is attempted
  to keep this part short. This includes even hrtimer callbacks which
  are not invoked with disabled interrupts.

- core code uses raw_spin_lock() if it needs to protect something. If
  you think you need such a lock try to measure the worst case with
  cyclictest and see how it behaves. If it is visible then a different
  design should probably be used by shrinking the atomic section in
  order to become more deterministic.

- debugging code often increases latency (lockdep even by few ms).
  Please document if this introduced unbounded atomic section is
  intended only for debugging.
 
> Thanks,
> Daniel

Sebastian

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] bpf: Use spinlock_t in lpm_trie
  2019-04-11 20:36   ` Song Liu
@ 2019-04-12 16:23     ` Sebastian Andrzej Siewior
  0 siblings, 0 replies; 12+ messages in thread
From: Sebastian Andrzej Siewior @ 2019-04-12 16:23 UTC (permalink / raw)
  To: Song Liu
  Cc: bpf, Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau,
	Song Liu, Yonghong Song, Thomas Gleixner, Steven Rostedt

On 2019-04-11 13:36:38 [-0700], Song Liu wrote:
> Hi,
Hi,

> On Wed, Apr 10, 2019 at 7:31 AM Sebastian Andrzej Siewior
> <bigeasy@linutronix.de> wrote:
> >
> > There is no difference between spinlock_t and raw_spinlock_t for !RT
> > kernels. trie_update_elem() will allocate memory while holding ->lock
> > which is not possible on RT kernels.
> 
> I am new to RT kernel. For !RT kernel, it is OK to hold a lock and call
> malloc with GFP_ATOMIC. Is this different for RT kernel?

Yes, please see
	https://lore.kernel.org/bpf/20190412153819.6sh2b2cwddpgnepq@linutronix.de/

> Thanks,
> Song

Sebastian

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2019-04-12 16:23 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-10 14:30 [PATCH 1/3] bpf: Use spinlock_t in bpf_lru_list Sebastian Andrzej Siewior
2019-04-10 14:30 ` [PATCH 2/3] bpf: Use spinlock_t in hashtab Sebastian Andrzej Siewior
2019-04-10 14:30 ` [PATCH 3/3] bpf: Use spinlock_t in lpm_trie Sebastian Andrzej Siewior
2019-04-11 20:36   ` Song Liu
2019-04-12 16:23     ` Sebastian Andrzej Siewior
2019-04-10 17:08 ` [PATCH 1/3] bpf: Use spinlock_t in bpf_lru_list Yonghong Song
2019-04-10 17:19   ` Daniel Borkmann
2019-04-10 17:44     ` Sebastian Andrzej Siewior
2019-04-10 18:35       ` Yonghong Song
2019-04-12 15:38         ` Sebastian Andrzej Siewior
2019-04-10 19:44       ` Daniel Borkmann
2019-04-12 16:14         ` Sebastian Andrzej Siewior

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.