* [PATCH v2 01/33] maple_tree: Be more cautious about dead nodes
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
@ 2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 02/33] maple_tree: Detect dead nodes in mas_start() Suren Baghdasaryan
` (33 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:40 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb, Liam Howlett
From: Liam Howlett <Liam.Howlett@oracle.com>
ma_pivots() and ma_data_end() may be called with a dead node. Ensure to
that the node isn't dead before using the returned values.
This is necessary for RCU mode of the maple tree.
Fixes: 54a611b60590 ("Maple Tree: add new data structure")
Signed-off-by: Liam Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
lib/maple_tree.c | 52 +++++++++++++++++++++++++++++++++++++++---------
1 file changed, 43 insertions(+), 9 deletions(-)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 5e9703189259..e5eeecd14eee 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -537,6 +537,7 @@ static inline bool ma_dead_node(const struct maple_node *node)
return (parent == node);
}
+
/*
* mte_dead_node() - check if the @enode is dead.
* @enode: The encoded maple node
@@ -618,6 +619,8 @@ static inline unsigned int mas_alloc_req(const struct ma_state *mas)
* @node - the maple node
* @type - the node type
*
+ * In the event of a dead node, this array may be %NULL
+ *
* Return: A pointer to the maple node pivots
*/
static inline unsigned long *ma_pivots(struct maple_node *node,
@@ -1089,8 +1092,11 @@ static int mas_ascend(struct ma_state *mas)
a_type = mas_parent_enum(mas, p_enode);
a_node = mte_parent(p_enode);
a_slot = mte_parent_slot(p_enode);
- pivots = ma_pivots(a_node, a_type);
a_enode = mt_mk_node(a_node, a_type);
+ pivots = ma_pivots(a_node, a_type);
+
+ if (unlikely(ma_dead_node(a_node)))
+ return 1;
if (!set_min && a_slot) {
set_min = true;
@@ -1394,6 +1400,9 @@ static inline unsigned char ma_data_end(struct maple_node *node,
{
unsigned char offset;
+ if (!pivots)
+ return 0;
+
if (type == maple_arange_64)
return ma_meta_end(node, type);
@@ -1429,6 +1438,9 @@ static inline unsigned char mas_data_end(struct ma_state *mas)
return ma_meta_end(node, type);
pivots = ma_pivots(node, type);
+ if (unlikely(ma_dead_node(node)))
+ return 0;
+
offset = mt_pivots[type] - 1;
if (likely(!pivots[offset]))
return ma_meta_end(node, type);
@@ -4498,6 +4510,9 @@ static inline int mas_prev_node(struct ma_state *mas, unsigned long min)
node = mas_mn(mas);
slots = ma_slots(node, mt);
pivots = ma_pivots(node, mt);
+ if (unlikely(ma_dead_node(node)))
+ return 1;
+
mas->max = pivots[offset];
if (offset)
mas->min = pivots[offset - 1] + 1;
@@ -4519,6 +4534,9 @@ static inline int mas_prev_node(struct ma_state *mas, unsigned long min)
slots = ma_slots(node, mt);
pivots = ma_pivots(node, mt);
offset = ma_data_end(node, mt, pivots, mas->max);
+ if (unlikely(ma_dead_node(node)))
+ return 1;
+
if (offset)
mas->min = pivots[offset - 1] + 1;
@@ -4567,6 +4585,7 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node,
struct maple_enode *enode;
int level = 0;
unsigned char offset;
+ unsigned char node_end;
enum maple_type mt;
void __rcu **slots;
@@ -4590,7 +4609,11 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node,
node = mas_mn(mas);
mt = mte_node_type(mas->node);
pivots = ma_pivots(node, mt);
- } while (unlikely(offset == ma_data_end(node, mt, pivots, mas->max)));
+ node_end = ma_data_end(node, mt, pivots, mas->max);
+ if (unlikely(ma_dead_node(node)))
+ return 1;
+
+ } while (unlikely(offset == node_end));
slots = ma_slots(node, mt);
pivot = mas_safe_pivot(mas, pivots, ++offset, mt);
@@ -4606,6 +4629,9 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node,
mt = mte_node_type(mas->node);
slots = ma_slots(node, mt);
pivots = ma_pivots(node, mt);
+ if (unlikely(ma_dead_node(node)))
+ return 1;
+
offset = 0;
pivot = pivots[0];
}
@@ -4652,11 +4678,14 @@ static inline void *mas_next_nentry(struct ma_state *mas,
return NULL;
}
- pivots = ma_pivots(node, type);
slots = ma_slots(node, type);
- mas->index = mas_safe_min(mas, pivots, mas->offset);
+ pivots = ma_pivots(node, type);
count = ma_data_end(node, type, pivots, mas->max);
- if (ma_dead_node(node))
+ if (unlikely(ma_dead_node(node)))
+ return NULL;
+
+ mas->index = mas_safe_min(mas, pivots, mas->offset);
+ if (unlikely(ma_dead_node(node)))
return NULL;
if (mas->index > max)
@@ -4810,6 +4839,11 @@ static inline void *mas_prev_nentry(struct ma_state *mas, unsigned long limit,
slots = ma_slots(mn, mt);
pivots = ma_pivots(mn, mt);
+ if (unlikely(ma_dead_node(mn))) {
+ mas_rewalk(mas, index);
+ goto retry;
+ }
+
if (offset == mt_pivots[mt])
pivot = mas->max;
else
@@ -6624,11 +6658,11 @@ static inline void *mas_first_entry(struct ma_state *mas, struct maple_node *mn,
while (likely(!ma_is_leaf(mt))) {
MT_BUG_ON(mas->tree, mte_dead_node(mas->node));
slots = ma_slots(mn, mt);
- pivots = ma_pivots(mn, mt);
- max = pivots[0];
entry = mas_slot(mas, slots, 0);
+ pivots = ma_pivots(mn, mt);
if (unlikely(ma_dead_node(mn)))
return NULL;
+ max = pivots[0];
mas->node = entry;
mn = mas_mn(mas);
mt = mte_node_type(mas->node);
@@ -6648,13 +6682,13 @@ static inline void *mas_first_entry(struct ma_state *mas, struct maple_node *mn,
if (likely(entry))
return entry;
- pivots = ma_pivots(mn, mt);
- mas->index = pivots[0] + 1;
mas->offset = 1;
entry = mas_slot(mas, slots, 1);
+ pivots = ma_pivots(mn, mt);
if (unlikely(ma_dead_node(mn)))
return NULL;
+ mas->index = pivots[0] + 1;
if (mas->index > limit)
goto none;
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 02/33] maple_tree: Detect dead nodes in mas_start()
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 01/33] maple_tree: Be more cautious about dead nodes Suren Baghdasaryan
@ 2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 03/33] maple_tree: Fix freeing of nodes in rcu mode Suren Baghdasaryan
` (32 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:40 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb, Liam Howlett
From: Liam Howlett <Liam.Howlett@oracle.com>
When initially starting a search, the root node may already be in the
process of being replaced in RCU mode. Detect and restart the walk if
this is the case. This is necessary for RCU mode of the maple tree.
Fixes: 54a611b60590 ("Maple Tree: add new data structure")
Signed-off-by: Liam Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
lib/maple_tree.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index e5eeecd14eee..482e17a460cb 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -1353,12 +1353,16 @@ static inline struct maple_enode *mas_start(struct ma_state *mas)
mas->max = ULONG_MAX;
mas->depth = 0;
+retry:
root = mas_root(mas);
/* Tree with nodes */
if (likely(xa_is_node(root))) {
mas->depth = 1;
mas->node = mte_safe_root(root);
mas->offset = 0;
+ if (mte_dead_node(mas->node))
+ goto retry;
+
return NULL;
}
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 03/33] maple_tree: Fix freeing of nodes in rcu mode
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 01/33] maple_tree: Be more cautious about dead nodes Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 02/33] maple_tree: Detect dead nodes in mas_start() Suren Baghdasaryan
@ 2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 04/33] maple_tree: remove extra smp_wmb() from mas_dead_leaves() Suren Baghdasaryan
` (31 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:40 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb, Liam Howlett
From: Liam Howlett <Liam.Howlett@oracle.com>
The walk to destroy the nodes was not always setting the node type and
would result in a destroy method potentially using the values as nodes.
Avoid this by setting the correct node types. This is necessary for the
RCU mode of the maple tree.
Fixes: 54a611b60590 ("Maple Tree: add new data structure")
Signed-off-by: Liam Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
lib/maple_tree.c | 73 ++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 62 insertions(+), 11 deletions(-)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 482e17a460cb..73917dd2c608 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -895,6 +895,44 @@ static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt,
meta->end = end;
}
+/*
+ * mas_clear_meta() - clear the metadata information of a node, if it exists
+ * @mas: The maple state
+ * @mn: The maple node
+ * @mt: The maple node type
+ * @offset: The offset of the highest sub-gap in this node.
+ * @end: The end of the data in this node.
+ */
+static inline void mas_clear_meta(struct ma_state *mas, struct maple_node *mn,
+ enum maple_type mt)
+{
+ struct maple_metadata *meta;
+ unsigned long *pivots;
+ void __rcu **slots;
+ void *next;
+
+ switch (mt) {
+ case maple_range_64:
+ pivots = mn->mr64.pivot;
+ if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) {
+ slots = mn->mr64.slot;
+ next = mas_slot_locked(mas, slots,
+ MAPLE_RANGE64_SLOTS - 1);
+ if (unlikely((mte_to_node(next) && mte_node_type(next))))
+ return; /* The last slot is a node, no metadata */
+ }
+ fallthrough;
+ case maple_arange_64:
+ meta = ma_meta(mn, mt);
+ break;
+ default:
+ return;
+ }
+
+ meta->gap = 0;
+ meta->end = 0;
+}
+
/*
* ma_meta_end() - Get the data end of a node from the metadata
* @mn: The maple node
@@ -5448,20 +5486,22 @@ static inline int mas_rev_alloc(struct ma_state *mas, unsigned long min,
* mas_dead_leaves() - Mark all leaves of a node as dead.
* @mas: The maple state
* @slots: Pointer to the slot array
+ * @type: The maple node type
*
* Must hold the write lock.
*
* Return: The number of leaves marked as dead.
*/
static inline
-unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots)
+unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots,
+ enum maple_type mt)
{
struct maple_node *node;
enum maple_type type;
void *entry;
int offset;
- for (offset = 0; offset < mt_slot_count(mas->node); offset++) {
+ for (offset = 0; offset < mt_slots[mt]; offset++) {
entry = mas_slot_locked(mas, slots, offset);
type = mte_node_type(entry);
node = mte_to_node(entry);
@@ -5480,14 +5520,13 @@ unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots)
static void __rcu **mas_dead_walk(struct ma_state *mas, unsigned char offset)
{
- struct maple_node *node, *next;
+ struct maple_node *next;
void __rcu **slots = NULL;
next = mas_mn(mas);
do {
- mas->node = ma_enode_ptr(next);
- node = mas_mn(mas);
- slots = ma_slots(node, node->type);
+ mas->node = mt_mk_node(next, next->type);
+ slots = ma_slots(next, next->type);
next = mas_slot_locked(mas, slots, offset);
offset = 0;
} while (!ma_is_leaf(next->type));
@@ -5551,11 +5590,14 @@ static inline void __rcu **mas_destroy_descend(struct ma_state *mas,
node = mas_mn(mas);
slots = ma_slots(node, mte_node_type(mas->node));
next = mas_slot_locked(mas, slots, 0);
- if ((mte_dead_node(next)))
+ if ((mte_dead_node(next))) {
+ mte_to_node(next)->type = mte_node_type(next);
next = mas_slot_locked(mas, slots, 1);
+ }
mte_set_node_dead(mas->node);
node->type = mte_node_type(mas->node);
+ mas_clear_meta(mas, node, node->type);
node->piv_parent = prev;
node->parent_slot = offset;
offset = 0;
@@ -5575,13 +5617,18 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags,
MA_STATE(mas, &mt, 0, 0);
- if (mte_is_leaf(enode))
+ mas.node = enode;
+ if (mte_is_leaf(enode)) {
+ node->type = mte_node_type(enode);
goto free_leaf;
+ }
+ ma_flags &= ~MT_FLAGS_LOCK_MASK;
mt_init_flags(&mt, ma_flags);
mas_lock(&mas);
- mas.node = start = enode;
+ mte_to_node(enode)->ma_flags = ma_flags;
+ start = enode;
slots = mas_destroy_descend(&mas, start, 0);
node = mas_mn(&mas);
do {
@@ -5589,7 +5636,8 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags,
unsigned char offset;
struct maple_enode *parent, *tmp;
- node->slot_len = mas_dead_leaves(&mas, slots);
+ node->type = mte_node_type(mas.node);
+ node->slot_len = mas_dead_leaves(&mas, slots, node->type);
if (free)
mt_free_bulk(node->slot_len, slots);
offset = node->parent_slot + 1;
@@ -5613,7 +5661,8 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags,
} while (start != mas.node);
node = mas_mn(&mas);
- node->slot_len = mas_dead_leaves(&mas, slots);
+ node->type = mte_node_type(mas.node);
+ node->slot_len = mas_dead_leaves(&mas, slots, node->type);
if (free)
mt_free_bulk(node->slot_len, slots);
@@ -5623,6 +5672,8 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags,
free_leaf:
if (free)
mt_free_rcu(&node->rcu);
+ else
+ mas_clear_meta(&mas, node, node->type);
}
/*
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 04/33] maple_tree: remove extra smp_wmb() from mas_dead_leaves()
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (2 preceding siblings ...)
2023-01-27 19:40 ` [PATCH v2 03/33] maple_tree: Fix freeing of nodes in rcu mode Suren Baghdasaryan
@ 2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 05/33] maple_tree: Fix write memory barrier of nodes once dead for RCU mode Suren Baghdasaryan
` (30 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:40 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb, Liam Howlett
From: Liam Howlett <Liam.Howlett@oracle.com>
The call to mte_set_dead_node() before the smp_wmb() already calls
smp_wmb() so this is not needed. This is an optimization for the RCU
mode of the maple tree.
Fixes: 54a611b60590 ("Maple Tree: add new data structure")
Signed-off-by: Liam Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
lib/maple_tree.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 73917dd2c608..75cce2d4d5da 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -5510,7 +5510,6 @@ unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots,
break;
mte_set_node_dead(entry);
- smp_wmb(); /* Needed for RCU */
node->type = type;
rcu_assign_pointer(slots[offset], node);
}
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 05/33] maple_tree: Fix write memory barrier of nodes once dead for RCU mode
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (3 preceding siblings ...)
2023-01-27 19:40 ` [PATCH v2 04/33] maple_tree: remove extra smp_wmb() from mas_dead_leaves() Suren Baghdasaryan
@ 2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 06/33] maple_tree: Add smp_rmb() to dead node detection Suren Baghdasaryan
` (29 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:40 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb, Liam R. Howlett
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
During the development of the maple tree, the strategy of freeing
multiple nodes changed and, in the process, the pivots were reused to
store pointers to dead nodes. To ensure the readers see accurate
pivots, the writers need to mark the nodes as dead and call smp_wmb() to
ensure any readers can identify the node as dead before using the pivot
values.
There were two places where the old method of marking the node as dead
without smp_wmb() were being used, which resulted in RCU readers seeing
the wrong pivot value before seeing the node was dead. Fix this race
condition by using mte_set_node_dead() which has the smp_wmb() call to
ensure the race is closed.
Add a WARN_ON() to the ma_free_rcu() call to ensure all nodes being
freed are marked as dead to ensure there are no other call paths besides
the two updated paths.
This is necessary for the RCU mode of the maple tree.
Fixes: 54a611b60590 ("Maple Tree: add new data structure")
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
lib/maple_tree.c | 7 +++++--
tools/testing/radix-tree/maple.c | 16 ++++++++++++++++
2 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 75cce2d4d5da..49e399e8afaa 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -178,7 +178,7 @@ static void mt_free_rcu(struct rcu_head *head)
*/
static void ma_free_rcu(struct maple_node *node)
{
- node->parent = ma_parent_ptr(node);
+ WARN_ON(node->parent != ma_parent_ptr(node));
call_rcu(&node->rcu, mt_free_rcu);
}
@@ -1771,8 +1771,10 @@ static inline void mas_replace(struct ma_state *mas, bool advanced)
rcu_assign_pointer(slots[offset], mas->node);
}
- if (!advanced)
+ if (!advanced) {
+ mte_set_node_dead(old_enode);
mas_free(mas, old_enode);
+ }
}
/*
@@ -4211,6 +4213,7 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas)
done:
mas_leaf_set_meta(mas, newnode, dst_pivots, maple_leaf_64, new_end);
if (in_rcu) {
+ mte_set_node_dead(mas->node);
mas->node = mt_mk_node(newnode, wr_mas->type);
mas_replace(mas, false);
} else {
diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index 958ee9bdb316..4c89ff333f6f 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -108,6 +108,7 @@ static noinline void check_new_node(struct maple_tree *mt)
MT_BUG_ON(mt, mn->slot[1] != NULL);
MT_BUG_ON(mt, mas_allocated(&mas) != 0);
+ mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
mas.node = MAS_START;
mas_nomem(&mas, GFP_KERNEL);
@@ -160,6 +161,7 @@ static noinline void check_new_node(struct maple_tree *mt)
MT_BUG_ON(mt, mas_allocated(&mas) != i);
MT_BUG_ON(mt, !mn);
MT_BUG_ON(mt, not_empty(mn));
+ mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
}
@@ -192,6 +194,7 @@ static noinline void check_new_node(struct maple_tree *mt)
MT_BUG_ON(mt, not_empty(mn));
MT_BUG_ON(mt, mas_allocated(&mas) != i - 1);
MT_BUG_ON(mt, !mn);
+ mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
}
@@ -210,6 +213,7 @@ static noinline void check_new_node(struct maple_tree *mt)
mn = mas_pop_node(&mas);
MT_BUG_ON(mt, not_empty(mn));
MT_BUG_ON(mt, mas_allocated(&mas) != j - 1);
+ mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
}
MT_BUG_ON(mt, mas_allocated(&mas) != 0);
@@ -233,6 +237,7 @@ static noinline void check_new_node(struct maple_tree *mt)
MT_BUG_ON(mt, mas_allocated(&mas) != i - j);
mn = mas_pop_node(&mas);
MT_BUG_ON(mt, not_empty(mn));
+ mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
MT_BUG_ON(mt, mas_allocated(&mas) != i - j - 1);
}
@@ -269,6 +274,7 @@ static noinline void check_new_node(struct maple_tree *mt)
mn = mas_pop_node(&mas); /* get the next node. */
MT_BUG_ON(mt, mn == NULL);
MT_BUG_ON(mt, not_empty(mn));
+ mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
}
MT_BUG_ON(mt, mas_allocated(&mas) != 0);
@@ -294,6 +300,7 @@ static noinline void check_new_node(struct maple_tree *mt)
mn = mas_pop_node(&mas2); /* get the next node. */
MT_BUG_ON(mt, mn == NULL);
MT_BUG_ON(mt, not_empty(mn));
+ mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
}
MT_BUG_ON(mt, mas_allocated(&mas2) != 0);
@@ -334,10 +341,12 @@ static noinline void check_new_node(struct maple_tree *mt)
MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2);
mn = mas_pop_node(&mas);
MT_BUG_ON(mt, not_empty(mn));
+ mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
for (i = 1; i <= MAPLE_ALLOC_SLOTS + 1; i++) {
mn = mas_pop_node(&mas);
MT_BUG_ON(mt, not_empty(mn));
+ mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
}
MT_BUG_ON(mt, mas_allocated(&mas) != 0);
@@ -375,6 +384,7 @@ static noinline void check_new_node(struct maple_tree *mt)
mas_node_count(&mas, i); /* Request */
mas_nomem(&mas, GFP_KERNEL); /* Fill request */
mn = mas_pop_node(&mas); /* get the next node. */
+ mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
mas_destroy(&mas);
@@ -382,10 +392,13 @@ static noinline void check_new_node(struct maple_tree *mt)
mas_node_count(&mas, i); /* Request */
mas_nomem(&mas, GFP_KERNEL); /* Fill request */
mn = mas_pop_node(&mas); /* get the next node. */
+ mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
mn = mas_pop_node(&mas); /* get the next node. */
+ mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
mn = mas_pop_node(&mas); /* get the next node. */
+ mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
mas_destroy(&mas);
}
@@ -35369,6 +35382,7 @@ static noinline void check_prealloc(struct maple_tree *mt)
MT_BUG_ON(mt, allocated != 1 + height * 3);
mn = mas_pop_node(&mas);
MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1);
+ mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
mas_destroy(&mas);
@@ -35386,6 +35400,7 @@ static noinline void check_prealloc(struct maple_tree *mt)
mas_destroy(&mas);
allocated = mas_allocated(&mas);
MT_BUG_ON(mt, allocated != 0);
+ mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
@@ -35756,6 +35771,7 @@ void farmer_tests(void)
tree.ma_root = mt_mk_node(node, maple_leaf_64);
mt_dump(&tree);
+ node->parent = ma_parent_ptr(node);
ma_free_rcu(node);
/* Check things that will make lockdep angry */
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 06/33] maple_tree: Add smp_rmb() to dead node detection
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (4 preceding siblings ...)
2023-01-27 19:40 ` [PATCH v2 05/33] maple_tree: Fix write memory barrier of nodes once dead for RCU mode Suren Baghdasaryan
@ 2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 07/33] mm: Enable maple tree RCU mode by default Suren Baghdasaryan
` (28 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:40 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb, Liam R. Howlett
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Add an smp_rmb() before reading the parent pointer to ensure that
anything read from the node prior to the parent pointer hasn't been
reordered ahead of this check.
The is necessary for RCU mode.
Fixes: 54a611b60590 ("Maple Tree: add new data structure")
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
lib/maple_tree.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 49e399e8afaa..859303d2da90 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -532,9 +532,11 @@ static inline struct maple_node *mte_parent(const struct maple_enode *enode)
*/
static inline bool ma_dead_node(const struct maple_node *node)
{
- struct maple_node *parent = (void *)((unsigned long)
- node->parent & ~MAPLE_NODE_MASK);
+ struct maple_node *parent;
+ /* Do not reorder reads from the node prior to the parent check */
+ smp_rmb();
+ parent = (void *)((unsigned long) node->parent & ~MAPLE_NODE_MASK);
return (parent == node);
}
@@ -549,6 +551,8 @@ static inline bool mte_dead_node(const struct maple_enode *enode)
struct maple_node *parent, *node;
node = mte_to_node(enode);
+ /* Do not reorder reads from the node prior to the parent check */
+ smp_rmb();
parent = mte_parent(enode);
return (parent == node);
}
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 07/33] mm: Enable maple tree RCU mode by default.
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (5 preceding siblings ...)
2023-01-27 19:40 ` [PATCH v2 06/33] maple_tree: Add smp_rmb() to dead node detection Suren Baghdasaryan
@ 2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 08/33] mm: introduce CONFIG_PER_VMA_LOCK Suren Baghdasaryan
` (27 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:40 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb, Liam R. Howlett
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Use the maple tree in RCU mode for VMA tracking. This is necessary for
the use of per-VMA locking. RCU mode is enabled by default but disabled
when exiting an mm and for the new tree during a fork.
Also enable RCU for the tree used in munmap operations to ensure the
nodes remain valid for readers.
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
include/linux/mm_types.h | 3 ++-
kernel/fork.c | 3 +++
mm/mmap.c | 4 +++-
3 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index da983aedb741..8410c3052148 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -768,7 +768,8 @@ struct mm_struct {
unsigned long cpu_bitmap[];
};
-#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN)
+#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \
+ MT_FLAGS_USE_RCU)
extern struct mm_struct init_mm;
/* Pointer magic because the dynamic array size confuses some compilers. */
diff --git a/kernel/fork.c b/kernel/fork.c
index 5e3029ea8e1e..5f23d5e03362 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -617,6 +617,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
if (retval)
goto out;
+ mt_clear_in_rcu(vmi.mas.tree);
for_each_vma(old_vmi, mpnt) {
struct file *file;
@@ -700,6 +701,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
retval = arch_dup_mmap(oldmm, mm);
loop_out:
vma_iter_free(&vmi);
+ if (!retval)
+ mt_set_in_rcu(vmi.mas.tree);
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
diff --git a/mm/mmap.c b/mm/mmap.c
index 9d0dadb55e9a..9efe13d36df7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2277,7 +2277,8 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
int count = 0;
int error = -ENOMEM;
MA_STATE(mas_detach, &mt_detach, 0, 0);
- mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN);
+ mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags &
+ (MT_FLAGS_LOCK_MASK | MT_FLAGS_USE_RCU));
mt_set_external_lock(&mt_detach, &mm->mmap_lock);
/*
@@ -3042,6 +3043,7 @@ void exit_mmap(struct mm_struct *mm)
*/
set_bit(MMF_OOM_SKIP, &mm->flags);
mmap_write_lock(mm);
+ mt_clear_in_rcu(&mm->mm_mt);
free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,
USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb);
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 08/33] mm: introduce CONFIG_PER_VMA_LOCK
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (6 preceding siblings ...)
2023-01-27 19:40 ` [PATCH v2 07/33] mm: Enable maple tree RCU mode by default Suren Baghdasaryan
@ 2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 09/33] mm: rcu safe VMA freeing Suren Baghdasaryan
` (26 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:40 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
This configuration variable will be used to build the support for VMA
locking during page fault handling.
This is enabled on supported architectures with SMP and MMU set.
The architecture support is needed since the page fault handler is called
from the architecture's page faulting code which needs modifications to
handle faults under VMA lock.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
mm/Kconfig | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/mm/Kconfig b/mm/Kconfig
index ca98b2072df5..2e4a7e61768a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1211,6 +1211,18 @@ config LRU_GEN_STATS
This option has a per-memcg and per-node memory overhead.
# }
+config ARCH_SUPPORTS_PER_VMA_LOCK
+ def_bool n
+
+config PER_VMA_LOCK
+ def_bool y
+ depends on ARCH_SUPPORTS_PER_VMA_LOCK && MMU && SMP
+ help
+ Allow per-vma locking during page fault handling.
+
+ This feature allows locking each virtual memory area separately when
+ handling page faults instead of taking mmap_lock.
+
source "mm/damon/Kconfig"
endmenu
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 09/33] mm: rcu safe VMA freeing
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (7 preceding siblings ...)
2023-01-27 19:40 ` [PATCH v2 08/33] mm: introduce CONFIG_PER_VMA_LOCK Suren Baghdasaryan
@ 2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 10/33] mm: move mmap_lock assert function definitions Suren Baghdasaryan
` (25 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:40 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
From: Michel Lespinasse <michel@lespinasse.org>
This prepares for page faults handling under VMA lock, looking up VMAs
under protection of an rcu read lock, instead of the usual mmap read lock.
Signed-off-by: Michel Lespinasse <michel@lespinasse.org>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
include/linux/mm_types.h | 13 ++++++++++---
kernel/fork.c | 20 +++++++++++++++++++-
2 files changed, 29 insertions(+), 4 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8410c3052148..62e413f84011 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -480,9 +480,16 @@ struct anon_vma_name {
struct vm_area_struct {
/* The first cache line has the info for VMA tree walking. */
- unsigned long vm_start; /* Our start address within vm_mm. */
- unsigned long vm_end; /* The first byte after our end address
- within vm_mm. */
+ union {
+ struct {
+ /* VMA covers [vm_start; vm_end) addresses within mm */
+ unsigned long vm_start;
+ unsigned long vm_end;
+ };
+#ifdef CONFIG_PER_VMA_LOCK
+ struct rcu_head vm_rcu; /* Used for deferred freeing. */
+#endif
+ };
struct mm_struct *vm_mm; /* The address space we belong to. */
diff --git a/kernel/fork.c b/kernel/fork.c
index 5f23d5e03362..314d51eb91da 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -479,12 +479,30 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
return new;
}
-void vm_area_free(struct vm_area_struct *vma)
+static void __vm_area_free(struct vm_area_struct *vma)
{
free_anon_vma_name(vma);
kmem_cache_free(vm_area_cachep, vma);
}
+#ifdef CONFIG_PER_VMA_LOCK
+static void vm_area_free_rcu_cb(struct rcu_head *head)
+{
+ struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
+ vm_rcu);
+ __vm_area_free(vma);
+}
+#endif
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+#ifdef CONFIG_PER_VMA_LOCK
+ call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
+#else
+ __vm_area_free(vma);
+#endif
+}
+
static void account_kernel_stack(struct task_struct *tsk, int account)
{
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 10/33] mm: move mmap_lock assert function definitions
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (8 preceding siblings ...)
2023-01-27 19:40 ` [PATCH v2 09/33] mm: rcu safe VMA freeing Suren Baghdasaryan
@ 2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 11/33] mm: add per-VMA lock and helper functions to control it Suren Baghdasaryan
` (24 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:40 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
Move mmap_lock assert function definitions up so that they can be used
by other mmap_lock routines.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
include/linux/mmap_lock.h | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 96e113e23d04..e49ba91bb1f0 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -60,6 +60,18 @@ static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
#endif /* CONFIG_TRACING */
+static inline void mmap_assert_locked(struct mm_struct *mm)
+{
+ lockdep_assert_held(&mm->mmap_lock);
+ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+}
+
+static inline void mmap_assert_write_locked(struct mm_struct *mm)
+{
+ lockdep_assert_held_write(&mm->mmap_lock);
+ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+}
+
static inline void mmap_init_lock(struct mm_struct *mm)
{
init_rwsem(&mm->mmap_lock);
@@ -150,18 +162,6 @@ static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
up_read_non_owner(&mm->mmap_lock);
}
-static inline void mmap_assert_locked(struct mm_struct *mm)
-{
- lockdep_assert_held(&mm->mmap_lock);
- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
-}
-
-static inline void mmap_assert_write_locked(struct mm_struct *mm)
-{
- lockdep_assert_held_write(&mm->mmap_lock);
- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
-}
-
static inline int mmap_lock_is_contended(struct mm_struct *mm)
{
return rwsem_is_contended(&mm->mmap_lock);
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 11/33] mm: add per-VMA lock and helper functions to control it
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (9 preceding siblings ...)
2023-01-27 19:40 ` [PATCH v2 10/33] mm: move mmap_lock assert function definitions Suren Baghdasaryan
@ 2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 12/33] mm: mark VMA as being written when changing vm_flags Suren Baghdasaryan
` (23 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:40 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
Introduce per-VMA locking. The lock implementation relies on a
per-vma and per-mm sequence counters to note exclusive locking:
- read lock - (implemented by vma_start_read) requires the vma
(vm_lock_seq) and mm (mm_lock_seq) sequence counters to differ.
If they match then there must be a vma exclusive lock held somewhere.
- read unlock - (implemented by vma_end_read) is a trivial vma->lock
unlock.
- write lock - (vma_start_write) requires the mmap_lock to be held
exclusively and the current mm counter is assigned to the vma counter.
This will allow multiple vmas to be locked under a single mmap_lock
write lock (e.g. during vma merging). The vma counter is modified
under exclusive vma lock.
- write unlock - (vma_end_write_all) is a batch release of all vma
locks held. It doesn't pair with a specific vma_start_write! It is
done before exclusive mmap_lock is released by incrementing mm
sequence counter (mm_lock_seq).
- write downgrade - if the mmap_lock is downgraded to the read lock, all
vma write locks are released as well (effectivelly same as write
unlock).
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
include/linux/mm.h | 82 +++++++++++++++++++++++++++++++++++++++
include/linux/mm_types.h | 8 ++++
include/linux/mmap_lock.h | 13 +++++++
kernel/fork.c | 4 ++
mm/init-mm.c | 3 ++
5 files changed, 110 insertions(+)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index dd295c020e85..fee08e8fdce7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -617,6 +617,87 @@ struct vm_operations_struct {
unsigned long addr);
};
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_init_lock(struct vm_area_struct *vma)
+{
+ init_rwsem(&vma->lock);
+ vma->vm_lock_seq = -1;
+}
+
+/*
+ * Try to read-lock a vma. The function is allowed to occasionally yield false
+ * locked result to avoid performance overhead, in which case we fall back to
+ * using mmap_lock. The function should never yield false unlocked result.
+ */
+static inline bool vma_start_read(struct vm_area_struct *vma)
+{
+ /* Check before locking. A race might cause false locked result. */
+ if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
+ return false;
+
+ if (unlikely(down_read_trylock(&vma->lock) == 0))
+ return false;
+
+ /*
+ * Overflow might produce false locked result.
+ * False unlocked result is impossible because we modify and check
+ * vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq
+ * modification invalidates all existing locks.
+ */
+ if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
+ up_read(&vma->lock);
+ return false;
+ }
+ return true;
+}
+
+static inline void vma_end_read(struct vm_area_struct *vma)
+{
+ rcu_read_lock(); /* keeps vma alive till the end of up_read */
+ up_read(&vma->lock);
+ rcu_read_unlock();
+}
+
+static inline void vma_start_write(struct vm_area_struct *vma)
+{
+ int mm_lock_seq;
+
+ mmap_assert_write_locked(vma->vm_mm);
+
+ /*
+ * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+ * mm->mm_lock_seq can't be concurrently modified.
+ */
+ mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+ if (vma->vm_lock_seq == mm_lock_seq)
+ return;
+
+ down_write(&vma->lock);
+ vma->vm_lock_seq = mm_lock_seq;
+ up_write(&vma->lock);
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+{
+ mmap_assert_write_locked(vma->vm_mm);
+ /*
+ * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+ * mm->mm_lock_seq can't be concurrently modified.
+ */
+ VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline void vma_init_lock(struct vm_area_struct *vma) {}
+static inline bool vma_start_read(struct vm_area_struct *vma)
+ { return false; }
+static inline void vma_end_read(struct vm_area_struct *vma) {}
+static inline void vma_start_write(struct vm_area_struct *vma) {}
+static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
static const struct vm_operations_struct dummy_vm_ops = {};
@@ -625,6 +706,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
vma->vm_mm = mm;
vma->vm_ops = &dummy_vm_ops;
INIT_LIST_HEAD(&vma->anon_vma_chain);
+ vma_init_lock(vma);
}
/* Use when VMA is not part of the VMA tree and needs no locking */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 62e413f84011..88619c6a29a3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -508,6 +508,11 @@ struct vm_area_struct {
vm_flags_t __private __vm_flags;
};
+#ifdef CONFIG_PER_VMA_LOCK
+ int vm_lock_seq;
+ struct rw_semaphore lock;
+#endif
+
/*
* For areas with an address space and backing store,
* linkage into the address_space->i_mmap interval tree.
@@ -633,6 +638,9 @@ struct mm_struct {
* init_mm.mmlist, and are protected
* by mmlist_lock
*/
+#ifdef CONFIG_PER_VMA_LOCK
+ int mm_lock_seq;
+#endif
unsigned long hiwater_rss; /* High-watermark of RSS usage */
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index e49ba91bb1f0..aab8f1b28d26 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -72,6 +72,17 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm)
VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
}
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_end_write_all(struct mm_struct *mm)
+{
+ mmap_assert_write_locked(mm);
+ /* No races during update due to exclusive mmap_lock being held */
+ WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
+}
+#else
+static inline void vma_end_write_all(struct mm_struct *mm) {}
+#endif
+
static inline void mmap_init_lock(struct mm_struct *mm)
{
init_rwsem(&mm->mmap_lock);
@@ -114,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm)
static inline void mmap_write_unlock(struct mm_struct *mm)
{
__mmap_lock_trace_released(mm, true);
+ vma_end_write_all(mm);
up_write(&mm->mmap_lock);
}
static inline void mmap_write_downgrade(struct mm_struct *mm)
{
__mmap_lock_trace_acquire_returned(mm, false, true);
+ vma_end_write_all(mm);
downgrade_write(&mm->mmap_lock);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 314d51eb91da..9141427a98b2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -474,6 +474,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
*/
data_race(memcpy(new, orig, sizeof(*new)));
INIT_LIST_HEAD(&new->anon_vma_chain);
+ vma_init_lock(new);
dup_anon_vma_name(orig, new);
}
return new;
@@ -1147,6 +1148,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
seqcount_init(&mm->write_protect_seq);
mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
+#ifdef CONFIG_PER_VMA_LOCK
+ mm->mm_lock_seq = 0;
+#endif
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index c9327abb771c..33269314e060 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -37,6 +37,9 @@ struct mm_struct init_mm = {
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
+#ifdef CONFIG_PER_VMA_LOCK
+ .mm_lock_seq = 0,
+#endif
.user_ns = &init_user_ns,
.cpu_bitmap = CPU_BITS_NONE,
#ifdef CONFIG_IOMMU_SVA
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 12/33] mm: mark VMA as being written when changing vm_flags
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (10 preceding siblings ...)
2023-01-27 19:40 ` [PATCH v2 11/33] mm: add per-VMA lock and helper functions to control it Suren Baghdasaryan
@ 2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 13/33] mm/mmap: move VMA locking before vma_adjust_trans_huge call Suren Baghdasaryan
` (22 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:40 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
Updates to vm_flags have to be done with VMA marked as being written for
preventing concurrent page faults or other modifications.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
include/linux/mm.h | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fee08e8fdce7..66dca140695e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -720,21 +720,21 @@ static inline void vm_flags_init(struct vm_area_struct *vma,
static inline void vm_flags_reset(struct vm_area_struct *vma,
vm_flags_t flags)
{
- mmap_assert_write_locked(vma->vm_mm);
+ vma_start_write(vma);
vm_flags_init(vma, flags);
}
static inline void vm_flags_set(struct vm_area_struct *vma,
vm_flags_t flags)
{
- mmap_assert_write_locked(vma->vm_mm);
+ vma_start_write(vma);
ACCESS_PRIVATE(vma, __vm_flags) |= flags;
}
static inline void vm_flags_clear(struct vm_area_struct *vma,
vm_flags_t flags)
{
- mmap_assert_write_locked(vma->vm_mm);
+ vma_start_write(vma);
ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
}
@@ -755,7 +755,7 @@ static inline void __vm_flags_mod(struct vm_area_struct *vma,
static inline void vm_flags_mod(struct vm_area_struct *vma,
vm_flags_t set, vm_flags_t clear)
{
- mmap_assert_write_locked(vma->vm_mm);
+ vma_start_write(vma);
__vm_flags_mod(vma, set, clear);
}
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 13/33] mm/mmap: move VMA locking before vma_adjust_trans_huge call
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (11 preceding siblings ...)
2023-01-27 19:40 ` [PATCH v2 12/33] mm: mark VMA as being written when changing vm_flags Suren Baghdasaryan
@ 2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 14/33] mm/khugepaged: write-lock VMA while collapsing a huge page Suren Baghdasaryan
` (21 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:40 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
vma_adjust_trans_huge() modifies the VMA and such modifications should
be done after VMA is marked as being written. Therefore move VMA flag
modifications before vma_adjust_trans_huge() so that VMA is marked
before all these modifications.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
mm/mmap.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index 9efe13d36df7..7467d691e357 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2910,11 +2910,12 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (vma_iter_prealloc(vmi))
goto unacct_fail;
+ /* Set flags first to implicitly lock the VMA before updates */
+ vm_flags_set(vma, VM_SOFTDIRTY);
vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
init_vma_prep(&vp, vma);
vma_prepare(&vp);
vma->vm_end = addr + len;
- vm_flags_set(vma, VM_SOFTDIRTY);
vma_iter_store(vmi, vma);
vma_complete(&vp, vmi, mm);
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 14/33] mm/khugepaged: write-lock VMA while collapsing a huge page
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (12 preceding siblings ...)
2023-01-27 19:40 ` [PATCH v2 13/33] mm/mmap: move VMA locking before vma_adjust_trans_huge call Suren Baghdasaryan
@ 2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 15/33] mm/mmap: write-lock VMAs before merging, splitting or expanding them Suren Baghdasaryan
` (20 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:40 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
Protect VMA from concurrent page fault handler while collapsing a huge
page. Page fault handler needs a stable PMD to use PTL and relies on
per-VMA lock to prevent concurrent PMD changes. pmdp_collapse_flush(),
set_huge_pmd() and collapse_and_free_pmd() can modify a PMD, which will
not be detected by a page fault handler without proper locking.
Before this patch, page tables can be walked under any one of the
mmap_lock, the mapping lock, and the anon_vma lock; so when khugepaged
unlinks and frees page tables, it must ensure that all of those either
are locked or don't exist. This patch adds a fourth lock under which
page tables can be traversed, and so khugepaged must also lock out that
one.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
mm/khugepaged.c | 5 +++++
mm/rmap.c | 31 ++++++++++++++++---------------
2 files changed, 21 insertions(+), 15 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index eb38bd1b1b2f..b938b286cdc3 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1134,6 +1134,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
if (result != SCAN_SUCCEED)
goto out_up_write;
+ vma_start_write(vma);
anon_vma_lock_write(vma->anon_vma);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
@@ -1601,6 +1602,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto drop_hpage;
}
+ /* Lock the vma before taking i_mmap and page table locks */
+ vma_start_write(vma);
+
/*
* We need to lock the mapping so that from here on, only GUP-fast and
* hardware page walks can access the parts of the page tables that
@@ -1806,6 +1810,7 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
result = SCAN_PTE_UFFD_WP;
goto unlock_next;
}
+ vma_start_write(vma);
collapse_and_free_pmd(mm, vma, addr, pmd);
if (!cc->is_khugepaged && is_target)
result = set_huge_pmd(vma, addr, pmd, hpage);
diff --git a/mm/rmap.c b/mm/rmap.c
index 15ae24585fc4..8e1a2ad9ca53 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -25,21 +25,22 @@
* mapping->invalidate_lock (in filemap_fault)
* page->flags PG_locked (lock_page)
* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
- * mapping->i_mmap_rwsem
- * anon_vma->rwsem
- * mm->page_table_lock or pte_lock
- * swap_lock (in swap_duplicate, swap_info_get)
- * mmlist_lock (in mmput, drain_mmlist and others)
- * mapping->private_lock (in block_dirty_folio)
- * folio_lock_memcg move_lock (in block_dirty_folio)
- * i_pages lock (widely used)
- * lruvec->lru_lock (in folio_lruvec_lock_irq)
- * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
- * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
- * sb_lock (within inode_lock in fs/fs-writeback.c)
- * i_pages lock (widely used, in set_page_dirty,
- * in arch-dependent flush_dcache_mmap_lock,
- * within bdi.wb->list_lock in __sync_single_inode)
+ * vma_start_write
+ * mapping->i_mmap_rwsem
+ * anon_vma->rwsem
+ * mm->page_table_lock or pte_lock
+ * swap_lock (in swap_duplicate, swap_info_get)
+ * mmlist_lock (in mmput, drain_mmlist and others)
+ * mapping->private_lock (in block_dirty_folio)
+ * folio_lock_memcg move_lock (in block_dirty_folio)
+ * i_pages lock (widely used)
+ * lruvec->lru_lock (in folio_lruvec_lock_irq)
+ * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
+ * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
+ * sb_lock (within inode_lock in fs/fs-writeback.c)
+ * i_pages lock (widely used, in set_page_dirty,
+ * in arch-dependent flush_dcache_mmap_lock,
+ * within bdi.wb->list_lock in __sync_single_inode)
*
* anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon)
* ->tasklist_lock
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 15/33] mm/mmap: write-lock VMAs before merging, splitting or expanding them
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (13 preceding siblings ...)
2023-01-27 19:40 ` [PATCH v2 14/33] mm/khugepaged: write-lock VMA while collapsing a huge page Suren Baghdasaryan
@ 2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 16/33] mm/mmap: write-lock VMA before shrinking or expanding it Suren Baghdasaryan
` (19 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:40 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
Decisions about whether VMAs can be merged, split or expanded must be
made while VMAs are protected from the changes which can affect that
decision. For example, merge_vma uses vma->anon_vma in its decision
whether the VMA can be merged. Meanwhile, page fault handler changes
vma->anon_vma during COW operation.
Write-lock all VMAs which might be affected by a merge or split operation
before making decision how such operations should be performed.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
mm/mmap.c | 23 ++++++++++++++++++++---
1 file changed, 20 insertions(+), 3 deletions(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index 7467d691e357..6fff76334177 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -269,8 +269,11 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
*/
vma_iter_init(&vmi, mm, oldbrk);
next = vma_find(&vmi, newbrk + PAGE_SIZE + stack_guard_gap);
- if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
- goto out;
+ if (next) {
+ vma_start_write(next);
+ if (newbrk + PAGE_SIZE > vm_start_gap(next))
+ goto out;
+ }
brkvma = vma_prev_limit(&vmi, mm->start_brk);
/* Ok, looks good - let it rip. */
@@ -912,10 +915,17 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
if (vm_flags & VM_SPECIAL)
return NULL;
+ if (prev)
+ vma_start_write(prev);
next = find_vma(mm, prev ? prev->vm_end : 0);
+ if (next)
+ vma_start_write(next);
mid = next;
- if (next && next->vm_end == end) /* cases 6, 7, 8 */
+ if (next && next->vm_end == end) { /* cases 6, 7, 8 */
next = find_vma(mm, next->vm_end);
+ if (next)
+ vma_start_write(next);
+ }
/* verify some invariant that must be enforced by the caller */
VM_WARN_ON(prev && addr <= prev->vm_start);
@@ -2163,6 +2173,7 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
WARN_ON(vma->vm_start >= addr);
WARN_ON(vma->vm_end <= addr);
+ vma_start_write(vma);
if (vma->vm_ops && vma->vm_ops->may_split) {
err = vma->vm_ops->may_split(vma, addr);
if (err)
@@ -2518,6 +2529,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
/* Attempt to expand an old mapping */
/* Check next */
+ if (next)
+ vma_start_write(next);
if (next && next->vm_start == end && !vma_policy(next) &&
can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen,
NULL_VM_UFFD_CTX, NULL)) {
@@ -2527,6 +2540,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
}
/* Check prev */
+ if (prev)
+ vma_start_write(prev);
if (prev && prev->vm_end == addr && !vma_policy(prev) &&
(vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file,
pgoff, vma->vm_userfaultfd_ctx, NULL) :
@@ -2900,6 +2915,8 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
return -ENOMEM;
+ if (vma)
+ vma_start_write(vma);
/*
* Expand the existing vma if possible; Note that singular lists do not
* occur after forking, so the expand will only happen on new VMAs.
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 16/33] mm/mmap: write-lock VMA before shrinking or expanding it
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (14 preceding siblings ...)
2023-01-27 19:40 ` [PATCH v2 15/33] mm/mmap: write-lock VMAs before merging, splitting or expanding them Suren Baghdasaryan
@ 2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 17/33] mm/mremap: write-lock VMA while remapping it to a new address range Suren Baghdasaryan
` (18 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:40 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
vma_expand and vma_shrink change VMA boundaries. Expansion might also
result in freeing of an adjacent VMA. Write-lock affected VMAs to prevent
concurrent page faults.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
mm/mmap.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/mm/mmap.c b/mm/mmap.c
index 6fff76334177..60038c24d836 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -674,6 +674,9 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
ret = dup_anon_vma(vma, next);
if (ret)
return ret;
+
+ /* Lock the VMA before removing it */
+ vma_start_write(next);
}
init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL);
@@ -686,6 +689,7 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (vma_iter_prealloc(vmi))
goto nomem;
+ vma_start_write(vma);
vma_adjust_trans_huge(vma, start, end, 0);
/* VMA iterator points to previous, so set to start if necessary */
if (vma_iter_addr(vmi) != start)
@@ -725,6 +729,7 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (vma_iter_prealloc(vmi))
return -ENOMEM;
+ vma_start_write(vma);
init_vma_prep(&vp, vma);
vma_adjust_trans_huge(vma, start, end, 0);
vma_prepare(&vp);
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 17/33] mm/mremap: write-lock VMA while remapping it to a new address range
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (15 preceding siblings ...)
2023-01-27 19:40 ` [PATCH v2 16/33] mm/mmap: write-lock VMA before shrinking or expanding it Suren Baghdasaryan
@ 2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 18/33] mm: write-lock VMAs before removing them from VMA tree Suren Baghdasaryan
` (17 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:40 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb, Laurent Dufour
Write-lock VMA as locked before copying it and when copy_vma produces
a new VMA.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Laurent Dufour <laurent.dufour@fr.ibm.com>
---
mm/mmap.c | 1 +
mm/mremap.c | 1 +
2 files changed, 2 insertions(+)
diff --git a/mm/mmap.c b/mm/mmap.c
index 60038c24d836..b3c247073aa0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3202,6 +3202,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
get_file(new_vma->vm_file);
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
+ vma_start_write(new_vma);
if (vma_link(mm, new_vma))
goto out_vma_link;
*need_rmap_locks = false;
diff --git a/mm/mremap.c b/mm/mremap.c
index 411a85682b58..dd541e59edda 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -623,6 +623,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
return -ENOMEM;
}
+ vma_start_write(vma);
new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
&need_rmap_locks);
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 18/33] mm: write-lock VMAs before removing them from VMA tree
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (16 preceding siblings ...)
2023-01-27 19:40 ` [PATCH v2 17/33] mm/mremap: write-lock VMA while remapping it to a new address range Suren Baghdasaryan
@ 2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 19/33] mm: conditionally write-lock VMA in free_pgtables Suren Baghdasaryan
` (16 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:40 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
Write-locking VMAs before isolating them ensures that page fault
handlers don't operate on isolated VMAs.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
mm/mmap.c | 1 +
mm/nommu.c | 5 +++++
2 files changed, 6 insertions(+)
diff --git a/mm/mmap.c b/mm/mmap.c
index b3c247073aa0..5bdfd087b632 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2261,6 +2261,7 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
static inline int munmap_sidetree(struct vm_area_struct *vma,
struct ma_state *mas_detach)
{
+ vma_start_write(vma);
mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1);
if (mas_store_gfp(mas_detach, vma, GFP_KERNEL))
return -ENOMEM;
diff --git a/mm/nommu.c b/mm/nommu.c
index 57ba243c6a37..2ab162d773e2 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -588,6 +588,7 @@ static int delete_vma_from_mm(struct vm_area_struct *vma)
current->pid);
return -ENOMEM;
}
+ vma_start_write(vma);
cleanup_vma_from_mm(vma);
/* remove from the MM's tree and list */
@@ -1519,6 +1520,10 @@ void exit_mmap(struct mm_struct *mm)
*/
mmap_write_lock(mm);
for_each_vma(vmi, vma) {
+ /*
+ * No need to lock VMA because this is the only mm user and no
+ * page fault handled can race with it.
+ */
cleanup_vma_from_mm(vma);
delete_vma(mm, vma);
cond_resched();
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 19/33] mm: conditionally write-lock VMA in free_pgtables
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (17 preceding siblings ...)
2023-01-27 19:40 ` [PATCH v2 18/33] mm: write-lock VMAs before removing them from VMA tree Suren Baghdasaryan
@ 2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 20/33] mm/mmap: write-lock adjacent VMAs if they can grow into unmapped area Suren Baghdasaryan
` (15 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:40 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
Normally free_pgtables needs to lock affected VMAs except for the case
when VMAs were isolated under VMA write-lock. munmap() does just that,
isolating while holding appropriate locks and then downgrading mmap_lock
and dropping per-VMA locks before freeing page tables.
Add a parameter to free_pgtables for such scenario.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
mm/internal.h | 2 +-
mm/memory.c | 6 +++++-
mm/mmap.c | 5 +++--
3 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/mm/internal.h b/mm/internal.h
index 90bb2078444c..52d7e9c2e58f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -105,7 +105,7 @@ void folio_activate(struct folio *folio);
void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
struct vm_area_struct *start_vma, unsigned long floor,
- unsigned long ceiling);
+ unsigned long ceiling, bool mm_wr_locked);
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
struct zap_details;
diff --git a/mm/memory.c b/mm/memory.c
index 7a04a1130ec1..d48c76e9fa57 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -348,7 +348,7 @@ void free_pgd_range(struct mmu_gather *tlb,
void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
struct vm_area_struct *vma, unsigned long floor,
- unsigned long ceiling)
+ unsigned long ceiling, bool mm_wr_locked)
{
MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
@@ -366,6 +366,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
* Hide vma from rmap and truncate_pagecache before freeing
* pgtables
*/
+ if (mm_wr_locked)
+ vma_start_write(vma);
unlink_anon_vmas(vma);
unlink_file_vma(vma);
@@ -380,6 +382,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
&& !is_vm_hugetlb_page(next)) {
vma = next;
next = mas_find(&mas, ceiling - 1);
+ if (mm_wr_locked)
+ vma_start_write(vma);
unlink_anon_vmas(vma);
unlink_file_vma(vma);
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 5bdfd087b632..57cb3a2ac9b1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2157,7 +2157,8 @@ static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
update_hiwater_rss(mm);
unmap_vmas(&tlb, mt, vma, start, end, mm_wr_locked);
free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
- next ? next->vm_start : USER_PGTABLES_CEILING);
+ next ? next->vm_start : USER_PGTABLES_CEILING,
+ mm_wr_locked);
tlb_finish_mmu(&tlb);
}
@@ -3069,7 +3070,7 @@ void exit_mmap(struct mm_struct *mm)
mmap_write_lock(mm);
mt_clear_in_rcu(&mm->mm_mt);
free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,
- USER_PGTABLES_CEILING);
+ USER_PGTABLES_CEILING, true);
tlb_finish_mmu(&tlb);
/*
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 20/33] mm/mmap: write-lock adjacent VMAs if they can grow into unmapped area
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (18 preceding siblings ...)
2023-01-27 19:40 ` [PATCH v2 19/33] mm: conditionally write-lock VMA in free_pgtables Suren Baghdasaryan
@ 2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 21/33] kernel/fork: assert no VMA readers during its destruction Suren Baghdasaryan
` (14 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:40 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
While unmapping VMAs, adjacent VMAs might be able to grow into the area
being unmapped. In such cases write-lock adjacent VMAs to prevent this
growth.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
mm/mmap.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index 57cb3a2ac9b1..3baf218836bb 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2399,11 +2399,13 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
* down_read(mmap_lock) and collide with the VMA we are about to unmap.
*/
if (downgrade) {
- if (next && (next->vm_flags & VM_GROWSDOWN))
+ if (next && (next->vm_flags & VM_GROWSDOWN)) {
+ vma_start_write(next);
downgrade = false;
- else if (prev && (prev->vm_flags & VM_GROWSUP))
+ } else if (prev && (prev->vm_flags & VM_GROWSUP)) {
+ vma_start_write(prev);
downgrade = false;
- else
+ } else
mmap_write_downgrade(mm);
}
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 21/33] kernel/fork: assert no VMA readers during its destruction
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (19 preceding siblings ...)
2023-01-27 19:40 ` [PATCH v2 20/33] mm/mmap: write-lock adjacent VMAs if they can grow into unmapped area Suren Baghdasaryan
@ 2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:40 ` [PATCH v2 22/33] mm/mmap: prevent pagefault handler from racing with mmu_notifier registration Suren Baghdasaryan
` (13 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:40 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
Assert there are no holders of VMA lock for reading when it is about to be
destroyed.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
kernel/fork.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/kernel/fork.c b/kernel/fork.c
index 9141427a98b2..a08cc0e2bfde 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -491,6 +491,9 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
{
struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
vm_rcu);
+
+ /* The vma should not be locked while being destroyed. */
+ VM_BUG_ON_VMA(rwsem_is_locked(&vma->lock), vma);
__vm_area_free(vma);
}
#endif
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 22/33] mm/mmap: prevent pagefault handler from racing with mmu_notifier registration
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (20 preceding siblings ...)
2023-01-27 19:40 ` [PATCH v2 21/33] kernel/fork: assert no VMA readers during its destruction Suren Baghdasaryan
@ 2023-01-27 19:40 ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 23/33] mm: introduce lock_vma_under_rcu to be used from arch-specific code Suren Baghdasaryan
` (12 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:40 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
Page fault handlers might need to fire MMU notifications while a new
notifier is being registered. Modify mm_take_all_locks to write-lock all
VMAs and prevent this race with page fault handlers that would hold VMA
locks. VMAs are locked before i_mmap_rwsem and anon_vma to keep the same
locking order as in page fault handlers.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
mm/mmap.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/mm/mmap.c b/mm/mmap.c
index 3baf218836bb..3d0cfbc92745 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3501,6 +3501,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
* of mm/rmap.c:
* - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
* hugetlb mapping);
+ * - all vmas marked locked
* - all i_mmap_rwsem locks;
* - all anon_vma->rwseml
*
@@ -3523,6 +3524,13 @@ int mm_take_all_locks(struct mm_struct *mm)
mutex_lock(&mm_all_locks_mutex);
+ mas_for_each(&mas, vma, ULONG_MAX) {
+ if (signal_pending(current))
+ goto out_unlock;
+ vma_start_write(vma);
+ }
+
+ mas_set(&mas, 0);
mas_for_each(&mas, vma, ULONG_MAX) {
if (signal_pending(current))
goto out_unlock;
@@ -3612,6 +3620,7 @@ void mm_drop_all_locks(struct mm_struct *mm)
if (vma->vm_file && vma->vm_file->f_mapping)
vm_unlock_mapping(vma->vm_file->f_mapping);
}
+ vma_end_write_all(mm);
mutex_unlock(&mm_all_locks_mutex);
}
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 23/33] mm: introduce lock_vma_under_rcu to be used from arch-specific code
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (21 preceding siblings ...)
2023-01-27 19:40 ` [PATCH v2 22/33] mm/mmap: prevent pagefault handler from racing with mmu_notifier registration Suren Baghdasaryan
@ 2023-01-27 19:41 ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 24/33] mm: fall back to mmap_lock if vma->anon_vma is not yet set Suren Baghdasaryan
` (11 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:41 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
Introduce lock_vma_under_rcu function to lookup and lock a VMA during
page fault handling. When VMA is not found, can't be locked or changes
after being locked, the function returns NULL. The lookup is performed
under RCU protection to prevent the found VMA from being destroyed before
the VMA lock is acquired. VMA lock statistics are updated according to
the results.
For now only anonymous VMAs can be searched this way. In other cases the
function returns NULL.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
include/linux/mm.h | 3 +++
mm/memory.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 54 insertions(+)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 66dca140695e..fa2b9d6e665e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -687,6 +687,9 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
}
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+ unsigned long address);
+
#else /* CONFIG_PER_VMA_LOCK */
static inline void vma_init_lock(struct vm_area_struct *vma) {}
diff --git a/mm/memory.c b/mm/memory.c
index d48c76e9fa57..5568fcb0a46b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5222,6 +5222,57 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
+#ifdef CONFIG_PER_VMA_LOCK
+/*
+ * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
+ * stable and not isolated. If the VMA is not found or is being modified the
+ * function returns NULL.
+ */
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+ unsigned long address)
+{
+ MA_STATE(mas, &mm->mm_mt, address, address);
+ struct vm_area_struct *vma, *validate;
+
+ rcu_read_lock();
+ vma = mas_walk(&mas);
+retry:
+ if (!vma)
+ goto inval;
+
+ /* Only anonymous vmas are supported for now */
+ if (!vma_is_anonymous(vma))
+ goto inval;
+
+ if (!vma_start_read(vma))
+ goto inval;
+
+ /* Check since vm_start/vm_end might change before we lock the VMA */
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
+ vma_end_read(vma);
+ goto inval;
+ }
+
+ /* Check if the VMA got isolated after we found it */
+ mas.index = address;
+ validate = mas_walk(&mas);
+ if (validate != vma) {
+ vma_end_read(vma);
+ count_vm_vma_lock_event(VMA_LOCK_MISS);
+ /* The area was replaced with another one. */
+ vma = validate;
+ goto retry;
+ }
+
+ rcu_read_unlock();
+ return vma;
+inval:
+ rcu_read_unlock();
+ count_vm_vma_lock_event(VMA_LOCK_ABORT);
+ return NULL;
+}
+#endif /* CONFIG_PER_VMA_LOCK */
+
#ifndef __PAGETABLE_P4D_FOLDED
/*
* Allocate p4d page table.
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 24/33] mm: fall back to mmap_lock if vma->anon_vma is not yet set
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (22 preceding siblings ...)
2023-01-27 19:41 ` [PATCH v2 23/33] mm: introduce lock_vma_under_rcu to be used from arch-specific code Suren Baghdasaryan
@ 2023-01-27 19:41 ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 25/33] mm: add FAULT_FLAG_VMA_LOCK flag Suren Baghdasaryan
` (10 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:41 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
When vma->anon_vma is not set, page fault handler will set it by either
reusing anon_vma of an adjacent VMA if VMAs are compatible or by
allocating a new one. find_mergeable_anon_vma() walks VMA tree to find
a compatible adjacent VMA and that requires not only the faulting VMA
to be stable but also the tree structure and other VMAs inside that tree.
Therefore locking just the faulting VMA is not enough for this search.
Fall back to taking mmap_lock when vma->anon_vma is not set. This
situation happens only on the first page fault and should not affect
overall performance.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
mm/memory.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/mm/memory.c b/mm/memory.c
index 5568fcb0a46b..593548f24007 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5244,6 +5244,10 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
if (!vma_is_anonymous(vma))
goto inval;
+ /* find_mergeable_anon_vma uses adjacent vmas which are not locked */
+ if (!vma->anon_vma)
+ goto inval;
+
if (!vma_start_read(vma))
goto inval;
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 25/33] mm: add FAULT_FLAG_VMA_LOCK flag
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (23 preceding siblings ...)
2023-01-27 19:41 ` [PATCH v2 24/33] mm: fall back to mmap_lock if vma->anon_vma is not yet set Suren Baghdasaryan
@ 2023-01-27 19:41 ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 26/33] mm: prevent do_swap_page from handling page faults under VMA lock Suren Baghdasaryan
` (9 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:41 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb, Laurent Dufour
Add a new flag to distinguish page faults handled under protection of
per-vma lock.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Laurent Dufour <laurent.dufour@fr.ibm.com>
---
include/linux/mm.h | 3 ++-
include/linux/mm_types.h | 1 +
2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fa2b9d6e665e..a6de58bb40c7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -472,7 +472,8 @@ static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
{ FAULT_FLAG_USER, "USER" }, \
{ FAULT_FLAG_REMOTE, "REMOTE" }, \
{ FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \
- { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }
+ { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \
+ { FAULT_FLAG_VMA_LOCK, "VMA_LOCK" }
/*
* vm_fault is filled by the pagefault handler and passed to the vma's
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 88619c6a29a3..c4c43f10344a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1053,6 +1053,7 @@ enum fault_flag {
FAULT_FLAG_INTERRUPTIBLE = 1 << 9,
FAULT_FLAG_UNSHARE = 1 << 10,
FAULT_FLAG_ORIG_PTE_VALID = 1 << 11,
+ FAULT_FLAG_VMA_LOCK = 1 << 12,
};
typedef unsigned int __bitwise zap_flags_t;
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 26/33] mm: prevent do_swap_page from handling page faults under VMA lock
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (24 preceding siblings ...)
2023-01-27 19:41 ` [PATCH v2 25/33] mm: add FAULT_FLAG_VMA_LOCK flag Suren Baghdasaryan
@ 2023-01-27 19:41 ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 27/33] mm: prevent userfaults to be handled under per-vma lock Suren Baghdasaryan
` (8 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:41 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb, Laurent Dufour
Due to the possibility of do_swap_page dropping mmap_lock, abort fault
handling under VMA lock and retry holding mmap_lock. This can be handled
more gracefully in the future.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Laurent Dufour <laurent.dufour@fr.ibm.com>
---
mm/memory.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/mm/memory.c b/mm/memory.c
index 593548f24007..33ecc850d3cb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3690,6 +3690,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (!pte_unmap_same(vmf))
goto out;
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ ret = VM_FAULT_RETRY;
+ goto out;
+ }
+
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 27/33] mm: prevent userfaults to be handled under per-vma lock
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (25 preceding siblings ...)
2023-01-27 19:41 ` [PATCH v2 26/33] mm: prevent do_swap_page from handling page faults under VMA lock Suren Baghdasaryan
@ 2023-01-27 19:41 ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 28/33] mm: introduce per-VMA lock statistics Suren Baghdasaryan
` (7 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:41 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
Due to the possibility of handle_userfault dropping mmap_lock, avoid fault
handling under VMA lock and retry holding mmap_lock. This can be handled
more gracefully in the future.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Suggested-by: Peter Xu <peterx@redhat.com>
---
mm/memory.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/mm/memory.c b/mm/memory.c
index 33ecc850d3cb..55582c6fa2fd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5256,6 +5256,15 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
if (!vma_start_read(vma))
goto inval;
+ /*
+ * Due to the possibility of userfault handler dropping mmap_lock, avoid
+ * it for now and fall back to page fault handling under mmap_lock.
+ */
+ if (userfaultfd_armed(vma)) {
+ vma_end_read(vma);
+ goto inval;
+ }
+
/* Check since vm_start/vm_end might change before we lock the VMA */
if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
vma_end_read(vma);
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 28/33] mm: introduce per-VMA lock statistics
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (26 preceding siblings ...)
2023-01-27 19:41 ` [PATCH v2 27/33] mm: prevent userfaults to be handled under per-vma lock Suren Baghdasaryan
@ 2023-01-27 19:41 ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 29/33] x86/mm: try VMA lock-based page fault handling first Suren Baghdasaryan
` (6 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:41 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
Add a new CONFIG_PER_VMA_LOCK_STATS config option to dump extra
statistics about handling page fault under VMA lock.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
include/linux/vm_event_item.h | 6 ++++++
include/linux/vmstat.h | 6 ++++++
mm/Kconfig.debug | 7 +++++++
mm/vmstat.c | 6 ++++++
4 files changed, 25 insertions(+)
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 7f5d1caf5890..8abfa1240040 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -149,6 +149,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
#ifdef CONFIG_X86
DIRECT_MAP_LEVEL2_SPLIT,
DIRECT_MAP_LEVEL3_SPLIT,
+#endif
+#ifdef CONFIG_PER_VMA_LOCK_STATS
+ VMA_LOCK_SUCCESS,
+ VMA_LOCK_ABORT,
+ VMA_LOCK_RETRY,
+ VMA_LOCK_MISS,
#endif
NR_VM_EVENT_ITEMS
};
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 19cf5b6892ce..fed855bae6d8 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -125,6 +125,12 @@ static inline void vm_events_fold_cpu(int cpu)
#define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
#endif
+#ifdef CONFIG_PER_VMA_LOCK_STATS
+#define count_vm_vma_lock_event(x) count_vm_event(x)
+#else
+#define count_vm_vma_lock_event(x) do {} while (0)
+#endif
+
#define __count_zid_vm_events(item, zid, delta) \
__count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 9191897e76af..4965a7333a3f 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -278,3 +278,10 @@ config DEBUG_KMEMLEAK_AUTO_SCAN
memory leaks.
If unsure, say Y.
+
+config PER_VMA_LOCK_STATS
+ bool "Statistics for per-vma locks"
+ depends on PER_VMA_LOCK
+ default y
+ help
+ Statistics for per-vma locks.
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1ea6a5ce1c41..4f1089a1860e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1399,6 +1399,12 @@ const char * const vmstat_text[] = {
"direct_map_level2_splits",
"direct_map_level3_splits",
#endif
+#ifdef CONFIG_PER_VMA_LOCK_STATS
+ "vma_lock_success",
+ "vma_lock_abort",
+ "vma_lock_retry",
+ "vma_lock_miss",
+#endif
#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
};
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 29/33] x86/mm: try VMA lock-based page fault handling first
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (27 preceding siblings ...)
2023-01-27 19:41 ` [PATCH v2 28/33] mm: introduce per-VMA lock statistics Suren Baghdasaryan
@ 2023-01-27 19:41 ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 30/33] arm64/mm: " Suren Baghdasaryan
` (5 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:41 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
Attempt VMA lock-based page fault handling first, and fall back to the
existing mmap_lock-based handling if that fails.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
arch/x86/Kconfig | 1 +
arch/x86/mm/fault.c | 36 ++++++++++++++++++++++++++++++++++++
2 files changed, 37 insertions(+)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3604074a878b..3647f7bdb110 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,6 +27,7 @@ config X86_64
# Options that are inherently 64-bit kernel only:
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
+ select ARCH_SUPPORTS_PER_VMA_LOCK
select ARCH_USE_CMPXCHG_LOCKREF
select HAVE_ARCH_SOFT_DIRTY
select MODULES_USE_ELF_RELA
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a498ae1fbe66..122c8fe4af12 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -19,6 +19,7 @@
#include <linux/uaccess.h> /* faulthandler_disabled() */
#include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/
#include <linux/mm_types.h>
+#include <linux/mm.h> /* find_and_lock_vma() */
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
@@ -1333,6 +1334,38 @@ void do_user_addr_fault(struct pt_regs *regs,
}
#endif
+#ifdef CONFIG_PER_VMA_LOCK
+ if (!(flags & FAULT_FLAG_USER) || atomic_read(&mm->mm_users) == 1)
+ goto lock_mmap;
+
+ vma = lock_vma_under_rcu(mm, address);
+ if (!vma)
+ goto lock_mmap;
+
+ if (unlikely(access_error(error_code, vma))) {
+ vma_end_read(vma);
+ goto lock_mmap;
+ }
+ fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
+ vma_end_read(vma);
+
+ if (!(fault & VM_FAULT_RETRY)) {
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ goto done;
+ }
+ count_vm_vma_lock_event(VMA_LOCK_RETRY);
+
+ /* Quick path to respond to signals */
+ if (fault_signal_pending(fault, regs)) {
+ if (!user_mode(regs))
+ kernelmode_fixup_or_oops(regs, error_code, address,
+ SIGBUS, BUS_ADRERR,
+ ARCH_DEFAULT_PKEY);
+ return;
+ }
+lock_mmap:
+#endif /* CONFIG_PER_VMA_LOCK */
+
/*
* Kernel-mode access to the user address space should only occur
* on well-defined single instructions listed in the exception
@@ -1433,6 +1466,9 @@ void do_user_addr_fault(struct pt_regs *regs,
}
mmap_read_unlock(mm);
+#ifdef CONFIG_PER_VMA_LOCK
+done:
+#endif
if (likely(!(fault & VM_FAULT_ERROR)))
return;
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 30/33] arm64/mm: try VMA lock-based page fault handling first
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (28 preceding siblings ...)
2023-01-27 19:41 ` [PATCH v2 29/33] x86/mm: try VMA lock-based page fault handling first Suren Baghdasaryan
@ 2023-01-27 19:41 ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 31/33] powerc/mm: " Suren Baghdasaryan
` (4 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:41 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
Attempt VMA lock-based page fault handling first, and fall back to the
existing mmap_lock-based handling if that fails.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
arch/arm64/Kconfig | 1 +
arch/arm64/mm/fault.c | 36 ++++++++++++++++++++++++++++++++++++
2 files changed, 37 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index c5ccca26a408..9f2c0e352da3 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -95,6 +95,7 @@ config ARM64
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
select ARCH_SUPPORTS_NUMA_BALANCING
select ARCH_SUPPORTS_PAGE_TABLE_CHECK
+ select ARCH_SUPPORTS_PER_VMA_LOCK
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
select ARCH_WANT_DEFAULT_BPF_JIT
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index f4cb0f85ccf4..16588dda9258 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -535,6 +535,9 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
unsigned long vm_flags;
unsigned int mm_flags = FAULT_FLAG_DEFAULT;
unsigned long addr = untagged_addr(far);
+#ifdef CONFIG_PER_VMA_LOCK
+ struct vm_area_struct *vma;
+#endif
if (kprobe_page_fault(regs, esr))
return 0;
@@ -585,6 +588,36 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
+#ifdef CONFIG_PER_VMA_LOCK
+ if (!(mm_flags & FAULT_FLAG_USER) || atomic_read(&mm->mm_users) == 1)
+ goto lock_mmap;
+
+ vma = lock_vma_under_rcu(mm, addr);
+ if (!vma)
+ goto lock_mmap;
+
+ if (!(vma->vm_flags & vm_flags)) {
+ vma_end_read(vma);
+ goto lock_mmap;
+ }
+ fault = handle_mm_fault(vma, addr & PAGE_MASK,
+ mm_flags | FAULT_FLAG_VMA_LOCK, regs);
+ vma_end_read(vma);
+
+ if (!(fault & VM_FAULT_RETRY)) {
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ goto done;
+ }
+ count_vm_vma_lock_event(VMA_LOCK_RETRY);
+
+ /* Quick path to respond to signals */
+ if (fault_signal_pending(fault, regs)) {
+ if (!user_mode(regs))
+ goto no_context;
+ return 0;
+ }
+lock_mmap:
+#endif /* CONFIG_PER_VMA_LOCK */
/*
* As per x86, we may deadlock here. However, since the kernel only
* validly references user space from well defined areas of the code,
@@ -628,6 +661,9 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
}
mmap_read_unlock(mm);
+#ifdef CONFIG_PER_VMA_LOCK
+done:
+#endif
/*
* Handle the "normal" (no error) case first.
*/
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 31/33] powerc/mm: try VMA lock-based page fault handling first
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (29 preceding siblings ...)
2023-01-27 19:41 ` [PATCH v2 30/33] arm64/mm: " Suren Baghdasaryan
@ 2023-01-27 19:41 ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 32/33] mm/mmap: free vm_area_struct without call_rcu in exit_mmap Suren Baghdasaryan
` (3 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:41 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
From: Laurent Dufour <ldufour@linux.ibm.com>
Attempt VMA lock-based page fault handling first, and fall back to the
existing mmap_lock-based handling if that fails.
Copied from "x86/mm: try VMA lock-based page fault handling first"
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
arch/powerpc/mm/fault.c | 41 ++++++++++++++++++++++++++
arch/powerpc/platforms/powernv/Kconfig | 1 +
arch/powerpc/platforms/pseries/Kconfig | 1 +
3 files changed, 43 insertions(+)
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 2bef19cc1b98..cab229e75018 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -469,6 +469,44 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
if (is_exec)
flags |= FAULT_FLAG_INSTRUCTION;
+#ifdef CONFIG_PER_VMA_LOCK
+ if (!(flags & FAULT_FLAG_USER) || atomic_read(&mm->mm_users) == 1)
+ goto lock_mmap;
+
+ vma = lock_vma_under_rcu(mm, address);
+ if (!vma)
+ goto lock_mmap;
+
+ if (unlikely(access_pkey_error(is_write, is_exec,
+ (error_code & DSISR_KEYFAULT), vma))) {
+ int rc = bad_access_pkey(regs, address, vma);
+
+ vma_end_read(vma);
+ return rc;
+ }
+
+ if (unlikely(access_error(is_write, is_exec, vma))) {
+ int rc = bad_access(regs, address);
+
+ vma_end_read(vma);
+ return rc;
+ }
+
+ fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
+ vma_end_read(vma);
+
+ if (!(fault & VM_FAULT_RETRY)) {
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ goto done;
+ }
+ count_vm_vma_lock_event(VMA_LOCK_RETRY);
+
+ if (fault_signal_pending(fault, regs))
+ return user_mode(regs) ? 0 : SIGBUS;
+
+lock_mmap:
+#endif /* CONFIG_PER_VMA_LOCK */
+
/* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
* kernel and should generate an OOPS. Unfortunately, in the case of an
@@ -545,6 +583,9 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
mmap_read_unlock(current->mm);
+#ifdef CONFIG_PER_VMA_LOCK
+done:
+#endif
if (unlikely(fault & VM_FAULT_ERROR))
return mm_fault_error(regs, address, fault);
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index ae248a161b43..70a46acc70d6 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -16,6 +16,7 @@ config PPC_POWERNV
select PPC_DOORBELL
select MMU_NOTIFIER
select FORCE_SMP
+ select ARCH_SUPPORTS_PER_VMA_LOCK
default y
config OPAL_PRD
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index a3b4d99567cb..e036a04ff1ca 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -21,6 +21,7 @@ config PPC_PSERIES
select HOTPLUG_CPU
select FORCE_SMP
select SWIOTLB
+ select ARCH_SUPPORTS_PER_VMA_LOCK
default y
config PARAVIRT
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 32/33] mm/mmap: free vm_area_struct without call_rcu in exit_mmap
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (30 preceding siblings ...)
2023-01-27 19:41 ` [PATCH v2 31/33] powerc/mm: " Suren Baghdasaryan
@ 2023-01-27 19:41 ` Suren Baghdasaryan
2023-01-27 19:41 ` [PATCH v2 33/33] mm: separate vma->lock from vm_area_struct Suren Baghdasaryan
` (2 subsequent siblings)
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:41 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
call_rcu() can take a long time when callback offloading is enabled.
Its use in the vm_area_free can cause regressions in the exit path when
multiple VMAs are being freed.
Because exit_mmap() is called only after the last mm user drops its
refcount, the page fault handlers can't be racing with it. Any other
possible user like oom-reaper or process_mrelease are already synchronized
using mmap_lock. Therefore exit_mmap() can free VMAs directly, without
the use of call_rcu().
Expose __vm_area_free() and use it from exit_mmap() to avoid possible
call_rcu() floods and performance regressions caused by it.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
include/linux/mm.h | 2 ++
kernel/fork.c | 2 +-
mm/mmap.c | 11 +++++++----
3 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a6de58bb40c7..1c4ddcd6fd84 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -250,6 +250,8 @@ void setup_initial_init_mm(void *start_code, void *end_code,
struct vm_area_struct *vm_area_alloc(struct mm_struct *);
struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
void vm_area_free(struct vm_area_struct *);
+/* Use only if VMA has no other users */
+void __vm_area_free(struct vm_area_struct *vma);
#ifndef CONFIG_MMU
extern struct rb_root nommu_region_tree;
diff --git a/kernel/fork.c b/kernel/fork.c
index a08cc0e2bfde..d0999de82f94 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -480,7 +480,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
return new;
}
-static void __vm_area_free(struct vm_area_struct *vma)
+void __vm_area_free(struct vm_area_struct *vma)
{
free_anon_vma_name(vma);
kmem_cache_free(vm_area_cachep, vma);
diff --git a/mm/mmap.c b/mm/mmap.c
index 3d0cfbc92745..1028fe131bb7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -133,7 +133,7 @@ void unlink_file_vma(struct vm_area_struct *vma)
/*
* Close a vm structure and free it.
*/
-static void remove_vma(struct vm_area_struct *vma)
+static void remove_vma(struct vm_area_struct *vma, bool unreachable)
{
might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
@@ -141,7 +141,10 @@ static void remove_vma(struct vm_area_struct *vma)
if (vma->vm_file)
fput(vma->vm_file);
mpol_put(vma_policy(vma));
- vm_area_free(vma);
+ if (unreachable)
+ __vm_area_free(vma);
+ else
+ vm_area_free(vma);
}
static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
@@ -2134,7 +2137,7 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += nrpages;
vm_stat_account(mm, vma->vm_flags, -nrpages);
- remove_vma(vma);
+ remove_vma(vma, false);
}
vm_unacct_memory(nr_accounted);
validate_mm(mm);
@@ -3083,7 +3086,7 @@ void exit_mmap(struct mm_struct *mm)
do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
- remove_vma(vma);
+ remove_vma(vma, true);
count++;
cond_resched();
} while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* [PATCH v2 33/33] mm: separate vma->lock from vm_area_struct
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (31 preceding siblings ...)
2023-01-27 19:41 ` [PATCH v2 32/33] mm/mmap: free vm_area_struct without call_rcu in exit_mmap Suren Baghdasaryan
@ 2023-01-27 19:41 ` Suren Baghdasaryan
2023-01-27 22:51 ` [PATCH v2 00/33] Per-VMA locks Andrew Morton
2023-02-15 17:32 ` [External] " Punit Agrawal
34 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-27 19:41 UTC (permalink / raw)
To: akpm
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team, surenb
vma->lock being part of the vm_area_struct causes performance regression
during page faults because during contention its count and owner fields
are constantly updated and having other parts of vm_area_struct used
during page fault handling next to them causes constant cache line
bouncing. Fix that by moving the lock outside of the vm_area_struct.
All attempts to keep vma->lock inside vm_area_struct in a separate
cache line still produce performance regression especially on NUMA
machines. Smallest regression was achieved when lock is placed in the
fourth cache line but that bloats vm_area_struct to 256 bytes.
Considering performance and memory impact, separate lock looks like
the best option. It increases memory footprint of each VMA but that
can be optimized later if the new size causes issues.
Note that after this change vma_init() does not allocate or
initialize vma->lock anymore. A number of drivers allocate a pseudo
VMA on the stack but they never use the VMA's lock, therefore it does
not need to be allocated. The future drivers which might need the VMA
lock should use vm_area_alloc()/vm_area_free() to allocate the VMA.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
include/linux/mm.h | 23 ++++++-------
include/linux/mm_types.h | 6 +++-
kernel/fork.c | 73 ++++++++++++++++++++++++++++++++--------
3 files changed, 74 insertions(+), 28 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1c4ddcd6fd84..52e048c31239 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -621,12 +621,6 @@ struct vm_operations_struct {
};
#ifdef CONFIG_PER_VMA_LOCK
-static inline void vma_init_lock(struct vm_area_struct *vma)
-{
- init_rwsem(&vma->lock);
- vma->vm_lock_seq = -1;
-}
-
/*
* Try to read-lock a vma. The function is allowed to occasionally yield false
* locked result to avoid performance overhead, in which case we fall back to
@@ -638,17 +632,17 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
return false;
- if (unlikely(down_read_trylock(&vma->lock) == 0))
+ if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
return false;
/*
* Overflow might produce false locked result.
* False unlocked result is impossible because we modify and check
- * vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq
+ * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
* modification invalidates all existing locks.
*/
if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
- up_read(&vma->lock);
+ up_read(&vma->vm_lock->lock);
return false;
}
return true;
@@ -657,7 +651,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
static inline void vma_end_read(struct vm_area_struct *vma)
{
rcu_read_lock(); /* keeps vma alive till the end of up_read */
- up_read(&vma->lock);
+ up_read(&vma->vm_lock->lock);
rcu_read_unlock();
}
@@ -675,9 +669,9 @@ static inline void vma_start_write(struct vm_area_struct *vma)
if (vma->vm_lock_seq == mm_lock_seq)
return;
- down_write(&vma->lock);
+ down_write(&vma->vm_lock->lock);
vma->vm_lock_seq = mm_lock_seq;
- up_write(&vma->lock);
+ up_write(&vma->vm_lock->lock);
}
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
@@ -704,6 +698,10 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
#endif /* CONFIG_PER_VMA_LOCK */
+/*
+ * WARNING: vma_init does not initialize vma->vm_lock.
+ * Use vm_area_alloc()/vm_area_free() if vma needs locking.
+ */
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
static const struct vm_operations_struct dummy_vm_ops = {};
@@ -712,7 +710,6 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
vma->vm_mm = mm;
vma->vm_ops = &dummy_vm_ops;
INIT_LIST_HEAD(&vma->anon_vma_chain);
- vma_init_lock(vma);
}
/* Use when VMA is not part of the VMA tree and needs no locking */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c4c43f10344a..1e97bb98197c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -471,6 +471,10 @@ struct anon_vma_name {
char name[];
};
+struct vma_lock {
+ struct rw_semaphore lock;
+};
+
/*
* This struct describes a virtual memory area. There is one of these
* per VM-area/task. A VM area is any part of the process virtual memory
@@ -510,7 +514,7 @@ struct vm_area_struct {
#ifdef CONFIG_PER_VMA_LOCK
int vm_lock_seq;
- struct rw_semaphore lock;
+ struct vma_lock *vm_lock;
#endif
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index d0999de82f94..a152804faa14 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -451,13 +451,49 @@ static struct kmem_cache *vm_area_cachep;
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
+#ifdef CONFIG_PER_VMA_LOCK
+
+/* SLAB cache for vm_area_struct.lock */
+static struct kmem_cache *vma_lock_cachep;
+
+static bool vma_lock_alloc(struct vm_area_struct *vma)
+{
+ vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
+ if (!vma->vm_lock)
+ return false;
+
+ init_rwsem(&vma->vm_lock->lock);
+ vma->vm_lock_seq = -1;
+
+ return true;
+}
+
+static inline void vma_lock_free(struct vm_area_struct *vma)
+{
+ kmem_cache_free(vma_lock_cachep, vma->vm_lock);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
+static inline void vma_lock_free(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
{
struct vm_area_struct *vma;
vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (vma)
- vma_init(vma, mm);
+ if (!vma)
+ return NULL;
+
+ vma_init(vma, mm);
+ if (!vma_lock_alloc(vma)) {
+ kmem_cache_free(vm_area_cachep, vma);
+ return NULL;
+ }
+
return vma;
}
@@ -465,24 +501,30 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
{
struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (new) {
- ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
- ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
- /*
- * orig->shared.rb may be modified concurrently, but the clone
- * will be reinitialized.
- */
- data_race(memcpy(new, orig, sizeof(*new)));
- INIT_LIST_HEAD(&new->anon_vma_chain);
- vma_init_lock(new);
- dup_anon_vma_name(orig, new);
+ if (!new)
+ return NULL;
+
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+ /*
+ * orig->shared.rb may be modified concurrently, but the clone
+ * will be reinitialized.
+ */
+ data_race(memcpy(new, orig, sizeof(*new)));
+ if (!vma_lock_alloc(new)) {
+ kmem_cache_free(vm_area_cachep, new);
+ return NULL;
}
+ INIT_LIST_HEAD(&new->anon_vma_chain);
+ dup_anon_vma_name(orig, new);
+
return new;
}
void __vm_area_free(struct vm_area_struct *vma)
{
free_anon_vma_name(vma);
+ vma_lock_free(vma);
kmem_cache_free(vm_area_cachep, vma);
}
@@ -493,7 +535,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
vm_rcu);
/* The vma should not be locked while being destroyed. */
- VM_BUG_ON_VMA(rwsem_is_locked(&vma->lock), vma);
+ VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
__vm_area_free(vma);
}
#endif
@@ -3089,6 +3131,9 @@ void __init proc_caches_init(void)
NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
+#ifdef CONFIG_PER_VMA_LOCK
+ vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
+#endif
mmap_init();
nsproxy_cache_init();
}
--
2.39.1
^ permalink raw reply related [flat|nested] 42+ messages in thread
* Re: [PATCH v2 00/33] Per-VMA locks
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (32 preceding siblings ...)
2023-01-27 19:41 ` [PATCH v2 33/33] mm: separate vma->lock from vm_area_struct Suren Baghdasaryan
@ 2023-01-27 22:51 ` Andrew Morton
2023-01-27 23:26 ` Matthew Wilcox
2023-02-15 17:32 ` [External] " Punit Agrawal
34 siblings, 1 reply; 42+ messages in thread
From: Andrew Morton @ 2023-01-27 22:51 UTC (permalink / raw)
To: Suren Baghdasaryan
Cc: michel, jglisse, mhocko, vbabka, hannes, mgorman, dave, willy,
liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team
On Fri, 27 Jan 2023 11:40:37 -0800 Suren Baghdasaryan <surenb@google.com> wrote:
> Per-vma locks idea that was discussed during SPF [1] discussion at LSF/MM
> last year [2], which concluded with suggestion that “a reader/writer
> semaphore could be put into the VMA itself; that would have the effect of
> using the VMA as a sort of range lock. There would still be contention at
> the VMA level, but it would be an improvement.” This patchset implements
> this suggested approach.
I think I'll await reviewer/tester input for a while.
> The patchset implements per-VMA locking only for anonymous pages which
> are not in swap and avoids userfaultfs as their implementation is more
> complex. Additional support for file-back page faults, swapped and user
> pages can be added incrementally.
This is a significant risk. How can we be confident that these as yet
unimplemented parts are implementable and that the result will be good?
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 00/33] Per-VMA locks
2023-01-27 22:51 ` [PATCH v2 00/33] Per-VMA locks Andrew Morton
@ 2023-01-27 23:26 ` Matthew Wilcox
2023-01-28 0:00 ` Suren Baghdasaryan
0 siblings, 1 reply; 42+ messages in thread
From: Matthew Wilcox @ 2023-01-27 23:26 UTC (permalink / raw)
To: Andrew Morton
Cc: Suren Baghdasaryan, michel, jglisse, mhocko, vbabka, hannes,
mgorman, dave, liam.howlett, peterz, ldufour, paulmck, mingo,
will, luto, songliubraving, peterx, david, dhowells, hughd,
bigeasy, kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team
On Fri, Jan 27, 2023 at 02:51:38PM -0800, Andrew Morton wrote:
> On Fri, 27 Jan 2023 11:40:37 -0800 Suren Baghdasaryan <surenb@google.com> wrote:
>
> > Per-vma locks idea that was discussed during SPF [1] discussion at LSF/MM
> > last year [2], which concluded with suggestion that “a reader/writer
> > semaphore could be put into the VMA itself; that would have the effect of
> > using the VMA as a sort of range lock. There would still be contention at
> > the VMA level, but it would be an improvement.” This patchset implements
> > this suggested approach.
>
> I think I'll await reviewer/tester input for a while.
>
> > The patchset implements per-VMA locking only for anonymous pages which
> > are not in swap and avoids userfaultfs as their implementation is more
> > complex. Additional support for file-back page faults, swapped and user
> > pages can be added incrementally.
>
> This is a significant risk. How can we be confident that these as yet
> unimplemented parts are implementable and that the result will be good?
They don't need to be implementable for this patchset to be evaluated
on its own terms. This patchset improves scalability for anon pages
without making file/swap/uffd pages worse (or if it does, I haven't
seen the benchmarks to prove it).
That said, I'm confident that I have a good handle on how to make
file-backed page faults work under RCU.
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 00/33] Per-VMA locks
2023-01-27 23:26 ` Matthew Wilcox
@ 2023-01-28 0:00 ` Suren Baghdasaryan
2023-02-14 16:47 ` Suren Baghdasaryan
0 siblings, 1 reply; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-01-28 0:00 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Andrew Morton, michel, jglisse, mhocko, vbabka, hannes, mgorman,
dave, liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team
On Fri, Jan 27, 2023 at 3:26 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Fri, Jan 27, 2023 at 02:51:38PM -0800, Andrew Morton wrote:
> > On Fri, 27 Jan 2023 11:40:37 -0800 Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > > Per-vma locks idea that was discussed during SPF [1] discussion at LSF/MM
> > > last year [2], which concluded with suggestion that “a reader/writer
> > > semaphore could be put into the VMA itself; that would have the effect of
> > > using the VMA as a sort of range lock. There would still be contention at
> > > the VMA level, but it would be an improvement.” This patchset implements
> > > this suggested approach.
> >
> > I think I'll await reviewer/tester input for a while.
Sure, I don't expect the review to be very quick considering the
complexity, however I would appreciate any testing that can be done.
> >
> > > The patchset implements per-VMA locking only for anonymous pages which
> > > are not in swap and avoids userfaultfs as their implementation is more
> > > complex. Additional support for file-back page faults, swapped and user
> > > pages can be added incrementally.
> >
> > This is a significant risk. How can we be confident that these as yet
> > unimplemented parts are implementable and that the result will be good?
>
> They don't need to be implementable for this patchset to be evaluated
> on its own terms. This patchset improves scalability for anon pages
> without making file/swap/uffd pages worse (or if it does, I haven't
> seen the benchmarks to prove it).
Making it work for all kinds of page faults would require much more
time. So, this incremental approach, when we tackle the mmap_lock
scalability problem part-by-part seems more doable. Even with
anonymous-only support, the patch shows considerable improvements.
Therefore I would argue that the patch is viable even if it does not
support the above-mentioned cases.
>
> That said, I'm confident that I have a good handle on how to make
> file-backed page faults work under RCU.
Looking forward to collaborating on that!
Thanks,
Suren.
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 00/33] Per-VMA locks
2023-01-28 0:00 ` Suren Baghdasaryan
@ 2023-02-14 16:47 ` Suren Baghdasaryan
0 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-02-14 16:47 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Andrew Morton, michel, jglisse, mhocko, vbabka, hannes, mgorman,
dave, liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team
On Fri, Jan 27, 2023 at 4:00 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Fri, Jan 27, 2023 at 3:26 PM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > On Fri, Jan 27, 2023 at 02:51:38PM -0800, Andrew Morton wrote:
> > > On Fri, 27 Jan 2023 11:40:37 -0800 Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > > Per-vma locks idea that was discussed during SPF [1] discussion at LSF/MM
> > > > last year [2], which concluded with suggestion that “a reader/writer
> > > > semaphore could be put into the VMA itself; that would have the effect of
> > > > using the VMA as a sort of range lock. There would still be contention at
> > > > the VMA level, but it would be an improvement.” This patchset implements
> > > > this suggested approach.
> > >
> > > I think I'll await reviewer/tester input for a while.
Over the last two weeks I did not receive any feedback on the mailing
list but off-list a couple of people reported positive results in
their tests and Punit reported a regression on his NUMA machine when
running pft-threads workload. I found the source of that regression
and have two small fixes which were confirmed to improve the
performance (hopefully Punit will share the results here).
I'm planning to post v3 sometime this week. If anyone has additional
feedback, please let me know soon so that I can address it in the v3.
Thanks,
Suren.
>
> Sure, I don't expect the review to be very quick considering the
> complexity, however I would appreciate any testing that can be done.
>
> > >
> > > > The patchset implements per-VMA locking only for anonymous pages which
> > > > are not in swap and avoids userfaultfs as their implementation is more
> > > > complex. Additional support for file-back page faults, swapped and user
> > > > pages can be added incrementally.
> > >
> > > This is a significant risk. How can we be confident that these as yet
> > > unimplemented parts are implementable and that the result will be good?
> >
> > They don't need to be implementable for this patchset to be evaluated
> > on its own terms. This patchset improves scalability for anon pages
> > without making file/swap/uffd pages worse (or if it does, I haven't
> > seen the benchmarks to prove it).
>
> Making it work for all kinds of page faults would require much more
> time. So, this incremental approach, when we tackle the mmap_lock
> scalability problem part-by-part seems more doable. Even with
> anonymous-only support, the patch shows considerable improvements.
> Therefore I would argue that the patch is viable even if it does not
> support the above-mentioned cases.
>
> >
> > That said, I'm confident that I have a good handle on how to make
> > file-backed page faults work under RCU.
>
> Looking forward to collaborating on that!
> Thanks,
> Suren.
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [External] [PATCH v2 00/33] Per-VMA locks
2023-01-27 19:40 [PATCH v2 00/33] Per-VMA locks Suren Baghdasaryan
` (33 preceding siblings ...)
2023-01-27 22:51 ` [PATCH v2 00/33] Per-VMA locks Andrew Morton
@ 2023-02-15 17:32 ` Punit Agrawal
2023-02-15 17:39 ` Suren Baghdasaryan
2023-02-28 12:06 ` Punit Agrawal
34 siblings, 2 replies; 42+ messages in thread
From: Punit Agrawal @ 2023-02-15 17:32 UTC (permalink / raw)
To: Suren Baghdasaryan
Cc: akpm, michel, jglisse, mhocko, vbabka, hannes, mgorman, dave,
willy, liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, punit.agrawal, lstoakes, peterjung1337,
rientjes, axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb,
tatashin, edumazet, gthelen, gurua, arjunroy, soheil, leewalsh,
posk, linux-mm, linux-arm-kernel, linuxppc-dev, x86,
linux-kernel, kernel-team
Suren Baghdasaryan <surenb@google.com> writes:
> Previous version:
> v1: https://lore.kernel.org/all/20230109205336.3665937-1-surenb@google.com/
> RFC: https://lore.kernel.org/all/20220901173516.702122-1-surenb@google.com/
>
> LWN article describing the feature:
> https://lwn.net/Articles/906852/
>
> Per-vma locks idea that was discussed during SPF [1] discussion at LSF/MM
> last year [2], which concluded with suggestion that “a reader/writer
> semaphore could be put into the VMA itself; that would have the effect of
> using the VMA as a sort of range lock. There would still be contention at
> the VMA level, but it would be an improvement.” This patchset implements
> this suggested approach.
I took the patches for a spin on a 2-socket 32 core (64 threads) system
with Intel 8336C (Ice Lake) and 512GB of RAM.
For the initial testing, "pft-threads" from the mm-tests suite[0] was
used. The test mmaps a memory region (~100GB on the test system) and
triggers access by a number of threads executing in parallel. For each
degree of parallelism, the test is repeated 10 times to get a better
feel for the behaviour. Below is an excerpt of the harmonic mean
reported by 'compare_kernel' script[1] included with mm-tests.
The first column is results for mm-unstable as of 2023-02-10, the second
column is the patches posted here while the third column includes
optimizations to reclaim some of the observed regression.
From the results, there is a drop in page fault/second for low number of
CPUs but good improvement with higher CPUs.
6.2.0-rc4 6.2.0-rc4 6.2.0-rc4
mm-unstable-20230210 pvl-v2 pvl-v2+opt
Hmean faults/cpu-1 898792.9338 ( 0.00%) 894597.0474 * -0.47%* 895933.2782 * -0.32%*
Hmean faults/cpu-4 751903.9803 ( 0.00%) 677764.2975 * -9.86%* 688643.8163 * -8.41%*
Hmean faults/cpu-7 612275.5663 ( 0.00%) 565363.4137 * -7.66%* 597538.9396 * -2.41%*
Hmean faults/cpu-12 434460.9074 ( 0.00%) 410974.2708 * -5.41%* 452501.4290 * 4.15%*
Hmean faults/cpu-21 291475.5165 ( 0.00%) 293936.8460 ( 0.84%) 308712.2434 * 5.91%*
Hmean faults/cpu-30 218021.3980 ( 0.00%) 228265.0559 * 4.70%* 241897.5225 * 10.95%*
Hmean faults/cpu-48 141798.5030 ( 0.00%) 162322.5972 * 14.47%* 166081.9459 * 17.13%*
Hmean faults/cpu-79 90060.9577 ( 0.00%) 107028.7779 * 18.84%* 109810.4488 * 21.93%*
Hmean faults/cpu-110 64729.3561 ( 0.00%) 80597.7246 * 24.51%* 83134.0679 * 28.43%*
Hmean faults/cpu-128 55740.1334 ( 0.00%) 68395.4426 * 22.70%* 69248.2836 * 24.23%*
Hmean faults/sec-1 898781.7694 ( 0.00%) 894247.3174 * -0.50%* 894440.3118 * -0.48%*
Hmean faults/sec-4 2965588.9697 ( 0.00%) 2683651.5664 * -9.51%* 2726450.9710 * -8.06%*
Hmean faults/sec-7 4144512.3996 ( 0.00%) 3891644.2128 * -6.10%* 4099918.8601 ( -1.08%)
Hmean faults/sec-12 4969513.6934 ( 0.00%) 4829731.4355 * -2.81%* 5264682.7371 * 5.94%*
Hmean faults/sec-21 5814379.4789 ( 0.00%) 5941405.3116 * 2.18%* 6263716.3903 * 7.73%*
Hmean faults/sec-30 6153685.3709 ( 0.00%) 6489311.6634 * 5.45%* 6910843.5858 * 12.30%*
Hmean faults/sec-48 6197953.1327 ( 0.00%) 7216320.7727 * 16.43%* 7412782.2927 * 19.60%*
Hmean faults/sec-79 6167135.3738 ( 0.00%) 7425927.1022 * 20.41%* 7637042.2198 * 23.83%*
Hmean faults/sec-110 6264768.2247 ( 0.00%) 7813329.3863 * 24.72%* 7984344.4005 * 27.45%*
Hmean faults/sec-128 6460727.8216 ( 0.00%) 7875664.8999 * 21.90%* 8049910.3601 * 24.60%*
[0] https://github.com/gormanm/mmtests
[1] https://github.com/gormanm/mmtests/blob/master/compare-kernels.sh
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [External] [PATCH v2 00/33] Per-VMA locks
2023-02-15 17:32 ` [External] " Punit Agrawal
@ 2023-02-15 17:39 ` Suren Baghdasaryan
2023-02-28 12:06 ` Punit Agrawal
1 sibling, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-02-15 17:39 UTC (permalink / raw)
To: Punit Agrawal
Cc: akpm, michel, jglisse, mhocko, vbabka, hannes, mgorman, dave,
willy, liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, lstoakes, peterjung1337, rientjes,
axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb, tatashin,
edumazet, gthelen, gurua, arjunroy, soheil, leewalsh, posk,
linux-mm, linux-arm-kernel, linuxppc-dev, x86, linux-kernel,
kernel-team
On Wed, Feb 15, 2023 at 9:33 AM Punit Agrawal
<punit.agrawal@bytedance.com> wrote:
>
> Suren Baghdasaryan <surenb@google.com> writes:
>
> > Previous version:
> > v1: https://lore.kernel.org/all/20230109205336.3665937-1-surenb@google.com/
> > RFC: https://lore.kernel.org/all/20220901173516.702122-1-surenb@google.com/
> >
> > LWN article describing the feature:
> > https://lwn.net/Articles/906852/
> >
> > Per-vma locks idea that was discussed during SPF [1] discussion at LSF/MM
> > last year [2], which concluded with suggestion that “a reader/writer
> > semaphore could be put into the VMA itself; that would have the effect of
> > using the VMA as a sort of range lock. There would still be contention at
> > the VMA level, but it would be an improvement.” This patchset implements
> > this suggested approach.
>
> I took the patches for a spin on a 2-socket 32 core (64 threads) system
> with Intel 8336C (Ice Lake) and 512GB of RAM.
>
> For the initial testing, "pft-threads" from the mm-tests suite[0] was
> used. The test mmaps a memory region (~100GB on the test system) and
> triggers access by a number of threads executing in parallel. For each
> degree of parallelism, the test is repeated 10 times to get a better
> feel for the behaviour. Below is an excerpt of the harmonic mean
> reported by 'compare_kernel' script[1] included with mm-tests.
>
> The first column is results for mm-unstable as of 2023-02-10, the second
> column is the patches posted here while the third column includes
> optimizations to reclaim some of the observed regression.
>
> From the results, there is a drop in page fault/second for low number of
> CPUs but good improvement with higher CPUs.
>
> 6.2.0-rc4 6.2.0-rc4 6.2.0-rc4
> mm-unstable-20230210 pvl-v2 pvl-v2+opt
>
> Hmean faults/cpu-1 898792.9338 ( 0.00%) 894597.0474 * -0.47%* 895933.2782 * -0.32%*
> Hmean faults/cpu-4 751903.9803 ( 0.00%) 677764.2975 * -9.86%* 688643.8163 * -8.41%*
> Hmean faults/cpu-7 612275.5663 ( 0.00%) 565363.4137 * -7.66%* 597538.9396 * -2.41%*
> Hmean faults/cpu-12 434460.9074 ( 0.00%) 410974.2708 * -5.41%* 452501.4290 * 4.15%*
> Hmean faults/cpu-21 291475.5165 ( 0.00%) 293936.8460 ( 0.84%) 308712.2434 * 5.91%*
> Hmean faults/cpu-30 218021.3980 ( 0.00%) 228265.0559 * 4.70%* 241897.5225 * 10.95%*
> Hmean faults/cpu-48 141798.5030 ( 0.00%) 162322.5972 * 14.47%* 166081.9459 * 17.13%*
> Hmean faults/cpu-79 90060.9577 ( 0.00%) 107028.7779 * 18.84%* 109810.4488 * 21.93%*
> Hmean faults/cpu-110 64729.3561 ( 0.00%) 80597.7246 * 24.51%* 83134.0679 * 28.43%*
> Hmean faults/cpu-128 55740.1334 ( 0.00%) 68395.4426 * 22.70%* 69248.2836 * 24.23%*
>
> Hmean faults/sec-1 898781.7694 ( 0.00%) 894247.3174 * -0.50%* 894440.3118 * -0.48%*
> Hmean faults/sec-4 2965588.9697 ( 0.00%) 2683651.5664 * -9.51%* 2726450.9710 * -8.06%*
> Hmean faults/sec-7 4144512.3996 ( 0.00%) 3891644.2128 * -6.10%* 4099918.8601 ( -1.08%)
> Hmean faults/sec-12 4969513.6934 ( 0.00%) 4829731.4355 * -2.81%* 5264682.7371 * 5.94%*
> Hmean faults/sec-21 5814379.4789 ( 0.00%) 5941405.3116 * 2.18%* 6263716.3903 * 7.73%*
> Hmean faults/sec-30 6153685.3709 ( 0.00%) 6489311.6634 * 5.45%* 6910843.5858 * 12.30%*
> Hmean faults/sec-48 6197953.1327 ( 0.00%) 7216320.7727 * 16.43%* 7412782.2927 * 19.60%*
> Hmean faults/sec-79 6167135.3738 ( 0.00%) 7425927.1022 * 20.41%* 7637042.2198 * 23.83%*
> Hmean faults/sec-110 6264768.2247 ( 0.00%) 7813329.3863 * 24.72%* 7984344.4005 * 27.45%*
> Hmean faults/sec-128 6460727.8216 ( 0.00%) 7875664.8999 * 21.90%* 8049910.3601 * 24.60%*
Thanks for summarizing the findings, Punit! So, looks like the latest
fixes I sent to you for testing (pvl-v2+opt) bring the regression down
quite a bit. faults/sec-4 case is still regressing but the rest look
quite good. I'll incorporate those fixes and post v3 shortly. Thanks!
>
> [0] https://github.com/gormanm/mmtests
> [1] https://github.com/gormanm/mmtests/blob/master/compare-kernels.sh
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 00/33] Per-VMA locks
2023-02-15 17:32 ` [External] " Punit Agrawal
2023-02-15 17:39 ` Suren Baghdasaryan
@ 2023-02-28 12:06 ` Punit Agrawal
2023-02-28 18:08 ` Suren Baghdasaryan
1 sibling, 1 reply; 42+ messages in thread
From: Punit Agrawal @ 2023-02-28 12:06 UTC (permalink / raw)
To: Punit Agrawal
Cc: Suren Baghdasaryan, akpm, michel, jglisse, mhocko, vbabka,
hannes, mgorman, dave, willy, liam.howlett, peterz, ldufour,
paulmck, mingo, will, luto, songliubraving, peterx, david,
dhowells, hughd, bigeasy, kent.overstreet, lstoakes,
peterjung1337, rientjes, axelrasmussen, joelaf, minchan, rppt,
jannh, shakeelb, tatashin, edumazet, gthelen, gurua, arjunroy,
soheil, leewalsh, posk, linux-mm, linux-arm-kernel, linuxppc-dev,
x86, linux-kernel, kernel-team
Punit Agrawal <punit.agrawal@bytedance.com> writes:
> Suren Baghdasaryan <surenb@google.com> writes:
>
>> Previous version:
>> v1: https://lore.kernel.org/all/20230109205336.3665937-1-surenb@google.com/
>> RFC: https://lore.kernel.org/all/20220901173516.702122-1-surenb@google.com/
>>
>> LWN article describing the feature:
>> https://lwn.net/Articles/906852/
>>
>> Per-vma locks idea that was discussed during SPF [1] discussion at LSF/MM
>> last year [2], which concluded with suggestion that “a reader/writer
>> semaphore could be put into the VMA itself; that would have the effect of
>> using the VMA as a sort of range lock. There would still be contention at
>> the VMA level, but it would be an improvement.” This patchset implements
>> this suggested approach.
>
> I took the patches for a spin on a 2-socket 32 core (64 threads) system
> with Intel 8336C (Ice Lake) and 512GB of RAM.
>
> For the initial testing, "pft-threads" from the mm-tests suite[0] was
> used. The test mmaps a memory region (~100GB on the test system) and
> triggers access by a number of threads executing in parallel. For each
> degree of parallelism, the test is repeated 10 times to get a better
> feel for the behaviour. Below is an excerpt of the harmonic mean
> reported by 'compare_kernel' script[1] included with mm-tests.
>
> The first column is results for mm-unstable as of 2023-02-10, the second
> column is the patches posted here while the third column includes
> optimizations to reclaim some of the observed regression.
>
> From the results, there is a drop in page fault/second for low number of
> CPUs but good improvement with higher CPUs.
>
> 6.2.0-rc4 6.2.0-rc4 6.2.0-rc4
> mm-unstable-20230210 pvl-v2 pvl-v2+opt
>
> Hmean faults/cpu-1 898792.9338 ( 0.00%) 894597.0474 * -0.47%* 895933.2782 * -0.32%*
> Hmean faults/cpu-4 751903.9803 ( 0.00%) 677764.2975 * -9.86%* 688643.8163 * -8.41%*
> Hmean faults/cpu-7 612275.5663 ( 0.00%) 565363.4137 * -7.66%* 597538.9396 * -2.41%*
> Hmean faults/cpu-12 434460.9074 ( 0.00%) 410974.2708 * -5.41%* 452501.4290 * 4.15%*
> Hmean faults/cpu-21 291475.5165 ( 0.00%) 293936.8460 ( 0.84%) 308712.2434 * 5.91%*
> Hmean faults/cpu-30 218021.3980 ( 0.00%) 228265.0559 * 4.70%* 241897.5225 * 10.95%*
> Hmean faults/cpu-48 141798.5030 ( 0.00%) 162322.5972 * 14.47%* 166081.9459 * 17.13%*
> Hmean faults/cpu-79 90060.9577 ( 0.00%) 107028.7779 * 18.84%* 109810.4488 * 21.93%*
> Hmean faults/cpu-110 64729.3561 ( 0.00%) 80597.7246 * 24.51%* 83134.0679 * 28.43%*
> Hmean faults/cpu-128 55740.1334 ( 0.00%) 68395.4426 * 22.70%* 69248.2836 * 24.23%*
>
> Hmean faults/sec-1 898781.7694 ( 0.00%) 894247.3174 * -0.50%* 894440.3118 * -0.48%*
> Hmean faults/sec-4 2965588.9697 ( 0.00%) 2683651.5664 * -9.51%* 2726450.9710 * -8.06%*
> Hmean faults/sec-7 4144512.3996 ( 0.00%) 3891644.2128 * -6.10%* 4099918.8601 ( -1.08%)
> Hmean faults/sec-12 4969513.6934 ( 0.00%) 4829731.4355 * -2.81%* 5264682.7371 * 5.94%*
> Hmean faults/sec-21 5814379.4789 ( 0.00%) 5941405.3116 * 2.18%* 6263716.3903 * 7.73%*
> Hmean faults/sec-30 6153685.3709 ( 0.00%) 6489311.6634 * 5.45%* 6910843.5858 * 12.30%*
> Hmean faults/sec-48 6197953.1327 ( 0.00%) 7216320.7727 * 16.43%* 7412782.2927 * 19.60%*
> Hmean faults/sec-79 6167135.3738 ( 0.00%) 7425927.1022 * 20.41%* 7637042.2198 * 23.83%*
> Hmean faults/sec-110 6264768.2247 ( 0.00%) 7813329.3863 * 24.72%* 7984344.4005 * 27.45%*
> Hmean faults/sec-128 6460727.8216 ( 0.00%) 7875664.8999 * 21.90%* 8049910.3601 * 24.60%*
The above workload represent the worst case with regards to per-VMA
locks as it creates a single large VMA. As a follow-up, I modified
pft[2] to create a VMA per thread to understand the behaviour in
scenarios where per-VMA locks should show the most benefit.
6.2.0-rc4 6.2.0-rc4 6.2.0-rc4
mm-unstable-20230210 pvl-v2 pvl-v2+opt
Hmean faults/cpu-1 905497.4354 ( 0.00%) 888736.5570 * -1.85%* 888695.2675 * -1.86%*
Hmean faults/cpu-4 758519.2719 ( 0.00%) 812103.1991 * 7.06%* 825077.9277 * 8.77%*
Hmean faults/cpu-7 617153.8038 ( 0.00%) 729943.4518 * 18.28%* 770872.3161 * 24.91%*
Hmean faults/cpu-12 424848.5266 ( 0.00%) 550357.2856 * 29.54%* 597478.5634 * 40.63%*
Hmean faults/cpu-21 290142.9988 ( 0.00%) 383668.3190 * 32.23%* 433376.8959 * 49.37%*
Hmean faults/cpu-30 218705.2915 ( 0.00%) 299888.5533 * 37.12%* 342640.6153 * 56.67%*
Hmean faults/cpu-48 142842.3372 ( 0.00%) 206498.2605 * 44.56%* 240306.3442 * 68.23%*
Hmean faults/cpu-79 90706.1425 ( 0.00%) 160006.6800 * 76.40%* 185298.4326 * 104.28%*
Hmean faults/cpu-110 67011.9297 ( 0.00%) 143536.0062 * 114.19%* 162688.8015 * 142.78%*
Hmean faults/cpu-128 55986.4986 ( 0.00%) 136550.8760 * 143.90%* 152718.8713 * 172.78%*
Hmean faults/sec-1 905492.1265 ( 0.00%) 887244.6592 * -2.02%* 887775.6079 * -1.96%*
Hmean faults/sec-4 2994284.4204 ( 0.00%) 3154236.9408 * 5.34%* 3221994.8465 * 7.60%*
Hmean faults/sec-7 4177411.3461 ( 0.00%) 4933286.4045 * 18.09%* 5202347.2077 * 24.54%*
Hmean faults/sec-12 4892848.3633 ( 0.00%) 6054577.0988 * 23.74%* 6511987.1142 * 33.09%*
Hmean faults/sec-21 5823534.1820 ( 0.00%) 7637637.4162 * 31.15%* 8553362.3513 * 46.88%*
Hmean faults/sec-30 6247210.8414 ( 0.00%) 8598150.6717 * 37.63%* 9799696.0945 * 56.87%*
Hmean faults/sec-48 6274617.1419 ( 0.00%) 9467132.3699 * 50.88%* 11049401.9072 * 76.10%*
Hmean faults/sec-79 6187291.4971 ( 0.00%) 11919062.5284 * 92.64%* 13420825.3820 * 116.91%*
Hmean faults/sec-110 6454542.3239 ( 0.00%) 15050228.1869 * 133.17%* 16667873.7618 * 158.23%*
Hmean faults/sec-128 6472970.8548 ( 0.00%) 16647275.6575 * 157.18%* 18680029.3714 * 188.59%*
As expected, the tests highlight the improved scalability as core count
increases.
> [0] https://github.com/gormanm/mmtests
> [1] https://github.com/gormanm/mmtests/blob/master/compare-kernels.sh
[2] https://github.com/gormanm/pft/pull/1/commits/8fe554a3d8b4f5947cd00d4b46f97178b8ba8752
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v2 00/33] Per-VMA locks
2023-02-28 12:06 ` Punit Agrawal
@ 2023-02-28 18:08 ` Suren Baghdasaryan
0 siblings, 0 replies; 42+ messages in thread
From: Suren Baghdasaryan @ 2023-02-28 18:08 UTC (permalink / raw)
To: Punit Agrawal
Cc: akpm, michel, jglisse, mhocko, vbabka, hannes, mgorman, dave,
willy, liam.howlett, peterz, ldufour, paulmck, mingo, will, luto,
songliubraving, peterx, david, dhowells, hughd, bigeasy,
kent.overstreet, lstoakes, peterjung1337, rientjes,
axelrasmussen, joelaf, minchan, rppt, jannh, shakeelb, tatashin,
edumazet, gthelen, gurua, arjunroy, soheil, leewalsh, posk,
linux-mm, linux-arm-kernel, linuxppc-dev, x86, linux-kernel,
kernel-team
On Tue, Feb 28, 2023 at 4:06 AM Punit Agrawal
<punit.agrawal@bytedance.com> wrote:
>
> Punit Agrawal <punit.agrawal@bytedance.com> writes:
>
> > Suren Baghdasaryan <surenb@google.com> writes:
> >
> >> Previous version:
> >> v1: https://lore.kernel.org/all/20230109205336.3665937-1-surenb@google.com/
> >> RFC: https://lore.kernel.org/all/20220901173516.702122-1-surenb@google.com/
> >>
> >> LWN article describing the feature:
> >> https://lwn.net/Articles/906852/
> >>
> >> Per-vma locks idea that was discussed during SPF [1] discussion at LSF/MM
> >> last year [2], which concluded with suggestion that “a reader/writer
> >> semaphore could be put into the VMA itself; that would have the effect of
> >> using the VMA as a sort of range lock. There would still be contention at
> >> the VMA level, but it would be an improvement.” This patchset implements
> >> this suggested approach.
> >
> > I took the patches for a spin on a 2-socket 32 core (64 threads) system
> > with Intel 8336C (Ice Lake) and 512GB of RAM.
> >
> > For the initial testing, "pft-threads" from the mm-tests suite[0] was
> > used. The test mmaps a memory region (~100GB on the test system) and
> > triggers access by a number of threads executing in parallel. For each
> > degree of parallelism, the test is repeated 10 times to get a better
> > feel for the behaviour. Below is an excerpt of the harmonic mean
> > reported by 'compare_kernel' script[1] included with mm-tests.
> >
> > The first column is results for mm-unstable as of 2023-02-10, the second
> > column is the patches posted here while the third column includes
> > optimizations to reclaim some of the observed regression.
> >
> > From the results, there is a drop in page fault/second for low number of
> > CPUs but good improvement with higher CPUs.
> >
> > 6.2.0-rc4 6.2.0-rc4 6.2.0-rc4
> > mm-unstable-20230210 pvl-v2 pvl-v2+opt
> >
> > Hmean faults/cpu-1 898792.9338 ( 0.00%) 894597.0474 * -0.47%* 895933.2782 * -0.32%*
> > Hmean faults/cpu-4 751903.9803 ( 0.00%) 677764.2975 * -9.86%* 688643.8163 * -8.41%*
> > Hmean faults/cpu-7 612275.5663 ( 0.00%) 565363.4137 * -7.66%* 597538.9396 * -2.41%*
> > Hmean faults/cpu-12 434460.9074 ( 0.00%) 410974.2708 * -5.41%* 452501.4290 * 4.15%*
> > Hmean faults/cpu-21 291475.5165 ( 0.00%) 293936.8460 ( 0.84%) 308712.2434 * 5.91%*
> > Hmean faults/cpu-30 218021.3980 ( 0.00%) 228265.0559 * 4.70%* 241897.5225 * 10.95%*
> > Hmean faults/cpu-48 141798.5030 ( 0.00%) 162322.5972 * 14.47%* 166081.9459 * 17.13%*
> > Hmean faults/cpu-79 90060.9577 ( 0.00%) 107028.7779 * 18.84%* 109810.4488 * 21.93%*
> > Hmean faults/cpu-110 64729.3561 ( 0.00%) 80597.7246 * 24.51%* 83134.0679 * 28.43%*
> > Hmean faults/cpu-128 55740.1334 ( 0.00%) 68395.4426 * 22.70%* 69248.2836 * 24.23%*
> >
> > Hmean faults/sec-1 898781.7694 ( 0.00%) 894247.3174 * -0.50%* 894440.3118 * -0.48%*
> > Hmean faults/sec-4 2965588.9697 ( 0.00%) 2683651.5664 * -9.51%* 2726450.9710 * -8.06%*
> > Hmean faults/sec-7 4144512.3996 ( 0.00%) 3891644.2128 * -6.10%* 4099918.8601 ( -1.08%)
> > Hmean faults/sec-12 4969513.6934 ( 0.00%) 4829731.4355 * -2.81%* 5264682.7371 * 5.94%*
> > Hmean faults/sec-21 5814379.4789 ( 0.00%) 5941405.3116 * 2.18%* 6263716.3903 * 7.73%*
> > Hmean faults/sec-30 6153685.3709 ( 0.00%) 6489311.6634 * 5.45%* 6910843.5858 * 12.30%*
> > Hmean faults/sec-48 6197953.1327 ( 0.00%) 7216320.7727 * 16.43%* 7412782.2927 * 19.60%*
> > Hmean faults/sec-79 6167135.3738 ( 0.00%) 7425927.1022 * 20.41%* 7637042.2198 * 23.83%*
> > Hmean faults/sec-110 6264768.2247 ( 0.00%) 7813329.3863 * 24.72%* 7984344.4005 * 27.45%*
> > Hmean faults/sec-128 6460727.8216 ( 0.00%) 7875664.8999 * 21.90%* 8049910.3601 * 24.60%*
>
>
> The above workload represent the worst case with regards to per-VMA
> locks as it creates a single large VMA. As a follow-up, I modified
> pft[2] to create a VMA per thread to understand the behaviour in
> scenarios where per-VMA locks should show the most benefit.
>
> 6.2.0-rc4 6.2.0-rc4 6.2.0-rc4
> mm-unstable-20230210 pvl-v2 pvl-v2+opt
>
> Hmean faults/cpu-1 905497.4354 ( 0.00%) 888736.5570 * -1.85%* 888695.2675 * -1.86%*
> Hmean faults/cpu-4 758519.2719 ( 0.00%) 812103.1991 * 7.06%* 825077.9277 * 8.77%*
> Hmean faults/cpu-7 617153.8038 ( 0.00%) 729943.4518 * 18.28%* 770872.3161 * 24.91%*
> Hmean faults/cpu-12 424848.5266 ( 0.00%) 550357.2856 * 29.54%* 597478.5634 * 40.63%*
> Hmean faults/cpu-21 290142.9988 ( 0.00%) 383668.3190 * 32.23%* 433376.8959 * 49.37%*
> Hmean faults/cpu-30 218705.2915 ( 0.00%) 299888.5533 * 37.12%* 342640.6153 * 56.67%*
> Hmean faults/cpu-48 142842.3372 ( 0.00%) 206498.2605 * 44.56%* 240306.3442 * 68.23%*
> Hmean faults/cpu-79 90706.1425 ( 0.00%) 160006.6800 * 76.40%* 185298.4326 * 104.28%*
> Hmean faults/cpu-110 67011.9297 ( 0.00%) 143536.0062 * 114.19%* 162688.8015 * 142.78%*
> Hmean faults/cpu-128 55986.4986 ( 0.00%) 136550.8760 * 143.90%* 152718.8713 * 172.78%*
>
> Hmean faults/sec-1 905492.1265 ( 0.00%) 887244.6592 * -2.02%* 887775.6079 * -1.96%*
> Hmean faults/sec-4 2994284.4204 ( 0.00%) 3154236.9408 * 5.34%* 3221994.8465 * 7.60%*
> Hmean faults/sec-7 4177411.3461 ( 0.00%) 4933286.4045 * 18.09%* 5202347.2077 * 24.54%*
> Hmean faults/sec-12 4892848.3633 ( 0.00%) 6054577.0988 * 23.74%* 6511987.1142 * 33.09%*
> Hmean faults/sec-21 5823534.1820 ( 0.00%) 7637637.4162 * 31.15%* 8553362.3513 * 46.88%*
> Hmean faults/sec-30 6247210.8414 ( 0.00%) 8598150.6717 * 37.63%* 9799696.0945 * 56.87%*
> Hmean faults/sec-48 6274617.1419 ( 0.00%) 9467132.3699 * 50.88%* 11049401.9072 * 76.10%*
> Hmean faults/sec-79 6187291.4971 ( 0.00%) 11919062.5284 * 92.64%* 13420825.3820 * 116.91%*
> Hmean faults/sec-110 6454542.3239 ( 0.00%) 15050228.1869 * 133.17%* 16667873.7618 * 158.23%*
> Hmean faults/sec-128 6472970.8548 ( 0.00%) 16647275.6575 * 157.18%* 18680029.3714 * 188.59%*
>
> As expected, the tests highlight the improved scalability as core count
> increases.
Thanks for trying this, Punit! This is very encouraging.
>
> > [0] https://github.com/gormanm/mmtests
> > [1] https://github.com/gormanm/mmtests/blob/master/compare-kernels.sh
>
> [2] https://github.com/gormanm/pft/pull/1/commits/8fe554a3d8b4f5947cd00d4b46f97178b8ba8752
^ permalink raw reply [flat|nested] 42+ messages in thread