Hi Michal,

I discover that this patch does not hold mmap_sem while migrating pages in
do_move_pages_to_node().

A simple fix below moves mmap_sem from add_page_for_migration()
to the outmost do_pages_move():


diff --git a/mm/migrate.c b/mm/migrate.c
index 5d0dc7b85f90..28b9e126cb38 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1487,7 +1487,6 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
        unsigned int follflags;
        int err;

-       down_read(&mm->mmap_sem);
        err = -EFAULT;
        vma = find_vma(mm, addr);
        if (!vma || addr < vma->vm_start || !vma_migratable(vma))
@@ -1540,7 +1539,6 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
         */
        put_page(page);
 out:
-       up_read(&mm->mmap_sem);
        return err;
 }

@@ -1561,6 +1559,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,

        migrate_prep();

+       down_read(&mm->mmap_sem);
        for (i = start = 0; i < nr_pages; i++) {
                const void __user *p;
                unsigned long addr;
@@ -1628,6 +1627,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
        if (!err)
                err = err1;
 out:
+       up_read(&mm->mmap_sem);
        return err;
 }


--
Best Regards
Yan Zi

On 3 Jan 2018, at 3:25, Michal Hocko wrote:

> From: Michal Hocko <mhocko@suse.com>
>
> do_pages_move is supposed to move user defined memory (an array of
> addresses) to the user defined numa nodes (an array of nodes one for
> each address). The user provided status array then contains resulting
> numa node for each address or an error. The semantic of this function is
> little bit confusing because only some errors are reported back. Notably
> migrate_pages error is only reported via the return value. This patch
> doesn't try to address these semantic nuances but rather change the
> underlying implementation.
>
> Currently we are processing user input (which can be really large)
> in batches which are stored to a temporarily allocated page. Each
> address is resolved to its struct page and stored to page_to_node
> structure along with the requested target numa node. The array of these
> structures is then conveyed down the page migration path via private
> argument. new_page_node then finds the corresponding structure and
> allocates the proper target page.
>
> What is the problem with the current implementation and why to change
> it? Apart from being quite ugly it also doesn't cope with unexpected
> pages showing up on the migration list inside migrate_pages path.
> That doesn't happen currently but the follow up patch would like to
> make the thp migration code more clear and that would need to split a
> THP into the list for some cases.
>
> How does the new implementation work? Well, instead of batching into a
> fixed size array we simply batch all pages that should be migrated to
> the same node and isolate all of them into a linked list which doesn't
> require any additional storage. This should work reasonably well because
> page migration usually migrates larger ranges of memory to a specific
> node. So the common case should work equally well as the current
> implementation. Even if somebody constructs an input where the target
> numa nodes would be interleaved we shouldn't see a large performance
> impact because page migration alone doesn't really benefit from
> batching. mmap_sem batching for the lookup is quite questionable and
> isolate_lru_page which would benefit from batching is not using it even
> in the current implementation.
>
> Acked-by: Kirill A. Shutemov <kirill@shutemov.name>
> Signed-off-by: Michal Hocko <mhocko@suse.com>
> ---
>  mm/internal.h  |   1 +
>  mm/mempolicy.c |   5 +-
>  mm/migrate.c   | 306 +++++++++++++++++++++++++--------------------------------
>  3 files changed, 138 insertions(+), 174 deletions(-)
>
> diff --git a/mm/internal.h b/mm/internal.h
> index 3e5dc95dc259..745e247aca9c 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -540,4 +540,5 @@ static inline bool is_migrate_highatomic_page(struct page *page)
>  }
>
>  void setup_zone_pageset(struct zone *zone);
> +extern struct page *alloc_new_node_page(struct page *page, unsigned long node, int **x);
>  #endif	/* __MM_INTERNAL_H */
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index f604b22ebb65..66c9c79b21be 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -942,7 +942,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
>  	}
>  }
>
> -static struct page *new_node_page(struct page *page, unsigned long node, int **x)
> +/* page allocation callback for NUMA node migration */
> +struct page *alloc_new_node_page(struct page *page, unsigned long node, int **x)
>  {
>  	if (PageHuge(page))
>  		return alloc_huge_page_node(page_hstate(compound_head(page)),
> @@ -986,7 +987,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
>  			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
>
>  	if (!list_empty(&pagelist)) {
> -		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
> +		err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
>  					MIGRATE_SYNC, MR_SYSCALL);
>  		if (err)
>  			putback_movable_pages(&pagelist);
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 4d0be47a322a..8fb90bcd44a7 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -1444,141 +1444,103 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
>  }
>
>  #ifdef CONFIG_NUMA
> -/*
> - * Move a list of individual pages
> - */
> -struct page_to_node {
> -	unsigned long addr;
> -	struct page *page;
> -	int node;
> -	int status;
> -};
>
> -static struct page *new_page_node(struct page *p, unsigned long private,
> -		int **result)
> +static int store_status(int __user *status, int start, int value, int nr)
>  {
> -	struct page_to_node *pm = (struct page_to_node *)private;
> -
> -	while (pm->node != MAX_NUMNODES && pm->page != p)
> -		pm++;
> +	while (nr-- > 0) {
> +		if (put_user(value, status + start))
> +			return -EFAULT;
> +		start++;
> +	}
>
> -	if (pm->node == MAX_NUMNODES)
> -		return NULL;
> +	return 0;
> +}
>
> -	*result = &pm->status;
> +static int do_move_pages_to_node(struct mm_struct *mm,
> +		struct list_head *pagelist, int node)
> +{
> +	int err;
>
> -	if (PageHuge(p))
> -		return alloc_huge_page_node(page_hstate(compound_head(p)),
> -					pm->node);
> -	else if (thp_migration_supported() && PageTransHuge(p)) {
> -		struct page *thp;
> +	if (list_empty(pagelist))
> +		return 0;
>
> -		thp = alloc_pages_node(pm->node,
> -			(GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM,
> -			HPAGE_PMD_ORDER);
> -		if (!thp)
> -			return NULL;
> -		prep_transhuge_page(thp);
> -		return thp;
> -	} else
> -		return __alloc_pages_node(pm->node,
> -				GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
> +	err = migrate_pages(pagelist, alloc_new_node_page, NULL, node,
> +			MIGRATE_SYNC, MR_SYSCALL);
> +	if (err)
> +		putback_movable_pages(pagelist);
> +	return err;
>  }
>
>  /*
> - * Move a set of pages as indicated in the pm array. The addr
> - * field must be set to the virtual address of the page to be moved
> - * and the node number must contain a valid target node.
> - * The pm array ends with node = MAX_NUMNODES.
> + * Resolves the given address to a struct page, isolates it from the LRU and
> + * puts it to the given pagelist.
> + * Returns -errno if the page cannot be found/isolated or 0 when it has been
> + * queued or the page doesn't need to be migrated because it is already on
> + * the target node
>   */
> -static int do_move_page_to_node_array(struct mm_struct *mm,
> -				      struct page_to_node *pm,
> -				      int migrate_all)
> +static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
> +		int node, struct list_head *pagelist, bool migrate_all)
>  {
> +	struct vm_area_struct *vma;
> +	struct page *page;
> +	unsigned int follflags;
>  	int err;
> -	struct page_to_node *pp;
> -	LIST_HEAD(pagelist);
>
>  	down_read(&mm->mmap_sem);
> +	err = -EFAULT;
> +	vma = find_vma(mm, addr);
> +	if (!vma || addr < vma->vm_start || !vma_migratable(vma))
> +		goto out;
>
> -	/*
> -	 * Build a list of pages to migrate
> -	 */
> -	for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
> -		struct vm_area_struct *vma;
> -		struct page *page;
> -		struct page *head;
> -		unsigned int follflags;
> -
> -		err = -EFAULT;
> -		vma = find_vma(mm, pp->addr);
> -		if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
> -			goto set_status;
> -
> -		/* FOLL_DUMP to ignore special (like zero) pages */
> -		follflags = FOLL_GET | FOLL_DUMP;
> -		if (!thp_migration_supported())
> -			follflags |= FOLL_SPLIT;
> -		page = follow_page(vma, pp->addr, follflags);
> +	/* FOLL_DUMP to ignore special (like zero) pages */
> +	follflags = FOLL_GET | FOLL_DUMP;
> +	if (!thp_migration_supported())
> +		follflags |= FOLL_SPLIT;
> +	page = follow_page(vma, addr, follflags);
>
> -		err = PTR_ERR(page);
> -		if (IS_ERR(page))
> -			goto set_status;
> +	err = PTR_ERR(page);
> +	if (IS_ERR(page))
> +		goto out;
>
> -		err = -ENOENT;
> -		if (!page)
> -			goto set_status;
> +	err = -ENOENT;
> +	if (!page)
> +		goto out;
>
> -		err = page_to_nid(page);
> +	err = 0;
> +	if (page_to_nid(page) == node)
> +		goto out_putpage;
>
> -		if (err == pp->node)
> -			/*
> -			 * Node already in the right place
> -			 */
> -			goto put_and_set;
> +	err = -EACCES;
> +	if (page_mapcount(page) > 1 && !migrate_all)
> +		goto out_putpage;
>
> -		err = -EACCES;
> -		if (page_mapcount(page) > 1 &&
> -				!migrate_all)
> -			goto put_and_set;
> -
> -		if (PageHuge(page)) {
> -			if (PageHead(page)) {
> -				isolate_huge_page(page, &pagelist);
> -				err = 0;
> -				pp->page = page;
> -			}
> -			goto put_and_set;
> +	if (PageHuge(page)) {
> +		if (PageHead(page)) {
> +			isolate_huge_page(page, pagelist);
> +			err = 0;
>  		}
> +	} else {
> +		struct page *head;
>
> -		pp->page = compound_head(page);
>  		head = compound_head(page);
>  		err = isolate_lru_page(head);
> -		if (!err) {
> -			list_add_tail(&head->lru, &pagelist);
> -			mod_node_page_state(page_pgdat(head),
> -				NR_ISOLATED_ANON + page_is_file_cache(head),
> -				hpage_nr_pages(head));
> -		}
> -put_and_set:
> -		/*
> -		 * Either remove the duplicate refcount from
> -		 * isolate_lru_page() or drop the page ref if it was
> -		 * not isolated.
> -		 */
> -		put_page(page);
> -set_status:
> -		pp->status = err;
> -	}
> -
> -	err = 0;
> -	if (!list_empty(&pagelist)) {
> -		err = migrate_pages(&pagelist, new_page_node, NULL,
> -				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
>  		if (err)
> -			putback_movable_pages(&pagelist);
> -	}
> +			goto out_putpage;
>
> +		err = 0;
> +		list_add_tail(&head->lru, pagelist);
> +		mod_node_page_state(page_pgdat(head),
> +			NR_ISOLATED_ANON + page_is_file_cache(head),
> +			hpage_nr_pages(head));
> +	}
> +out_putpage:
> +	/*
> +	 * Either remove the duplicate refcount from
> +	 * isolate_lru_page() or drop the page ref if it was
> +	 * not isolated.
> +	 */
> +	put_page(page);
> +out:
>  	up_read(&mm->mmap_sem);
>  	return err;
>  }
> @@ -1593,79 +1555,79 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
>  			 const int __user *nodes,
>  			 int __user *status, int flags)
>  {
> -	struct page_to_node *pm;
> -	unsigned long chunk_nr_pages;
> -	unsigned long chunk_start;
> -	int err;
> -
> -	err = -ENOMEM;
> -	pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
> -	if (!pm)
> -		goto out;
> +	int current_node = NUMA_NO_NODE;
> +	LIST_HEAD(pagelist);
> +	int start, i;
> +	int err = 0, err1;
>
>  	migrate_prep();
>
> -	/*
> -	 * Store a chunk of page_to_node array in a page,
> -	 * but keep the last one as a marker
> -	 */
> -	chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
> -
> -	for (chunk_start = 0;
> -	     chunk_start < nr_pages;
> -	     chunk_start += chunk_nr_pages) {
> -		int j;
> +	for (i = start = 0; i < nr_pages; i++) {
> +		const void __user *p;
> +		unsigned long addr;
> +		int node;
>
> -		if (chunk_start + chunk_nr_pages > nr_pages)
> -			chunk_nr_pages = nr_pages - chunk_start;
> -
> -		/* fill the chunk pm with addrs and nodes from user-space */
> -		for (j = 0; j < chunk_nr_pages; j++) {
> -			const void __user *p;
> -			int node;
> -
> -			err = -EFAULT;
> -			if (get_user(p, pages + j + chunk_start))
> -				goto out_pm;
> -			pm[j].addr = (unsigned long) p;
> -
> -			if (get_user(node, nodes + j + chunk_start))
> -				goto out_pm;
> -
> -			err = -ENODEV;
> -			if (node < 0 || node >= MAX_NUMNODES)
> -				goto out_pm;
> -
> -			if (!node_state(node, N_MEMORY))
> -				goto out_pm;
> -
> -			err = -EACCES;
> -			if (!node_isset(node, task_nodes))
> -				goto out_pm;
> +		err = -EFAULT;
> +		if (get_user(p, pages + i))
> +			goto out_flush;
> +		if (get_user(node, nodes + i))
> +			goto out_flush;
> +		addr = (unsigned long)p;
> +
> +		err = -ENODEV;
> +		if (node < 0 || node >= MAX_NUMNODES)
> +			goto out_flush;
> +		if (!node_state(node, N_MEMORY))
> +			goto out_flush;
>
> -			pm[j].node = node;
> +		err = -EACCES;
> +		if (!node_isset(node, task_nodes))
> +			goto out_flush;
> +
> +		if (current_node == NUMA_NO_NODE) {
> +			current_node = node;
> +			start = i;
> +		} else if (node != current_node) {
> +			err = do_move_pages_to_node(mm, &pagelist, current_node);
> +			if (err)
> +				goto out;
> +			err = store_status(status, start, current_node, i - start);
> +			if (err)
> +				goto out;
> +			start = i;
> +			current_node = node;
>  		}
>
> -		/* End marker for this chunk */
> -		pm[chunk_nr_pages].node = MAX_NUMNODES;
> +		/*
> +		 * Errors in the page lookup or isolation are not fatal and we simply
> +		 * report them via status
> +		 */
> +		err = add_page_for_migration(mm, addr, current_node,
> +				&pagelist, flags & MPOL_MF_MOVE_ALL);
> +		if (!err)
> +			continue;
>
> -		/* Migrate this chunk */
> -		err = do_move_page_to_node_array(mm, pm,
> -						 flags & MPOL_MF_MOVE_ALL);
> -		if (err < 0)
> -			goto out_pm;
> +		err = store_status(status, i, err, 1);
> +		if (err)
> +			goto out_flush;
>
> -		/* Return status information */
> -		for (j = 0; j < chunk_nr_pages; j++)
> -			if (put_user(pm[j].status, status + j + chunk_start)) {
> -				err = -EFAULT;
> -				goto out_pm;
> -			}
> +		err = do_move_pages_to_node(mm, &pagelist, current_node);
> +		if (err)
> +			goto out;
> +		if (i > start) {
> +			err = store_status(status, start, current_node, i - start);
> +			if (err)
> +				goto out;
> +		}
> +		current_node = NUMA_NO_NODE;
>  	}
> -	err = 0;
> -
> -out_pm:
> -	free_page((unsigned long)pm);
> +out_flush:
> +	/* Make sure we do not overwrite the existing error */
> +	err1 = do_move_pages_to_node(mm, &pagelist, current_node);
> +	if (!err1)
> +		err1 = store_status(status, start, current_node, i - start);
> +	if (!err)
> +		err = err1;
>  out:
>  	return err;
>  }
> -- 
> 2.15.1