Re: [PATCH v1 3/3] mm: per-process reclaim

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* Re: [PATCH v1 3/3] mm: per-process reclaim
       [not found] <040501d1c55a$81d51910$857f4b30$@alibaba-inc.com>
@ 2016-06-13 10:07 ` Hillf Danton
  2016-06-15  0:46   ` Minchan Kim
  0 siblings, 1 reply; 12+ messages in thread
From: Hillf Danton @ 2016-06-13 10:07 UTC (permalink / raw)
  To: 'Minchan Kim'; +Cc: linux-kernel, linux-mm

> +static ssize_t reclaim_write(struct file *file, const char __user *buf,
> +				size_t count, loff_t *ppos)
> +{
> +	struct task_struct *task;
> +	char buffer[PROC_NUMBUF];
> +	struct mm_struct *mm;
> +	struct vm_area_struct *vma;
> +	int itype;
> +	int rv;
> +	enum reclaim_type type;
> +
> +	memset(buffer, 0, sizeof(buffer));
> +	if (count > sizeof(buffer) - 1)
> +		count = sizeof(buffer) - 1;
> +	if (copy_from_user(buffer, buf, count))
> +		return -EFAULT;
> +	rv = kstrtoint(strstrip(buffer), 10, &itype);
> +	if (rv < 0)
> +		return rv;
> +	type = (enum reclaim_type)itype;
> +	if (type < RECLAIM_FILE || type > RECLAIM_ALL)
> +		return -EINVAL;
> +
> +	task = get_proc_task(file->f_path.dentry->d_inode);
> +	if (!task)
> +		return -ESRCH;
> +
> +	mm = get_task_mm(task);
> +	if (mm) {
> +		struct mm_walk reclaim_walk = {
> +			.pmd_entry = reclaim_pte_range,
> +			.mm = mm,
> +		};
> +
> +		down_read(&mm->mmap_sem);
> +		for (vma = mm->mmap; vma; vma = vma->vm_next) {
> +			reclaim_walk.private = vma;
> +
> +			if (is_vm_hugetlb_page(vma))
> +				continue;
> +
> +			if (!vma_is_anonymous(vma) && !(type & RECLAIM_FILE))
> +				continue;
> +
> +			if (vma_is_anonymous(vma) && !(type & RECLAIM_ANON))
> +				continue;
> +
> +			walk_page_range(vma->vm_start, vma->vm_end,
> +					&reclaim_walk);

Check fatal signal after reclaiming a mapping?

> +		}
> +		flush_tlb_mm(mm);
> +		up_read(&mm->mmap_sem);
> +		mmput(mm);
> +	}
> +	put_task_struct(task);
> +
> +	return count;
> +}

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 3/3] mm: per-process reclaim
  2016-06-13 10:07 ` [PATCH v1 3/3] mm: per-process reclaim Hillf Danton
@ 2016-06-15  0:46   ` Minchan Kim
  0 siblings, 0 replies; 12+ messages in thread
From: Minchan Kim @ 2016-06-15  0:46 UTC (permalink / raw)
  To: Hillf Danton; +Cc: linux-kernel, linux-mm

On Mon, Jun 13, 2016 at 06:07:09PM +0800, Hillf Danton wrote:
> > +static ssize_t reclaim_write(struct file *file, const char __user *buf,
> > +				size_t count, loff_t *ppos)
> > +{
> > +	struct task_struct *task;
> > +	char buffer[PROC_NUMBUF];
> > +	struct mm_struct *mm;
> > +	struct vm_area_struct *vma;
> > +	int itype;
> > +	int rv;
> > +	enum reclaim_type type;
> > +
> > +	memset(buffer, 0, sizeof(buffer));
> > +	if (count > sizeof(buffer) - 1)
> > +		count = sizeof(buffer) - 1;
> > +	if (copy_from_user(buffer, buf, count))
> > +		return -EFAULT;
> > +	rv = kstrtoint(strstrip(buffer), 10, &itype);
> > +	if (rv < 0)
> > +		return rv;
> > +	type = (enum reclaim_type)itype;
> > +	if (type < RECLAIM_FILE || type > RECLAIM_ALL)
> > +		return -EINVAL;
> > +
> > +	task = get_proc_task(file->f_path.dentry->d_inode);
> > +	if (!task)
> > +		return -ESRCH;
> > +
> > +	mm = get_task_mm(task);
> > +	if (mm) {
> > +		struct mm_walk reclaim_walk = {
> > +			.pmd_entry = reclaim_pte_range,
> > +			.mm = mm,
> > +		};
> > +
> > +		down_read(&mm->mmap_sem);
> > +		for (vma = mm->mmap; vma; vma = vma->vm_next) {
> > +			reclaim_walk.private = vma;
> > +
> > +			if (is_vm_hugetlb_page(vma))
> > +				continue;
> > +
> > +			if (!vma_is_anonymous(vma) && !(type & RECLAIM_FILE))
> > +				continue;
> > +
> > +			if (vma_is_anonymous(vma) && !(type & RECLAIM_ANON))
> > +				continue;
> > +
> > +			walk_page_range(vma->vm_start, vma->vm_end,
> > +					&reclaim_walk);
> 
> Check fatal signal after reclaiming a mapping?

Yeb, We might need it in page_walker.

Thanks.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 3/3] mm: per-process reclaim
  2016-06-17  7:24     ` Balbir Singh
@ 2016-06-17  7:57       ` Vinayak Menon
  0 siblings, 0 replies; 12+ messages in thread
From: Vinayak Menon @ 2016-06-17  7:57 UTC (permalink / raw)
  To: Balbir Singh, Johannes Weiner, Minchan Kim
  Cc: Andrew Morton, linux-kernel, linux-mm, Rik van Riel, Sangwoo Park

On 6/17/2016 12:54 PM, Balbir Singh wrote:
>
> On 14/06/16 01:06, Johannes Weiner wrote:
>> Hi Minchan,
>>
>> On Mon, Jun 13, 2016 at 04:50:58PM +0900, Minchan Kim wrote:
>>> These day, there are many platforms available in the embedded market
>>> and sometime, they has more hints about workingset than kernel so
>>> they want to involve memory management more heavily like android's
>>> lowmemory killer and ashmem or user-daemon with lowmemory notifier.
>>>
>>> This patch adds add new method for userspace to manage memory
>>> efficiently via knob "/proc/<pid>/reclaim" so platform can reclaim
>>> any process anytime.
>> Cgroups are our canonical way to control system resources on a per
>> process or group-of-processes level. I don't like the idea of adding
>> ad-hoc interfaces for single-use cases like this.
>>
>> For this particular case, you can already stick each app into its own
>> cgroup and use memory.force_empty to target-reclaim them.
>>
>> Or better yet, set the soft limits / memory.low to guide physical
>> memory pressure, once it actually occurs, toward the least-important
>> apps? We usually prefer doing work on-demand rather than proactively.
>>
>> The one-cgroup-per-app model would give Android much more control and
>> would also remove a *lot* of overhead during task switches, see this:
>> https://lkml.org/lkml/2014/12/19/358
> Yes, I'd agree. cgroups can group many tasks, but the group size can be
> 1 as well. Could you try the same test with the recommended approach and
> see if it works as desired? 
>
With cgroup v2, IIUC there can be only a single hierarchy where all controllers exist, and
a process can be part of only one cgroup. If that is true, with per task cgroup, a task can
be present only in its own cgroup. That being the case would it be feasible to have other
parallel controllers like CPU which would not be able to work efficiently with per task cgroup ?

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 3/3] mm: per-process reclaim
  2016-06-13 15:06   ` Johannes Weiner
  2016-06-15  0:40     ` Minchan Kim
@ 2016-06-17  7:24     ` Balbir Singh
  2016-06-17  7:57       ` Vinayak Menon
  1 sibling, 1 reply; 12+ messages in thread
From: Balbir Singh @ 2016-06-17  7:24 UTC (permalink / raw)
  To: Johannes Weiner, Minchan Kim
  Cc: Andrew Morton, linux-kernel, linux-mm, Rik van Riel, Sangwoo Park



On 14/06/16 01:06, Johannes Weiner wrote:
> Hi Minchan,
> 
> On Mon, Jun 13, 2016 at 04:50:58PM +0900, Minchan Kim wrote:
>> These day, there are many platforms available in the embedded market
>> and sometime, they has more hints about workingset than kernel so
>> they want to involve memory management more heavily like android's
>> lowmemory killer and ashmem or user-daemon with lowmemory notifier.
>>
>> This patch adds add new method for userspace to manage memory
>> efficiently via knob "/proc/<pid>/reclaim" so platform can reclaim
>> any process anytime.
> 
> Cgroups are our canonical way to control system resources on a per
> process or group-of-processes level. I don't like the idea of adding
> ad-hoc interfaces for single-use cases like this.
> 
> For this particular case, you can already stick each app into its own
> cgroup and use memory.force_empty to target-reclaim them.
> 
> Or better yet, set the soft limits / memory.low to guide physical
> memory pressure, once it actually occurs, toward the least-important
> apps? We usually prefer doing work on-demand rather than proactively.
> 
> The one-cgroup-per-app model would give Android much more control and
> would also remove a *lot* of overhead during task switches, see this:
> https://lkml.org/lkml/2014/12/19/358

Yes, I'd agree. cgroups can group many tasks, but the group size can be
1 as well. Could you try the same test with the recommended approach and
see if it works as desired? 

Balbir Singh

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 3/3] mm: per-process reclaim
  2016-06-16 14:41       ` Johannes Weiner
@ 2016-06-17  6:43         ` Minchan Kim
  0 siblings, 0 replies; 12+ messages in thread
From: Minchan Kim @ 2016-06-17  6:43 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, linux-kernel, linux-mm, Rik van Riel, Sangwoo Park

Hi Hannes,

On Thu, Jun 16, 2016 at 10:41:02AM -0400, Johannes Weiner wrote:
> On Wed, Jun 15, 2016 at 09:40:27AM +0900, Minchan Kim wrote:
> > A question is it seems cgroup2 doesn't have per-cgroup swappiness.
> > Why?
> > 
> > I think we need it in one-cgroup-per-app model.
> 
> Can you explain why you think that?
> 
> As we have talked about this recently in the LRU balancing thread,
> swappiness is the cost factor between file IO and swapping, so the
> only situation I can imagine you'd need a memcg swappiness setting is
> when you have different cgroups use different storage devices that do
> not have comparable speeds.
> 
> So I'm not sure I understand the relationship to an app-group model.

Sorry for lacking the inforamtion. I should have written more clear.
In fact, what we need is *per-memcg-swap-device*.

What I want is to avoid kill background application although memory
is overflow because cold launcing of app takes a very long time
compared to resume(ie, just switching). I also want to keep a mount
of free pages in the memory so that new application startup cannot
be stuck by reclaim activities.

To get free memory, I want to reclaim less important app rather than
killing. In this time, we can support two swap devices.

A one is zram, other is slow storage but much bigger than zram size.
Then, we can use storage swap to reclaim pages for not-important app
while we can use zram swap for for important app(e.g., forground app,
system services, daemon and so on).

IOW, we want to support mutiple swap device with one-cgroup-per-app
and the storage speed is totally different.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 3/3] mm: per-process reclaim
  2016-06-15  0:40     ` Minchan Kim
  2016-06-16 11:07       ` Michal Hocko
@ 2016-06-16 14:41       ` Johannes Weiner
  2016-06-17  6:43         ` Minchan Kim
  1 sibling, 1 reply; 12+ messages in thread
From: Johannes Weiner @ 2016-06-16 14:41 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-kernel, linux-mm, Rik van Riel, Sangwoo Park

On Wed, Jun 15, 2016 at 09:40:27AM +0900, Minchan Kim wrote:
> A question is it seems cgroup2 doesn't have per-cgroup swappiness.
> Why?
> 
> I think we need it in one-cgroup-per-app model.

Can you explain why you think that?

As we have talked about this recently in the LRU balancing thread,
swappiness is the cost factor between file IO and swapping, so the
only situation I can imagine you'd need a memcg swappiness setting is
when you have different cgroups use different storage devices that do
not have comparable speeds.

So I'm not sure I understand the relationship to an app-group model.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 3/3] mm: per-process reclaim
  2016-06-15  0:40     ` Minchan Kim
@ 2016-06-16 11:07       ` Michal Hocko
  2016-06-16 14:41       ` Johannes Weiner
  1 sibling, 0 replies; 12+ messages in thread
From: Michal Hocko @ 2016-06-16 11:07 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Johannes Weiner, Andrew Morton, linux-kernel, linux-mm,
	Rik van Riel, Sangwoo Park

On Wed 15-06-16 09:40:27, Minchan Kim wrote:
[...]
> A question is it seems cgroup2 doesn't have per-cgroup swappiness.
> Why?

There was no strong use case for it AFAICT.
 
> I think we need it in one-cgroup-per-app model.

I wouldn't be opposed if it is really needed.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 3/3] mm: per-process reclaim
  2016-06-13 17:06   ` Rik van Riel
@ 2016-06-15  1:01     ` Minchan Kim
  0 siblings, 0 replies; 12+ messages in thread
From: Minchan Kim @ 2016-06-15  1:01 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Andrew Morton, linux-kernel, linux-mm, Sangwoo Park

On Mon, Jun 13, 2016 at 01:06:35PM -0400, Rik van Riel wrote:
> On Mon, 2016-06-13 at 16:50 +0900, Minchan Kim wrote:
> > These day, there are many platforms available in the embedded market
> > and sometime, they has more hints about workingset than kernel so
> > they want to involve memory management more heavily like android's
> > lowmemory killer and ashmem or user-daemon with lowmemory notifier.
> > 
> > This patch adds add new method for userspace to manage memory
> > efficiently via knob "/proc/<pid>/reclaim" so platform can reclaim
> > any process anytime.
> > 
> 
> Could it make sense to invoke this automatically,
> perhaps from the Android low memory killer code?

It's doable. In fact, It was first internal implementation of our
product. However, I wanted to use it on platforms which don't have
lowmemory killer. :)

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 3/3] mm: per-process reclaim
  2016-06-13 15:06   ` Johannes Weiner
@ 2016-06-15  0:40     ` Minchan Kim
  2016-06-16 11:07       ` Michal Hocko
  2016-06-16 14:41       ` Johannes Weiner
  2016-06-17  7:24     ` Balbir Singh
  1 sibling, 2 replies; 12+ messages in thread
From: Minchan Kim @ 2016-06-15  0:40 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, linux-kernel, linux-mm, Rik van Riel, Sangwoo Park

Hi Johannes,

On Mon, Jun 13, 2016 at 11:06:53AM -0400, Johannes Weiner wrote:
> Hi Minchan,
> 
> On Mon, Jun 13, 2016 at 04:50:58PM +0900, Minchan Kim wrote:
> > These day, there are many platforms available in the embedded market
> > and sometime, they has more hints about workingset than kernel so
> > they want to involve memory management more heavily like android's
> > lowmemory killer and ashmem or user-daemon with lowmemory notifier.
> > 
> > This patch adds add new method for userspace to manage memory
> > efficiently via knob "/proc/<pid>/reclaim" so platform can reclaim
> > any process anytime.
> 
> Cgroups are our canonical way to control system resources on a per
> process or group-of-processes level. I don't like the idea of adding
> ad-hoc interfaces for single-use cases like this.
> 
> For this particular case, you can already stick each app into its own
> cgroup and use memory.force_empty to target-reclaim them.
> 
> Or better yet, set the soft limits / memory.low to guide physical
> memory pressure, once it actually occurs, toward the least-important
> apps? We usually prefer doing work on-demand rather than proactively.
> 
> The one-cgroup-per-app model would give Android much more control and
> would also remove a *lot* of overhead during task switches, see this:
> https://lkml.org/lkml/2014/12/19/358

I didn't notice that. Thanks for the pointing.
I read the thread you pointed out and read memcg code.

Firstly, I thought one-cgroup-per-app model is abuse of memcg but now
I feel your suggestion does make sense that it's right direction for
control memory from the userspace. Just a concern is that not sure
how hard we can map memory management model from global memory pressure
to per-app pressure model smoothly.

A question is it seems cgroup2 doesn't have per-cgroup swappiness.
Why?

I think we need it in one-cgroup-per-app model.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 3/3] mm: per-process reclaim
  2016-06-13  7:50 ` [PATCH v1 3/3] mm: " Minchan Kim
  2016-06-13 15:06   ` Johannes Weiner
@ 2016-06-13 17:06   ` Rik van Riel
  2016-06-15  1:01     ` Minchan Kim
  1 sibling, 1 reply; 12+ messages in thread
From: Rik van Riel @ 2016-06-13 17:06 UTC (permalink / raw)
  To: Minchan Kim, Andrew Morton; +Cc: linux-kernel, linux-mm, Sangwoo Park

[-- Attachment #1: Type: text/plain, Size: 638 bytes --]

On Mon, 2016-06-13 at 16:50 +0900, Minchan Kim wrote:
> These day, there are many platforms available in the embedded market
> and sometime, they has more hints about workingset than kernel so
> they want to involve memory management more heavily like android's
> lowmemory killer and ashmem or user-daemon with lowmemory notifier.
> 
> This patch adds add new method for userspace to manage memory
> efficiently via knob "/proc/<pid>/reclaim" so platform can reclaim
> any process anytime.
> 

Could it make sense to invoke this automatically,
perhaps from the Android low memory killer code?

-- 
All Rights Reversed.


[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v1 3/3] mm: per-process reclaim
  2016-06-13  7:50 ` [PATCH v1 3/3] mm: " Minchan Kim
@ 2016-06-13 15:06   ` Johannes Weiner
  2016-06-15  0:40     ` Minchan Kim
  2016-06-17  7:24     ` Balbir Singh
  2016-06-13 17:06   ` Rik van Riel
  1 sibling, 2 replies; 12+ messages in thread
From: Johannes Weiner @ 2016-06-13 15:06 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, linux-kernel, linux-mm, Rik van Riel, Sangwoo Park

Hi Minchan,

On Mon, Jun 13, 2016 at 04:50:58PM +0900, Minchan Kim wrote:
> These day, there are many platforms available in the embedded market
> and sometime, they has more hints about workingset than kernel so
> they want to involve memory management more heavily like android's
> lowmemory killer and ashmem or user-daemon with lowmemory notifier.
> 
> This patch adds add new method for userspace to manage memory
> efficiently via knob "/proc/<pid>/reclaim" so platform can reclaim
> any process anytime.

Cgroups are our canonical way to control system resources on a per
process or group-of-processes level. I don't like the idea of adding
ad-hoc interfaces for single-use cases like this.

For this particular case, you can already stick each app into its own
cgroup and use memory.force_empty to target-reclaim them.

Or better yet, set the soft limits / memory.low to guide physical
memory pressure, once it actually occurs, toward the least-important
apps? We usually prefer doing work on-demand rather than proactively.

The one-cgroup-per-app model would give Android much more control and
would also remove a *lot* of overhead during task switches, see this:
https://lkml.org/lkml/2014/12/19/358

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v1 3/3] mm: per-process reclaim
  2016-06-13  7:50 [PATCH v1 0/3] " Minchan Kim
@ 2016-06-13  7:50 ` Minchan Kim
  2016-06-13 15:06   ` Johannes Weiner
  2016-06-13 17:06   ` Rik van Riel
  0 siblings, 2 replies; 12+ messages in thread
From: Minchan Kim @ 2016-06-13  7:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, Rik van Riel, Minchan Kim, Sangwoo Park

These day, there are many platforms available in the embedded market
and sometime, they has more hints about workingset than kernel so
they want to involve memory management more heavily like android's
lowmemory killer and ashmem or user-daemon with lowmemory notifier.

This patch adds add new method for userspace to manage memory
efficiently via knob "/proc/<pid>/reclaim" so platform can reclaim
any process anytime.

One of useful usecase is to avoid process killing for getting free
memory in android, which was really terrible experience because I
lost my best score of game I had ever after I switch the phone call
while I enjoyed the game as well as slow start-up by cold launching.

Our product have used it in real procuct.

Quote from Sangwoo Park <angwoo2.park@lge.com>
Thanks for the data, Sangwoo!
"
- Test scenaro
  - platform: android
  - target: MSM8952, 2G DDR, 16G eMMC
  - scenario
    retry app launch and Back Home with 16 apps and 16 turns
    (total app launch count is 256)
  - result:
			  resume count   |  cold launching count
-----------------------------------------------------------------
 vanilla           |           85        |          171
 perproc reclaim   |           184       |           72
"

Higher resume count is better because cold launching needs loading
lots of resource data which takes above 15 ~ 20 seconds for some
games while successful resume just takes 1~5 second.

As perproc reclaim way with new management policy, we could reduce
cold launching a lot(i.e., 171-72) so that it reduces app startup
a lot.

Another useful function from this feature is to make swapout easily
which is useful for testing swapout stress and workloads.

Interface:

Reclaim file-backed pages only.
	echo 1 > /proc/<pid>/reclaim
Reclaim anonymous pages only.
	echo 2 > /proc/<pid>/reclaim
Reclaim all pages
	echo 3 > /proc/<pid>/reclaim

bit 1 : file, bit 2 : anon, bit 1 & 2 : all

Note:
If a page is shared by other processes(i.e., page_mapcount(page) > 1),
it couldn't be reclaimed.

Cc: Sangwoo Park <sangwoo2.park@lge.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 Documentation/filesystems/proc.txt |  15 ++++
 fs/proc/base.c                     |   1 +
 fs/proc/internal.h                 |   1 +
 fs/proc/task_mmu.c                 | 149 +++++++++++++++++++++++++++++++++++++
 include/linux/rmap.h               |   4 +
 mm/vmscan.c                        |  40 ++++++++++
 6 files changed, 210 insertions(+)

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 50fcf48f4d58..3b6adf370f3c 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -138,6 +138,7 @@ Table 1-1: Process specific entries in /proc
  maps		Memory maps to executables and library files	(2.4)
  mem		Memory held by this process
  root		Link to the root directory of this process
+ reclaim	Reclaim pages in this process
  stat		Process status
  statm		Process memory status information
  status		Process status in human readable form
@@ -536,6 +537,20 @@ To reset the peak resident set size ("high water mark") to the process's
 
 Any other value written to /proc/PID/clear_refs will have no effect.
 
+The file /proc/PID/reclaim is used to reclaim pages in this process.
+bit 1: file, bit 2: anon, bit 3: all
+
+To reclaim file-backed pages,
+    > echo 1 > /proc/PID/reclaim
+
+To reclaim anonymous pages,
+    > echo 2 > /proc/PID/reclaim
+
+To reclaim all pages,
+    > echo 3 > /proc/PID/reclaim
+
+If a page is shared by several processes, it cannot be reclaimed.
+
 The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags
 using /proc/kpageflags and number of times a page is mapped using
 /proc/kpagecount. For detailed explanation, see Documentation/vm/pagemap.txt.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 93e7754fd5b2..b957d929516d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2848,6 +2848,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("mounts",     S_IRUGO, proc_mounts_operations),
 	REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
 	REG("mountstats", S_IRUSR, proc_mountstats_operations),
+	REG("reclaim", S_IWUSR, proc_reclaim_operations),
 #ifdef CONFIG_PROC_PAGE_MONITOR
 	REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
 	REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index aa2781095bd1..ef2b01533c97 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -209,6 +209,7 @@ struct pde_opener {
 extern const struct inode_operations proc_link_inode_operations;
 
 extern const struct inode_operations proc_pid_link_inode_operations;
+extern const struct file_operations proc_reclaim_operations;
 
 extern void proc_init_inodecache(void);
 extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 187d84ef9de9..31e4657f8fe9 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -11,6 +11,7 @@
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
+#include <linux/mm_inline.h>
 #include <linux/swapops.h>
 #include <linux/mmu_notifier.h>
 #include <linux/page_idle.h>
@@ -1465,6 +1466,154 @@ const struct file_operations proc_pagemap_operations = {
 };
 #endif /* CONFIG_PROC_PAGE_MONITOR */
 
+static int reclaim_pte_range(pmd_t *pmd, unsigned long addr,
+				unsigned long end, struct mm_walk *walk)
+{
+	struct mm_struct *mm = walk->mm;
+	struct vm_area_struct *vma = walk->private;
+	pte_t *orig_pte, *pte, ptent;
+	spinlock_t *ptl;
+	struct page *page;
+	LIST_HEAD(page_list);
+	int isolated = 0;
+
+	split_huge_pmd(vma, pmd, addr);
+	if (pmd_trans_unstable(pmd))
+		return 0;
+
+	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	for (; addr != end; pte++, addr += PAGE_SIZE) {
+		ptent = *pte;
+
+		if (!pte_present(ptent))
+			continue;
+
+		page = vm_normal_page(vma, addr, ptent);
+		if (!page)
+			continue;
+
+		if (page_mapcount(page) != 1)
+			continue;
+
+		if (PageTransCompound(page)) {
+			get_page(page);
+			if (!trylock_page(page)) {
+				put_page(page);
+				goto out;
+			}
+			pte_unmap_unlock(orig_pte, ptl);
+
+			if (split_huge_page(page)) {
+				unlock_page(page);
+				put_page(page);
+				orig_pte = pte_offset_map_lock(mm, pmd,
+								addr, &ptl);
+				goto out;
+			}
+			put_page(page);
+			unlock_page(page);
+			pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+			pte--;
+			addr -= PAGE_SIZE;
+			continue;
+		}
+
+		VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+		if (isolate_lru_page(page))
+			continue;
+
+		list_add(&page->lru, &page_list);
+		inc_zone_page_state(page, NR_ISOLATED_ANON +
+					page_is_file_cache(page));
+		isolated++;
+		if (isolated >= SWAP_CLUSTER_MAX) {
+			pte_unmap_unlock(orig_pte, ptl);
+			reclaim_pages_from_list(&page_list);
+			isolated = 0;
+			cond_resched();
+			orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+		}
+	}
+
+out:
+	pte_unmap_unlock(orig_pte, ptl);
+	reclaim_pages_from_list(&page_list);
+
+	cond_resched();
+	return 0;
+}
+
+enum reclaim_type {
+	RECLAIM_FILE = 1,
+	RECLAIM_ANON,
+	RECLAIM_ALL,
+};
+
+static ssize_t reclaim_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	char buffer[PROC_NUMBUF];
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	int itype;
+	int rv;
+	enum reclaim_type type;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		return -EFAULT;
+	rv = kstrtoint(strstrip(buffer), 10, &itype);
+	if (rv < 0)
+		return rv;
+	type = (enum reclaim_type)itype;
+	if (type < RECLAIM_FILE || type > RECLAIM_ALL)
+		return -EINVAL;
+
+	task = get_proc_task(file->f_path.dentry->d_inode);
+	if (!task)
+		return -ESRCH;
+
+	mm = get_task_mm(task);
+	if (mm) {
+		struct mm_walk reclaim_walk = {
+			.pmd_entry = reclaim_pte_range,
+			.mm = mm,
+		};
+
+		down_read(&mm->mmap_sem);
+		for (vma = mm->mmap; vma; vma = vma->vm_next) {
+			reclaim_walk.private = vma;
+
+			if (is_vm_hugetlb_page(vma))
+				continue;
+
+			if (!vma_is_anonymous(vma) && !(type & RECLAIM_FILE))
+				continue;
+
+			if (vma_is_anonymous(vma) && !(type & RECLAIM_ANON))
+				continue;
+
+			walk_page_range(vma->vm_start, vma->vm_end,
+					&reclaim_walk);
+		}
+		flush_tlb_mm(mm);
+		up_read(&mm->mmap_sem);
+		mmput(mm);
+	}
+	put_task_struct(task);
+
+	return count;
+}
+
+const struct file_operations proc_reclaim_operations = {
+	.write		= reclaim_write,
+	.llseek		= noop_llseek,
+};
+
 #ifdef CONFIG_NUMA
 
 struct numa_maps {
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 5704f101b52e..e90a21b78da3 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -10,6 +10,10 @@
 #include <linux/rwsem.h>
 #include <linux/memcontrol.h>
 
+extern int isolate_lru_page(struct page *page);
+extern void putback_lru_page(struct page *page);
+extern unsigned long reclaim_pages_from_list(struct list_head *page_list);
+
 /*
  * The anon_vma heads a list of private "related" vmas, to scan if
  * an anonymous page pointing to this anon_vma needs to be unmapped:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d20c9e863d35..442866f77251 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1212,6 +1212,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * appear not as the counts should be low
 		 */
 		list_add(&page->lru, &free_pages);
+		/*
+		 * If pagelist are from multiple zones, we should decrease
+		 * NR_ISOLATED_ANON + x on freed pages in here.
+		 */
+		if (!zone)
+			dec_zone_page_state(page, NR_ISOLATED_ANON +
+					page_is_file_cache(page));
 		continue;
 
 cull_mlocked:
@@ -1280,6 +1287,39 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 	return ret;
 }
 
+unsigned long reclaim_pages_from_list(struct list_head *page_list)
+{
+	struct scan_control sc = {
+		.gfp_mask = GFP_KERNEL,
+		.priority = DEF_PRIORITY,
+		.may_writepage = 1,
+		.may_unmap = 1,
+		.may_swap = 1,
+		.force_reclaim = 1,
+	};
+
+	unsigned long nr_reclaimed, dummy1, dummy2, dummy3, dummy4, dummy5;
+	struct page *page;
+
+	list_for_each_entry(page, page_list, lru)
+		ClearPageActive(page);
+
+	nr_reclaimed = shrink_page_list(page_list, &sc,
+					TTU_UNMAP|TTU_IGNORE_ACCESS,
+					&dummy1, &dummy2, &dummy3,
+					&dummy4, &dummy5);
+
+	while (!list_empty(page_list)) {
+		page = lru_to_page(page_list);
+		list_del(&page->lru);
+		dec_zone_page_state(page, NR_ISOLATED_ANON +
+				page_is_file_cache(page));
+		putback_lru_page(page);
+	}
+
+	return nr_reclaimed;
+}
+
 /*
  * Attempt to remove the specified page from its LRU.  Only take this page
  * if it is of the appropriate PageActive status.  Pages which are being
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2016-06-17  7:58 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <040501d1c55a$81d51910$857f4b30$@alibaba-inc.com>
2016-06-13 10:07 ` [PATCH v1 3/3] mm: per-process reclaim Hillf Danton
2016-06-15  0:46   ` Minchan Kim
2016-06-13  7:50 [PATCH v1 0/3] " Minchan Kim
2016-06-13  7:50 ` [PATCH v1 3/3] mm: " Minchan Kim
2016-06-13 15:06   ` Johannes Weiner
2016-06-15  0:40     ` Minchan Kim
2016-06-16 11:07       ` Michal Hocko
2016-06-16 14:41       ` Johannes Weiner
2016-06-17  6:43         ` Minchan Kim
2016-06-17  7:24     ` Balbir Singh
2016-06-17  7:57       ` Vinayak Menon
2016-06-13 17:06   ` Rik van Riel
2016-06-15  1:01     ` Minchan Kim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).