All of lore.kernel.org
 help / color / mirror / Atom feed
From: Pavel Emelianov <xemul@sw.ru>
To: Andrew Morton <akpm@osdl.org>, Paul Menage <menage@google.com>,
	Srivatsa Vaddagiri <vatsa@in.ibm.com>,
	Balbir Singh <balbir@in.ibm.com>
Cc: devel@openvz.org,
	Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
	containers@lists.osdl.org, Kirill Korotaev <dev@sw.ru>
Subject: [RFC][PATCH 5/7] Per-container OOM killer and page reclamation
Date: Tue, 06 Mar 2007 18:03:21 +0300	[thread overview]
Message-ID: <45ED82B9.8090306@sw.ru> (raw)
In-Reply-To: <45ED7DEC.7010403@sw.ru>

[-- Attachment #1: Type: text/plain, Size: 410 bytes --]

* container_try_to_free_pages() walks containers
  page list and tries to shrink pages. This is based
  on try_to_free_pages() and Co code.
  Called from core code when no resource left at the
  moment of page touching.

* container_out_of_memory() selects a process to be
  killed which mm_struct belongs to container in question.
  Called from core code when no resources left and no
  pages were reclaimed.

[-- Attachment #2: diff-rss-container-oom-reclaim --]
[-- Type: text/plain, Size: 7897 bytes --]

diff -upr linux-2.6.20.orig/mm/oom_kill.c linux-2.6.20-0/mm/oom_kill.c
--- linux-2.6.20.orig/mm/oom_kill.c	2007-03-06 13:33:28.000000000 +0300
+++ linux-2.6.20-0/mm/oom_kill.c	2007-03-06 13:33:28.000000000 +0300
@@ -24,6 +24,7 @@
 #include <linux/cpuset.h>
 #include <linux/module.h>
 #include <linux/notifier.h>
+#include <linux/rss_container.h>
 
 int sysctl_panic_on_oom;
 /* #define DEBUG */
@@ -47,7 +48,8 @@ int sysctl_panic_on_oom;
  *    of least surprise ... (be careful when you change it)
  */
 
-unsigned long badness(struct task_struct *p, unsigned long uptime)
+unsigned long badness(struct task_struct *p, unsigned long uptime,
+		struct rss_container *rss)
 {
 	unsigned long points, cpu_time, run_time, s;
 	struct mm_struct *mm;
@@ -60,6 +62,13 @@ unsigned long badness(struct task_struct
 		return 0;
 	}
 
+#ifdef CONFIG_RSS_CONTAINER
+	if (rss != NULL && mm->rss_container != rss) {
+		task_unlock(p);
+		return 0;
+	}
+#endif
+
 	/*
 	 * The memory size of the process is the basis for the badness.
 	 */
@@ -200,7 +209,8 @@ static inline int constrained_alloc(stru
  *
  * (not docbooked, we don't want this one cluttering up the manual)
  */
-static struct task_struct *select_bad_process(unsigned long *ppoints)
+static struct task_struct *select_bad_process(unsigned long *ppoints,
+		struct rss_container *rss)
 {
 	struct task_struct *g, *p;
 	struct task_struct *chosen = NULL;
@@ -254,7 +264,7 @@ static struct task_struct *select_bad_pr
 		if (p->oomkilladj == OOM_DISABLE)
 			continue;
 
-		points = badness(p, uptime.tv_sec);
+		points = badness(p, uptime.tv_sec, rss);
 		if (points > *ppoints || !chosen) {
 			chosen = p;
 			*ppoints = points;
@@ -435,7 +445,7 @@ retry:
 		 * Rambo mode: Shoot down a process and hope it solves whatever
 		 * issues we may have.
 		 */
-		p = select_bad_process(&points);
+		p = select_bad_process(&points, NULL);
 
 		if (PTR_ERR(p) == -1UL)
 			goto out;
@@ -464,3 +474,27 @@ out:
 	if (!test_thread_flag(TIF_MEMDIE))
 		schedule_timeout_uninterruptible(1);
 }
+
+#ifdef CONFIG_RSS_CONTAINER
+void container_out_of_memory(struct rss_container *rss)
+{
+	unsigned long points = 0;
+	struct task_struct *p;
+
+	container_lock();
+	read_lock(&tasklist_lock);
+retry:
+	p = select_bad_process(&points, rss);
+	if (PTR_ERR(p) == -1UL)
+		goto out;
+
+	if (!p)
+		p = current;
+
+	if (oom_kill_process(p, points, "Container out of memory"))
+		goto retry;
+out:
+	read_unlock(&tasklist_lock);
+	container_unlock();
+}
+#endif
diff -upr linux-2.6.20.orig/mm/vmscan.c linux-2.6.20-0/mm/vmscan.c
--- linux-2.6.20.orig/mm/vmscan.c	2007-02-04 21:44:54.000000000 +0300
+++ linux-2.6.20-0/mm/vmscan.c	2007-03-06 13:33:28.000000000 +0300
@@ -45,6 +45,8 @@
 
 #include "internal.h"
 
+#include <linux/rss_container.h>
+
 struct scan_control {
 	/* Incremented by the number of inactive pages that were scanned */
 	unsigned long nr_scanned;
@@ -1097,6 +1099,194 @@ out:
 	return ret;
 }
 
+#ifdef CONFIG_RSS_CONTAINER
+/*
+ * These are containers' inactive and active pages shrinkers.
+ * Thes works like shrink_inactive_list() and shrink_active_list()
+ *
+ * Two main differences is that container_isolate_pages() is used to isolate
+ * pages, and that reclaim_mapped is considered to be 1 as hitting BC
+ * limit implies we have to shrink _mapped_ pages
+ */
+static unsigned long container_shrink_pages_inactive(unsigned long max_scan,
+		struct rss_container *rss, struct scan_control *sc)
+{
+	LIST_HEAD(page_list);
+	unsigned long nr_scanned = 0;
+	unsigned long nr_reclaimed = 0;
+
+	do {
+		struct page *page;
+		unsigned long nr_taken;
+		unsigned long nr_scan;
+		struct zone *z;
+
+		nr_taken = container_isolate_pages(sc->swap_cluster_max, rss,
+				&page_list, 0, &nr_scan);
+
+		nr_scanned += nr_scan;
+		nr_reclaimed += shrink_page_list(&page_list, sc);
+		if (nr_taken == 0)
+			goto done;
+
+		while (!list_empty(&page_list)) {
+			page = lru_to_page(&page_list);
+			z = page_zone(page);
+
+			spin_lock_irq(&z->lru_lock);
+			VM_BUG_ON(PageLRU(page));
+			SetPageLRU(page);
+			list_del(&page->lru);
+			if (PageActive(page))
+				add_page_to_active_list(z, page);
+			else
+				add_page_to_inactive_list(z, page);
+			spin_unlock_irq(&z->lru_lock);
+
+			put_page(page);
+		}
+  	} while (nr_scanned < max_scan);
+done:
+	return nr_reclaimed;
+}
+
+static void container_shrink_pages_active(unsigned long nr_pages,
+		struct rss_container *rss, struct scan_control *sc)
+{
+	LIST_HEAD(l_hold);
+	LIST_HEAD(l_inactive);
+	LIST_HEAD(l_active);
+	struct page *page;
+	unsigned long nr_scanned;
+	unsigned long nr_deactivated = 0;
+	struct zone *z;
+
+	container_isolate_pages(nr_pages, rss, &l_hold, 1, &nr_scanned);
+
+	while (!list_empty(&l_hold)) {
+		cond_resched();
+		page = lru_to_page(&l_hold);
+		list_del(&page->lru);
+		if (page_mapped(page)) {
+			if ((total_swap_pages == 0 && PageAnon(page)) ||
+			    page_referenced(page, 0)) {
+				list_add(&page->lru, &l_active);
+				continue;
+			}
+		}
+		nr_deactivated++;
+		list_add(&page->lru, &l_inactive);
+	}
+
+	while (!list_empty(&l_inactive)) {
+		page = lru_to_page(&l_inactive);
+		z = page_zone(page);
+
+		spin_lock_irq(&z->lru_lock);
+		VM_BUG_ON(PageLRU(page));
+		SetPageLRU(page);
+		VM_BUG_ON(!PageActive(page));
+		ClearPageActive(page);
+
+		list_move(&page->lru, &z->inactive_list);
+		z->nr_inactive++;
+		spin_unlock_irq(&z->lru_lock);
+
+		put_page(page);
+	}
+
+	while (!list_empty(&l_active)) {
+		page = lru_to_page(&l_active);
+		z = page_zone(page);
+
+		spin_lock_irq(&z->lru_lock);
+		VM_BUG_ON(PageLRU(page));
+		SetPageLRU(page);
+		VM_BUG_ON(!PageActive(page));
+		list_move(&page->lru, &z->active_list);
+		z->nr_active++;
+		spin_unlock_irq(&z->lru_lock);
+
+		put_page(page);
+	}
+}
+
+/*
+ * This is a reworked shrink_zone() routine - it scans active pages firts,
+ * then inactive and returns the number of pages reclaimed
+ */
+static unsigned long container_shrink_pages(int priority,
+		struct rss_container *rss, struct scan_control *sc)
+{
+	unsigned long nr_pages;
+	unsigned long nr_to_scan;
+	unsigned long nr_reclaimed = 0;
+
+	nr_pages = (container_nr_physpages(rss) >> priority) + 1;
+	if (nr_pages < sc->swap_cluster_max)
+		nr_pages = 0;
+
+	while (nr_pages) {
+		nr_to_scan = min(nr_pages, (unsigned long)sc->swap_cluster_max);
+		nr_pages -= nr_to_scan;
+		container_shrink_pages_active(nr_to_scan, rss, sc);
+	}
+
+	nr_pages = (container_nr_physpages(rss) >> priority) + 1;
+	if (nr_pages < sc->swap_cluster_max)
+		nr_pages = 0;
+
+	while (nr_pages) {
+		nr_to_scan = min(nr_pages, (unsigned long)sc->swap_cluster_max);
+		nr_pages -= nr_to_scan;
+		nr_reclaimed += container_shrink_pages_inactive(nr_to_scan, rss, sc);
+	}
+
+	throttle_vm_writeout();
+	return nr_reclaimed;
+}
+
+/*
+ * This functions works like try_to_free_pages() - it tries
+ * to shrink bc's pages with increasing priority
+ */
+unsigned long container_try_to_free_pages(struct rss_container *rss)
+{
+	int priority;
+	int ret = 0;
+	unsigned long total_scanned = 0;
+	unsigned long nr_reclaimed = 0;
+	struct scan_control sc = {
+		.gfp_mask = GFP_KERNEL,
+		.may_writepage = !laptop_mode,
+		.swap_cluster_max = SWAP_CLUSTER_MAX,
+		.may_swap = 1,
+		.swappiness = vm_swappiness,
+	};
+
+	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+		sc.nr_scanned = 0;
+		nr_reclaimed += container_shrink_pages(priority, rss, &sc);
+		total_scanned += sc.nr_scanned;
+		if (nr_reclaimed > 1) {
+			ret = 1;
+			goto out;
+		}
+
+		if (total_scanned > sc.swap_cluster_max +
+					sc.swap_cluster_max / 2) {
+			wakeup_pdflush(laptop_mode ? 0 : total_scanned);
+			sc.may_writepage = 1;
+		}
+
+		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
+			congestion_wait(WRITE, HZ/10);
+	}
+out:
+	return ret;
+}
+#endif
+
 /*
  * For kswapd, balance_pgdat() will work across all this node's zones until
  * they are all at pages_high.


  parent reply	other threads:[~2007-03-06 15:00 UTC|newest]

Thread overview: 129+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-03-06 14:42 [RFC][PATCH 0/7] Resource controllers based on process containers Pavel Emelianov
2007-03-06 14:49 ` [RFC][PATCH 1/7] Resource counters Pavel Emelianov
2007-03-07  4:03   ` Balbir Singh
2007-03-07  7:19     ` Pavel Emelianov
2007-03-09 16:37       ` Herbert Poetzl
2007-03-11  9:01         ` Pavel Emelianov
2007-03-11 19:00         ` Eric W. Biederman
2007-03-12  1:16           ` Herbert Poetzl
2007-03-13  9:09             ` Eric W. Biederman
2007-03-13  9:27               ` Pavel Emelianov
2007-03-13  9:49               ` [Devel] " Kirill Korotaev
2007-03-13 15:21               ` Herbert Poetzl
2007-03-13 15:41                 ` Pavel Emelianov
2007-03-13 16:07                   ` Srivatsa Vaddagiri
2007-03-14  7:12                     ` Pavel Emelianov
2007-03-15 16:51                       ` Eric W. Biederman
2007-03-13 16:32                   ` Herbert Poetzl
2007-03-06 14:55 ` [RFC][PATCH 2/7] RSS controller core Pavel Emelianov
2007-03-06 22:00   ` Andrew Morton
2007-03-09 16:48     ` Herbert Poetzl
2007-03-11  9:08       ` Pavel Emelianov
2007-03-11 14:32         ` Herbert Poetzl
2007-03-11 15:04           ` Pavel Emelianov
2007-03-12  0:41             ` Herbert Poetzl
2007-03-12  8:31               ` Pavel Emelianov
2007-03-12  9:55       ` Balbir Singh
2007-03-12 23:43         ` Herbert Poetzl
2007-03-13  1:57           ` Balbir Singh
2007-03-13  2:24             ` Srivatsa Vaddagiri
2007-03-13 16:06             ` Herbert Poetzl
2007-03-11 12:26     ` Kirill Korotaev
2007-03-11 12:51       ` Andrew Morton
2007-03-11 15:51         ` Balbir Singh
2007-03-11 19:34         ` Eric W. Biederman
2007-03-12  9:23           ` [Devel] " Kirill Korotaev
2007-03-13  9:26             ` Eric W. Biederman
2007-03-13 15:43               ` Kirill Korotaev
2007-03-12  1:00         ` Herbert Poetzl
2007-03-12  9:02           ` Pavel Emelianov
2007-03-12 21:11             ` Herbert Poetzl
2007-03-13  7:17               ` Pavel Emelianov
2007-03-13 15:05                 ` Herbert Poetzl
2007-03-13 15:32                   ` Pavel Emelianov
2007-03-13 15:10               ` Kirill Korotaev
2007-03-13 15:11                 ` Herbert Poetzl
2007-03-13 15:54                   ` Kirill Korotaev
2007-03-12 18:42           ` Dave Hansen
2007-03-12 22:41             ` Herbert Poetzl
2007-03-12 23:02               ` Dave Hansen
2007-03-18 16:58                 ` Eric W. Biederman
2007-03-13  6:04               ` Andrew Morton
2007-03-13 10:19                 ` [Devel] " Kirill Korotaev
2007-03-13 11:48                   ` Andrew Morton
2007-03-13 14:59                     ` Herbert Poetzl
2007-03-13 17:05                     ` Dave Hansen
2007-03-14 15:38                       ` Mel Gorman
2007-03-14 20:42                         ` Dave Hansen
2007-03-20 18:57                           ` Mel Gorman
2007-03-18 22:44                       ` [Devel] " Paul Menage
2007-03-19 17:41                         ` Eric W. Biederman
2007-03-13 17:26                 ` Dave Hansen
2007-03-13 19:09                   ` Alan Cox
2007-03-13 20:28                     ` Dave Hansen
2007-03-16  0:55                     ` Eric W. Biederman
2007-03-16 16:31                       ` Dave Hansen
2007-03-16 18:54                         ` Eric W. Biederman
2007-03-16 19:46                           ` Dave Hansen
2007-03-18 17:42                             ` Eric W. Biederman
2007-03-19 15:48                               ` Herbert Poetzl
2007-03-20 16:15                               ` controlling mmap()'d vs read/write() pages Dave Hansen
2007-03-20 21:19                                 ` Eric W. Biederman
2007-03-23  0:51                                   ` Herbert Poetzl
2007-03-23  5:57                                   ` Nick Piggin
2007-03-23 10:12                                     ` Eric W. Biederman
2007-03-23 10:47                                       ` Nick Piggin
2007-03-23 12:21                                         ` Eric W. Biederman
2007-03-28  7:33                                           ` Nick Piggin
2007-03-23 16:41                                       ` Dave Hansen
2007-03-23 18:16                                         ` Herbert Poetzl
2007-03-28  9:18                                           ` Balbir Singh
2007-03-14 16:47                   ` [RFC][PATCH 2/7] RSS controller core Mel Gorman
2007-03-07  5:37   ` Balbir Singh
2007-03-07  7:27     ` Pavel Emelianov
2007-03-06 14:58 ` [RFC][PATCH 3/7] Data structures changes for RSS accounting Pavel Emelianov
2007-03-11 19:13   ` Eric W. Biederman
2007-03-12 16:16     ` Kirill Korotaev
2007-03-12 16:48       ` Dave Hansen
2007-03-12 17:19         ` Pavel Emelianov
2007-03-12 17:27           ` Dave Hansen
2007-03-13  7:10             ` Pavel Emelianov
2007-03-12 17:21         ` Balbir Singh
2007-03-06 15:00 ` [RFC][PATCH 4/7] RSS accounting hooks over the code Pavel Emelianov
2007-03-11 19:14   ` Eric W. Biederman
2007-03-12 16:23     ` Kirill Korotaev
2007-03-12 16:50       ` Dave Hansen
2007-03-12 17:07         ` Kirill Korotaev
2007-03-12 17:33           ` Dave Hansen
2007-03-13  9:43             ` Eric W. Biederman
2007-03-12 23:54         ` Herbert Poetzl
2007-03-13  9:58           ` Eric W. Biederman
2007-03-13 10:25             ` Nick Piggin
2007-03-13 16:01               ` Eric W. Biederman
2007-03-14  3:51                 ` Nick Piggin
2007-03-14  6:42                   ` Balbir Singh
2007-03-14  6:57                     ` Nick Piggin
2007-03-14  7:48                       ` Balbir Singh
2007-03-14 13:25                         ` Vaidyanathan Srinivasan
2007-03-14 13:49                           ` Nick Piggin
2007-03-14 14:43                             ` Vaidyanathan Srinivasan
2007-03-14 16:16                             ` Kirill Korotaev
2007-03-15  5:01                               ` Nick Piggin
2007-03-15  5:44                                 ` Balbir Singh
2007-03-28 20:15               ` Ethan Solomita
2007-03-14 15:37   ` Cedric Le Goater
2007-03-14 15:45     ` Pavel Emelianov
2007-03-06 15:03 ` Pavel Emelianov [this message]
2007-03-09 21:21   ` [RFC][PATCH 5/7] Per-container OOM killer and page reclamation Balbir Singh
2007-03-11  8:41     ` Pavel Emelianov
2007-03-06 15:04 ` [RFC][PATCH 6/7] Account for the number of tasks within container Pavel Emelianov
2007-03-07  2:00   ` Paul Menage
2007-03-07  7:13     ` Pavel Emelianov
2007-03-08 13:49       ` Paul Menage
2007-03-11  8:36         ` Pavel Emelianov
2007-03-06 15:07 ` [RFC][PATCH 7/7] Account for the number of files opened " Pavel Emelianov
2007-03-07  2:02 ` [RFC][PATCH 0/7] Resource controllers based on process containers Paul Menage
2007-03-07  7:30   ` Pavel Emelianov
2007-03-07  6:52 ` Balbir Singh
2007-03-07  7:32   ` Pavel Emelianov
2007-03-07  9:43     ` Kirill Korotaev

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=45ED82B9.8090306@sw.ru \
    --to=xemul@sw.ru \
    --cc=akpm@osdl.org \
    --cc=balbir@in.ibm.com \
    --cc=containers@lists.osdl.org \
    --cc=dev@sw.ru \
    --cc=devel@openvz.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=menage@google.com \
    --cc=vatsa@in.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.