linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* OOM fixes 1/5
@ 2005-01-21  5:48 Andrea Arcangeli
  2005-01-21  5:49 ` OOM fixes 2/5 Andrea Arcangeli
  2005-01-22  6:35 ` OOM fixes 1/5 Andrea Arcangeli
  0 siblings, 2 replies; 22+ messages in thread
From: Andrea Arcangeli @ 2005-01-21  5:48 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Nick Piggin

I'm sending 5 patches incremental with each other updated to the latest
bk snapshot I could find on kernel.org [kernel cvs is still unusable for
me, is it my mistake?]

From: garloff@suse.de
Subject: protect-pids

This is protect-pids, a patch to allow the admin to tune the oom killer.
The tweak is inherited between parent and child so it's easy to write a
wrapper for complex apps.

I made used_math a char at the light of later patches. Current patch
breaks alpha, but future patches will fix it.

Signed-off-by: Andrea Arcangeli <andrea@suse.de>

--- mainline/fs/proc/base.c	2005-01-15 20:44:58.000000000 +0100
+++ mainline-1/fs/proc/base.c	2005-01-20 18:26:29.000000000 +0100
@@ -72,6 +72,8 @@ enum pid_directory_inos {
 	PROC_TGID_ATTR_FSCREATE,
 #endif
 	PROC_TGID_FD_DIR,
+	PROC_TGID_OOM_SCORE,
+	PROC_TGID_OOM_ADJUST,
 	PROC_TID_INO,
 	PROC_TID_STATUS,
 	PROC_TID_MEM,
@@ -98,6 +100,8 @@ enum pid_directory_inos {
 	PROC_TID_ATTR_FSCREATE,
 #endif
 	PROC_TID_FD_DIR = 0x8000,	/* 0x8000-0xffff */
+	PROC_TID_OOM_SCORE,
+	PROC_TID_OOM_ADJUST,
 };
 
 struct pid_entry {
@@ -133,6 +137,8 @@ static struct pid_entry tgid_base_stuff[
 #ifdef CONFIG_SCHEDSTATS
 	E(PROC_TGID_SCHEDSTAT, "schedstat", S_IFREG|S_IRUGO),
 #endif
+	E(PROC_TGID_OOM_SCORE, "oom_score",S_IFREG|S_IRUGO),
+	E(PROC_TGID_OOM_ADJUST,"oom_adj", S_IFREG|S_IRUGO|S_IWUSR),
 	{0,0,NULL,0}
 };
 static struct pid_entry tid_base_stuff[] = {
@@ -158,6 +164,8 @@ static struct pid_entry tid_base_stuff[]
 #ifdef CONFIG_SCHEDSTATS
 	E(PROC_TID_SCHEDSTAT, "schedstat",S_IFREG|S_IRUGO),
 #endif
+	E(PROC_TID_OOM_SCORE,  "oom_score",S_IFREG|S_IRUGO),
+	E(PROC_TID_OOM_ADJUST, "oom_adj", S_IFREG|S_IRUGO|S_IWUSR),
 	{0,0,NULL,0}
 };
 
@@ -384,6 +392,18 @@ static int proc_pid_schedstat(struct tas
 }
 #endif
 
+/* The badness from the OOM killer */
+unsigned long badness(struct task_struct *p, unsigned long uptime);
+static int proc_oom_score(struct task_struct *task, char *buffer)
+{
+	unsigned long points;
+	struct timespec uptime;
+
+	do_posix_clock_monotonic_gettime(&uptime);
+	points = badness(task, uptime.tv_sec);
+	return sprintf(buffer, "%lu\n", points);
+}
+
 /************************************************************************/
 /*                       Here the fs part begins                        */
 /************************************************************************/
@@ -657,6 +677,55 @@ static struct file_operations proc_mem_o
 	.open		= mem_open,
 };
 
+static ssize_t oom_adjust_read(struct file * file, char * buf,
+				size_t count, loff_t *ppos)
+{
+	struct task_struct *task = proc_task(file->f_dentry->d_inode);
+	char buffer[8];
+	size_t len;
+	int oom_adjust = task->oomkilladj;
+
+	len = sprintf(buffer, "%i\n", oom_adjust) + 1;
+	if (*ppos >= len)
+		return 0;
+	if (count > len-*ppos)
+		count = len-*ppos;
+	if (copy_to_user(buf, buffer + *ppos, count)) 
+		return -EFAULT;
+	*ppos += count;
+	return count;
+}
+
+static ssize_t oom_adjust_write(struct file * file, const char * buf,
+				size_t count, loff_t *ppos)
+{
+	struct task_struct *task = proc_task(file->f_dentry->d_inode);
+	char buffer[8], *end;
+	int oom_adjust;
+
+	if (!capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+	memset(buffer, 0, 8);	
+	if (count > 6)
+		count = 6;
+	if (copy_from_user(buffer, buf, count)) 
+		return -EFAULT;
+	oom_adjust = simple_strtol(buffer, &end, 0);
+	if (oom_adjust < -16 || oom_adjust > 15)
+		return -EINVAL;
+	if (*end == '\n')
+		end++;
+	task->oomkilladj = oom_adjust;
+	if (end - buffer == 0) 
+		return -EIO;
+	return end - buffer;
+}
+
+static struct file_operations proc_oom_adjust_operations = {
+	read:		oom_adjust_read,
+	write:		oom_adjust_write,
+};
+
 static struct inode_operations proc_mem_inode_operations = {
 	.permission	= proc_permission,
 };
@@ -1336,6 +1405,15 @@ static struct dentry *proc_pident_lookup
 			ei->op.proc_read = proc_pid_schedstat;
 			break;
 #endif
+		case PROC_TID_OOM_SCORE:	
+		case PROC_TGID_OOM_SCORE:
+			inode->i_fop = &proc_info_file_operations;
+			ei->op.proc_read = proc_oom_score;
+			break;
+		case PROC_TID_OOM_ADJUST:
+		case PROC_TGID_OOM_ADJUST:
+			inode->i_fop = &proc_oom_adjust_operations;
+			break;
 		default:
 			printk("procfs: impossible type (%d)",p->type);
 			iput(inode);
--- mainline/include/linux/sched.h	2005-01-20 18:20:10.000000000 +0100
+++ mainline-1/include/linux/sched.h	2005-01-20 18:27:45.000000000 +0100
@@ -614,7 +614,19 @@ struct task_struct {
 	struct key *process_keyring;	/* keyring private to this process (CLONE_THREAD) */
 	struct key *thread_keyring;	/* keyring private to this thread */
 #endif
-	unsigned short used_math;
+/*
+ * Must be changed atomically so it shouldn't be
+ * be a shareable bitflag.
+ */
+	unsigned char used_math;
+/*
+ * OOM kill score adjustment (bit shift).
+ * Cannot live together with used_math since
+ * used_math and oomkilladj can be changed at the
+ * same time, so they would race if they're in the
+ * same atomic block.
+ */
+	short oomkilladj;
 	char comm[TASK_COMM_LEN];
 /* file system info */
 	int link_count, total_link_count;
--- mainline/mm/oom_kill.c	2005-01-15 20:45:00.000000000 +0100
+++ mainline-1/mm/oom_kill.c	2005-01-20 18:26:30.000000000 +0100
@@ -42,7 +42,7 @@
  *    of least surprise ... (be careful when you change it)
  */
 
-static unsigned long badness(struct task_struct *p, unsigned long uptime)
+unsigned long badness(struct task_struct *p, unsigned long uptime)
 {
 	unsigned long points, cpu_time, run_time, s;
 
@@ -99,6 +99,17 @@ static unsigned long badness(struct task
 	 */
 	if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
 		points /= 4;
+
+	/* 
+	 * Adjust the score by oomkilladj.
+	 */
+	if (p->oomkilladj) {
+		if (p->oomkilladj > 0)
+			points <<= p->oomkilladj;
+		else
+			points >>= -(p->oomkilladj);
+	}
+		
 #ifdef DEBUG
 	printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",
 	p->pid, p->comm, points);

^ permalink raw reply	[flat|nested] 22+ messages in thread

* OOM fixes 2/5
  2005-01-21  5:48 OOM fixes 1/5 Andrea Arcangeli
@ 2005-01-21  5:49 ` Andrea Arcangeli
  2005-01-21  5:49   ` OOM fixes 3/5 Andrea Arcangeli
  2005-01-21  6:20   ` OOM fixes 2/5 Andrew Morton
  2005-01-22  6:35 ` OOM fixes 1/5 Andrea Arcangeli
  1 sibling, 2 replies; 22+ messages in thread
From: Andrea Arcangeli @ 2005-01-21  5:49 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Nick Piggin

From: Andrea Arcangeli <andrea@suse.de>
Subject: keep balance between different classzones

This is the forward port to 2.6 of the lowmem_reserved algorithm I
invented in 2.4.1*, merged in 2.4.2x already and needed to fix workloads
like google (especially without swap) on x86 with >1G of ram, but it's
needed in all sort of workloads with lots of ram on x86, it's also
needed on x86-64 for dma allocations. This brings 2.6 in sync with
latest 2.4.2x.

Signed-off-by: Andrea Arcangeli <andrea@suse.de>

--- mainline-2/include/linux/mmzone.h.orig	2005-01-15 20:45:00.000000000 +0100
+++ mainline-2/include/linux/mmzone.h	2005-01-21 05:55:28.644869648 +0100
@@ -112,18 +112,14 @@ struct zone {
 	unsigned long		free_pages;
 	unsigned long		pages_min, pages_low, pages_high;
 	/*
-	 * protection[] is a pre-calculated number of extra pages that must be
-	 * available in a zone in order for __alloc_pages() to allocate memory
-	 * from the zone. i.e., for a GFP_KERNEL alloc of "order" there must
-	 * be "(1<<order) + protection[ZONE_NORMAL]" free pages in the zone
-	 * for us to choose to allocate the page from that zone.
-	 *
-	 * It uses both min_free_kbytes and sysctl_lower_zone_protection.
-	 * The protection values are recalculated if either of these values
-	 * change.  The array elements are in zonelist order:
-	 *	[0] == GFP_DMA, [1] == GFP_KERNEL, [2] == GFP_HIGHMEM.
+	 * We don't know if the memory that we're going to allocate will be freeable
+	 * or/and it will be released eventually, so to avoid totally wasting several
+	 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
+	 * to run OOM on the lower zones despite there's tons of freeable ram
+	 * on the higher zones). This array is recalculated at runtime if the
+	 * sysctl_lowmem_reserve_ratio sysctl changes.
 	 */
-	unsigned long		protection[MAX_NR_ZONES];
+	unsigned long		lowmem_reserve[MAX_NR_ZONES];
 
 	struct per_cpu_pageset	pageset[NR_CPUS];
 
@@ -368,7 +364,8 @@ struct ctl_table;
 struct file;
 int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, 
 					void __user *, size_t *, loff_t *);
-int lower_zone_protection_sysctl_handler(struct ctl_table *, int, struct file *,
+extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
+int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
 
 #include <linux/topology.h>
--- mainline-2/include/linux/sysctl.h.orig	2005-01-15 20:45:00.000000000 +0100
+++ mainline-2/include/linux/sysctl.h	2005-01-21 05:55:28.646869344 +0100
@@ -160,7 +160,7 @@ enum
 	VM_PAGEBUF=17,		/* struct: Control pagebuf parameters */
 	VM_HUGETLB_PAGES=18,	/* int: Number of available Huge Pages */
 	VM_SWAPPINESS=19,	/* Tendency to steal mapped memory */
-	VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */
+	VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */
 	VM_MIN_FREE_KBYTES=21,	/* Minimum free kilobytes to maintain */
 	VM_MAX_MAP_COUNT=22,	/* int: Maximum number of mmaps/address-space */
 	VM_LAPTOP_MODE=23,	/* vm laptop mode */
--- mainline-2/kernel/sysctl.c.orig	2005-01-15 20:45:00.000000000 +0100
+++ mainline-2/kernel/sysctl.c	2005-01-21 05:55:28.648869040 +0100
@@ -61,7 +61,6 @@ extern int core_uses_pid;
 extern char core_pattern[];
 extern int cad_pid;
 extern int pid_max;
-extern int sysctl_lower_zone_protection;
 extern int min_free_kbytes;
 extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
@@ -745,14 +744,13 @@ static ctl_table vm_table[] = {
 	 },
 #endif
 	{
-		.ctl_name	= VM_LOWER_ZONE_PROTECTION,
-		.procname	= "lower_zone_protection",
-		.data		= &sysctl_lower_zone_protection,
-		.maxlen		= sizeof(sysctl_lower_zone_protection),
+		.ctl_name	= VM_LOWMEM_RESERVE_RATIO,
+		.procname	= "lowmem_reserve_ratio",
+		.data		= &sysctl_lowmem_reserve_ratio,
+		.maxlen		= sizeof(sysctl_lowmem_reserve_ratio),
 		.mode		= 0644,
-		.proc_handler	= &lower_zone_protection_sysctl_handler,
+		.proc_handler	= &lowmem_reserve_ratio_sysctl_handler,
 		.strategy	= &sysctl_intvec,
-		.extra1		= &zero,
 	},
 	{
 		.ctl_name	= VM_MIN_FREE_KBYTES,
--- mainline-2/mm/page_alloc.c.orig	2005-01-15 20:45:00.000000000 +0100
+++ mainline-2/mm/page_alloc.c	2005-01-21 05:58:53.338751448 +0100
@@ -44,7 +44,15 @@ struct pglist_data *pgdat_list;
 unsigned long totalram_pages;
 unsigned long totalhigh_pages;
 long nr_swap_pages;
-int sysctl_lower_zone_protection = 0;
+/*
+ * results with 256, 32 in the lowmem_reserve sysctl:
+ *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
+ *	1G machine -> (16M dma, 784M normal, 224M high)
+ *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
+ *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
+ *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
+ */
+int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
 
 EXPORT_SYMBOL(totalram_pages);
 EXPORT_SYMBOL(nr_swap_pages);
@@ -654,7 +662,7 @@ buffered_rmqueue(struct zone *zone, int 
  * of the allocation.
  */
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-		int alloc_type, int can_try_harder, int gfp_high)
+		      int classzone_idx, int can_try_harder, int gfp_high)
 {
 	/* free_pages my go negative - that's OK */
 	long min = mark, free_pages = z->free_pages - (1 << order) + 1;
@@ -665,7 +673,7 @@ int zone_watermark_ok(struct zone *z, in
 	if (can_try_harder)
 		min -= min / 4;
 
-	if (free_pages <= min + z->protection[alloc_type])
+	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
 		return 0;
 	for (o = 0; o < order; o++) {
 		/* At the next order, this order's pages become unavailable */
@@ -682,19 +690,6 @@ int zone_watermark_ok(struct zone *z, in
 
 /*
  * This is the 'heart' of the zoned buddy allocator.
- *
- * Herein lies the mysterious "incremental min".  That's the
- *
- *	local_low = z->pages_low;
- *	min += local_low;
- *
- * thing.  The intent here is to provide additional protection to low zones for
- * allocation requests which _could_ use higher zones.  So a GFP_HIGHMEM
- * request is not allowed to dip as deeply into the normal zone as a GFP_KERNEL
- * request.  This preserves additional space in those lower zones for requests
- * which really do need memory from those zones.  It means that on a decent
- * sized machine, GFP_HIGHMEM and GFP_KERNEL requests basically leave the DMA
- * zone untouched.
  */
 struct page * fastcall
 __alloc_pages(unsigned int gfp_mask, unsigned int order,
@@ -706,7 +701,7 @@ __alloc_pages(unsigned int gfp_mask, uns
 	struct reclaim_state reclaim_state;
 	struct task_struct *p = current;
 	int i;
-	int alloc_type;
+	int classzone_idx;
 	int do_retry;
 	int can_try_harder;
 
@@ -726,13 +721,13 @@ __alloc_pages(unsigned int gfp_mask, uns
 		return NULL;
 	}
 
-	alloc_type = zone_idx(zones[0]);
+	classzone_idx = zone_idx(zones[0]);
 
 	/* Go through the zonelist once, looking for a zone with enough free */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
 
 		if (!zone_watermark_ok(z, order, z->pages_low,
-				alloc_type, 0, 0))
+				       classzone_idx, 0, 0))
 			continue;
 
 		page = buffered_rmqueue(z, order, gfp_mask);
@@ -749,8 +744,8 @@ __alloc_pages(unsigned int gfp_mask, uns
 	 */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
 		if (!zone_watermark_ok(z, order, z->pages_min,
-				alloc_type, can_try_harder,
-				gfp_mask & __GFP_HIGH))
+				       classzone_idx, can_try_harder,
+				       gfp_mask & __GFP_HIGH))
 			continue;
 
 		page = buffered_rmqueue(z, order, gfp_mask);
@@ -787,8 +782,8 @@ rebalance:
 	/* go through the zonelist yet one more time */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
 		if (!zone_watermark_ok(z, order, z->pages_min,
-				alloc_type, can_try_harder,
-				gfp_mask & __GFP_HIGH))
+				       classzone_idx, can_try_harder,
+				       gfp_mask & __GFP_HIGH))
 			continue;
 
 		page = buffered_rmqueue(z, order, gfp_mask);
@@ -1198,9 +1193,9 @@ void show_free_areas(void)
 			zone->pages_scanned,
 			(zone->all_unreclaimable ? "yes" : "no")
 			);
-		printk("protections[]:");
+		printk("lowmem_reserve[]:");
 		for (i = 0; i < MAX_NR_ZONES; i++)
-			printk(" %lu", zone->protection[i]);
+			printk(" %lu", zone->lowmem_reserve[i]);
 		printk("\n");
 	}
 
@@ -1872,87 +1867,29 @@ void __init page_alloc_init(void)
 	hotcpu_notifier(page_alloc_cpu_notify, 0);
 }
 
-static unsigned long higherzone_val(struct zone *z, int max_zone,
-					int alloc_type)
-{
-	int z_idx = zone_idx(z);
-	struct zone *higherzone;
-	unsigned long pages;
-
-	/* there is no higher zone to get a contribution from */
-	if (z_idx == MAX_NR_ZONES-1)
-		return 0;
-
-	higherzone = &z->zone_pgdat->node_zones[z_idx+1];
-
-	/* We always start with the higher zone's protection value */
-	pages = higherzone->protection[alloc_type];
-
-	/*
-	 * We get a lower-zone-protection contribution only if there are
-	 * pages in the higher zone and if we're not the highest zone
-	 * in the current zonelist.  e.g., never happens for GFP_DMA. Happens
-	 * only for ZONE_DMA in a GFP_KERNEL allocation and happens for ZONE_DMA
-	 * and ZONE_NORMAL for a GFP_HIGHMEM allocation.
-	 */
-	if (higherzone->present_pages && z_idx < alloc_type)
-		pages += higherzone->pages_low * sysctl_lower_zone_protection;
-
-	return pages;
-}
-
 /*
- * setup_per_zone_protection - called whenver min_free_kbytes or
- *	sysctl_lower_zone_protection changes.  Ensures that each zone
- *	has a correct pages_protected value, so an adequate number of
+ * setup_per_zone_lowmem_reserve - called whenever
+ *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
+ *	has a correct pages reserved value, so an adequate number of
  *	pages are left in the zone after a successful __alloc_pages().
- *
- *	This algorithm is way confusing.  I tries to keep the same behavior
- *	as we had with the incremental min iterative algorithm.
  */
-static void setup_per_zone_protection(void)
+static void setup_per_zone_lowmem_reserve(void)
 {
 	struct pglist_data *pgdat;
-	struct zone *zones, *zone;
-	int max_zone;
-	int i, j;
+	int j, idx;
 
 	for_each_pgdat(pgdat) {
-		zones = pgdat->node_zones;
+		for (j = 0; j < MAX_NR_ZONES; j++) {
+			struct zone * zone = pgdat->node_zones + j;
+			unsigned long present_pages = zone->present_pages;
 
-		for (i = 0, max_zone = 0; i < MAX_NR_ZONES; i++)
-			if (zones[i].present_pages)
-				max_zone = i;
+			zone->lowmem_reserve[j] = 0;
 
-		/*
-		 * For each of the different allocation types:
-		 * GFP_DMA -> GFP_KERNEL -> GFP_HIGHMEM
-		 */
-		for (i = 0; i < GFP_ZONETYPES; i++) {
-			/*
-			 * For each of the zones:
-			 * ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA
-			 */
-			for (j = MAX_NR_ZONES-1; j >= 0; j--) {
-				zone = &zones[j];
+			for (idx = j-1; idx >= 0; idx--) {
+				struct zone * lower_zone = pgdat->node_zones + idx;
 
-				/*
-				 * We never protect zones that don't have memory
-				 * in them (j>max_zone) or zones that aren't in
-				 * the zonelists for a certain type of
-				 * allocation (j>=i).  We have to assign these
-				 * to zero because the lower zones take
-				 * contributions from the higher zones.
-				 */
-				if (j > max_zone || j >= i) {
-					zone->protection[i] = 0;
-					continue;
-				}
-				/*
-				 * The contribution of the next higher zone
-				 */
-				zone->protection[i] = higherzone_val(zone,
-								max_zone, i);
+				lower_zone->lowmem_reserve[j] = present_pages / sysctl_lowmem_reserve_ratio[idx];
+				present_pages += lower_zone->present_pages;
 			}
 		}
 	}
@@ -2047,7 +1984,7 @@ static int __init init_per_zone_pages_mi
 	if (min_free_kbytes > 65536)
 		min_free_kbytes = 65536;
 	setup_per_zone_pages_min();
-	setup_per_zone_protection();
+	setup_per_zone_lowmem_reserve();
 	return 0;
 }
 module_init(init_per_zone_pages_min)
@@ -2062,20 +1999,23 @@ int min_free_kbytes_sysctl_handler(ctl_t
 {
 	proc_dointvec(table, write, file, buffer, length, ppos);
 	setup_per_zone_pages_min();
-	setup_per_zone_protection();
 	return 0;
 }
 
 /*
- * lower_zone_protection_sysctl_handler - just a wrapper around
- *	proc_dointvec() so that we can call setup_per_zone_protection()
- *	whenever sysctl_lower_zone_protection changes.
+ * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
+ *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
+ *	whenever sysctl_lowmem_reserve_ratio changes.
+ *
+ * The reserve ratio obviously has absolutely no relation with the
+ * pages_min watermarks. The lowmem reserve ratio can only make sense
+ * if in function of the boot time zone sizes.
  */
-int lower_zone_protection_sysctl_handler(ctl_table *table, int write,
+int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
 		 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
-	setup_per_zone_protection();
+	setup_per_zone_lowmem_reserve();
 	return 0;
 }
 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* OOM fixes 3/5
  2005-01-21  5:49 ` OOM fixes 2/5 Andrea Arcangeli
@ 2005-01-21  5:49   ` Andrea Arcangeli
  2005-01-21  5:50     ` OOM fixes 4/5 Andrea Arcangeli
  2005-01-21  6:20   ` OOM fixes 2/5 Andrew Morton
  1 sibling, 1 reply; 22+ messages in thread
From: Andrea Arcangeli @ 2005-01-21  5:49 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Nick Piggin

From: Andrea Arcangeli <andrea@suse.de>
Subject: fix several oom killer bugs, most important avoid spurious oom kills
 badness algorithm tweaked by Thomas Gleixner to deal with fork bombs

This is the core of the oom-killer fixes I developed partly taking the
idea from Thomas's patches of getting feedback from the exit path, plus
I moved the oom killer into page_alloc.c as it should to be able to
check the watermarks before killing more stuff. This also tweaks the
badness to take thread bombs more into account (that change to badness
is from Thomas, from my part I'd rather rewrite badness from scratch
instead, but that's an orthgonal issue ;). With this applied the oom
killer is very sane, no more 5 sec waits and suprious oom kills.

Signed-off-by: Andrea Arcangeli <andrea@suse.de>

--- mainline-2/include/linux/sched.h	2005-01-20 18:27:45.000000000 +0100
+++ mainline-3/include/linux/sched.h	2005-01-21 06:01:08.585190864 +0100
@@ -615,6 +615,11 @@ struct task_struct {
 	struct key *thread_keyring;	/* keyring private to this thread */
 #endif
 /*
+ * All archs should support atomic ops with
+ * 1 byte granularity.
+ */
+	unsigned char memdie;
+/*
  * Must be changed atomically so it shouldn't be
  * be a shareable bitflag.
  */
@@ -736,8 +741,7 @@ do { if (atomic_dec_and_test(&(tsk)->usa
 #define PF_DUMPCORE	0x00000200	/* dumped core */
 #define PF_SIGNALED	0x00000400	/* killed by a signal */
 #define PF_MEMALLOC	0x00000800	/* Allocating memory */
-#define PF_MEMDIE	0x00001000	/* Killed for out-of-memory */
-#define PF_FLUSHER	0x00002000	/* responsible for disk writeback */
+#define PF_FLUSHER	0x00001000	/* responsible for disk writeback */
 
 #define PF_FREEZE	0x00004000	/* this task is being frozen for suspend now */
 #define PF_NOFREEZE	0x00008000	/* this thread should not be frozen */
--- mainline-2/mm/oom_kill.c	2005-01-20 18:26:30.000000000 +0100
+++ mainline-3/mm/oom_kill.c	2005-01-21 06:14:00.290873768 +0100
@@ -45,18 +45,30 @@
 unsigned long badness(struct task_struct *p, unsigned long uptime)
 {
 	unsigned long points, cpu_time, run_time, s;
+	struct list_head *tsk;
 
 	if (!p->mm)
 		return 0;
 
-	if (p->flags & PF_MEMDIE)
-		return 0;
 	/*
 	 * The memory size of the process is the basis for the badness.
 	 */
 	points = p->mm->total_vm;
 
 	/*
+	 * Processes which fork a lot of child processes are likely 
+	 * a good choice. We add the vmsize of the childs if they
+	 * have an own mm. This prevents forking servers to flood the
+	 * machine with an endless amount of childs
+	 */
+	list_for_each(tsk, &p->children) {
+		struct task_struct *chld;
+		chld = list_entry(tsk, struct task_struct, sibling);
+		if (chld->mm != p->mm && chld->mm)
+			points += chld->mm->total_vm;
+	}
+
+	/*
 	 * CPU time is in tens of seconds and run time is in thousands
          * of seconds. There is no particular reason for this other than
          * that it turned out to work very well in practice.
@@ -132,14 +144,24 @@ static struct task_struct * select_bad_p
 
 	do_posix_clock_monotonic_gettime(&uptime);
 	do_each_thread(g, p)
-		if (p->pid) {
-			unsigned long points = badness(p, uptime.tv_sec);
-			if (points > maxpoints) {
+		/* skip the init task with pid == 1 */
+		if (p->pid > 1) {
+			unsigned long points;
+
+			/*
+			 * This is in the process of releasing memory so wait it
+			 * to finish before killing some other task by mistake.
+			 */
+			if ((p->memdie || (p->flags & PF_EXITING)) && !(p->flags & PF_DEAD))
+				return ERR_PTR(-1UL);
+			if (p->flags & PF_SWAPOFF)
+				return p;
+
+			points = badness(p, uptime.tv_sec);
+			if (points > maxpoints || !chosen) {
 				chosen = p;
 				maxpoints = points;
 			}
-			if (p->flags & PF_SWAPOFF)
-				return p;
 		}
 	while_each_thread(g, p);
 	return chosen;
@@ -152,6 +174,12 @@ static struct task_struct * select_bad_p
  */
 static void __oom_kill_task(task_t *p)
 {
+	if (p->pid == 1) {
+		WARN_ON(1);
+		printk(KERN_WARNING "tried to kill init!\n");
+		return;
+	}
+
 	task_lock(p);
 	if (!p->mm || p->mm == &init_mm) {
 		WARN_ON(1);
@@ -168,7 +196,7 @@ static void __oom_kill_task(task_t *p)
 	 * exit() and clear out its resources quickly...
 	 */
 	p->time_slice = HZ;
-	p->flags |= PF_MEMALLOC | PF_MEMDIE;
+	p->memdie = 1;
 
 	/* This process has hardware access, be more careful. */
 	if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) {
@@ -181,12 +209,45 @@ static void __oom_kill_task(task_t *p)
 static struct mm_struct *oom_kill_task(task_t *p)
 {
 	struct mm_struct *mm = get_task_mm(p);
-	if (!mm || mm == &init_mm)
+	task_t * g, * q;
+
+	if (!mm)
 		return NULL;
+	if (mm == &init_mm) {
+		mmput(mm);
+		return NULL;
+	}
+
 	__oom_kill_task(p);
+	/*
+	 * kill all processes that share the ->mm (i.e. all threads),
+	 * but are in a different thread group
+	 */
+	do_each_thread(g, q)
+		if (q->mm == mm && q->tgid != p->tgid)
+			__oom_kill_task(q);
+	while_each_thread(g, q);
+
 	return mm;
 }
 
+static struct mm_struct *oom_kill_process(task_t *p)
+{
+ 	struct mm_struct *mm;
+	struct task_struct *c;
+	struct list_head *tsk;
+
+	/* Try to kill a child first */
+	list_for_each(tsk, &p->children) {
+		c = list_entry(tsk, struct task_struct, sibling);
+		if (c->mm == p->mm)
+			continue;
+		mm = oom_kill_task(c);
+		if (mm)
+			return mm;
+	}
+	return oom_kill_task(p);
+}
 
 /**
  * oom_kill - kill the "best" process when we run out of memory
@@ -196,117 +257,40 @@ static struct mm_struct *oom_kill_task(t
  * OR try to be smart about which process to kill. Note that we
  * don't have to be perfect here, we just have to be good.
  */
-static void oom_kill(void)
+void out_of_memory(int gfp_mask)
 {
-	struct mm_struct *mm;
-	struct task_struct *g, *p, *q;
-	
+	struct mm_struct *mm = NULL;
+	task_t * p;
+
 	read_lock(&tasklist_lock);
 retry:
 	p = select_bad_process();
 
+	if (PTR_ERR(p) == -1UL)
+		goto out;
+
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!p) {
+		read_unlock(&tasklist_lock);
 		show_free_areas();
 		panic("Out of memory and no killable processes...\n");
 	}
 
-	mm = oom_kill_task(p);
-	if (!mm)
-		goto retry;
-	/*
-	 * kill all processes that share the ->mm (i.e. all threads),
-	 * but are in a different thread group
-	 */
-	do_each_thread(g, q)
-		if (q->mm == mm && q->tgid != p->tgid)
-			__oom_kill_task(q);
-	while_each_thread(g, q);
-	if (!p->mm)
-		printk(KERN_INFO "Fixed up OOM kill of mm-less task\n");
-	read_unlock(&tasklist_lock);
-	mmput(mm);
-
-	/*
-	 * Make kswapd go out of the way, so "p" has a good chance of
-	 * killing itself before someone else gets the chance to ask
-	 * for more memory.
-	 */
-	yield();
-	return;
-}
-
-/**
- * out_of_memory - is the system out of memory?
- */
-void out_of_memory(int gfp_mask)
-{
-	/*
-	 * oom_lock protects out_of_memory()'s static variables.
-	 * It's a global lock; this is not performance-critical.
-	 */
-	static DEFINE_SPINLOCK(oom_lock);
-	static unsigned long first, last, count, lastkill;
-	unsigned long now, since;
-
-	spin_lock(&oom_lock);
-	now = jiffies;
-	since = now - last;
-	last = now;
-
-	/*
-	 * If it's been a long time since last failure,
-	 * we're not oom.
-	 */
-	if (since > 5*HZ)
-		goto reset;
-
-	/*
-	 * If we haven't tried for at least one second,
-	 * we're not really oom.
-	 */
-	since = now - first;
-	if (since < HZ)
-		goto out_unlock;
-
-	/*
-	 * If we have gotten only a few failures,
-	 * we're not really oom. 
-	 */
-	if (++count < 10)
-		goto out_unlock;
-
-	/*
-	 * If we just killed a process, wait a while
-	 * to give that task a chance to exit. This
-	 * avoids killing multiple processes needlessly.
-	 */
-	since = now - lastkill;
-	if (since < HZ*5)
-		goto out_unlock;
-
-	/*
-	 * Ok, really out of memory. Kill something.
-	 */
-	lastkill = now;
-
 	printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
 	show_free_areas();
+	mm = oom_kill_process(p);
+	if (!mm)
+		goto retry;
 
-	/* oom_kill() sleeps */
-	spin_unlock(&oom_lock);
-	oom_kill();
-	spin_lock(&oom_lock);
+ out:
+	read_unlock(&tasklist_lock);
+	if (mm)
+		mmput(mm);
 
-reset:
 	/*
-	 * We dropped the lock above, so check to be sure the variable
-	 * first only ever increases to prevent false OOM's.
+	 * Give "p" a good chance of killing itself before we
+	 * retry to allocate memory.
 	 */
-	if (time_after(now, first))
-		first = now;
-	count = 0;
-
-out_unlock:
-	spin_unlock(&oom_lock);
+	__set_current_state(TASK_INTERRUPTIBLE);
+	schedule_timeout(1);
 }
--- mainline-2/mm/page_alloc.c	2005-01-21 05:58:53.338751448 +0100
+++ mainline-3/mm/page_alloc.c	2005-01-21 06:09:43.068977440 +0100
@@ -704,6 +704,7 @@ __alloc_pages(unsigned int gfp_mask, uns
 	int classzone_idx;
 	int do_retry;
 	int can_try_harder;
+	int did_some_progress;
 
 	might_sleep_if(wait);
 
@@ -723,6 +724,7 @@ __alloc_pages(unsigned int gfp_mask, uns
 
 	classzone_idx = zone_idx(zones[0]);
 
+ restart:
 	/* Go through the zonelist once, looking for a zone with enough free */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
 
@@ -754,7 +756,7 @@ __alloc_pages(unsigned int gfp_mask, uns
 	}
 
 	/* This allocation should allow future memory freeing. */
-	if ((p->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) {
+	if (((p->flags & PF_MEMALLOC) || p->memdie) && !in_interrupt()) {
 		/* go through the zonelist yet again, ignoring mins */
 		for (i = 0; (z = zones[i]) != NULL; i++) {
 			page = buffered_rmqueue(z, order, gfp_mask);
@@ -769,26 +771,56 @@ __alloc_pages(unsigned int gfp_mask, uns
 		goto nopage;
 
 rebalance:
+	cond_resched();
+
 	/* We now go into synchronous reclaim */
 	p->flags |= PF_MEMALLOC;
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	try_to_free_pages(zones, gfp_mask, order);
+	did_some_progress = try_to_free_pages(zones, gfp_mask, order);
 
 	p->reclaim_state = NULL;
 	p->flags &= ~PF_MEMALLOC;
 
-	/* go through the zonelist yet one more time */
-	for (i = 0; (z = zones[i]) != NULL; i++) {
-		if (!zone_watermark_ok(z, order, z->pages_min,
-				       classzone_idx, can_try_harder,
-				       gfp_mask & __GFP_HIGH))
-			continue;
+	cond_resched();
 
-		page = buffered_rmqueue(z, order, gfp_mask);
-		if (page)
-			goto got_pg;
+	if (likely(did_some_progress)) {
+		/*
+		 * Go through the zonelist yet one more time, keep
+		 * very high watermark here, this is only to catch
+		 * a parallel oom killing, we must fail if we're still
+		 * under heavy pressure.
+		 */
+		for (i = 0; (z = zones[i]) != NULL; i++) {
+			if (!zone_watermark_ok(z, order, z->pages_min,
+					       classzone_idx, can_try_harder,
+					       gfp_mask & __GFP_HIGH))
+				continue;
+
+			page = buffered_rmqueue(z, order, gfp_mask);
+			if (page)
+				goto got_pg;
+		}
+	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+		/*
+		 * Go through the zonelist yet one more time, keep
+		 * very high watermark here, this is only to catch
+		 * a parallel oom killing, we must fail if we're still
+		 * under heavy pressure.
+		 */
+		for (i = 0; (z = zones[i]) != NULL; i++) {
+			if (!zone_watermark_ok(z, order, z->pages_high,
+					       classzone_idx, 0, 0))
+				continue;
+
+			page = buffered_rmqueue(z, order, gfp_mask);
+			if (page)
+				goto got_pg;
+		}
+
+		out_of_memory(gfp_mask);
+		goto restart;
 	}
 
 	/*
--- mainline-2/mm/swap_state.c	2005-01-04 01:13:30.000000000 +0100
+++ mainline-3/mm/swap_state.c	2005-01-21 06:01:11.181796120 +0100
@@ -59,6 +59,8 @@ void show_swap_cache_info(void)
 		swap_cache_info.add_total, swap_cache_info.del_total,
 		swap_cache_info.find_success, swap_cache_info.find_total,
 		swap_cache_info.noent_race, swap_cache_info.exist_race);
+	printk("Free swap  = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
+	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
 }
 
 /*
--- mainline-2/mm/vmscan.c	2005-01-20 18:20:10.000000000 +0100
+++ mainline-3/mm/vmscan.c	2005-01-21 06:01:11.189794904 +0100
@@ -937,8 +937,6 @@ int try_to_free_pages(struct zone **zone
 		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
 			blk_congestion_wait(WRITE, HZ/10);
 	}
-	if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
-		out_of_memory(gfp_mask);
 out:
 	for (i = 0; zones[i] != 0; i++)
 		zones[i]->prev_priority = zones[i]->temp_priority;

^ permalink raw reply	[flat|nested] 22+ messages in thread

* OOM fixes 4/5
  2005-01-21  5:49   ` OOM fixes 3/5 Andrea Arcangeli
@ 2005-01-21  5:50     ` Andrea Arcangeli
  2005-01-21  5:50       ` OOM fixes 5/5 Andrea Arcangeli
  0 siblings, 1 reply; 22+ messages in thread
From: Andrea Arcangeli @ 2005-01-21  5:50 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Nick Piggin

From: Andrea Arcangeli <andrea@suse.de>
Subject: convert memdie to an atomic thread bitflag

On Sat, Dec 25, 2004 at 03:27:21AM +0100, Andrea Arcangeli wrote:
> So my current plan is to make used_math a PF_USED_MATH, and memdie a
> TIF_MEMDIE. And of course oomtaskadj an int (that one requires more than

This makes memdie a TIF_MEMDIE.

memdie will not be modified by the current task, so it cannot be a
PF_MEMDIE but it must be a TIF_MEMDIE.

Signed-off-by: Andrea Arcangeli <andrea@suse.de>

--- mainline-4/include/asm-alpha/thread_info.h.orig	2004-12-04 08:55:03.000000000 +0100
+++ mainline-4/include/asm-alpha/thread_info.h	2005-01-21 06:17:24.780786576 +0100
@@ -77,6 +77,7 @@ register struct thread_info *__current_t
 #define TIF_UAC_NOPRINT		6	/* see sysinfo.h */
 #define TIF_UAC_NOFIX		7
 #define TIF_UAC_SIGBUS		8
+#define TIF_MEMDIE		9
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
--- mainline-4/include/asm-arm/thread_info.h.orig	2005-01-04 01:13:27.000000000 +0100
+++ mainline-4/include/asm-arm/thread_info.h	2005-01-21 06:17:24.792784752 +0100
@@ -128,6 +128,7 @@ extern void iwmmxt_task_release(struct t
 #define TIF_SYSCALL_TRACE	8
 #define TIF_POLLING_NRFLAG	16
 #define TIF_USING_IWMMXT	17
+#define TIF_MEMDIE		18
 
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
--- mainline-4/include/asm-arm26/thread_info.h.orig	2005-01-15 20:44:58.000000000 +0100
+++ mainline-4/include/asm-arm26/thread_info.h	2005-01-21 06:17:24.797783992 +0100
@@ -126,6 +126,7 @@ extern void free_thread_info(struct thre
 #define TIF_SYSCALL_TRACE	8
 #define TIF_USED_FPU		16
 #define TIF_POLLING_NRFLAG	17
+#define TIF_MEMDIE		18
 
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
--- mainline-4/include/asm-cris/thread_info.h.orig	2003-07-10 19:33:07.000000000 +0200
+++ mainline-4/include/asm-cris/thread_info.h	2005-01-21 06:17:24.803783080 +0100
@@ -85,6 +85,7 @@ struct thread_info {
 #define TIF_SIGPENDING		2	/* signal pending */
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
+#define TIF_MEMDIE		17
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
--- mainline-4/include/asm-h8300/thread_info.h.orig	2004-08-25 02:47:35.000000000 +0200
+++ mainline-4/include/asm-h8300/thread_info.h	2005-01-21 06:17:24.814781408 +0100
@@ -93,6 +93,7 @@ static inline struct thread_info *curren
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_POLLING_NRFLAG	4	/* true if poll_idle() is polling
 					   TIF_NEED_RESCHED */
+#define TIF_MEMDIE		5
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
--- mainline-4/include/asm-i386/thread_info.h.orig	2005-01-04 01:13:27.000000000 +0100
+++ mainline-4/include/asm-i386/thread_info.h	2005-01-21 06:17:24.828779280 +0100
@@ -141,6 +141,7 @@ register unsigned long current_stack_poi
 #define TIF_IRET		5	/* return with iret */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
+#define TIF_MEMDIE		17
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
--- mainline-4/include/asm-ia64/thread_info.h.orig	2004-12-04 08:55:04.000000000 +0100
+++ mainline-4/include/asm-ia64/thread_info.h	2005-01-21 06:17:24.841777304 +0100
@@ -67,6 +67,7 @@ struct thread_info {
 #define TIF_SYSCALL_TRACE	3	/* syscall trace active */
 #define TIF_SYSCALL_AUDIT	4	/* syscall auditing active */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
+#define TIF_MEMDIE		17
 
 #define TIF_WORK_MASK		0x7	/* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE */
 #define TIF_ALLWORK_MASK	0x1f	/* bits 0..4 are "work to do on user-return" bits */
--- mainline-4/include/asm-m68k/thread_info.h.orig	2004-08-25 02:47:35.000000000 +0200
+++ mainline-4/include/asm-m68k/thread_info.h	2005-01-21 06:17:24.851775784 +0100
@@ -48,6 +48,7 @@ struct thread_info {
 #define TIF_NOTIFY_RESUME	2	/* resumption notification requested */
 #define TIF_SIGPENDING		3	/* signal pending */
 #define TIF_NEED_RESCHED	4	/* rescheduling necessary */
+#define TIF_MEMDIE		5
 
 extern int thread_flag_fixme(void);
 
--- mainline-4/include/asm-m68knommu/thread_info.h.orig	2005-01-15 20:44:59.000000000 +0100
+++ mainline-4/include/asm-m68knommu/thread_info.h	2005-01-21 06:17:24.858774720 +0100
@@ -85,6 +85,7 @@ static inline struct thread_info *curren
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_POLLING_NRFLAG	4	/* true if poll_idle() is polling
 					   TIF_NEED_RESCHED */
+#define TIF_MEMDIE		5
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
--- mainline-4/include/asm-mips/thread_info.h.orig	2005-01-04 01:13:29.000000000 +0100
+++ mainline-4/include/asm-mips/thread_info.h	2005-01-21 06:17:24.871772744 +0100
@@ -116,6 +116,7 @@ register struct thread_info *__current_t
 #define TIF_SYSCALL_AUDIT	4	/* syscall auditing active */
 #define TIF_USEDFPU		16	/* FPU was used by this task this quantum (SMP) */
 #define TIF_POLLING_NRFLAG	17	/* true if poll_idle() is polling TIF_NEED_RESCHED */
+#define TIF_MEMDIE		18
 #define TIF_SYSCALL_TRACE	31	/* syscall trace active */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
--- mainline-4/include/asm-parisc/thread_info.h.orig	2005-01-04 01:13:29.000000000 +0100
+++ mainline-4/include/asm-parisc/thread_info.h	2005-01-21 06:17:24.885770616 +0100
@@ -63,6 +63,7 @@ struct thread_info {
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_POLLING_NRFLAG	4	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_32BIT               5       /* 32 bit binary */
+#define TIF_MEMDIE		6
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
--- mainline-4/include/asm-ppc/thread_info.h.orig	2005-01-04 01:13:29.000000000 +0100
+++ mainline-4/include/asm-ppc/thread_info.h	2005-01-21 06:17:24.892769552 +0100
@@ -76,6 +76,7 @@ static inline struct thread_info *curren
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_POLLING_NRFLAG	4	/* true if poll_idle() is polling
 					   TIF_NEED_RESCHED */
+#define TIF_MEMDIE		5
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
--- mainline-4/include/asm-ppc64/thread_info.h.orig	2005-01-20 18:20:10.000000000 +0100
+++ mainline-4/include/asm-ppc64/thread_info.h	2005-01-21 06:17:50.185924408 +0100
@@ -100,6 +100,7 @@ static inline struct thread_info *curren
 #define TIF_ABI_PENDING		7	/* 32/64 bit switch needed */
 #define TIF_SYSCALL_AUDIT	8	/* syscall auditing active */
 #define TIF_SINGLESTEP		9	/* singlestepping active */
+#define TIF_MEMDIE		10
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
--- mainline-4/include/asm-s390/thread_info.h.orig	2004-12-04 08:55:04.000000000 +0100
+++ mainline-4/include/asm-s390/thread_info.h	2005-01-21 06:17:24.911766664 +0100
@@ -100,6 +100,7 @@ static inline struct thread_info *curren
 #define TIF_POLLING_NRFLAG	17	/* true if poll_idle() is polling 
 					   TIF_NEED_RESCHED */
 #define TIF_31BIT		18	/* 32bit process */ 
+#define TIF_MEMDIE		19
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
--- mainline-4/include/asm-sh/thread_info.h.orig	2005-01-04 01:13:29.000000000 +0100
+++ mainline-4/include/asm-sh/thread_info.h	2005-01-21 06:17:24.919765448 +0100
@@ -83,6 +83,7 @@ static inline struct thread_info *curren
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_USEDFPU		16	/* FPU was used by this task this quantum (SMP) */
 #define TIF_POLLING_NRFLAG	17	/* true if poll_idle() is polling TIF_NEED_RESCHED */
+#define TIF_MEMDIE		18
 #define TIF_USERSPACE		31	/* true if FS sets userspace */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
--- mainline-4/include/asm-sh64/thread_info.h.orig	2004-08-25 02:47:51.000000000 +0200
+++ mainline-4/include/asm-sh64/thread_info.h	2005-01-21 06:17:24.925764536 +0100
@@ -74,6 +74,7 @@ static inline struct thread_info *curren
 #define TIF_SYSCALL_TRACE	0	/* syscall trace active */
 #define TIF_SIGPENDING		2	/* signal pending */
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
+#define TIF_MEMDIE		4
 
 #define THREAD_SIZE	16384
 
--- mainline-4/include/asm-sparc/thread_info.h.orig	2004-08-25 02:47:35.000000000 +0200
+++ mainline-4/include/asm-sparc/thread_info.h	2005-01-21 06:17:24.930763776 +0100
@@ -138,6 +138,7 @@ BTFIXUPDEF_CALL(void, free_thread_info, 
 					 * this quantum (SMP) */
 #define TIF_POLLING_NRFLAG	9	/* true if poll_idle() is polling
 					 * TIF_NEED_RESCHED */
+#define TIF_MEMDIE		10
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
--- mainline-4/include/asm-sparc64/thread_info.h.orig	2004-08-25 02:47:57.000000000 +0200
+++ mainline-4/include/asm-sparc64/thread_info.h	2005-01-21 06:17:24.937762712 +0100
@@ -228,6 +228,7 @@ register struct thread_info *current_thr
  *       an immediate value in instructions such as andcc.
  */
 #define TIF_ABI_PENDING		12
+#define TIF_MEMDIE		13
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
--- mainline-4/include/asm-um/thread_info.h.orig	2005-01-15 20:45:00.000000000 +0100
+++ mainline-4/include/asm-um/thread_info.h	2005-01-21 06:17:24.943761800 +0100
@@ -71,6 +71,7 @@ static inline struct thread_info *curren
 					 * TIF_NEED_RESCHED 
 					 */
 #define TIF_RESTART_BLOCK 	4
+#define TIF_MEMDIE	 	5
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
--- mainline-4/include/asm-v850/thread_info.h.orig	2003-06-17 11:31:42.000000000 +0200
+++ mainline-4/include/asm-v850/thread_info.h	2005-01-21 06:17:24.954760128 +0100
@@ -83,6 +83,7 @@ struct thread_info {
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_POLLING_NRFLAG	4	/* true if poll_idle() is polling
 					   TIF_NEED_RESCHED */
+#define TIF_MEMDIE		5
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
--- mainline-4/include/asm-x86_64/thread_info.h.orig	2005-01-04 01:13:29.000000000 +0100
+++ mainline-4/include/asm-x86_64/thread_info.h	2005-01-21 06:17:24.965758456 +0100
@@ -106,6 +106,7 @@ static inline struct thread_info *stack_
 #define TIF_IA32		17	/* 32bit process */ 
 #define TIF_FORK		18	/* ret_from_fork */
 #define TIF_ABI_PENDING		19
+#define TIF_MEMDIE		20
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
--- mainline-4/include/linux/sched.h.orig	2005-01-21 06:01:08.585190864 +0100
+++ mainline-4/include/linux/sched.h	2005-01-21 06:17:24.967758152 +0100
@@ -615,11 +615,6 @@ struct task_struct {
 	struct key *thread_keyring;	/* keyring private to this thread */
 #endif
 /*
- * All archs should support atomic ops with
- * 1 byte granularity.
- */
-	unsigned char memdie;
-/*
  * Must be changed atomically so it shouldn't be
  * be a shareable bitflag.
  */
--- mainline-4/include/asm-m32r/thread_info.h.orig	2005-01-15 20:44:59.000000000 +0100
+++ mainline-4/include/asm-m32r/thread_info.h	2005-01-21 06:18:25.045624928 +0100
@@ -155,6 +155,7 @@ static inline unsigned int get_thread_fa
 #define TIF_IRET		5	/* return with iret */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 					/* 31..28 fault code */
+#define TIF_MEMDIE		17
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
--- mainline-4/mm/oom_kill.c.orig	2005-01-21 06:14:00.290873768 +0100
+++ mainline-4/mm/oom_kill.c	2005-01-21 06:17:24.980756176 +0100
@@ -152,7 +152,8 @@ static struct task_struct * select_bad_p
 			 * This is in the process of releasing memory so wait it
 			 * to finish before killing some other task by mistake.
 			 */
-			if ((p->memdie || (p->flags & PF_EXITING)) && !(p->flags & PF_DEAD))
+			if ((unlikely(test_tsk_thread_flag(p, TIF_MEMDIE)) || (p->flags & PF_EXITING)) &&
+			    !(p->flags & PF_DEAD))
 				return ERR_PTR(-1UL);
 			if (p->flags & PF_SWAPOFF)
 				return p;
@@ -196,7 +197,7 @@ static void __oom_kill_task(task_t *p)
 	 * exit() and clear out its resources quickly...
 	 */
 	p->time_slice = HZ;
-	p->memdie = 1;
+	set_tsk_thread_flag(p, TIF_MEMDIE);
 
 	/* This process has hardware access, be more careful. */
 	if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) {
--- mainline-4/mm/page_alloc.c.orig	2005-01-21 06:09:43.068977440 +0100
+++ mainline-4/mm/page_alloc.c	2005-01-21 06:17:24.996753744 +0100
@@ -756,7 +756,7 @@ __alloc_pages(unsigned int gfp_mask, uns
 	}
 
 	/* This allocation should allow future memory freeing. */
-	if (((p->flags & PF_MEMALLOC) || p->memdie) && !in_interrupt()) {
+	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) {
 		/* go through the zonelist yet again, ignoring mins */
 		for (i = 0; (z = zones[i]) != NULL; i++) {
 			page = buffered_rmqueue(z, order, gfp_mask);

^ permalink raw reply	[flat|nested] 22+ messages in thread

* OOM fixes 5/5
  2005-01-21  5:50     ` OOM fixes 4/5 Andrea Arcangeli
@ 2005-01-21  5:50       ` Andrea Arcangeli
  2005-01-21  6:01         ` writeback-highmem Andrea Arcangeli
  0 siblings, 1 reply; 22+ messages in thread
From: Andrea Arcangeli @ 2005-01-21  5:50 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Nick Piggin

From: Andrea Arcangeli <andrea@suse.de>
Subject: Convert the unsafe signed (16bit) used_math to a safe and optimal PF_USED_MATH

On Sat, Dec 25, 2004 at 04:24:30AM +0100, Andrea Arcangeli wrote:
> Here it is the first part. This makes memdie a TIF_MEMDIE. It's

And here is the final incremental part converting ->used_math to
PF_USED_MATH.

I might have broken arm, see the very first change in the patch to
asm-offsets.c, rest looks ok at first glance.

If you want used_math to return 0 or 1 (instead of 0 or PF_USED_MATH),
just s/!!// in the below patch and place !! in sched.h::*used_math()
accordingly after applying the patch, it should work just fine. Using !!
only when necessary as the below is optimal.

Signed-off-by: Andrea Arcangeli <andrea@suse.de>

--- mainline-5/arch/arm26/kernel/asm-offsets.c.orig	2003-07-17 01:52:38.000000000 +0200
+++ mainline-5/arch/arm26/kernel/asm-offsets.c	2005-01-21 06:20:01.999885640 +0100
@@ -42,7 +42,6 @@
 
 int main(void)
 {
-  DEFINE(TSK_USED_MATH,		offsetof(struct task_struct, used_math));
   DEFINE(TSK_ACTIVE_MM,		offsetof(struct task_struct, active_mm));
   BLANK();
   DEFINE(VMA_VM_MM,		offsetof(struct vm_area_struct, vm_mm));
--- mainline-5/arch/arm26/kernel/process.c.orig	2005-01-15 20:44:48.000000000 +0100
+++ mainline-5/arch/arm26/kernel/process.c	2005-01-21 06:20:02.013883512 +0100
@@ -271,7 +271,7 @@ void flush_thread(void)
 	memset(&tsk->thread.debug, 0, sizeof(struct debug_info));
 	memset(&thread->fpstate, 0, sizeof(union fp_state));
 
-	current->used_math = 0;
+	clear_used_math();
 }
 
 void release_thread(struct task_struct *dead_task)
@@ -305,7 +305,7 @@ copy_thread(int nr, unsigned long clone_
 int dump_fpu (struct pt_regs *regs, struct user_fp *fp)
 {
 	struct thread_info *thread = current_thread_info();
-	int used_math = current->used_math;
+	int used_math = !!used_math();
 
 	if (used_math)
 		memcpy(fp, &thread->fpstate.soft, sizeof (*fp));
--- mainline-5/arch/arm26/kernel/ptrace.c.orig	2005-01-04 01:13:09.000000000 +0100
+++ mainline-5/arch/arm26/kernel/ptrace.c	2005-01-21 06:20:02.018882752 +0100
@@ -540,7 +540,7 @@ static int ptrace_getfpregs(struct task_
  */
 static int ptrace_setfpregs(struct task_struct *tsk, void *ufp)
 {
-	tsk->used_math = 1;
+	set_stopped_child_used_math(tsk);
 	return copy_from_user(&tsk->thread_info->fpstate, ufp,
 			      sizeof(struct user_fp)) ? -EFAULT : 0;
 }
--- mainline-5/arch/i386/kernel/cpu/common.c.orig	2005-01-15 20:44:49.000000000 +0100
+++ mainline-5/arch/i386/kernel/cpu/common.c	2005-01-21 06:20:02.027881384 +0100
@@ -629,6 +629,6 @@ void __init cpu_init (void)
 	 * Force FPU initialization:
 	 */
 	current_thread_info()->status = 0;
-	current->used_math = 0;
+	clear_used_math();
 	mxcsr_feature_mask_init();
 }
--- mainline-5/arch/i386/kernel/i387.c.orig	2005-01-20 18:20:09.000000000 +0100
+++ mainline-5/arch/i386/kernel/i387.c	2005-01-21 06:20:02.040879408 +0100
@@ -60,7 +60,8 @@ void init_fpu(struct task_struct *tsk)
 		tsk->thread.i387.fsave.twd = 0xffffffffu;
 		tsk->thread.i387.fsave.fos = 0xffff0000u;
 	}
-	tsk->used_math = 1;
+	/* only the device not available exception or ptrace can call init_fpu */
+	set_stopped_child_used_math(tsk);
 }
 
 /*
@@ -331,13 +332,13 @@ static int save_i387_fxsave( struct _fps
 
 int save_i387( struct _fpstate __user *buf )
 {
-	if ( !current->used_math )
+	if ( !used_math() )
 		return 0;
 
 	/* This will cause a "finit" to be triggered by the next
 	 * attempted FPU operation by the 'current' process.
 	 */
-	current->used_math = 0;
+	clear_used_math();
 
 	if ( HAVE_HWFP ) {
 		if ( cpu_has_fxsr ) {
@@ -383,7 +384,7 @@ int restore_i387( struct _fpstate __user
 	} else {
 		err = restore_i387_soft( &current->thread.i387.soft, buf );
 	}
-	current->used_math = 1;
+	set_used_math();
 	return err;
 }
 
@@ -507,7 +508,7 @@ int dump_fpu( struct pt_regs *regs, stru
 	int fpvalid;
 	struct task_struct *tsk = current;
 
-	fpvalid = tsk->used_math;
+	fpvalid = !!used_math();
 	if ( fpvalid ) {
 		unlazy_fpu( tsk );
 		if ( cpu_has_fxsr ) {
@@ -522,7 +523,7 @@ int dump_fpu( struct pt_regs *regs, stru
 
 int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
 {
-	int fpvalid = tsk->used_math;
+	int fpvalid = !!tsk_used_math(tsk);
 
 	if (fpvalid) {
 		if (tsk == current)
@@ -537,7 +538,7 @@ int dump_task_fpu(struct task_struct *ts
 
 int dump_task_extended_fpu(struct task_struct *tsk, struct user_fxsr_struct *fpu)
 {
-	int fpvalid = tsk->used_math && cpu_has_fxsr;
+	int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr;
 
 	if (fpvalid) {
 		if (tsk == current)
--- mainline-5/arch/i386/kernel/process.c.orig	2005-01-15 20:44:49.000000000 +0100
+++ mainline-5/arch/i386/kernel/process.c	2005-01-21 06:20:02.049878040 +0100
@@ -351,7 +351,7 @@ void flush_thread(void)
 	 * Forget coprocessor state..
 	 */
 	clear_fpu(tsk);
-	tsk->used_math = 0;
+	clear_used_math();
 }
 
 void release_thread(struct task_struct *dead_task)
--- mainline-5/arch/i386/kernel/ptrace.c.orig	2005-01-15 20:44:49.000000000 +0100
+++ mainline-5/arch/i386/kernel/ptrace.c	2005-01-21 06:20:02.052877584 +0100
@@ -592,7 +592,7 @@ asmlinkage int sys_ptrace(long request, 
 			break;
 		}
 		ret = 0;
-		if (!child->used_math)
+		if (!tsk_used_math(child))
 			init_fpu(child);
 		get_fpregs((struct user_i387_struct __user *)data, child);
 		break;
@@ -604,7 +604,7 @@ asmlinkage int sys_ptrace(long request, 
 			ret = -EIO;
 			break;
 		}
-		child->used_math = 1;
+		set_stopped_child_used_math(child);
 		set_fpregs(child, (struct user_i387_struct __user *)data);
 		ret = 0;
 		break;
@@ -616,7 +616,7 @@ asmlinkage int sys_ptrace(long request, 
 			ret = -EIO;
 			break;
 		}
-		if (!child->used_math)
+		if (!tsk_used_math(child))
 			init_fpu(child);
 		ret = get_fpxregs((struct user_fxsr_struct __user *)data, child);
 		break;
@@ -628,7 +628,7 @@ asmlinkage int sys_ptrace(long request, 
 			ret = -EIO;
 			break;
 		}
-		child->used_math = 1;
+		set_stopped_child_used_math(child);
 		ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data);
 		break;
 	}
--- mainline-5/arch/i386/kernel/traps.c.orig	2005-01-15 20:44:49.000000000 +0100
+++ mainline-5/arch/i386/kernel/traps.c	2005-01-21 06:20:02.054877280 +0100
@@ -911,7 +911,7 @@ asmlinkage void math_state_restore(struc
 	struct task_struct *tsk = thread->task;
 
 	clts();		/* Allow maths ops (or we recurse) */
-	if (!tsk->used_math)
+	if (!tsk_used_math(tsk))
 		init_fpu(tsk);
 	restore_fpu(tsk);
 	thread->status |= TS_USEDFPU;	/* So we fnsave on switch_to() */
--- mainline-5/arch/i386/math-emu/fpu_entry.c.orig	2004-08-25 02:47:49.000000000 +0200
+++ mainline-5/arch/i386/math-emu/fpu_entry.c	2005-01-21 06:20:02.066875456 +0100
@@ -155,10 +155,10 @@ asmlinkage void math_emulate(long arg)
   RE_ENTRANT_CHECK_ON;
 #endif /* RE_ENTRANT_CHECKING */
 
-  if (!current->used_math)
+  if (!used_math())
     {
       finit();
-      current->used_math = 1;
+      set_used_math();
     }
 
   SETUP_DATA_AREA(arg);
--- mainline-5/arch/ia64/ia32/elfcore32.h.orig	2005-01-04 01:13:09.000000000 +0100
+++ mainline-5/arch/ia64/ia32/elfcore32.h	2005-01-21 06:20:02.077873784 +0100
@@ -106,7 +106,7 @@ elf_core_copy_task_fpregs(struct task_st
 	struct ia32_user_i387_struct *fpstate = (void*)fpu;
 	mm_segment_t old_fs;
 
-	if (!tsk->used_math)
+	if (!tsk_used_math(tsk))
 		return 0;
 	
 	old_fs = get_fs();
@@ -124,7 +124,7 @@ elf_core_copy_task_xfpregs(struct task_s
 	struct ia32_user_fxsr_struct *fpxstate = (void*) xfpu;
 	mm_segment_t old_fs;
 
-	if (!tsk->used_math)
+	if (!tsk_used_math(tsk))
 		return 0;
 
 	old_fs = get_fs();
--- mainline-5/arch/mips/kernel/irixsig.c.orig	2005-01-20 18:20:09.000000000 +0100
+++ mainline-5/arch/mips/kernel/irixsig.c	2005-01-21 06:20:02.085872568 +0100
@@ -99,7 +99,7 @@ static void setup_irix_frame(struct k_si
 	__put_user((u64) regs->hi, &ctx->hi);
 	__put_user((u64) regs->lo, &ctx->lo);
 	__put_user((u64) regs->cp0_epc, &ctx->pc);
-	__put_user(current->used_math, &ctx->usedfp);
+	__put_user(!!used_math(), &ctx->usedfp);
 	__put_user((u64) regs->cp0_cause, &ctx->cp0_cause);
 	__put_user((u64) regs->cp0_badvaddr, &ctx->cp0_badvaddr);
 
@@ -725,7 +725,7 @@ asmlinkage int irix_getcontext(struct pt
 	__put_user(regs->cp0_epc, &ctx->regs[35]);
 
 	flags = 0x0f;
-	if(!current->used_math) {
+	if(!used_math()) {
 		flags &= ~(0x08);
 	} else {
 		/* XXX wheee... */
--- mainline-5/arch/mips/kernel/process.c.orig	2005-01-04 01:13:10.000000000 +0100
+++ mainline-5/arch/mips/kernel/process.c	2005-01-21 06:20:02.093871352 +0100
@@ -76,7 +76,7 @@ void start_thread(struct pt_regs * regs,
 #endif
 	status |= KU_USER;
 	regs->cp0_status = status;
-	current->used_math = 0;
+	clear_used_math();
 	lose_fpu();
 	regs->cp0_epc = pc;
 	regs->regs[29] = sp;
--- mainline-5/arch/mips/kernel/ptrace.c.orig	2005-01-04 01:13:10.000000000 +0100
+++ mainline-5/arch/mips/kernel/ptrace.c	2005-01-21 06:20:02.094871200 +0100
@@ -119,7 +119,7 @@ asmlinkage int sys_ptrace(long request, 
 			tmp = regs->regs[addr];
 			break;
 		case FPR_BASE ... FPR_BASE + 31:
-			if (child->used_math) {
+			if (tsk_used_math(child)) {
 				fpureg_t *fregs = get_fpu_regs(child);
 
 #ifdef CONFIG_MIPS32
@@ -205,7 +205,7 @@ asmlinkage int sys_ptrace(long request, 
 		case FPR_BASE ... FPR_BASE + 31: {
 			fpureg_t *fregs = get_fpu_regs(child);
 
-			if (!child->used_math) {
+			if (!tsk_used_math(child)) {
 				/* FP not yet used  */
 				memset(&child->thread.fpu.hard, ~0,
 				       sizeof(child->thread.fpu.hard));
--- mainline-5/arch/mips/kernel/ptrace32.c.orig	2005-01-04 01:13:10.000000000 +0100
+++ mainline-5/arch/mips/kernel/ptrace32.c	2005-01-21 06:20:02.096870896 +0100
@@ -112,7 +112,7 @@ asmlinkage int sys32_ptrace(int request,
 			tmp = regs->regs[addr];
 			break;
 		case FPR_BASE ... FPR_BASE + 31:
-			if (child->used_math) {
+			if (tsk_used_math(child)) {
 				fpureg_t *fregs = get_fpu_regs(child);
 
 				/*
@@ -193,7 +193,7 @@ asmlinkage int sys32_ptrace(int request,
 		case FPR_BASE ... FPR_BASE + 31: {
 			fpureg_t *fregs = get_fpu_regs(child);
 
-			if (!child->used_math) {
+			if (!tsk_used_math(child)) {
 				/* FP not yet used  */
 				memset(&child->thread.fpu.hard, ~0,
 				       sizeof(child->thread.fpu.hard));
--- mainline-5/arch/mips/kernel/signal.c.orig	2005-01-20 18:20:09.000000000 +0100
+++ mainline-5/arch/mips/kernel/signal.c	2005-01-21 06:20:02.098870592 +0100
@@ -178,11 +178,11 @@ asmlinkage int restore_sigcontext(struct
 	restore_gp_reg(31);
 #undef restore_gp_reg
 
-	err |= __get_user(current->used_math, &sc->sc_used_math);
+	err |= __get_user(!!used_math(), &sc->sc_used_math);
 
 	preempt_disable();
 
-	if (current->used_math) {
+	if (used_math()) {
 		/* restore fpu context if we have used it before */
 		own_fpu();
 		err |= restore_fp_context(sc);
@@ -323,9 +323,9 @@ inline int setup_sigcontext(struct pt_re
 	err |= __put_user(regs->cp0_cause, &sc->sc_cause);
 	err |= __put_user(regs->cp0_badvaddr, &sc->sc_badvaddr);
 
-	err |= __put_user(current->used_math, &sc->sc_used_math);
+	err |= __put_user(!!used_math(), &sc->sc_used_math);
 
-	if (!current->used_math)
+	if (!used_math())
 		goto out;
 
 	/*
--- mainline-5/arch/mips/kernel/signal32.c.orig	2005-01-20 18:20:09.000000000 +0100
+++ mainline-5/arch/mips/kernel/signal32.c	2005-01-21 06:20:02.099870440 +0100
@@ -361,11 +361,11 @@ static asmlinkage int restore_sigcontext
 	restore_gp_reg(31);
 #undef restore_gp_reg
 
-	err |= __get_user(current->used_math, &sc->sc_used_math);
+	err |= __get_user(!!used_math(), &sc->sc_used_math);
 
 	preempt_disable();
 
-	if (current->used_math) {
+	if (used_math()) {
 		/* restore fpu context if we have used it before */
 		own_fpu();
 		err |= restore_fp_context32(sc);
@@ -552,9 +552,9 @@ static inline int setup_sigcontext32(str
 	err |= __put_user(regs->cp0_cause, &sc->sc_cause);
 	err |= __put_user(regs->cp0_badvaddr, &sc->sc_badvaddr);
 
-	err |= __put_user(current->used_math, &sc->sc_used_math);
+	err |= __put_user(!!used_math(), &sc->sc_used_math);
 
-	if (!current->used_math)
+	if (!used_math())
 		goto out;
 
 	/* 
--- mainline-5/arch/mips/kernel/traps.c.orig	2005-01-04 01:13:10.000000000 +0100
+++ mainline-5/arch/mips/kernel/traps.c	2005-01-21 06:20:02.105869528 +0100
@@ -655,11 +655,11 @@ asmlinkage void do_cpu(struct pt_regs *r
 		preempt_disable();
 
 		own_fpu();
-		if (current->used_math) {	/* Using the FPU again.  */
+		if (used_math()) {	/* Using the FPU again.  */
 			restore_fp(current);
 		} else {			/* First time FPU user.  */
 			init_fpu();
-			current->used_math = 1;
+			set_used_math();
 		}
 
 		if (!cpu_has_fpu) {
--- mainline-5/arch/s390/kernel/process.c.orig	2005-01-15 20:44:50.000000000 +0100
+++ mainline-5/arch/s390/kernel/process.c	2005-01-21 06:20:02.120867248 +0100
@@ -215,8 +215,7 @@ void exit_thread(void)
 
 void flush_thread(void)
 {
-
-        current->used_math = 0;
+	clear_used_math();
 	clear_tsk_thread_flag(current, TIF_USEDFPU);
 }
 
--- mainline-5/arch/s390/kernel/setup.c.orig	2005-01-20 18:20:09.000000000 +0100
+++ mainline-5/arch/s390/kernel/setup.c	2005-01-21 06:20:02.129865880 +0100
@@ -96,7 +96,7 @@ void __devinit cpu_init (void)
          * Force FPU initialization:
          */
         clear_thread_flag(TIF_USEDFPU);
-        current->used_math = 0;
+        clear_used_math();
 
 	atomic_inc(&init_mm.mm_count);
 	current->active_mm = &init_mm;
--- mainline-5/arch/sh/kernel/cpu/sh4/fpu.c.orig	2004-02-20 17:26:36.000000000 +0100
+++ mainline-5/arch/sh/kernel/cpu/sh4/fpu.c	2005-01-21 06:20:02.139864360 +0100
@@ -323,13 +323,13 @@ do_fpu_state_restore(unsigned long r4, u
 		return;
 	}
 
-	if (tsk->used_math) {
+	if (used_math()) {
 		/* Using the FPU again.  */
 		restore_fpu(tsk);
 	} else	{
 		/* First time FPU user.  */
 		fpu_init();
-		tsk->used_math = 1;
+		set_used_math();
 	}
 	set_tsk_thread_flag(tsk, TIF_USEDFPU);
 }
--- mainline-5/arch/sh/kernel/cpu/init.c.orig	2005-01-04 01:13:11.000000000 +0100
+++ mainline-5/arch/sh/kernel/cpu/init.c	2005-01-21 06:20:02.150862688 +0100
@@ -194,7 +194,7 @@ asmlinkage void __init sh_cpu_init(void)
 	/* FPU initialization */
 	if ((cpu_data->flags & CPU_HAS_FPU)) {
 		clear_thread_flag(TIF_USEDFPU);
-		current->used_math = 0;
+		clear_used_math();
 	}
 
 #ifdef CONFIG_SH_DSP
--- mainline-5/arch/sh/kernel/process.c.orig	2005-01-15 20:44:50.000000000 +0100
+++ mainline-5/arch/sh/kernel/process.c	2005-01-21 06:20:02.159861320 +0100
@@ -208,7 +208,7 @@ void flush_thread(void)
 
 	/* Forget lazy FPU state */
 	clear_fpu(tsk, regs);
-	tsk->used_math = 0;
+	clear_used_math();
 #endif
 }
 
@@ -225,7 +225,7 @@ int dump_fpu(struct pt_regs *regs, elf_f
 #if defined(CONFIG_SH_FPU)
 	struct task_struct *tsk = current;
 
-	fpvalid = tsk->used_math;
+	fpvalid = !!tsk_used_math(tsk);
 	if (fpvalid) {
 		unlazy_fpu(tsk, regs);
 		memcpy(fpu, &tsk->thread.fpu.hard, sizeof(*fpu));
@@ -260,7 +260,7 @@ dump_task_fpu (struct task_struct *tsk, 
 	int fpvalid = 0;
 
 #if defined(CONFIG_SH_FPU)
-	fpvalid = tsk->used_math;
+	fpvalid = !!tsk_used_math(tsk);
 	if (fpvalid) {
 		struct pt_regs *regs = (struct pt_regs *)
 					((unsigned long)tsk->thread_info
@@ -286,7 +286,7 @@ int copy_thread(int nr, unsigned long cl
 
 	unlazy_fpu(tsk, regs);
 	p->thread.fpu = tsk->thread.fpu;
-	p->used_math = tsk->used_math;
+	copy_to_stopped_child_used_math(p);
 #endif
 
 	childregs = ((struct pt_regs *)
--- mainline-5/arch/sh/kernel/ptrace.c.orig	2005-01-04 01:13:11.000000000 +0100
+++ mainline-5/arch/sh/kernel/ptrace.c	2005-01-21 06:20:02.168859952 +0100
@@ -150,7 +150,7 @@ asmlinkage int sys_ptrace(long request, 
 			tmp = get_stack_long(child, addr);
 		else if (addr >= (long) &dummy->fpu &&
 			 addr < (long) &dummy->u_fpvalid) {
-			if (!child->used_math) {
+			if (!tsk_used_math(child)) {
 				if (addr == (long)&dummy->fpu.fpscr)
 					tmp = FPSCR_INIT;
 				else
@@ -159,7 +159,7 @@ asmlinkage int sys_ptrace(long request, 
 				tmp = ((long *)&child->thread.fpu)
 					[(addr - (long)&dummy->fpu) >> 2];
 		} else if (addr == (long) &dummy->u_fpvalid)
-			tmp = child->used_math;
+			tmp = !!tsk_used_math(child);
 		else
 			tmp = 0;
 		ret = put_user(tmp, (unsigned long *)data);
@@ -185,12 +185,12 @@ asmlinkage int sys_ptrace(long request, 
 			ret = put_stack_long(child, addr, data);
 		else if (addr >= (long) &dummy->fpu &&
 			 addr < (long) &dummy->u_fpvalid) {
-			child->used_math = 1;
+			set_stopped_child_used_math(child);
 			((long *)&child->thread.fpu)
 				[(addr - (long)&dummy->fpu) >> 2] = data;
 			ret = 0;
 		} else if (addr == (long) &dummy->u_fpvalid) {
-			child->used_math = data?1:0;
+			conditional_stopped_child_used_math(data, child);
 			ret = 0;
 		}
 		break;
--- mainline-5/arch/sh/kernel/signal.c.orig	2005-01-20 18:20:09.000000000 +0100
+++ mainline-5/arch/sh/kernel/signal.c	2005-01-21 06:20:02.170859648 +0100
@@ -162,7 +162,7 @@ static inline int restore_sigcontext_fpu
 	if (!(cpu_data->flags & CPU_HAS_FPU))
 		return 0;
 
-	tsk->used_math = 1;
+	set_used_math();
 	return __copy_from_user(&tsk->thread.fpu.hard, &sc->sc_fpregs[0],
 				sizeof(long)*(16*2+2));
 }
@@ -175,7 +175,7 @@ static inline int save_sigcontext_fpu(st
 	if (!(cpu_data->flags & CPU_HAS_FPU))
 		return 0;
 
-	if (!tsk->used_math) {
+	if (!used_math()) {
 		__put_user(0, &sc->sc_ownedfp);
 		return 0;
 	}
@@ -185,7 +185,7 @@ static inline int save_sigcontext_fpu(st
 	/* This will cause a "finit" to be triggered by the next
 	   attempted FPU operation by the 'current' process.
 	   */
-	tsk->used_math = 0;
+	clear_used_math();
 
 	unlazy_fpu(tsk, regs);
 	return __copy_to_user(&sc->sc_fpregs[0], &tsk->thread.fpu.hard,
@@ -219,7 +219,7 @@ restore_sigcontext(struct pt_regs *regs,
 
 		regs->sr |= SR_FD; /* Release FPU */
 		clear_fpu(tsk, regs);
-		tsk->used_math = 0;
+		clear_used_math();
 		__get_user (owned_fp, &sc->sc_ownedfp);
 		if (owned_fp)
 			err |= restore_sigcontext_fpu(sc);
--- mainline-5/arch/sh64/kernel/fpu.c.orig	2004-08-25 02:47:49.000000000 +0200
+++ mainline-5/arch/sh64/kernel/fpu.c	2005-01-21 06:20:02.182857824 +0100
@@ -158,12 +158,12 @@ do_fpu_state_restore(unsigned long ex, s
 		fpsave(&last_task_used_math->thread.fpu.hard);
         }
         last_task_used_math = current;
-        if (current->used_math) {
+        if (used_math()) {
                 fpload(&current->thread.fpu.hard);
         } else {
 		/* First time FPU user.  */
 		fpload(&init_fpuregs.hard);
-                current->used_math = 1;
+                set_used_math();
         }
 	release_fpu();
 }
--- mainline-5/arch/sh64/kernel/process.c.orig	2005-01-15 20:44:50.000000000 +0100
+++ mainline-5/arch/sh64/kernel/process.c	2005-01-21 06:20:02.195855848 +0100
@@ -688,7 +688,7 @@ void flush_thread(void)
 		last_task_used_math = NULL;
 	}
 	/* Force FPU state to be reinitialised after exec */
-	current->used_math = 0;
+	clear_used_math();
 #endif
 
 	/* if we are a kernel thread, about to change to user thread,
@@ -713,7 +713,7 @@ int dump_fpu(struct pt_regs *regs, elf_f
 	int fpvalid;
 	struct task_struct *tsk = current;
 
-	fpvalid = tsk->used_math;
+	fpvalid = !!tsk_used_math(tsk);
 	if (fpvalid) {
 		if (current == last_task_used_math) {
 			grab_fpu();
--- mainline-5/arch/sh64/kernel/ptrace.c.orig	2005-01-04 01:13:11.000000000 +0100
+++ mainline-5/arch/sh64/kernel/ptrace.c	2005-01-21 06:20:02.202854784 +0100
@@ -63,7 +63,7 @@ get_fpu_long(struct task_struct *task, u
 	struct pt_regs *regs;
 	regs = (struct pt_regs*)((unsigned char *)task + THREAD_SIZE) - 1;
 
-	if (!task->used_math) {
+	if (!tsk_used_math(task)) {
 		if (addr == offsetof(struct user_fpu_struct, fpscr)) {
 			tmp = FPSCR_INIT;
 		} else {
@@ -105,9 +105,9 @@ put_fpu_long(struct task_struct *task, u
 
 	regs = (struct pt_regs*)((unsigned char *)task + THREAD_SIZE) - 1;
 
-	if (!task->used_math) {
+	if (!tsk_used_math(task)) {
 		fpinit(&task->thread.fpu.hard);
-		task->used_math = 1;
+		set_stopped_child_used_math(task);
 	} else if (last_task_used_math == task) {
 		grab_fpu();
 		fpsave(&task->thread.fpu.hard);
@@ -187,7 +187,7 @@ asmlinkage int sys_ptrace(long request, 
 			 (addr <  offsetof(struct user, u_fpvalid))) {
 			tmp = get_fpu_long(child, addr - offsetof(struct user, fpu));
 		} else if (addr == offsetof(struct user, u_fpvalid)) {
-			tmp = child->used_math;
+			tmp = !!tsk_used_math(child);
 		} else {
 			break;
 		}
--- mainline-5/arch/sh64/kernel/signal.c.orig	2005-01-20 18:20:09.000000000 +0100
+++ mainline-5/arch/sh64/kernel/signal.c	2005-01-21 06:20:02.204854480 +0100
@@ -186,7 +186,7 @@ restore_sigcontext_fpu(struct pt_regs *r
 	int fpvalid;
 
 	err |= __get_user (fpvalid, &sc->sc_fpvalid);
-	current->used_math = fpvalid;
+	conditional_used_math(fpvalid);
 	if (! fpvalid)
 		return err;
 
@@ -207,7 +207,7 @@ setup_sigcontext_fpu(struct pt_regs *reg
 	int err = 0;
 	int fpvalid;
 
-	fpvalid = current->used_math;
+	fpvalid = !!used_math();
 	err |= __put_user(fpvalid, &sc->sc_fpvalid);
 	if (! fpvalid)
 		return err;
@@ -222,7 +222,7 @@ setup_sigcontext_fpu(struct pt_regs *reg
 
 	err |= __copy_to_user(&sc->sc_fpregs[0], &current->thread.fpu.hard,
 			      (sizeof(long long) * 32) + (sizeof(int) * 1));
-	current->used_math = 0;
+	clear_used_math();
 
 	return err;
 }
--- mainline-5/arch/sparc/kernel/process.c.orig	2005-01-15 20:44:50.000000000 +0100
+++ mainline-5/arch/sparc/kernel/process.c	2005-01-21 06:20:02.219852200 +0100
@@ -599,7 +599,7 @@ void dump_thread(struct pt_regs * regs, 
  */
 int dump_fpu (struct pt_regs * regs, elf_fpregset_t * fpregs)
 {
-	if (current->used_math == 0) {
+	if (used_math()) {
 		memset(fpregs, 0, sizeof(*fpregs));
 		fpregs->pr_q_entrysize = 8;
 		return 1;
--- mainline-5/arch/sparc/kernel/signal.c.orig	2005-01-15 20:44:50.000000000 +0100
+++ mainline-5/arch/sparc/kernel/signal.c	2005-01-21 06:20:02.225851288 +0100
@@ -202,7 +202,7 @@ restore_fpu_state(struct pt_regs *regs, 
 		regs->psr &= ~PSR_EF;
 	}
 #endif
-	current->used_math = 1;
+	set_used_math();
 	clear_tsk_thread_flag(current, TIF_USEDFPU);
 
 	if (verify_area(VERIFY_READ, fpu, sizeof(*fpu)))
@@ -584,7 +584,7 @@ save_fpu_state(struct pt_regs *regs, __s
 				      &current->thread.fpqueue[0],
 				      ((sizeof(unsigned long) +
 				      (sizeof(unsigned long *)))*16));
-	current->used_math = 0;
+	clear_used_math();
 	return err;
 }
 
@@ -599,7 +599,7 @@ new_setup_frame(struct k_sigaction *ka, 
 	synchronize_user_stack();
 
 	sigframe_size = NF_ALIGNEDSZ;
-	if (!current->used_math)
+	if (!used_math())
 		sigframe_size -= sizeof(__siginfo_fpu_t);
 
 	sf = (struct new_signal_frame __user *)
@@ -616,7 +616,7 @@ new_setup_frame(struct k_sigaction *ka, 
 	
 	err |= __put_user(0, &sf->extra_size);
 
-	if (current->used_math) {
+	if (used_math()) {
 		err |= save_fpu_state(regs, &sf->fpu_state);
 		err |= __put_user(&sf->fpu_state, &sf->fpu_save);
 	} else {
@@ -677,7 +677,7 @@ new_setup_rt_frame(struct k_sigaction *k
 
 	synchronize_user_stack();
 	sigframe_size = RT_ALIGNEDSZ;
-	if (!current->used_math)
+	if (!used_math())
 		sigframe_size -= sizeof(__siginfo_fpu_t);
 	sf = (struct rt_signal_frame __user *)
 		get_sigframe(&ka->sa, regs, sigframe_size);
@@ -690,7 +690,7 @@ new_setup_rt_frame(struct k_sigaction *k
 	err |= __put_user(regs->npc, &sf->regs.npc);
 	err |= __put_user(regs->y, &sf->regs.y);
 	psr = regs->psr;
-	if (current->used_math)
+	if (used_math())
 		psr |= PSR_EF;
 	err |= __put_user(psr, &sf->regs.psr);
 	err |= __copy_to_user(&sf->regs.u_regs, regs->u_regs, sizeof(regs->u_regs));
--- mainline-5/arch/sparc/kernel/traps.c.orig	2004-08-25 02:47:49.000000000 +0200
+++ mainline-5/arch/sparc/kernel/traps.c	2005-01-21 06:20:02.233850072 +0100
@@ -246,17 +246,17 @@ void do_fpd_trap(struct pt_regs *regs, u
 		       &fptask->thread.fpqueue[0], &fptask->thread.fpqdepth);
 	}
 	last_task_used_math = current;
-	if(current->used_math) {
+	if(used_math()) {
 		fpload(&current->thread.float_regs[0], &current->thread.fsr);
 	} else {
 		/* Set initial sane state. */
 		fpload(&init_fregs[0], &init_fsr);
-		current->used_math = 1;
+		set_used_math();
 	}
 #else
-	if(!current->used_math) {
+	if(!used_math()) {
 		fpload(&init_fregs[0], &init_fsr);
-		current->used_math = 1;
+		set_used_math();
 	} else {
 		fpload(&current->thread.float_regs[0], &current->thread.fsr);
 	}
--- mainline-5/arch/x86_64/ia32/fpu32.c.orig	2005-01-20 18:20:09.000000000 +0100
+++ mainline-5/arch/x86_64/ia32/fpu32.c	2005-01-21 06:20:02.246848096 +0100
@@ -157,7 +157,7 @@ int restore_i387_ia32(struct task_struct
 				     sizeof(struct i387_fxsave_struct)))
 			return -1;
 		tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
-		tsk->used_math = 1;
+		set_stopped_child_used_math(tsk);
 	} 
 	return convert_fxsr_from_user(&tsk->thread.i387.fxsave, buf);
 }  
--- mainline-5/arch/x86_64/ia32/ia32_binfmt.c.orig	2005-01-15 20:44:50.000000000 +0100
+++ mainline-5/arch/x86_64/ia32/ia32_binfmt.c	2005-01-21 06:20:02.255846728 +0100
@@ -214,7 +214,7 @@ elf_core_copy_task_fpregs(struct task_st
 	struct _fpstate_ia32 *fpstate = (void*)fpu; 
 	mm_segment_t oldfs = get_fs();
 
-	if (!tsk->used_math) 
+	if (!tsk_used_math(tsk)) 
 		return 0;
 	if (!regs)
 		regs = (struct pt_regs *)tsk->thread.rsp0;
@@ -235,7 +235,7 @@ static inline int 
 elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu)
 {
 	struct pt_regs *regs = ((struct pt_regs *)(t->thread.rsp0))-1; 
-	if (!t->used_math) 
+	if (!tsk_used_math(t)) 
 		return 0;
 	if (t == current)
 		unlazy_fpu(t); 
--- mainline-5/arch/x86_64/ia32/ia32_signal.c.orig	2005-01-15 20:44:50.000000000 +0100
+++ mainline-5/arch/x86_64/ia32/ia32_signal.c	2005-01-21 06:20:02.256846576 +0100
@@ -389,7 +389,7 @@ ia32_setup_sigcontext(struct sigcontext_
 	if (tmp < 0)
 	  err = -EFAULT;
 	else { 
-		current->used_math = 0;
+		clear_used_math();
 		stts();
 	  err |= __put_user((u32)(u64)(tmp ? fpstate : NULL), &sc->fpstate);
 	}
--- mainline-5/arch/x86_64/ia32/ptrace32.c.orig	2005-01-04 01:13:11.000000000 +0100
+++ mainline-5/arch/x86_64/ia32/ptrace32.c	2005-01-21 06:20:02.272844144 +0100
@@ -358,7 +358,7 @@ asmlinkage long sys32_ptrace(long reques
 			break;
 		/* no checking to be bug-to-bug compatible with i386 */
 		__copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u));
-		child->used_math = 1;
+		set_stopped_child_used_math(child);
 		child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
 		ret = 0; 
 		break;
--- mainline-5/arch/x86_64/kernel/i387.c.orig	2004-08-25 02:47:33.000000000 +0200
+++ mainline-5/arch/x86_64/kernel/i387.c	2005-01-21 06:20:02.282842624 +0100
@@ -57,12 +57,12 @@ void __init fpu_init(void)
 	mxcsr_feature_mask_init();
 	/* clean state in init */
 	current_thread_info()->status = 0;
-	current->used_math = 0;
+	clear_used_math();
 }
 
 void init_fpu(struct task_struct *child)
 {
-	if (child->used_math) { 
+	if (tsk_used_math(child)) { 
 		if (child == current)
 			unlazy_fpu(child);
 		return;
@@ -70,7 +70,8 @@ void init_fpu(struct task_struct *child)
 	memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
 	child->thread.i387.fxsave.cwd = 0x37f;
 	child->thread.i387.fxsave.mxcsr = 0x1f80;
-	child->used_math = 1;
+	/* only the device not available exception or ptrace can call init_fpu */
+	set_stopped_child_used_math(child);
 }
 
 /*
@@ -91,9 +92,9 @@ int save_i387(struct _fpstate __user *bu
 	if ((unsigned long)buf % 16) 
 		printk("save_i387: bad fpstate %p\n",buf); 
 
-	if (!tsk->used_math) 
+	if (!used_math()) 
 		return 0;
-	tsk->used_math = 0; /* trigger finit */ 
+	clear_used_math(); /* trigger finit */ 
 	if (tsk->thread_info->status & TS_USEDFPU) {
 		err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
 		if (err) return err;
@@ -133,7 +134,7 @@ int dump_fpu( struct pt_regs *regs, stru
 {
 	struct task_struct *tsk = current;
 
-	if (!tsk->used_math) 
+	if (!used_math()) 
 		return 0;
 
 	unlazy_fpu(tsk);
@@ -143,7 +144,7 @@ int dump_fpu( struct pt_regs *regs, stru
 
 int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
 {
-	int fpvalid = tsk->used_math;
+	int fpvalid = !!tsk_used_math(tsk);
 
 	if (fpvalid) {
 		if (tsk == current)
--- mainline-5/arch/x86_64/kernel/process.c.orig	2005-01-15 20:44:50.000000000 +0100
+++ mainline-5/arch/x86_64/kernel/process.c	2005-01-21 06:20:02.290841408 +0100
@@ -314,7 +314,7 @@ void flush_thread(void)
 	 * Forget coprocessor state..
 	 */
 	clear_fpu(tsk);
-	tsk->used_math = 0;
+	clear_used_math();
 }
 
 void release_thread(struct task_struct *dead_task)
--- mainline-5/arch/x86_64/kernel/ptrace.c.orig	2005-01-04 01:13:11.000000000 +0100
+++ mainline-5/arch/x86_64/kernel/ptrace.c	2005-01-21 06:20:02.302839584 +0100
@@ -480,7 +480,7 @@ asmlinkage long sys_ptrace(long request,
 			ret = -EIO;
 			break;
 		}
-		child->used_math = 1;
+		set_stopped_child_used_math(child);
 		ret = set_fpregs(child, (struct user_i387_struct __user *)data);
 		break;
 	}
--- mainline-5/arch/x86_64/kernel/signal.c.orig	2005-01-20 18:20:09.000000000 +0100
+++ mainline-5/arch/x86_64/kernel/signal.c	2005-01-21 06:20:02.304839280 +0100
@@ -251,7 +251,7 @@ static void setup_rt_frame(int sig, stru
 	int err = 0;
 	struct task_struct *me = current;
 
-	if (me->used_math) {
+	if (used_math()) {
 		fp = get_stack(ka, regs, sizeof(struct _fpstate)); 
 		frame = (void __user *)round_down((unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
 
--- mainline-5/arch/x86_64/kernel/traps.c.orig	2005-01-15 20:44:50.000000000 +0100
+++ mainline-5/arch/x86_64/kernel/traps.c	2005-01-21 06:20:02.318837152 +0100
@@ -901,7 +901,7 @@ asmlinkage void math_state_restore(void)
 	struct task_struct *me = current;
 	clts();			/* Allow maths ops (or we recurse) */
 
-	if (!me->used_math)
+	if (!used_math())
 		init_fpu(me);
 	restore_fpu_checking(&me->thread.i387.fxsave);
 	me->thread_info->status |= TS_USEDFPU;
--- mainline-5/arch/m32r/kernel/ptrace.c.orig	2005-01-15 20:44:49.000000000 +0100
+++ mainline-5/arch/m32r/kernel/ptrace.c	2005-01-21 06:20:02.325836088 +0100
@@ -130,7 +130,7 @@ static int ptrace_read_user(struct task_
 #ifndef NO_FPU
 		else if (off >= (long)(&dummy->fpu >> 2) &&
 			 off < (long)(&dummy->u_fpvalid >> 2)) {
-			if (!tsk->used_math) {
+			if (!tsk_used_math(tsk)) {
 				if (off == (long)(&dummy->fpu.fpscr >> 2))
 					tmp = FPSCR_INIT;
 				else
@@ -139,7 +139,7 @@ static int ptrace_read_user(struct task_
 				tmp = ((long *)(&tsk->thread.fpu >> 2))
 					[off - (long)&dummy->fpu];
 		} else if (off == (long)(&dummy->u_fpvalid >> 2))
-			tmp = tsk->used_math;
+			tmp = !!tsk_used_math(tsk);
 #endif /* not NO_FPU */
 		else
 			tmp = 0;
@@ -187,12 +187,12 @@ static int ptrace_write_user(struct task
 #ifndef NO_FPU
 		else if (off >= (long)(&dummy->fpu >> 2) &&
 			 off < (long)(&dummy->u_fpvalid >> 2)) {
-			tsk->used_math = 1;
+			set_stopped_child_used_math(tsk);
 			((long *)&tsk->thread.fpu)
 				[off - (long)&dummy->fpu] = data;
 			ret = 0;
 		} else if (off == (long)(&dummy->u_fpvalid >> 2)) {
-			tsk->used_math = data ? 1 : 0;
+			conditional_stopped_child_used_math(data, tsk);
 			ret = 0;
 		}
 #endif /* not NO_FPU */
--- mainline-5/arch/m32r/kernel/setup.c.orig	2005-01-15 20:44:49.000000000 +0100
+++ mainline-5/arch/m32r/kernel/setup.c	2005-01-21 06:20:02.327835784 +0100
@@ -391,7 +391,7 @@ void __init cpu_init (void)
 
 	/* Force FPU initialization */
 	current_thread_info()->status = 0;
-	current->used_math = 0;
+	clear_used_math();
 
 #ifdef CONFIG_MMU
 	/* Set up MMU */
--- mainline-5/include/asm-arm26/constants.h.orig	2003-06-08 18:21:42.000000000 +0200
+++ mainline-5/include/asm-arm26/constants.h	2005-01-21 06:20:02.339833960 +0100
@@ -7,7 +7,6 @@
  *
  */
 
-#define TSK_USED_MATH 788 /* offsetof(struct task_struct, used_math) */
 #define TSK_ACTIVE_MM 96 /* offsetof(struct task_struct, active_mm) */
 
 #define VMA_VM_MM 0 /* offsetof(struct vm_area_struct, vm_mm) */
--- mainline-5/include/asm-x86_64/i387.h.orig	2004-12-04 08:55:04.000000000 +0100
+++ mainline-5/include/asm-x86_64/i387.h	2005-01-21 06:20:02.349832440 +0100
@@ -25,16 +25,6 @@ extern void mxcsr_feature_mask_init(void
 extern void init_fpu(struct task_struct *child);
 extern int save_i387(struct _fpstate __user *buf);
 
-static inline int need_signal_i387(struct task_struct *me) 
-{ 
-	if (!me->used_math)
-		return 0;
-	me->used_math = 0; 
-	if (me->thread_info->status & TS_USEDFPU)
-		return 0;
-	return 1;
-} 
-
 /*
  * FPU lazy state save handling...
  */
--- mainline-5/include/linux/sched.h.orig	2005-01-21 06:17:24.967758152 +0100
+++ mainline-5/include/linux/sched.h	2005-01-21 06:21:42.854553400 +0100
@@ -614,19 +614,7 @@ struct task_struct {
 	struct key *process_keyring;	/* keyring private to this process (CLONE_THREAD) */
 	struct key *thread_keyring;	/* keyring private to this thread */
 #endif
-/*
- * Must be changed atomically so it shouldn't be
- * be a shareable bitflag.
- */
-	unsigned char used_math;
-/*
- * OOM kill score adjustment (bit shift).
- * Cannot live together with used_math since
- * used_math and oomkilladj can be changed at the
- * same time, so they would race if they're in the
- * same atomic block.
- */
-	short oomkilladj;
+	int oomkilladj; /* OOM kill score adjustment (bit shift). */
 	char comm[TASK_COMM_LEN];
 /* file system info */
 	int link_count, total_link_count;
@@ -695,7 +683,7 @@ struct task_struct {
 #endif
 #ifdef CONFIG_NUMA
   	struct mempolicy *mempolicy;
-  	short il_next;		/* could be shared with used_math */
+	short il_next;
 #endif
 };
 
@@ -737,7 +725,7 @@ do { if (atomic_dec_and_test(&(tsk)->usa
 #define PF_SIGNALED	0x00000400	/* killed by a signal */
 #define PF_MEMALLOC	0x00000800	/* Allocating memory */
 #define PF_FLUSHER	0x00001000	/* responsible for disk writeback */
-
+#define PF_USED_MATH	0x00002000	/* if unset the fpu must be initialized before use */
 #define PF_FREEZE	0x00004000	/* this task is being frozen for suspend now */
 #define PF_NOFREEZE	0x00008000	/* this thread should not be frozen */
 #define PF_FROZEN	0x00010000	/* frozen for system suspend */
@@ -748,6 +736,31 @@ do { if (atomic_dec_and_test(&(tsk)->usa
 #define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
 #define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
 
+/*
+ * Only the _current_ task can read/write to tsk->flags, but other
+ * tasks can access tsk->flags in readonly mode for example
+ * with tsk_used_math (like during threaded core dumping).
+ * There is however an exception to this rule during ptrace
+ * or during fork: the ptracer task is allowed to write to the
+ * child->flags of its traced child (same goes for fork, the parent
+ * can write to the child->flags), because we're guaranteed the
+ * child is not running and in turn not changing child->flags
+ * at the same time the parent does it.
+ */
+#define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0)
+#define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0)
+#define clear_used_math() clear_stopped_child_used_math(current)
+#define set_used_math() set_stopped_child_used_math(current)
+#define conditional_stopped_child_used_math(condition, child) \
+	do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
+#define conditional_used_math(condition) \
+	conditional_stopped_child_used_math(condition, current)
+#define copy_to_stopped_child_used_math(child) \
+	do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
+/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
+#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
+#define used_math() tsk_used_math(current)
+
 #ifdef CONFIG_SMP
 extern int set_cpus_allowed(task_t *p, cpumask_t new_mask);
 #else
--- mainline-oom/arch/i386/kernel/signal.c.~1~	2005-01-15 20:44:49.000000000 +0100
+++ mainline-oom/arch/i386/kernel/signal.c	2005-01-21 06:27:20.433233640 +0100
@@ -192,9 +192,9 @@ restore_sigcontext(struct pt_regs *regs,
 			err |= restore_i387(buf);
 		} else {
 			struct task_struct *me = current;
-			if (me->used_math) {
+			if (used_math()) {
 				clear_fpu(me);
-				me->used_math = 0;
+				clear_used_math();
 			}
 		}
 	}
--- mainline-oom/arch/x86_64/ia32/ia32_signal.c.~1~	2005-01-21 06:24:51.970803360 +0100
+++ mainline-oom/arch/x86_64/ia32/ia32_signal.c	2005-01-21 06:29:47.208920344 +0100
@@ -263,9 +263,9 @@ ia32_restore_sigcontext(struct pt_regs *
 			err |= restore_i387_ia32(current, buf, 0);
 		} else {
 			struct task_struct *me = current;
-			if (me->used_math) {
+			if (used_math()) {
 				clear_fpu(me);
-				me->used_math = 0;
+				clear_used_math();
 			}
 		}
 	}
--- mainline-oom/arch/x86_64/kernel/signal.c.~1~	2005-01-21 06:24:51.975802600 +0100
+++ mainline-oom/arch/x86_64/kernel/signal.c	2005-01-21 06:29:41.988713936 +0100
@@ -126,9 +126,9 @@ restore_sigcontext(struct pt_regs *regs,
 			err |= restore_i387(buf);
 		} else {
 			struct task_struct *me = current;
-			if (me->used_math) {
+			if (used_math()) {
 				clear_fpu(me);
-				me->used_math = 0;
+				clear_used_math();
 			}
 		}
 	}

^ permalink raw reply	[flat|nested] 22+ messages in thread

* writeback-highmem
  2005-01-21  5:50       ` OOM fixes 5/5 Andrea Arcangeli
@ 2005-01-21  6:01         ` Andrea Arcangeli
  2005-01-21  6:26           ` writeback-highmem Andrew Morton
  0 siblings, 1 reply; 22+ messages in thread
From: Andrea Arcangeli @ 2005-01-21  6:01 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Nick Piggin, Rik van Riel

This needed highmem fix from Rik is still missing too, so please apply
along the other 5 (it's orthogonal so you can apply this one in any
order you want).

From: Rik van Riel <riel@redhat.com>
Subject: [PATCH][1/2] adjust dirty threshold for lowmem-only mappings

Simply running "dd if=/dev/zero of=/dev/hd<one you can miss>" will
result in OOM kills, with the dirty pagecache completely filling up
lowmem.  This patch is part 1 to fixing that problem.

This patch effectively lowers the dirty limit for mappings which cannot
be cached in highmem, counting the dirty limit as a percentage of lowmem
instead.  This should prevent heavy block device writers from pushing
the VM over the edge and triggering OOM kills.

Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Andrea Arcangeli <andrea@suse.de>

--- x/mm/page-writeback.c.orig	2005-01-04 01:13:30.000000000 +0100
+++ x/mm/page-writeback.c	2005-01-04 02:41:29.573177184 +0100
@@ -133,7 +133,8 @@ static void get_writeback_state(struct w
  * clamping level.
  */
 static void
-get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty)
+get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty,
+		 struct address_space *mapping)
 {
 	int background_ratio;		/* Percentages */
 	int dirty_ratio;
@@ -141,10 +142,20 @@ get_dirty_limits(struct writeback_state 
 	long background;
 	long dirty;
 	struct task_struct *tsk;
+	unsigned long available_memory = total_pages;
 
 	get_writeback_state(wbs);
 
-	unmapped_ratio = 100 - (wbs->nr_mapped * 100) / total_pages;
+#ifdef CONFIG_HIGHMEM
+	/*
+	 * In some cases we can only allocate from low memory,
+	 * so we exclude high memory from our count.
+	 */
+	if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM))
+		available_memory -= totalhigh_pages;
+#endif
+
+	unmapped_ratio = 100 - (wbs->nr_mapped * 100) / available_memory;
 
 	dirty_ratio = vm_dirty_ratio;
 	if (dirty_ratio > unmapped_ratio / 2)
@@ -194,7 +205,7 @@ static void balance_dirty_pages(struct a
 			.nr_to_write	= write_chunk,
 		};
 
-		get_dirty_limits(&wbs, &background_thresh, &dirty_thresh);
+		get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, mapping);
 		nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable;
 		if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
 			break;
@@ -210,7 +221,7 @@ static void balance_dirty_pages(struct a
 		if (nr_reclaimable) {
 			writeback_inodes(&wbc);
 			get_dirty_limits(&wbs, &background_thresh,
-					&dirty_thresh);
+					&dirty_thresh, mapping);
 			nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable;
 			if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
 				break;
@@ -296,7 +307,7 @@ static void background_writeout(unsigned
 		long background_thresh;
 		long dirty_thresh;
 
-		get_dirty_limits(&wbs, &background_thresh, &dirty_thresh);
+		get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL);
 		if (wbs.nr_dirty + wbs.nr_unstable < background_thresh
 				&& min_pages <= 0)
 			break;

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: OOM fixes 2/5
  2005-01-21  5:49 ` OOM fixes 2/5 Andrea Arcangeli
  2005-01-21  5:49   ` OOM fixes 3/5 Andrea Arcangeli
@ 2005-01-21  6:20   ` Andrew Morton
  2005-01-21  6:35     ` Andrea Arcangeli
  2005-01-21  6:36     ` Nick Piggin
  1 sibling, 2 replies; 22+ messages in thread
From: Andrew Morton @ 2005-01-21  6:20 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: linux-kernel, npiggin

Andrea Arcangeli <andrea@suse.de> wrote:
>
>  This is the forward port to 2.6 of the lowmem_reserved algorithm I
>  invented in 2.4.1*, merged in 2.4.2x already and needed to fix workloads
>  like google (especially without swap) on x86 with >1G of ram, but it's
>  needed in all sort of workloads with lots of ram on x86, it's also
>  needed on x86-64 for dma allocations. This brings 2.6 in sync with
>  latest 2.4.2x.

But this patch doesn't change anything at all in the page allocation path
apart from renaming lots of things, does it?

AFAICT all it does is to change the default values in the protection map. 
It does it via a simplification, which is nice, but I can't see how it
fixes anything.

Confused.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: writeback-highmem
  2005-01-21  6:01         ` writeback-highmem Andrea Arcangeli
@ 2005-01-21  6:26           ` Andrew Morton
  2005-01-21  6:41             ` writeback-highmem Andrea Arcangeli
  2005-01-21 13:46             ` writeback-highmem Rik van Riel
  0 siblings, 2 replies; 22+ messages in thread
From: Andrew Morton @ 2005-01-21  6:26 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: linux-kernel, npiggin, Rik van Riel

Andrea Arcangeli <andrea@suse.de> wrote:
>
> This needed highmem fix from Rik is still missing too, so please apply
>  along the other 5 (it's orthogonal so you can apply this one in any
>  order you want).
> 
>  From: Rik van Riel <riel@redhat.com>
>  Subject: [PATCH][1/2] adjust dirty threshold for lowmem-only mappings

I've held off on this one because the recent throttling fix should have
helped this problem.  Has anyone confirmed that this patch still actually
fixes something?  If so, what was the scenario?

Thanks.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: OOM fixes 2/5
  2005-01-21  6:20   ` OOM fixes 2/5 Andrew Morton
@ 2005-01-21  6:35     ` Andrea Arcangeli
  2005-01-21  6:36     ` Nick Piggin
  1 sibling, 0 replies; 22+ messages in thread
From: Andrea Arcangeli @ 2005-01-21  6:35 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, npiggin

On Thu, Jan 20, 2005 at 10:20:56PM -0800, Andrew Morton wrote:
> Andrea Arcangeli <andrea@suse.de> wrote:
> >
> >  This is the forward port to 2.6 of the lowmem_reserved algorithm I
> >  invented in 2.4.1*, merged in 2.4.2x already and needed to fix workloads
> >  like google (especially without swap) on x86 with >1G of ram, but it's
> >  needed in all sort of workloads with lots of ram on x86, it's also
> >  needed on x86-64 for dma allocations. This brings 2.6 in sync with
> >  latest 2.4.2x.
> 
> But this patch doesn't change anything at all in the page allocation path
> apart from renaming lots of things, does it?

In the allocation path not, but it rewrites the setting algorithm, so
from somebody watching it from userspace it's a completely different
thing, usable for the first time ever in 2.6. Otherwise userspace would
be required to have knowledge about the kernel internals to be able to
set it to a sane value. Plus the new init code is much cleaner too.

> AFAICT all it does is to change the default values in the protection map. 
> It does it via a simplification, which is nice, but I can't see how it
> fixes anything.

Having this patch applied is a major fix. See again the google fix
thread in 2.4.1x.  2.6 is vulnerable to it again. This patch makes the
feature usable and enables the feature as well, which is definitely a
fix as far as an end user is concerned (google was the user in this case).

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: OOM fixes 2/5
  2005-01-21  6:20   ` OOM fixes 2/5 Andrew Morton
  2005-01-21  6:35     ` Andrea Arcangeli
@ 2005-01-21  6:36     ` Nick Piggin
  2005-01-21  6:46       ` Andrew Morton
  2005-01-21  6:52       ` Andrea Arcangeli
  1 sibling, 2 replies; 22+ messages in thread
From: Nick Piggin @ 2005-01-21  6:36 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Andrea Arcangeli, linux-kernel, npiggin

On Thu, 2005-01-20 at 22:20 -0800, Andrew Morton wrote:
> Andrea Arcangeli <andrea@suse.de> wrote:
> >
> >  This is the forward port to 2.6 of the lowmem_reserved algorithm I
> >  invented in 2.4.1*, merged in 2.4.2x already and needed to fix workloads
> >  like google (especially without swap) on x86 with >1G of ram, but it's
> >  needed in all sort of workloads with lots of ram on x86, it's also
> >  needed on x86-64 for dma allocations. This brings 2.6 in sync with
> >  latest 2.4.2x.
> 
> But this patch doesn't change anything at all in the page allocation path
> apart from renaming lots of things, does it?
> 
> AFAICT all it does is to change the default values in the protection map. 
> It does it via a simplification, which is nice, but I can't see how it
> fixes anything.
> 
> Confused.


It does turn on lowmem protection by default. We never reached
an agreement about doing this though, but Andrea has shown that
it fixes trivial OOM cases.

I think it should be turned on by default. I can't recall what
your reservations were...?




^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: writeback-highmem
  2005-01-21  6:26           ` writeback-highmem Andrew Morton
@ 2005-01-21  6:41             ` Andrea Arcangeli
  2005-01-21 13:46             ` writeback-highmem Rik van Riel
  1 sibling, 0 replies; 22+ messages in thread
From: Andrea Arcangeli @ 2005-01-21  6:41 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, npiggin, Rik van Riel

On Thu, Jan 20, 2005 at 10:26:30PM -0800, Andrew Morton wrote:
> Andrea Arcangeli <andrea@suse.de> wrote:
> >
> > This needed highmem fix from Rik is still missing too, so please apply
> >  along the other 5 (it's orthogonal so you can apply this one in any
> >  order you want).
> > 
> >  From: Rik van Riel <riel@redhat.com>
> >  Subject: [PATCH][1/2] adjust dirty threshold for lowmem-only mappings
> 
> I've held off on this one because the recent throttling fix should have
> helped this problem.  Has anyone confirmed that this patch still actually
> fixes something?  If so, what was the scenario?

Without this fix write throttling is completely broken for a blkdev and
it won't start _at_all_ and it'll just keep hanging in the allocation
routines. I agree it won't explain oom (with the other fixes the VM
should writeback synchronously instead of running oom) but it may make
the box completely unusable under a cp /dev/zero /dev/somedevice.

There is a reason why we start write throttling before 100% of ram is
being locked by dirty pages in the pagecache path.

The beauty of this fix is that Rik allowed the pagecache not to have the
limit (in 2.4 pagecache had the limit too). Probably async writeback
won't start but at least the write throttling will and that's all we
need to keep the box running other apps at the same time of the write.

If the system goes unresponsive for 10 minutes and swaps during backups
or workloads working on the blkdev, they'll file bugreports and they'd
be correct.

In short I agree this shouldn't be applied for oom, but it's still
definitely a correct and needed fix (and I rate it a bit more than just
an optimization).

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: OOM fixes 2/5
  2005-01-21  6:36     ` Nick Piggin
@ 2005-01-21  6:46       ` Andrew Morton
  2005-01-21  7:04         ` Nick Piggin
                           ` (2 more replies)
  2005-01-21  6:52       ` Andrea Arcangeli
  1 sibling, 3 replies; 22+ messages in thread
From: Andrew Morton @ 2005-01-21  6:46 UTC (permalink / raw)
  To: Nick Piggin; +Cc: andrea, linux-kernel

Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>
> On Thu, 2005-01-20 at 22:20 -0800, Andrew Morton wrote:
> > Andrea Arcangeli <andrea@suse.de> wrote:
> > >
> > >  This is the forward port to 2.6 of the lowmem_reserved algorithm I
> > >  invented in 2.4.1*, merged in 2.4.2x already and needed to fix workloads
> > >  like google (especially without swap) on x86 with >1G of ram, but it's
> > >  needed in all sort of workloads with lots of ram on x86, it's also
> > >  needed on x86-64 for dma allocations. This brings 2.6 in sync with
> > >  latest 2.4.2x.
> > 
> > But this patch doesn't change anything at all in the page allocation path
> > apart from renaming lots of things, does it?
> > 
> > AFAICT all it does is to change the default values in the protection map. 
> > It does it via a simplification, which is nice, but I can't see how it
> > fixes anything.
> > 
> > Confused.
> 
> 
> It does turn on lowmem protection by default. We never reached
> an agreement about doing this though, but Andrea has shown that
> it fixes trivial OOM cases.
> 
> I think it should be turned on by default. I can't recall what
> your reservations were...?
> 

Just that it throws away a bunch of potentially usable memory.  In three
years I've seen zero reports of any problems which would have been solved
by increasing the protection ratio.

Thus empirically, it appears that the number of machines which need a
non-zero protection ratio is exceedingly small.  Why change the setting on
all machines for the benefit of the tiny few?  Seems weird.  Especially
when this problem could be solved with a few-line initscript.  Ho hum.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: OOM fixes 2/5
  2005-01-21  6:36     ` Nick Piggin
  2005-01-21  6:46       ` Andrew Morton
@ 2005-01-21  6:52       ` Andrea Arcangeli
  2005-01-21  7:00         ` Andrew Morton
  1 sibling, 1 reply; 22+ messages in thread
From: Andrea Arcangeli @ 2005-01-21  6:52 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, linux-kernel

On Fri, Jan 21, 2005 at 05:36:14PM +1100, Nick Piggin wrote:
> I think it should be turned on by default. I can't recall what

I think it too, since the number of people that can be bitten by this is
certainly higher than the number of people who knows the VM internals
and for what kind of workloads they need to enable this by hand to avoid
risking lockups (notably with boxes without swap or with heavy pagetable
allocations all the time which is not uncommon with db usage).

This is needed on x86-64 too to avoid pagetables to lockup the dma zone.
Or anyways it's needed also on x86 for the dma zone on <1G boxes too.

Anyway if you leave it off by default I don't mind, with my new code
forward ported stright from 2.4 mainline, it's possible for the first
time to set it from userspace without having to embed knowledge on the
kernel min_kbytes settings at boot time. So if you want it down by
default it simply means we'll guarantee it on our distro with userland.
Setting a sysctl at boot time is no big deal for us (of course leaving
it enabled by default in kernel space is older distro where userland
isn't yet aware about it). So it's pretty much up to you, as long as we
can easily fixup in userland is fine with me and I already tried a dozen
times to push mainline in what I believe to be the right direction (like
I already did in 2.4 mainline since that same code is enabled by default
in 2.4).

The sysctl name had to change to lowmem_reserve_ratio because its
semantics are completely different now.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: OOM fixes 2/5
  2005-01-21  6:52       ` Andrea Arcangeli
@ 2005-01-21  7:00         ` Andrew Morton
  2005-01-21  7:10           ` Andrea Arcangeli
  0 siblings, 1 reply; 22+ messages in thread
From: Andrew Morton @ 2005-01-21  7:00 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: nickpiggin, linux-kernel

Andrea Arcangeli <andrea@suse.de> wrote:
>
> Anyway if you leave it off by default I don't mind, with my new code
>  forward ported stright from 2.4 mainline, it's possible for the first
>  time to set it from userspace without having to embed knowledge on the
>  kernel min_kbytes settings at boot time.

Last time we dicsussed this you pointed out that reserving more lowmem from
highmem-capable allocations may actually *help* things.  (Tries to remember
why) By reducing inode/dentry eviction rates?  I asked Martin Bligh if he
could test that on a big NUMA box but iirc the results were inconclusive.

Maybe it just won't make much difference.  Hard to say.

>  The sysctl name had to change to lowmem_reserve_ratio because its
>  semantics are completely different now.

That reminds me.  Documentation/filesystems/proc.txt ;)

I'll cook something up for that.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: OOM fixes 2/5
  2005-01-21  6:46       ` Andrew Morton
@ 2005-01-21  7:04         ` Nick Piggin
  2005-01-21  7:17           ` Andrea Arcangeli
  2005-01-21  7:04         ` Andrea Arcangeli
  2005-01-21  7:08         ` Andi Kleen
  2 siblings, 1 reply; 22+ messages in thread
From: Nick Piggin @ 2005-01-21  7:04 UTC (permalink / raw)
  To: Andrew Morton; +Cc: andrea, linux-kernel

On Thu, 2005-01-20 at 22:46 -0800, Andrew Morton wrote:
> Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> > It does turn on lowmem protection by default. We never reached
> > an agreement about doing this though, but Andrea has shown that
> > it fixes trivial OOM cases.
> > 
> > I think it should be turned on by default. I can't recall what
> > your reservations were...?
> > 
> 
> Just that it throws away a bunch of potentially usable memory.  In three
> years I've seen zero reports of any problems which would have been solved
> by increasing the protection ratio.
> 
> Thus empirically, it appears that the number of machines which need a
> non-zero protection ratio is exceedingly small.  Why change the setting on
> all machines for the benefit of the tiny few?  Seems weird.  Especially
> when this problem could be solved with a few-line initscript.  Ho hum.


That is true, but it should not reserve a great deal of memory on
small memory machines. ZONE_NORMAL reservation may not even be too
noticeable as you'll usually have ZONE_NORMAL allocations during
the course of normal running.

Although it is true that there haven't been many problems attributed
to this, one example I can remember is when we fixed the __alloc_pages
watermark code, we fixed a bug that was reserving much more ZONE_DMA
than it was supposed to. This cased all those page allocation failure
problems. So we raised the atomic reserve, but that didn't bring
ZONE_DMA reservation back to its previous levels.

"So the buffer between GFP_KERNEL and GFP_ATOMIC allocations is:

2.6.8      | 465 dma, 117 norm, 582 tot = 2328K
2.6.10-rc  |   2 dma, 146 norm, 148 tot =  592K
patch      |  12 dma, 500 norm, 512 tot = 2048K"

So we were still seeing GFP_DMA allocation failures in the sound code.
You recently had to make that NOWARN to shut it up.

OK this is a fairly lame example... but the current code is more or
less just lucky that ZONE_DMA doesn't usually fill up with pinned mem
on machines that need explicit ZONE_DMA allocations.



Find local movie times and trailers on Yahoo! Movies.
http://au.movies.yahoo.com

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: OOM fixes 2/5
  2005-01-21  6:46       ` Andrew Morton
  2005-01-21  7:04         ` Nick Piggin
@ 2005-01-21  7:04         ` Andrea Arcangeli
  2005-01-21  7:08         ` Andi Kleen
  2 siblings, 0 replies; 22+ messages in thread
From: Andrea Arcangeli @ 2005-01-21  7:04 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Nick Piggin, linux-kernel, Hugh Dickins

On Thu, Jan 20, 2005 at 10:46:45PM -0800, Andrew Morton wrote:
> Thus empirically, it appears that the number of machines which need a
> non-zero protection ratio is exceedingly small.  Why change the setting on
> all machines for the benefit of the tiny few?  Seems weird.  Especially
> when this problem could be solved with a few-line initscript.  Ho hum.

It's up to you, IMHO you're doing a mistake, but I don't mind as long as our
customers aren't at risk of early oom kills (or worse kernel crashes)
with some db load (especially without swap the risk is huge for all
users, since all anonymous memory will be pinned like ptes, but with ~3G
of pagetables they're at risk even with swap).  At least you *must*
admit that without my patch applied as I posted, there's a >0 probabity
of running out of normal zone which will lead to an oom-kill or a
deadlock despite 10G of highmem might still be freeeable (like with
clean cache). And my patch obviously cannot make it impossible to run
out of normal zone, since there's only 800m of normal zone and one can
open more files than what fits in normal zone, but at least it gives the
user the security that a certain workload can run reliably. Without this
patch there's no guarantee at all that any workload will run when >1G of
ptes is allocated.

This below fix as well is needed and you won't find reports of people
reproducing this race condition. Please apply. CC'ed Hugh. Sorry Hugh, I
know you were working on it (you said not in the weekend IIRC), but I've
been upgraded to latest bk so I had to fixup quickly or I would have to
run the racy code on my smp systems to test new kernels.

From: Andrea Arcangeli <andrea@suse.de>
Subject: fixup smp race introduced in 2.6.11-rc1

Signed-off-by: Andrea Arcangeli <andrea@suse.de>

--- x/mm/memory.c.~1~	2005-01-21 06:58:14.747335048 +0100
+++ x/mm/memory.c	2005-01-21 07:16:15.318063328 +0100
@@ -1555,8 +1555,17 @@ void unmap_mapping_range(struct address_
 
 	spin_lock(&mapping->i_mmap_lock);
 
+	/* serialize i_size write against truncate_count write */
+	smp_wmb(); 
 	/* Protect against page faults, and endless unmapping loops */
 	mapping->truncate_count++;
+	/*
+	 * For archs where spin_lock has inclusive semantics like ia64
+	 * this smp_mb() will prevent to read pagetable contents
+	 * before the truncate_count increment is visible to
+	 * other cpus.
+	 */
+	smp_mb();
 	if (unlikely(is_restart_addr(mapping->truncate_count))) {
 		if (mapping->truncate_count == 0)
 			reset_vma_truncate_counts(mapping);
@@ -1864,10 +1873,18 @@ do_no_page(struct mm_struct *mm, struct 
 	if (vma->vm_file) {
 		mapping = vma->vm_file->f_mapping;
 		sequence = mapping->truncate_count;
+		smp_rmb(); /* serializes i_size against truncate_count */
 	}
 retry:
 	cond_resched();
 	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
+	/*
+	 * No smp_rmb is needed here as long as there's a full
+	 * spin_lock/unlock sequence inside the ->nopage callback
+	 * (for the pagecache lookup) that acts as an implicit
+	 * smp_mb() and prevents the i_size read to happen
+	 * after the next truncate_count read.
+	 */
 
 	/* no page was available -- either SIGBUS or OOM */
 	if (new_page == NOPAGE_SIGBUS)


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: OOM fixes 2/5
  2005-01-21  6:46       ` Andrew Morton
  2005-01-21  7:04         ` Nick Piggin
  2005-01-21  7:04         ` Andrea Arcangeli
@ 2005-01-21  7:08         ` Andi Kleen
  2005-01-21  7:21           ` Andrea Arcangeli
  2 siblings, 1 reply; 22+ messages in thread
From: Andi Kleen @ 2005-01-21  7:08 UTC (permalink / raw)
  To: Andrew Morton; +Cc: andrea, linux-kernel

Andrew Morton <akpm@osdl.org> writes:

> Just that it throws away a bunch of potentially usable memory.  In three
> years I've seen zero reports of any problems which would have been solved
> by increasing the protection ratio.

We ran into a big problem with this on x86-64. The SUSE installer
would load the floppy driver during installation. Floppy driver would
try to allocate some pages with GFP_DMA and on a small memory x86-64
system (256-512MB) the OOM killer would always start to kill things
trying to free some DMA pages. This was quite a show stopper
because you effectively couldn't install.

So at least for GFP_DMA it seems to be definitely needed.

-Andi

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: OOM fixes 2/5
  2005-01-21  7:00         ` Andrew Morton
@ 2005-01-21  7:10           ` Andrea Arcangeli
  0 siblings, 0 replies; 22+ messages in thread
From: Andrea Arcangeli @ 2005-01-21  7:10 UTC (permalink / raw)
  To: Andrew Morton; +Cc: nickpiggin, linux-kernel

On Thu, Jan 20, 2005 at 11:00:16PM -0800, Andrew Morton wrote:
> Last time we dicsussed this you pointed out that reserving more lowmem from
> highmem-capable allocations may actually *help* things.  (Tries to remember
> why) By reducing inode/dentry eviction rates?  I asked Martin Bligh if he
> could test that on a big NUMA box but iirc the results were inconclusive.

This is correct, guaranteeing more memory to be freeable in lowmem (ptes
aren't freeable without a sigkill for example) the icache/dcache will at
least have a margin where it can grow indipendently from highmem
allocations.

> Maybe it just won't make much difference.  Hard to say.

I don't know myself if it makes a performance difference, all old
benchmarks have been run with this applied. This was applied for
correcntess (i.e.  to avoid sigkills or lockups), it wasn't applied for
performance. But I don't see how it could hurt performance (especially
given current code already does the check at runtime, which is
pratically the only fast-path cost ;).

> >  The sysctl name had to change to lowmem_reserve_ratio because its
> >  semantics are completely different now.
> 
> That reminds me.  Documentation/filesystems/proc.txt ;)

Woops, forgotten about it ;)

> I'll cook something up for that.

Thanks. If you prefer I can write it too to relieve you from this load,
it's up to you. If you want to fix it yourself go ahead of course ;)

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: OOM fixes 2/5
  2005-01-21  7:04         ` Nick Piggin
@ 2005-01-21  7:17           ` Andrea Arcangeli
  0 siblings, 0 replies; 22+ messages in thread
From: Andrea Arcangeli @ 2005-01-21  7:17 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, linux-kernel

On Fri, Jan 21, 2005 at 06:04:25PM +1100, Nick Piggin wrote:
> OK this is a fairly lame example... but the current code is more or
> less just lucky that ZONE_DMA doesn't usually fill up with pinned mem
> on machines that need explicit ZONE_DMA allocations.

Yep. For the DMA zone all slab cache will be a memory pin (like ptes for
highmem, but not that many people runs with 3G of ram in ptes, and I
guess the ones doing it aren't normally using a mainline kernel in the
first place so they're likely not running into it either). While slab
cache pinning the normal zone has more probability of being reproduced
on l-k in random usages.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: OOM fixes 2/5
  2005-01-21  7:08         ` Andi Kleen
@ 2005-01-21  7:21           ` Andrea Arcangeli
  0 siblings, 0 replies; 22+ messages in thread
From: Andrea Arcangeli @ 2005-01-21  7:21 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Andrew Morton, linux-kernel

On Fri, Jan 21, 2005 at 08:08:21AM +0100, Andi Kleen wrote:
> So at least for GFP_DMA it seems to be definitely needed.

Indeed. Plus if you add pci32 zone, it'll be needed for it too on
x86-64, like for the normal zone on x86, since ptes will go in highmem
while pci32 allocations will not. So while floppy might be fixed, this
issue would be for brand new pci32 zone needed by some device (i.e.
nvidia, so not such a unlikely corner case).

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: writeback-highmem
  2005-01-21  6:26           ` writeback-highmem Andrew Morton
  2005-01-21  6:41             ` writeback-highmem Andrea Arcangeli
@ 2005-01-21 13:46             ` Rik van Riel
  1 sibling, 0 replies; 22+ messages in thread
From: Rik van Riel @ 2005-01-21 13:46 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Andrea Arcangeli, linux-kernel, npiggin

On Thu, 20 Jan 2005, Andrew Morton wrote:

> I've held off on this one because the recent throttling fix should have
> helped this problem.  Has anyone confirmed that this patch still actually
> fixes something?  If so, what was the scenario?

The throttling fix is not quite enough, a big mkfs can still
completely paralyse the system.  Note that the previously
posted patch wasn't quite complete, Larry Woodman spotted an
additional 2 lines that needed changing.

The full patch is:

This patch effectively lowers the dirty limit for mappings which cannot
be cached in highmem, counting the dirty limit as a percentage of lowmem
instead.  This should prevent heavy block device writers from pushing
the VM over the edge and triggering OOM kills.

Signed-off-by: Rik van Riel <riel@redhat.com>

===== mm/page-writeback.c 1.95 vs edited =====
--- 1.95/mm/page-writeback.c	Thu Oct 21 04:39:27 2004
+++ edited/mm/page-writeback.c	Fri Jan 21 08:45:24 2005
@@ -133,17 +133,27 @@
   * clamping level.
   */
  static void
-get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty)
+get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, struct address_space *mapping)
  {
  	int background_ratio;		/* Percentages */
  	int dirty_ratio;
  	int unmapped_ratio;
  	long background;
  	long dirty;
+	unsigned long available_memory = total_pages;
  	struct task_struct *tsk;

  	get_writeback_state(wbs);

+#ifdef CONFIG_HIGHMEM
+	/*
+	 * If this mapping can only allocate from low memory,
+	 * we exclude high memory from our count.
+	 */
+	if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM))
+		available_memory -= totalhigh_pages;
+#endif
+
  	unmapped_ratio = 100 - (wbs->nr_mapped * 100) / total_pages;

  	dirty_ratio = vm_dirty_ratio;
@@ -157,8 +167,8 @@
  	if (background_ratio >= dirty_ratio)
  		background_ratio = dirty_ratio / 2;

-	background = (background_ratio * total_pages) / 100;
-	dirty = (dirty_ratio * total_pages) / 100;
+	background = (background_ratio * available_memory) / 100;
+	dirty = (dirty_ratio * available_memory) / 100;
  	tsk = current;
  	if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
  		background += background / 4;
@@ -194,7 +204,8 @@
  			.nr_to_write	= write_chunk,
  		};

-		get_dirty_limits(&wbs, &background_thresh, &dirty_thresh);
+		get_dirty_limits(&wbs, &background_thresh,
+					&dirty_thresh, mapping);
  		nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable;
  		if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
  			break;
@@ -210,7 +221,7 @@
  		if (nr_reclaimable) {
  			writeback_inodes(&wbc);
  			get_dirty_limits(&wbs, &background_thresh,
-					&dirty_thresh);
+					&dirty_thresh, mapping);
  			nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable;
  			if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
  				break;
@@ -296,7 +307,7 @@
  		long background_thresh;
  		long dirty_thresh;

-		get_dirty_limits(&wbs, &background_thresh, &dirty_thresh);
+		get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL);
  		if (wbs.nr_dirty + wbs.nr_unstable < background_thresh
  				&& min_pages <= 0)
  			break;

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: OOM fixes 1/5
  2005-01-21  5:48 OOM fixes 1/5 Andrea Arcangeli
  2005-01-21  5:49 ` OOM fixes 2/5 Andrea Arcangeli
@ 2005-01-22  6:35 ` Andrea Arcangeli
  1 sibling, 0 replies; 22+ messages in thread
From: Andrea Arcangeli @ 2005-01-22  6:35 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Nick Piggin

I noticed 1/5 had a glitch, this is an update. It won't alter the
ordering, the other patches will still apply cleanly.

Thanks.

From: garloff@suse.de
Subject: protect-pids

This is protect-pids, a patch to allow the admin to tune the oom killer.
The tweak is inherited between parent and child so it's easy to write a
wrapper for complex apps.

I made used_math a char at the light of later patches. Current patch
breaks alpha, but future patches will fix it.

Signed-off-by: Andrea Arcangeli <andrea@suse.de>

--- x/fs/proc/base.c	2005-01-15 20:44:58.000000000 +0100
+++ xx/fs/proc/base.c	2005-01-22 07:02:50.000000000 +0100
@@ -72,6 +72,8 @@ enum pid_directory_inos {
 	PROC_TGID_ATTR_FSCREATE,
 #endif
 	PROC_TGID_FD_DIR,
+	PROC_TGID_OOM_SCORE,
+	PROC_TGID_OOM_ADJUST,
 	PROC_TID_INO,
 	PROC_TID_STATUS,
 	PROC_TID_MEM,
@@ -98,6 +100,8 @@ enum pid_directory_inos {
 	PROC_TID_ATTR_FSCREATE,
 #endif
 	PROC_TID_FD_DIR = 0x8000,	/* 0x8000-0xffff */
+	PROC_TID_OOM_SCORE,
+	PROC_TID_OOM_ADJUST,
 };
 
 struct pid_entry {
@@ -133,6 +137,8 @@ static struct pid_entry tgid_base_stuff[
 #ifdef CONFIG_SCHEDSTATS
 	E(PROC_TGID_SCHEDSTAT, "schedstat", S_IFREG|S_IRUGO),
 #endif
+	E(PROC_TGID_OOM_SCORE, "oom_score",S_IFREG|S_IRUGO),
+	E(PROC_TGID_OOM_ADJUST,"oom_adj", S_IFREG|S_IRUGO|S_IWUSR),
 	{0,0,NULL,0}
 };
 static struct pid_entry tid_base_stuff[] = {
@@ -158,6 +164,8 @@ static struct pid_entry tid_base_stuff[]
 #ifdef CONFIG_SCHEDSTATS
 	E(PROC_TID_SCHEDSTAT, "schedstat",S_IFREG|S_IRUGO),
 #endif
+	E(PROC_TID_OOM_SCORE,  "oom_score",S_IFREG|S_IRUGO),
+	E(PROC_TID_OOM_ADJUST, "oom_adj", S_IFREG|S_IRUGO|S_IWUSR),
 	{0,0,NULL,0}
 };
 
@@ -384,6 +392,18 @@ static int proc_pid_schedstat(struct tas
 }
 #endif
 
+/* The badness from the OOM killer */
+unsigned long badness(struct task_struct *p, unsigned long uptime);
+static int proc_oom_score(struct task_struct *task, char *buffer)
+{
+	unsigned long points;
+	struct timespec uptime;
+
+	do_posix_clock_monotonic_gettime(&uptime);
+	points = badness(task, uptime.tv_sec);
+	return sprintf(buffer, "%lu\n", points);
+}
+
 /************************************************************************/
 /*                       Here the fs part begins                        */
 /************************************************************************/
@@ -657,6 +677,56 @@ static struct file_operations proc_mem_o
 	.open		= mem_open,
 };
 
+static ssize_t oom_adjust_read(struct file * file, char * buf,
+				size_t count, loff_t *ppos)
+{
+	struct task_struct *task = proc_task(file->f_dentry->d_inode);
+	char buffer[8];
+	size_t len;
+	int oom_adjust = task->oomkilladj;
+	loff_t __ppos = *ppos;
+
+	len = sprintf(buffer, "%i\n", oom_adjust);
+	if (__ppos >= len)
+		return 0;
+	if (count > len-__ppos)
+		count = len-__ppos;
+	if (copy_to_user(buf, buffer + __ppos, count)) 
+		return -EFAULT;
+	*ppos = __ppos + count;
+	return count;
+}
+
+static ssize_t oom_adjust_write(struct file * file, const char * buf,
+				size_t count, loff_t *ppos)
+{
+	struct task_struct *task = proc_task(file->f_dentry->d_inode);
+	char buffer[8], *end;
+	int oom_adjust;
+
+	if (!capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+	memset(buffer, 0, 8);	
+	if (count > 6)
+		count = 6;
+	if (copy_from_user(buffer, buf, count)) 
+		return -EFAULT;
+	oom_adjust = simple_strtol(buffer, &end, 0);
+	if (oom_adjust < -16 || oom_adjust > 15)
+		return -EINVAL;
+	if (*end == '\n')
+		end++;
+	task->oomkilladj = oom_adjust;
+	if (end - buffer == 0) 
+		return -EIO;
+	return end - buffer;
+}
+
+static struct file_operations proc_oom_adjust_operations = {
+	read:		oom_adjust_read,
+	write:		oom_adjust_write,
+};
+
 static struct inode_operations proc_mem_inode_operations = {
 	.permission	= proc_permission,
 };
@@ -1336,6 +1406,15 @@ static struct dentry *proc_pident_lookup
 			ei->op.proc_read = proc_pid_schedstat;
 			break;
 #endif
+		case PROC_TID_OOM_SCORE:	
+		case PROC_TGID_OOM_SCORE:
+			inode->i_fop = &proc_info_file_operations;
+			ei->op.proc_read = proc_oom_score;
+			break;
+		case PROC_TID_OOM_ADJUST:
+		case PROC_TGID_OOM_ADJUST:
+			inode->i_fop = &proc_oom_adjust_operations;
+			break;
 		default:
 			printk("procfs: impossible type (%d)",p->type);
 			iput(inode);
--- x/include/linux/sched.h	2005-01-22 07:02:29.000000000 +0100
+++ xx/include/linux/sched.h	2005-01-22 07:02:40.000000000 +0100
@@ -614,7 +614,19 @@ struct task_struct {
 	struct key *process_keyring;	/* keyring private to this process (CLONE_THREAD) */
 	struct key *thread_keyring;	/* keyring private to this thread */
 #endif
-	unsigned short used_math;
+/*
+ * Must be changed atomically so it shouldn't be
+ * be a shareable bitflag.
+ */
+	unsigned char used_math;
+/*
+ * OOM kill score adjustment (bit shift).
+ * Cannot live together with used_math since
+ * used_math and oomkilladj can be changed at the
+ * same time, so they would race if they're in the
+ * same atomic block.
+ */
+	short oomkilladj;
 	char comm[TASK_COMM_LEN];
 /* file system info */
 	int link_count, total_link_count;
--- x/mm/oom_kill.c	2005-01-15 20:45:00.000000000 +0100
+++ xx/mm/oom_kill.c	2005-01-22 07:02:40.000000000 +0100
@@ -42,7 +42,7 @@
  *    of least surprise ... (be careful when you change it)
  */
 
-static unsigned long badness(struct task_struct *p, unsigned long uptime)
+unsigned long badness(struct task_struct *p, unsigned long uptime)
 {
 	unsigned long points, cpu_time, run_time, s;
 
@@ -99,6 +99,17 @@ static unsigned long badness(struct task
 	 */
 	if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
 		points /= 4;
+
+	/* 
+	 * Adjust the score by oomkilladj.
+	 */
+	if (p->oomkilladj) {
+		if (p->oomkilladj > 0)
+			points <<= p->oomkilladj;
+		else
+			points >>= -(p->oomkilladj);
+	}
+		
 #ifdef DEBUG
 	printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",
 	p->pid, p->comm, points);


^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2005-01-22  6:36 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-01-21  5:48 OOM fixes 1/5 Andrea Arcangeli
2005-01-21  5:49 ` OOM fixes 2/5 Andrea Arcangeli
2005-01-21  5:49   ` OOM fixes 3/5 Andrea Arcangeli
2005-01-21  5:50     ` OOM fixes 4/5 Andrea Arcangeli
2005-01-21  5:50       ` OOM fixes 5/5 Andrea Arcangeli
2005-01-21  6:01         ` writeback-highmem Andrea Arcangeli
2005-01-21  6:26           ` writeback-highmem Andrew Morton
2005-01-21  6:41             ` writeback-highmem Andrea Arcangeli
2005-01-21 13:46             ` writeback-highmem Rik van Riel
2005-01-21  6:20   ` OOM fixes 2/5 Andrew Morton
2005-01-21  6:35     ` Andrea Arcangeli
2005-01-21  6:36     ` Nick Piggin
2005-01-21  6:46       ` Andrew Morton
2005-01-21  7:04         ` Nick Piggin
2005-01-21  7:17           ` Andrea Arcangeli
2005-01-21  7:04         ` Andrea Arcangeli
2005-01-21  7:08         ` Andi Kleen
2005-01-21  7:21           ` Andrea Arcangeli
2005-01-21  6:52       ` Andrea Arcangeli
2005-01-21  7:00         ` Andrew Morton
2005-01-21  7:10           ` Andrea Arcangeli
2005-01-22  6:35 ` OOM fixes 1/5 Andrea Arcangeli

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).