linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH -mm 2/2] cgroup: use multibuf for tasks file
@ 2008-09-12 11:55 Lai Jiangshan
  2008-09-15 20:28 ` Paul Menage
  0 siblings, 1 reply; 6+ messages in thread
From: Lai Jiangshan @ 2008-09-12 11:55 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Paul Menage, Linux Kernel Mailing List


when we open a really large cgroup for read, we may failed
for kmalloc() is not reliable for allocate a big buffer.

the patch use multibuf for tasks file, every buf is a page
apart from we need only a small buffer.

we use obj_sort() to sort this pids, so we don't need to map this
pages to an continuous memory region.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index bb298de..3d3c3bb 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -141,8 +141,8 @@ struct cgroup {
 
 	/* pids_mutex protects the fields below */
 	struct rw_semaphore pids_mutex;
-	/* Array of process ids in the cgroup */
-	pid_t *tasks_pids;
+	/* Multi-array of process ids in the cgroup */
+	const pid_t *const *tasks_pids;
 	/* How many files are using the current tasks_pids array */
 	int pids_use_count;
 	/* Length of the current tasks_pids array */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 996865a..f61b152 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2004,6 +2004,8 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
  *
  */
 
+const static int pid_per_page = PAGE_SIZE / sizeof(pid_t);
+
 /*
  * Load into 'pidarray' up to 'npids' of the tasks using cgroup
  * 'cgrp'.  Return actual number of pids loaded.  No need to
@@ -2011,16 +2013,22 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
  * read section, so the css_set can't go away, and is
  * immutable after creation.
  */
-static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
+static int pid_array_load(pid_t **pidarray, int npids, struct cgroup *cgrp)
 {
-	int n = 0;
+	int n = 0, i = 0, j = 0;
 	struct cgroup_iter it;
 	struct task_struct *tsk;
 	cgroup_iter_start(cgrp, &it);
 	while ((tsk = cgroup_iter_next(cgrp, &it))) {
 		if (unlikely(n == npids))
 			break;
-		pidarray[n++] = task_pid_vnr(tsk);
+		pidarray[i][j] = task_pid_vnr(tsk);
+		n++;
+		j++;
+		if (j == pid_per_page) {
+			i++;
+			j = 0;
+		}
 	}
 	cgroup_iter_end(cgrp, &it);
 	return n;
@@ -2079,11 +2087,27 @@ err:
 	return ret;
 }
 
-static int cmppid(const void *a, const void *b)
+static inline pid_t getpidofmbuf(const pid_t *const *multibuf, int index)
+{
+	return multibuf[index / pid_per_page][index % pid_per_page];
+}
+
+static int cmppid(const void *c, size_t left, size_t right)
 {
-	return *(pid_t *)a - *(pid_t *)b;
+	return getpidofmbuf(c, left) - getpidofmbuf(c, right);
 }
 
+static inline pid_t *getpidptr(pid_t *const *multibuf, int index)
+{
+	return &multibuf[index / pid_per_page][index % pid_per_page];
+}
+
+static void swappid(void *c, size_t left, size_t right)
+{
+	pid_t rpid = getpidofmbuf(c, right);
+	*getpidptr(c, right) = getpidofmbuf(c, left);
+	*getpidptr(c, left) = rpid;
+}
 
 /*
  * seq_file methods for the "tasks" file. The seq_file position is the
@@ -2100,19 +2124,19 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
 	 * next pid to display, if any
 	 */
 	struct cgroup *cgrp = s->private;
-	int index = 0, pid = *pos;
-	int *iter;
+	int index = 0;
+	pid_t pid = *pos;
 
 	down_read(&cgrp->pids_mutex);
 	if (pid) {
 		int end = cgrp->pids_length;
-		int i;
 		while (index < end) {
 			int mid = (index + end) / 2;
-			if (cgrp->tasks_pids[mid] == pid) {
+			pid_t mpid = getpidofmbuf(cgrp->tasks_pids, mid);
+			if (mpid == pid) {
 				index = mid;
 				break;
-			} else if (cgrp->tasks_pids[mid] <= pid)
+			} else if (mpid <= pid)
 				index = mid + 1;
 			else
 				end = mid;
@@ -2122,9 +2146,8 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
 	if (index >= cgrp->pids_length)
 		return NULL;
 	/* Update the abstract position to be the actual pid that we found */
-	iter = cgrp->tasks_pids + index;
-	*pos = *iter;
-	return iter;
+	*pos = getpidofmbuf(cgrp->tasks_pids, index);
+	return (void *)(index ^ -0x10000); /* we cannot return 0 */
 }
 
 static void cgroup_tasks_stop(struct seq_file *s, void *v)
@@ -2136,25 +2159,26 @@ static void cgroup_tasks_stop(struct seq_file *s, void *v)
 static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
 {
 	struct cgroup *cgrp = s->private;
-	int *p = v;
-	int *end = cgrp->tasks_pids + cgrp->pids_length;
+	int index = (int)v ^ -0x10000;
 
 	/*
 	 * Advance to the next pid in the array. If this goes off the
 	 * end, we're done
 	 */
-	p++;
-	if (p >= end) {
+	index++;
+	if (index >= cgrp->pids_length) {
 		return NULL;
 	} else {
-		*pos = *p;
-		return p;
+		*pos = getpidofmbuf(cgrp->tasks_pids, index);
+		return (void *)(index ^ -0x10000); /* we cannot return 0 */
 	}
 }
 
 static int cgroup_tasks_show(struct seq_file *s, void *v)
 {
-	return seq_printf(s, "%d\n", *(int *)v);
+	struct cgroup *cgrp = s->private;
+	int index = (int)v ^ -0x10000;
+	return seq_printf(s, "%d\n", getpidofmbuf(cgrp->tasks_pids, index));
 }
 
 static struct seq_operations cgroup_tasks_seq_operations = {
@@ -2164,12 +2188,60 @@ static struct seq_operations cgroup_tasks_seq_operations = {
 	.show = cgroup_tasks_show,
 };
 
+static void *alloc_mutibufs(size_t npids)
+{
+	int i, j, npages = (npids + pid_per_page - 1) / pid_per_page;
+	unsigned long *pages;
+
+	if (npids <= pid_per_page - sizeof(pid_t *) / sizeof(pid_t)) {
+		void *pids = kmalloc(sizeof(pid_t *) + sizeof(pid_t) * npids,
+				GFP_KERNEL);
+		if (!pids)
+			return NULL;
+		/* make single buf fake multi-buf */
+		*(void **)pids = pids + sizeof(pid_t *);
+		return pids;
+	}
+
+	pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
+	if (!pages)
+		return NULL;
+
+	for (i = 0; i < npages; i++) {
+		pages[i] = __get_free_page(GFP_KERNEL);
+		if (unlikely(!pages[i]))
+			goto depopulate;
+	}
+	return pages;
+
+depopulate:
+	for (j = 0; j < i; j++)
+		free_page(pages[j]);
+	kfree(pages);
+	return NULL;
+}
+
+static void free_multibufs(void *ptr, size_t npids)
+{
+	if (!ptr)
+		return;
+
+	if (npids > pid_per_page - sizeof(pid_t *) / sizeof(pid_t)) {
+		int i, npages = (npids + pid_per_page - 1) / pid_per_page;
+		unsigned long *pages = ptr;
+		for (i = 0; i < npages; i++)
+			free_page(pages[i]);
+	}
+
+	kfree(ptr);
+}
+
 static void release_cgroup_pid_array(struct cgroup *cgrp)
 {
 	down_write(&cgrp->pids_mutex);
 	BUG_ON(!cgrp->pids_use_count);
 	if (!--cgrp->pids_use_count) {
-		kfree(cgrp->tasks_pids);
+		free_multibufs((void *)cgrp->tasks_pids, cgrp->pids_length);
 		cgrp->tasks_pids = NULL;
 		cgrp->pids_length = 0;
 	}
@@ -2202,7 +2274,7 @@ static struct file_operations cgroup_tasks_operations = {
 static int cgroup_tasks_open(struct inode *unused, struct file *file)
 {
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-	pid_t *pidarray;
+	pid_t **pidarray;
 	int npids;
 	int retval;
 
@@ -2217,19 +2289,19 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
 	 * show up until sometime later on.
 	 */
 	npids = cgroup_task_count(cgrp);
-	pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
+	pidarray = alloc_mutibufs(npids);
 	if (!pidarray)
 		return -ENOMEM;
 	npids = pid_array_load(pidarray, npids, cgrp);
-	sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
+	obj_sort(pidarray, 0, npids, cmppid, swappid);
 
 	/*
 	 * Store the array in the cgroup, freeing the old
 	 * array if necessary
 	 */
 	down_write(&cgrp->pids_mutex);
-	kfree(cgrp->tasks_pids);
-	cgrp->tasks_pids = pidarray;
+	free_multibufs((void *)cgrp->tasks_pids, cgrp->pids_length);
+	cgrp->tasks_pids = (const pid_t *const *)pidarray;
 	cgrp->pids_length = npids;
 	cgrp->pids_use_count++;
 	up_write(&cgrp->pids_mutex);



^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH -mm 2/2] cgroup: use multibuf for tasks file
  2008-09-12 11:55 [PATCH -mm 2/2] cgroup: use multibuf for tasks file Lai Jiangshan
@ 2008-09-15 20:28 ` Paul Menage
  2008-09-16  1:37   ` Lai Jiangshan
  0 siblings, 1 reply; 6+ messages in thread
From: Paul Menage @ 2008-09-15 20:28 UTC (permalink / raw)
  To: Lai Jiangshan; +Cc: Andrew Morton, Linux Kernel Mailing List

On Fri, Sep 12, 2008 at 4:55 AM, Lai Jiangshan <laijs@cn.fujitsu.com> wrote:
>
> when we open a really large cgroup for read, we may failed
> for kmalloc() is not reliable for allocate a big buffer.

This still depends on an answer to whether using plain vmalloc is too
much overhead.

Balbir pointed out to me that most cgroups are likely to be pretty
small - so the solution of just doing a kmalloc() for 8K or less, and
a vmalloc() for more than 8K (which is >2000 threads) will avoid the
vmalloc overhead in almost all cases; the question is whether
eliminating the remaining overhead is worth the extra complexity.

Paul

>
> the patch use multibuf for tasks file, every buf is a page
> apart from we need only a small buffer.
>
> we use obj_sort() to sort this pids, so we don't need to map this
> pages to an continuous memory region.
>
> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
> ---
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index bb298de..3d3c3bb 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -141,8 +141,8 @@ struct cgroup {
>
>        /* pids_mutex protects the fields below */
>        struct rw_semaphore pids_mutex;
> -       /* Array of process ids in the cgroup */
> -       pid_t *tasks_pids;
> +       /* Multi-array of process ids in the cgroup */
> +       const pid_t *const *tasks_pids;
>        /* How many files are using the current tasks_pids array */
>        int pids_use_count;
>        /* Length of the current tasks_pids array */
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 996865a..f61b152 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -2004,6 +2004,8 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
>  *
>  */
>
> +const static int pid_per_page = PAGE_SIZE / sizeof(pid_t);
> +
>  /*
>  * Load into 'pidarray' up to 'npids' of the tasks using cgroup
>  * 'cgrp'.  Return actual number of pids loaded.  No need to
> @@ -2011,16 +2013,22 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
>  * read section, so the css_set can't go away, and is
>  * immutable after creation.
>  */
> -static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
> +static int pid_array_load(pid_t **pidarray, int npids, struct cgroup *cgrp)
>  {
> -       int n = 0;
> +       int n = 0, i = 0, j = 0;
>        struct cgroup_iter it;
>        struct task_struct *tsk;
>        cgroup_iter_start(cgrp, &it);
>        while ((tsk = cgroup_iter_next(cgrp, &it))) {
>                if (unlikely(n == npids))
>                        break;
> -               pidarray[n++] = task_pid_vnr(tsk);
> +               pidarray[i][j] = task_pid_vnr(tsk);
> +               n++;
> +               j++;
> +               if (j == pid_per_page) {
> +                       i++;
> +                       j = 0;
> +               }
>        }
>        cgroup_iter_end(cgrp, &it);
>        return n;
> @@ -2079,11 +2087,27 @@ err:
>        return ret;
>  }
>
> -static int cmppid(const void *a, const void *b)
> +static inline pid_t getpidofmbuf(const pid_t *const *multibuf, int index)
> +{
> +       return multibuf[index / pid_per_page][index % pid_per_page];
> +}
> +
> +static int cmppid(const void *c, size_t left, size_t right)
>  {
> -       return *(pid_t *)a - *(pid_t *)b;
> +       return getpidofmbuf(c, left) - getpidofmbuf(c, right);
>  }
>
> +static inline pid_t *getpidptr(pid_t *const *multibuf, int index)
> +{
> +       return &multibuf[index / pid_per_page][index % pid_per_page];
> +}
> +
> +static void swappid(void *c, size_t left, size_t right)
> +{
> +       pid_t rpid = getpidofmbuf(c, right);
> +       *getpidptr(c, right) = getpidofmbuf(c, left);
> +       *getpidptr(c, left) = rpid;
> +}
>
>  /*
>  * seq_file methods for the "tasks" file. The seq_file position is the
> @@ -2100,19 +2124,19 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
>         * next pid to display, if any
>         */
>        struct cgroup *cgrp = s->private;
> -       int index = 0, pid = *pos;
> -       int *iter;
> +       int index = 0;
> +       pid_t pid = *pos;
>
>        down_read(&cgrp->pids_mutex);
>        if (pid) {
>                int end = cgrp->pids_length;
> -               int i;
>                while (index < end) {
>                        int mid = (index + end) / 2;
> -                       if (cgrp->tasks_pids[mid] == pid) {
> +                       pid_t mpid = getpidofmbuf(cgrp->tasks_pids, mid);
> +                       if (mpid == pid) {
>                                index = mid;
>                                break;
> -                       } else if (cgrp->tasks_pids[mid] <= pid)
> +                       } else if (mpid <= pid)
>                                index = mid + 1;
>                        else
>                                end = mid;
> @@ -2122,9 +2146,8 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
>        if (index >= cgrp->pids_length)
>                return NULL;
>        /* Update the abstract position to be the actual pid that we found */
> -       iter = cgrp->tasks_pids + index;
> -       *pos = *iter;
> -       return iter;
> +       *pos = getpidofmbuf(cgrp->tasks_pids, index);
> +       return (void *)(index ^ -0x10000); /* we cannot return 0 */
>  }
>
>  static void cgroup_tasks_stop(struct seq_file *s, void *v)
> @@ -2136,25 +2159,26 @@ static void cgroup_tasks_stop(struct seq_file *s, void *v)
>  static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
>  {
>        struct cgroup *cgrp = s->private;
> -       int *p = v;
> -       int *end = cgrp->tasks_pids + cgrp->pids_length;
> +       int index = (int)v ^ -0x10000;
>
>        /*
>         * Advance to the next pid in the array. If this goes off the
>         * end, we're done
>         */
> -       p++;
> -       if (p >= end) {
> +       index++;
> +       if (index >= cgrp->pids_length) {
>                return NULL;
>        } else {
> -               *pos = *p;
> -               return p;
> +               *pos = getpidofmbuf(cgrp->tasks_pids, index);
> +               return (void *)(index ^ -0x10000); /* we cannot return 0 */
>        }
>  }
>
>  static int cgroup_tasks_show(struct seq_file *s, void *v)
>  {
> -       return seq_printf(s, "%d\n", *(int *)v);
> +       struct cgroup *cgrp = s->private;
> +       int index = (int)v ^ -0x10000;
> +       return seq_printf(s, "%d\n", getpidofmbuf(cgrp->tasks_pids, index));
>  }
>
>  static struct seq_operations cgroup_tasks_seq_operations = {
> @@ -2164,12 +2188,60 @@ static struct seq_operations cgroup_tasks_seq_operations = {
>        .show = cgroup_tasks_show,
>  };
>
> +static void *alloc_mutibufs(size_t npids)
> +{
> +       int i, j, npages = (npids + pid_per_page - 1) / pid_per_page;
> +       unsigned long *pages;
> +
> +       if (npids <= pid_per_page - sizeof(pid_t *) / sizeof(pid_t)) {
> +               void *pids = kmalloc(sizeof(pid_t *) + sizeof(pid_t) * npids,
> +                               GFP_KERNEL);
> +               if (!pids)
> +                       return NULL;
> +               /* make single buf fake multi-buf */
> +               *(void **)pids = pids + sizeof(pid_t *);
> +               return pids;
> +       }
> +
> +       pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
> +       if (!pages)
> +               return NULL;
> +
> +       for (i = 0; i < npages; i++) {
> +               pages[i] = __get_free_page(GFP_KERNEL);
> +               if (unlikely(!pages[i]))
> +                       goto depopulate;
> +       }
> +       return pages;
> +
> +depopulate:
> +       for (j = 0; j < i; j++)
> +               free_page(pages[j]);
> +       kfree(pages);
> +       return NULL;
> +}
> +
> +static void free_multibufs(void *ptr, size_t npids)
> +{
> +       if (!ptr)
> +               return;
> +
> +       if (npids > pid_per_page - sizeof(pid_t *) / sizeof(pid_t)) {
> +               int i, npages = (npids + pid_per_page - 1) / pid_per_page;
> +               unsigned long *pages = ptr;
> +               for (i = 0; i < npages; i++)
> +                       free_page(pages[i]);
> +       }
> +
> +       kfree(ptr);
> +}
> +
>  static void release_cgroup_pid_array(struct cgroup *cgrp)
>  {
>        down_write(&cgrp->pids_mutex);
>        BUG_ON(!cgrp->pids_use_count);
>        if (!--cgrp->pids_use_count) {
> -               kfree(cgrp->tasks_pids);
> +               free_multibufs((void *)cgrp->tasks_pids, cgrp->pids_length);
>                cgrp->tasks_pids = NULL;
>                cgrp->pids_length = 0;
>        }
> @@ -2202,7 +2274,7 @@ static struct file_operations cgroup_tasks_operations = {
>  static int cgroup_tasks_open(struct inode *unused, struct file *file)
>  {
>        struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
> -       pid_t *pidarray;
> +       pid_t **pidarray;
>        int npids;
>        int retval;
>
> @@ -2217,19 +2289,19 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
>         * show up until sometime later on.
>         */
>        npids = cgroup_task_count(cgrp);
> -       pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
> +       pidarray = alloc_mutibufs(npids);
>        if (!pidarray)
>                return -ENOMEM;
>        npids = pid_array_load(pidarray, npids, cgrp);
> -       sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
> +       obj_sort(pidarray, 0, npids, cmppid, swappid);
>
>        /*
>         * Store the array in the cgroup, freeing the old
>         * array if necessary
>         */
>        down_write(&cgrp->pids_mutex);
> -       kfree(cgrp->tasks_pids);
> -       cgrp->tasks_pids = pidarray;
> +       free_multibufs((void *)cgrp->tasks_pids, cgrp->pids_length);
> +       cgrp->tasks_pids = (const pid_t *const *)pidarray;
>        cgrp->pids_length = npids;
>        cgrp->pids_use_count++;
>        up_write(&cgrp->pids_mutex);
>
>
>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH -mm 2/2] cgroup: use multibuf for tasks file
  2008-09-15 20:28 ` Paul Menage
@ 2008-09-16  1:37   ` Lai Jiangshan
  2008-09-16  2:16     ` Li Zefan
  0 siblings, 1 reply; 6+ messages in thread
From: Lai Jiangshan @ 2008-09-16  1:37 UTC (permalink / raw)
  To: Paul Menage; +Cc: Andrew Morton, Linux Kernel Mailing List

Paul Menage wrote:
> On Fri, Sep 12, 2008 at 4:55 AM, Lai Jiangshan <laijs@cn.fujitsu.com> wrote:
>> when we open a really large cgroup for read, we may failed
>> for kmalloc() is not reliable for allocate a big buffer.
> 
> This still depends on an answer to whether using plain vmalloc is too
> much overhead.
> 
> Balbir pointed out to me that most cgroups are likely to be pretty
> small - so the solution of just doing a kmalloc() for 8K or less, and
> a vmalloc() for more than 8K (which is >2000 threads) will avoid the
> vmalloc overhead in almost all cases; the question is whether
> eliminating the remaining overhead is worth the extra complexity.
> 

I think open cgroup.tasks to read is not a critical path.
so using plain vmalloc(even more overhead functions) is worth.

> 
>> the patch use multibuf for tasks file, every buf is a page
>> apart from we need only a small buffer.
>>
>> we use obj_sort() to sort this pids, so we don't need to map this
>> pages to an continuous memory region.
>>
>> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
>> ---
>> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
>> index bb298de..3d3c3bb 100644
>> --- a/include/linux/cgroup.h
>> +++ b/include/linux/cgroup.h
>> @@ -141,8 +141,8 @@ struct cgroup {
>>
>>        /* pids_mutex protects the fields below */
>>        struct rw_semaphore pids_mutex;
>> -       /* Array of process ids in the cgroup */
>> -       pid_t *tasks_pids;
>> +       /* Multi-array of process ids in the cgroup */
>> +       const pid_t *const *tasks_pids;
>>        /* How many files are using the current tasks_pids array */
>>        int pids_use_count;
>>        /* Length of the current tasks_pids array */
>> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
>> index 996865a..f61b152 100644
>> --- a/kernel/cgroup.c
>> +++ b/kernel/cgroup.c
>> @@ -2004,6 +2004,8 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
>>  *
>>  */
>>
>> +const static int pid_per_page = PAGE_SIZE / sizeof(pid_t);
>> +
>>  /*
>>  * Load into 'pidarray' up to 'npids' of the tasks using cgroup
>>  * 'cgrp'.  Return actual number of pids loaded.  No need to
>> @@ -2011,16 +2013,22 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
>>  * read section, so the css_set can't go away, and is
>>  * immutable after creation.
>>  */
>> -static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
>> +static int pid_array_load(pid_t **pidarray, int npids, struct cgroup *cgrp)
>>  {
>> -       int n = 0;
>> +       int n = 0, i = 0, j = 0;
>>        struct cgroup_iter it;
>>        struct task_struct *tsk;
>>        cgroup_iter_start(cgrp, &it);
>>        while ((tsk = cgroup_iter_next(cgrp, &it))) {
>>                if (unlikely(n == npids))
>>                        break;
>> -               pidarray[n++] = task_pid_vnr(tsk);
>> +               pidarray[i][j] = task_pid_vnr(tsk);
>> +               n++;
>> +               j++;
>> +               if (j == pid_per_page) {
>> +                       i++;
>> +                       j = 0;
>> +               }
>>        }
>>        cgroup_iter_end(cgrp, &it);
>>        return n;
>> @@ -2079,11 +2087,27 @@ err:
>>        return ret;
>>  }
>>
>> -static int cmppid(const void *a, const void *b)
>> +static inline pid_t getpidofmbuf(const pid_t *const *multibuf, int index)
>> +{
>> +       return multibuf[index / pid_per_page][index % pid_per_page];
>> +}
>> +
>> +static int cmppid(const void *c, size_t left, size_t right)
>>  {
>> -       return *(pid_t *)a - *(pid_t *)b;
>> +       return getpidofmbuf(c, left) - getpidofmbuf(c, right);
>>  }
>>
>> +static inline pid_t *getpidptr(pid_t *const *multibuf, int index)
>> +{
>> +       return &multibuf[index / pid_per_page][index % pid_per_page];
>> +}
>> +
>> +static void swappid(void *c, size_t left, size_t right)
>> +{
>> +       pid_t rpid = getpidofmbuf(c, right);
>> +       *getpidptr(c, right) = getpidofmbuf(c, left);
>> +       *getpidptr(c, left) = rpid;
>> +}
>>
>>  /*
>>  * seq_file methods for the "tasks" file. The seq_file position is the
>> @@ -2100,19 +2124,19 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
>>         * next pid to display, if any
>>         */
>>        struct cgroup *cgrp = s->private;
>> -       int index = 0, pid = *pos;
>> -       int *iter;
>> +       int index = 0;
>> +       pid_t pid = *pos;
>>
>>        down_read(&cgrp->pids_mutex);
>>        if (pid) {
>>                int end = cgrp->pids_length;
>> -               int i;
>>                while (index < end) {
>>                        int mid = (index + end) / 2;
>> -                       if (cgrp->tasks_pids[mid] == pid) {
>> +                       pid_t mpid = getpidofmbuf(cgrp->tasks_pids, mid);
>> +                       if (mpid == pid) {
>>                                index = mid;
>>                                break;
>> -                       } else if (cgrp->tasks_pids[mid] <= pid)
>> +                       } else if (mpid <= pid)
>>                                index = mid + 1;
>>                        else
>>                                end = mid;
>> @@ -2122,9 +2146,8 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
>>        if (index >= cgrp->pids_length)
>>                return NULL;
>>        /* Update the abstract position to be the actual pid that we found */
>> -       iter = cgrp->tasks_pids + index;
>> -       *pos = *iter;
>> -       return iter;
>> +       *pos = getpidofmbuf(cgrp->tasks_pids, index);
>> +       return (void *)(index ^ -0x10000); /* we cannot return 0 */
>>  }
>>
>>  static void cgroup_tasks_stop(struct seq_file *s, void *v)
>> @@ -2136,25 +2159,26 @@ static void cgroup_tasks_stop(struct seq_file *s, void *v)
>>  static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
>>  {
>>        struct cgroup *cgrp = s->private;
>> -       int *p = v;
>> -       int *end = cgrp->tasks_pids + cgrp->pids_length;
>> +       int index = (int)v ^ -0x10000;
>>
>>        /*
>>         * Advance to the next pid in the array. If this goes off the
>>         * end, we're done
>>         */
>> -       p++;
>> -       if (p >= end) {
>> +       index++;
>> +       if (index >= cgrp->pids_length) {
>>                return NULL;
>>        } else {
>> -               *pos = *p;
>> -               return p;
>> +               *pos = getpidofmbuf(cgrp->tasks_pids, index);
>> +               return (void *)(index ^ -0x10000); /* we cannot return 0 */
>>        }
>>  }
>>
>>  static int cgroup_tasks_show(struct seq_file *s, void *v)
>>  {
>> -       return seq_printf(s, "%d\n", *(int *)v);
>> +       struct cgroup *cgrp = s->private;
>> +       int index = (int)v ^ -0x10000;
>> +       return seq_printf(s, "%d\n", getpidofmbuf(cgrp->tasks_pids, index));
>>  }
>>
>>  static struct seq_operations cgroup_tasks_seq_operations = {
>> @@ -2164,12 +2188,60 @@ static struct seq_operations cgroup_tasks_seq_operations = {
>>        .show = cgroup_tasks_show,
>>  };
>>
>> +static void *alloc_mutibufs(size_t npids)
>> +{
>> +       int i, j, npages = (npids + pid_per_page - 1) / pid_per_page;
>> +       unsigned long *pages;
>> +
>> +       if (npids <= pid_per_page - sizeof(pid_t *) / sizeof(pid_t)) {
>> +               void *pids = kmalloc(sizeof(pid_t *) + sizeof(pid_t) * npids,
>> +                               GFP_KERNEL);
>> +               if (!pids)
>> +                       return NULL;
>> +               /* make single buf fake multi-buf */
>> +               *(void **)pids = pids + sizeof(pid_t *);
>> +               return pids;
>> +       }
>> +
>> +       pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
>> +       if (!pages)
>> +               return NULL;
>> +
>> +       for (i = 0; i < npages; i++) {
>> +               pages[i] = __get_free_page(GFP_KERNEL);
>> +               if (unlikely(!pages[i]))
>> +                       goto depopulate;
>> +       }
>> +       return pages;
>> +
>> +depopulate:
>> +       for (j = 0; j < i; j++)
>> +               free_page(pages[j]);
>> +       kfree(pages);
>> +       return NULL;
>> +}
>> +
>> +static void free_multibufs(void *ptr, size_t npids)
>> +{
>> +       if (!ptr)
>> +               return;
>> +
>> +       if (npids > pid_per_page - sizeof(pid_t *) / sizeof(pid_t)) {
>> +               int i, npages = (npids + pid_per_page - 1) / pid_per_page;
>> +               unsigned long *pages = ptr;
>> +               for (i = 0; i < npages; i++)
>> +                       free_page(pages[i]);
>> +       }
>> +
>> +       kfree(ptr);
>> +}
>> +
>>  static void release_cgroup_pid_array(struct cgroup *cgrp)
>>  {
>>        down_write(&cgrp->pids_mutex);
>>        BUG_ON(!cgrp->pids_use_count);
>>        if (!--cgrp->pids_use_count) {
>> -               kfree(cgrp->tasks_pids);
>> +               free_multibufs((void *)cgrp->tasks_pids, cgrp->pids_length);
>>                cgrp->tasks_pids = NULL;
>>                cgrp->pids_length = 0;
>>        }
>> @@ -2202,7 +2274,7 @@ static struct file_operations cgroup_tasks_operations = {
>>  static int cgroup_tasks_open(struct inode *unused, struct file *file)
>>  {
>>        struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
>> -       pid_t *pidarray;
>> +       pid_t **pidarray;
>>        int npids;
>>        int retval;
>>
>> @@ -2217,19 +2289,19 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
>>         * show up until sometime later on.
>>         */
>>        npids = cgroup_task_count(cgrp);
>> -       pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
>> +       pidarray = alloc_mutibufs(npids);
>>        if (!pidarray)
>>                return -ENOMEM;
>>        npids = pid_array_load(pidarray, npids, cgrp);
>> -       sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
>> +       obj_sort(pidarray, 0, npids, cmppid, swappid);
>>
>>        /*
>>         * Store the array in the cgroup, freeing the old
>>         * array if necessary
>>         */
>>        down_write(&cgrp->pids_mutex);
>> -       kfree(cgrp->tasks_pids);
>> -       cgrp->tasks_pids = pidarray;
>> +       free_multibufs((void *)cgrp->tasks_pids, cgrp->pids_length);
>> +       cgrp->tasks_pids = (const pid_t *const *)pidarray;
>>        cgrp->pids_length = npids;
>>        cgrp->pids_use_count++;
>>        up_write(&cgrp->pids_mutex);
>>
>>
>>
> 
> 
> 



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH -mm 2/2] cgroup: use multibuf for tasks file
  2008-09-16  1:37   ` Lai Jiangshan
@ 2008-09-16  2:16     ` Li Zefan
  2008-09-16  3:30       ` Lai Jiangshan
  0 siblings, 1 reply; 6+ messages in thread
From: Li Zefan @ 2008-09-16  2:16 UTC (permalink / raw)
  To: Lai Jiangshan; +Cc: Paul Menage, Andrew Morton, Linux Kernel Mailing List

Lai Jiangshan wrote:
> Paul Menage wrote:
>> On Fri, Sep 12, 2008 at 4:55 AM, Lai Jiangshan <laijs@cn.fujitsu.com> wrote:
>>> when we open a really large cgroup for read, we may failed
>>> for kmalloc() is not reliable for allocate a big buffer.
>> This still depends on an answer to whether using plain vmalloc is too
>> much overhead.
>>
>> Balbir pointed out to me that most cgroups are likely to be pretty
>> small - so the solution of just doing a kmalloc() for 8K or less, and
>> a vmalloc() for more than 8K (which is >2000 threads) will avoid the
>> vmalloc overhead in almost all cases; the question is whether
>> eliminating the remaining overhead is worth the extra complexity.
>>
> 
> I think open cgroup.tasks to read is not a critical path.
> so using plain vmalloc(even more overhead functions) is worth.
> 

This patch does not only add runtime overhead, but also make code much more
complex, so the code is harder to read and harder to maintain, and object size
is increased, which means increased memory footprint.

And is there any reason not using plain vmalloc? Don't bloat the kernel without
good reasons IMO...

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH -mm 2/2] cgroup: use multibuf for tasks file
  2008-09-16  2:16     ` Li Zefan
@ 2008-09-16  3:30       ` Lai Jiangshan
  2008-09-18 19:52         ` KOSAKI Motohiro
  0 siblings, 1 reply; 6+ messages in thread
From: Lai Jiangshan @ 2008-09-16  3:30 UTC (permalink / raw)
  To: Li Zefan; +Cc: Paul Menage, Andrew Morton, Linux Kernel Mailing List

Li Zefan wrote:
> Lai Jiangshan wrote:
>> Paul Menage wrote:
>>> On Fri, Sep 12, 2008 at 4:55 AM, Lai Jiangshan <laijs@cn.fujitsu.com> wrote:
>>>> when we open a really large cgroup for read, we may failed
>>>> for kmalloc() is not reliable for allocate a big buffer.
>>> This still depends on an answer to whether using plain vmalloc is too
>>> much overhead.
>>>
>>> Balbir pointed out to me that most cgroups are likely to be pretty
>>> small - so the solution of just doing a kmalloc() for 8K or less, and
>>> a vmalloc() for more than 8K (which is >2000 threads) will avoid the
>>> vmalloc overhead in almost all cases; the question is whether
>>> eliminating the remaining overhead is worth the extra complexity.
>>>
>> I think open cgroup.tasks to read is not a critical path.
>> so using plain vmalloc(even more overhead functions) is worth.
>>
> 
> This patch does not only add runtime overhead, but also make code much more
> complex, so the code is harder to read and harder to maintain, and object size
> is increased, which means increased memory footprint.
> 
> And is there any reason not using plain vmalloc? Don't bloat the kernel without
> good reasons IMO...
> 

I said that vmalloc is worth.
vmalloc was the fist choice of my opinion. ^_^

> 
> 



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH -mm 2/2] cgroup: use multibuf for tasks file
  2008-09-16  3:30       ` Lai Jiangshan
@ 2008-09-18 19:52         ` KOSAKI Motohiro
  0 siblings, 0 replies; 6+ messages in thread
From: KOSAKI Motohiro @ 2008-09-18 19:52 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: kosaki.motohiro, Li Zefan, Paul Menage, Andrew Morton,
	Linux Kernel Mailing List

> Li Zefan wrote:
> > Lai Jiangshan wrote:
> >> Paul Menage wrote:
> >>> On Fri, Sep 12, 2008 at 4:55 AM, Lai Jiangshan <laijs@cn.fujitsu.com> wrote:
> >>>> when we open a really large cgroup for read, we may failed
> >>>> for kmalloc() is not reliable for allocate a big buffer.
> >>> This still depends on an answer to whether using plain vmalloc is too
> >>> much overhead.
> >>>
> >>> Balbir pointed out to me that most cgroups are likely to be pretty
> >>> small - so the solution of just doing a kmalloc() for 8K or less, and
> >>> a vmalloc() for more than 8K (which is >2000 threads) will avoid the
> >>> vmalloc overhead in almost all cases; the question is whether
> >>> eliminating the remaining overhead is worth the extra complexity.
> >>>
> >> I think open cgroup.tasks to read is not a critical path.
> >> so using plain vmalloc(even more overhead functions) is worth.
> >>
> > 
> > This patch does not only add runtime overhead, but also make code much more
> > complex, so the code is harder to read and harder to maintain, and object size
> > is increased, which means increased memory footprint.
> > 
> > And is there any reason not using plain vmalloc? Don't bloat the kernel without
> > good reasons IMO...
> > 
> 
> I said that vmalloc is worth.
> vmalloc was the fist choice of my opinion. ^_^

I agreed with Paul Menage's opinion because ..

 - plain vmalloc cause unnecessary overhead.
 - vmalloc sholdn't use for small allocation
   because virtual address space is valuable resource on 32bit machine.




^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2008-09-17  7:53 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-09-12 11:55 [PATCH -mm 2/2] cgroup: use multibuf for tasks file Lai Jiangshan
2008-09-15 20:28 ` Paul Menage
2008-09-16  1:37   ` Lai Jiangshan
2008-09-16  2:16     ` Li Zefan
2008-09-16  3:30       ` Lai Jiangshan
2008-09-18 19:52         ` KOSAKI Motohiro

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).