linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
@ 2019-03-10 20:34 Sultan Alsawaf
  2019-03-10 21:03 ` Greg Kroah-Hartman
                   ` (2 more replies)
  0 siblings, 3 replies; 113+ messages in thread
From: Sultan Alsawaf @ 2019-03-10 20:34 UTC (permalink / raw)
  Cc: Greg Kroah-Hartman, Arve Hjønnevåg, Todd Kjos,
	Martijn Coenen, Joel Fernandes, Christian Brauner, Ingo Molnar,
	Peter Zijlstra, linux-kernel, devel, linux-mm,
	Suren Baghdasaryan, Tim Murray, Sultan Alsawaf

From: Sultan Alsawaf <sultan@kerneltoast.com>

This is a complete low memory killer solution for Android that is small
and simple. It kills the largest, least-important processes it can find
whenever a page allocation has completely failed (right after direct
reclaim). Processes are killed according to the priorities that Android
gives them, so that the least important processes are always killed
first. Killing larger processes is preferred in order to free the most
memory possible in one go.

Simple LMK is integrated deeply into the page allocator in order to
catch exactly when a page allocation fails and exactly when a page is
freed. Failed page allocations that have invoked Simple LMK are placed
on a queue and wait for Simple LMK to satisfy them. When a page is about
to be freed, the failed page allocations are given priority over normal
page allocations by Simple LMK to see if they can immediately use the
freed page.

Additionally, processes are continuously killed by failed small-order
page allocations until they are satisfied.

Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
---
 drivers/android/Kconfig      |  28 ++++
 drivers/android/Makefile     |   1 +
 drivers/android/simple_lmk.c | 301 +++++++++++++++++++++++++++++++++++
 include/linux/sched.h        |   3 +
 include/linux/simple_lmk.h   |  11 ++
 kernel/fork.c                |   3 +
 mm/page_alloc.c              |  13 ++
 7 files changed, 360 insertions(+)
 create mode 100644 drivers/android/simple_lmk.c
 create mode 100644 include/linux/simple_lmk.h

diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig
index 6fdf2abe4..7469d049d 100644
--- a/drivers/android/Kconfig
+++ b/drivers/android/Kconfig
@@ -54,6 +54,34 @@ config ANDROID_BINDER_IPC_SELFTEST
 	  exhaustively with combinations of various buffer sizes and
 	  alignments.
 
+config ANDROID_SIMPLE_LMK
+	bool "Simple Android Low Memory Killer"
+	depends on !MEMCG
+	---help---
+	  This is a complete low memory killer solution for Android that is
+	  small and simple. It is integrated deeply into the page allocator to
+	  know exactly when a page allocation hits OOM and exactly when a page
+	  is freed. Processes are killed according to the priorities that
+	  Android gives them, so that the least important processes are always
+	  killed first.
+
+if ANDROID_SIMPLE_LMK
+
+config ANDROID_SIMPLE_LMK_MINFREE
+	int "Minimum MiB of memory to free per reclaim"
+	default "64"
+	help
+	  Simple LMK will free at least this many MiB of memory per reclaim.
+
+config ANDROID_SIMPLE_LMK_KILL_TIMEOUT
+	int "Kill timeout in milliseconds"
+	default "50"
+	help
+	  Simple LMK will only perform memory reclaim at most once per this
+	  amount of time.
+
+endif # if ANDROID_SIMPLE_LMK
+
 endif # if ANDROID
 
 endmenu
diff --git a/drivers/android/Makefile b/drivers/android/Makefile
index c7856e320..7c91293b6 100644
--- a/drivers/android/Makefile
+++ b/drivers/android/Makefile
@@ -3,3 +3,4 @@ ccflags-y += -I$(src)			# needed for trace events
 obj-$(CONFIG_ANDROID_BINDERFS)		+= binderfs.o
 obj-$(CONFIG_ANDROID_BINDER_IPC)	+= binder.o binder_alloc.o
 obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o
+obj-$(CONFIG_ANDROID_SIMPLE_LMK)	+= simple_lmk.o
diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c
new file mode 100644
index 000000000..8a441650a
--- /dev/null
+++ b/drivers/android/simple_lmk.c
@@ -0,0 +1,301 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Sultan Alsawaf <sultan@kerneltoast.com>.
+ */
+
+#define pr_fmt(fmt) "simple_lmk: " fmt
+
+#include <linux/mm.h>
+#include <linux/moduleparam.h>
+#include <linux/oom.h>
+#include <linux/sched.h>
+#include <linux/sizes.h>
+#include <linux/sort.h>
+
+#define MIN_FREE_PAGES (CONFIG_ANDROID_SIMPLE_LMK_MINFREE * SZ_1M / PAGE_SIZE)
+
+struct oom_alloc_req {
+	struct page *page;
+	struct completion done;
+	struct list_head lh;
+	unsigned int order;
+	int migratetype;
+};
+
+struct victim_info {
+	struct task_struct *tsk;
+	unsigned long size;
+};
+
+enum {
+	DISABLED,
+	STARTING,
+	READY,
+	KILLING
+};
+
+/* Pulled from the Android framework */
+static const short int adj_prio[] = {
+	906, /* CACHED_APP_MAX_ADJ */
+	905, /* Cached app */
+	904, /* Cached app */
+	903, /* Cached app */
+	902, /* Cached app */
+	901, /* Cached app */
+	900, /* CACHED_APP_MIN_ADJ */
+	800, /* SERVICE_B_ADJ */
+	700, /* PREVIOUS_APP_ADJ */
+	600, /* HOME_APP_ADJ */
+	500, /* SERVICE_ADJ */
+	400, /* HEAVY_WEIGHT_APP_ADJ */
+	300, /* BACKUP_APP_ADJ */
+	200, /* PERCEPTIBLE_APP_ADJ */
+	100, /* VISIBLE_APP_ADJ */
+	0    /* FOREGROUND_APP_ADJ */
+};
+
+/* Make sure that PID_MAX_DEFAULT isn't too big, or these arrays will be huge */
+static struct victim_info victim_array[PID_MAX_DEFAULT];
+static struct victim_info *victim_ptr_array[ARRAY_SIZE(victim_array)];
+static atomic_t simple_lmk_state = ATOMIC_INIT(DISABLED);
+static atomic_t oom_alloc_count = ATOMIC_INIT(0);
+static unsigned long last_kill_expires;
+static unsigned long kill_expires;
+static DEFINE_SPINLOCK(oom_queue_lock);
+static LIST_HEAD(oom_alloc_queue);
+
+static int victim_info_cmp(const void *lhs, const void *rhs)
+{
+	const struct victim_info **lhs_ptr = (typeof(lhs_ptr))lhs;
+	const struct victim_info **rhs_ptr = (typeof(rhs_ptr))rhs;
+
+	if ((*lhs_ptr)->size > (*rhs_ptr)->size)
+		return -1;
+
+	if ((*lhs_ptr)->size < (*rhs_ptr)->size)
+		return 1;
+
+	return 0;
+}
+
+static unsigned long scan_and_kill(int min_adj, int max_adj,
+				   unsigned long pages_needed)
+{
+	unsigned long pages_freed = 0;
+	unsigned int i, vcount = 0;
+	struct task_struct *tsk;
+
+	rcu_read_lock();
+	for_each_process(tsk) {
+		struct task_struct *vtsk;
+		unsigned long tasksize;
+		short oom_score_adj;
+
+		/* Don't commit suicide or kill kthreads */
+		if (same_thread_group(tsk, current) || tsk->flags & PF_KTHREAD)
+			continue;
+
+		vtsk = find_lock_task_mm(tsk);
+		if (!vtsk)
+			continue;
+
+		/* Don't kill tasks that have been killed or lack memory */
+		if (vtsk->slmk_sigkill_sent ||
+		    test_tsk_thread_flag(vtsk, TIF_MEMDIE)) {
+			task_unlock(vtsk);
+			continue;
+		}
+
+		oom_score_adj = vtsk->signal->oom_score_adj;
+		if (oom_score_adj < min_adj || oom_score_adj > max_adj) {
+			task_unlock(vtsk);
+			continue;
+		}
+
+		tasksize = get_mm_rss(vtsk->mm);
+		task_unlock(vtsk);
+		if (!tasksize)
+			continue;
+
+		/* Store this potential victim away for later */
+		get_task_struct(vtsk);
+		victim_array[vcount].tsk = vtsk;
+		victim_array[vcount].size = tasksize;
+		victim_ptr_array[vcount] = &victim_array[vcount];
+		vcount++;
+
+		/* The victim array is so big that this should never happen */
+		if (unlikely(vcount == ARRAY_SIZE(victim_array)))
+			break;
+	}
+	rcu_read_unlock();
+
+	/* No potential victims for this adj range means no pages freed */
+	if (!vcount)
+		return 0;
+
+	/*
+	 * Sort the victims in descending order of size in order to target the
+	 * largest ones first.
+	 */
+	sort(victim_ptr_array, vcount, sizeof(victim_ptr_array[0]),
+	     victim_info_cmp, NULL);
+
+	for (i = 0; i < vcount; i++) {
+		struct victim_info *victim = victim_ptr_array[i];
+		struct task_struct *vtsk = victim->tsk;
+
+		if (pages_freed >= pages_needed) {
+			put_task_struct(vtsk);
+			continue;
+		}
+
+		pr_info("killing %s with adj %d to free %lu MiB\n",
+			vtsk->comm, vtsk->signal->oom_score_adj,
+			victim->size * PAGE_SIZE / SZ_1M);
+
+		if (!do_send_sig_info(SIGKILL, SEND_SIG_PRIV, vtsk, true))
+			pages_freed += victim->size;
+
+		/* Unconditionally mark task as killed so it isn't reused */
+		vtsk->slmk_sigkill_sent = true;
+		put_task_struct(vtsk);
+	}
+
+	return pages_freed;
+}
+
+static void kill_processes(unsigned long pages_needed)
+{
+	unsigned long pages_freed = 0;
+	int i;
+
+	for (i = 1; i < ARRAY_SIZE(adj_prio); i++) {
+		pages_freed += scan_and_kill(adj_prio[i], adj_prio[i - 1],
+					     pages_needed - pages_freed);
+		if (pages_freed >= pages_needed)
+			break;
+	}
+}
+
+static void do_memory_reclaim(void)
+{
+	/* Only one reclaim can occur at a time */
+	if (atomic_cmpxchg(&simple_lmk_state, READY, KILLING) != READY)
+		return;
+
+	if (time_after_eq(jiffies, last_kill_expires)) {
+		kill_processes(MIN_FREE_PAGES);
+		last_kill_expires = jiffies + kill_expires;
+	}
+
+	atomic_set(&simple_lmk_state, READY);
+}
+
+static long reclaim_once_or_more(struct completion *done, unsigned int order)
+{
+	long ret;
+
+	/* Don't allow costly allocations to do memory reclaim more than once */
+	if (order > PAGE_ALLOC_COSTLY_ORDER) {
+		do_memory_reclaim();
+		return wait_for_completion_killable(done);
+	}
+
+	do {
+		do_memory_reclaim();
+		ret = wait_for_completion_killable_timeout(done, kill_expires);
+	} while (!ret);
+
+	return ret;
+}
+
+struct page *simple_lmk_oom_alloc(unsigned int order, int migratetype)
+{
+	struct oom_alloc_req page_req = {
+		.done = COMPLETION_INITIALIZER_ONSTACK(page_req.done),
+		.order = order,
+		.migratetype = migratetype
+	};
+	long ret;
+
+	if (atomic_read(&simple_lmk_state) <= STARTING)
+		return NULL;
+
+	spin_lock(&oom_queue_lock);
+	list_add_tail(&page_req.lh, &oom_alloc_queue);
+	spin_unlock(&oom_queue_lock);
+
+	atomic_inc(&oom_alloc_count);
+
+	/* Do memory reclaim and wait */
+	ret = reclaim_once_or_more(&page_req.done, order);
+	if (ret == -ERESTARTSYS) {
+		/* Give up since this process is dying */
+		spin_lock(&oom_queue_lock);
+		if (!page_req.page)
+			list_del(&page_req.lh);
+		spin_unlock(&oom_queue_lock);
+	}
+
+	atomic_dec(&oom_alloc_count);
+
+	return page_req.page;
+}
+
+bool simple_lmk_page_in(struct page *page, unsigned int order, int migratetype)
+{
+	struct oom_alloc_req *page_req;
+	bool matched = false;
+	int try_order;
+
+	if (atomic_read(&simple_lmk_state) <= STARTING ||
+	    !atomic_read(&oom_alloc_count))
+		return false;
+
+	/* Try to match this free page with an OOM allocation request */
+	spin_lock(&oom_queue_lock);
+	for (try_order = order; try_order >= 0; try_order--) {
+		list_for_each_entry(page_req, &oom_alloc_queue, lh) {
+			if (page_req->order == try_order &&
+			    page_req->migratetype == migratetype) {
+				matched = true;
+				break;
+			}
+		}
+
+		if (matched)
+			break;
+	}
+
+	if (matched) {
+		__ClearPageBuddy(page);
+		page_req->page = page;
+		list_del(&page_req->lh);
+		complete(&page_req->done);
+	}
+	spin_unlock(&oom_queue_lock);
+
+	return matched;
+}
+
+/* Enable Simple LMK when LMKD in Android writes to the minfree parameter */
+static int simple_lmk_init_set(const char *val, const struct kernel_param *kp)
+{
+	if (atomic_cmpxchg(&simple_lmk_state, DISABLED, STARTING) != DISABLED)
+		return 0;
+
+	/* Store the calculated kill timeout jiffies for frequent reuse */
+	kill_expires = msecs_to_jiffies(CONFIG_ANDROID_SIMPLE_LMK_KILL_TIMEOUT);
+	atomic_set(&simple_lmk_state, READY);
+	return 0;
+}
+
+static const struct kernel_param_ops simple_lmk_init_ops = {
+	.set = simple_lmk_init_set
+};
+
+/* Needed to prevent Android from thinking there's no LMK and thus rebooting */
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "lowmemorykiller."
+module_param_cb(minfree, &simple_lmk_init_ops, NULL, 0200);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1549584a1..d290f9ece 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1199,6 +1199,9 @@ struct task_struct {
 	unsigned long			lowest_stack;
 	unsigned long			prev_lowest_stack;
 #endif
+#ifdef CONFIG_ANDROID_SIMPLE_LMK
+	bool slmk_sigkill_sent;
+#endif
 
 	/*
 	 * New fields for task_struct should be added above here, so that
diff --git a/include/linux/simple_lmk.h b/include/linux/simple_lmk.h
new file mode 100644
index 000000000..64c26368a
--- /dev/null
+++ b/include/linux/simple_lmk.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2019 Sultan Alsawaf <sultan@kerneltoast.com>.
+ */
+#ifndef _SIMPLE_LMK_H_
+#define _SIMPLE_LMK_H_
+
+struct page *simple_lmk_oom_alloc(unsigned int order, int migratetype);
+bool simple_lmk_page_in(struct page *page, unsigned int order, int migratetype);
+
+#endif /* _SIMPLE_LMK_H_ */
diff --git a/kernel/fork.c b/kernel/fork.c
index 9dcd18aa2..162c45392 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1881,6 +1881,9 @@ static __latent_entropy struct task_struct *copy_process(
 	p->sequential_io	= 0;
 	p->sequential_io_avg	= 0;
 #endif
+#ifdef CONFIG_ANDROID_SIMPLE_LMK
+	p->slmk_sigkill_sent = false;
+#endif
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	retval = sched_fork(clone_flags, p);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3eb01dedf..fd0d697c6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -67,6 +67,7 @@
 #include <linux/lockdep.h>
 #include <linux/nmi.h>
 #include <linux/psi.h>
+#include <linux/simple_lmk.h>
 
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -967,6 +968,11 @@ static inline void __free_one_page(struct page *page,
 		}
 	}
 
+#ifdef CONFIG_ANDROID_SIMPLE_LMK
+	if (simple_lmk_page_in(page, order, migratetype))
+		return;
+#endif
+
 	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
 out:
 	zone->free_area[order].nr_free++;
@@ -4427,6 +4433,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
 		goto nopage;
 
+#ifdef CONFIG_ANDROID_SIMPLE_LMK
+	page = simple_lmk_oom_alloc(order, ac->migratetype);
+	if (page)
+		prep_new_page(page, order, gfp_mask, alloc_flags);
+	goto got_pg;
+#endif
+
 	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
 				 did_some_progress > 0, &no_progress_loops))
 		goto retry;
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-10 20:34 [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android Sultan Alsawaf
@ 2019-03-10 21:03 ` Greg Kroah-Hartman
  2019-03-10 21:26   ` Sultan Alsawaf
  2019-03-11 16:32 ` Joel Fernandes
  2019-03-11 17:43 ` Michal Hocko
  2 siblings, 1 reply; 113+ messages in thread
From: Greg Kroah-Hartman @ 2019-03-10 21:03 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Arve Hjønnevåg, Todd Kjos, Martijn Coenen,
	Joel Fernandes, Christian Brauner, Ingo Molnar, Peter Zijlstra,
	linux-kernel, devel, linux-mm, Suren Baghdasaryan, Tim Murray

On Sun, Mar 10, 2019 at 01:34:03PM -0700, Sultan Alsawaf wrote:
> From: Sultan Alsawaf <sultan@kerneltoast.com>
> 
> This is a complete low memory killer solution for Android that is small
> and simple. It kills the largest, least-important processes it can find
> whenever a page allocation has completely failed (right after direct
> reclaim). Processes are killed according to the priorities that Android
> gives them, so that the least important processes are always killed
> first. Killing larger processes is preferred in order to free the most
> memory possible in one go.
> 
> Simple LMK is integrated deeply into the page allocator in order to
> catch exactly when a page allocation fails and exactly when a page is
> freed. Failed page allocations that have invoked Simple LMK are placed
> on a queue and wait for Simple LMK to satisfy them. When a page is about
> to be freed, the failed page allocations are given priority over normal
> page allocations by Simple LMK to see if they can immediately use the
> freed page.
> 
> Additionally, processes are continuously killed by failed small-order
> page allocations until they are satisfied.
> 
> Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>

Wait, why?  We just removed the in-kernel android memory killer, we
don't want to add another one back again, right?  Android Go devices
work just fine with the userspace memory killer code, and those are "low
memory" by design.

Why do we need kernel code here at all?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-10 21:03 ` Greg Kroah-Hartman
@ 2019-03-10 21:26   ` Sultan Alsawaf
  0 siblings, 0 replies; 113+ messages in thread
From: Sultan Alsawaf @ 2019-03-10 21:26 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: Arve Hjønnevåg, Todd Kjos, Martijn Coenen,
	Joel Fernandes, Christian Brauner, Ingo Molnar, Peter Zijlstra,
	linux-kernel, devel, linux-mm, Suren Baghdasaryan, Tim Murray

On Sun, Mar 10, 2019 at 10:03:35PM +0100, Greg Kroah-Hartman wrote:
> On Sun, Mar 10, 2019 at 01:34:03PM -0700, Sultan Alsawaf wrote:
> > From: Sultan Alsawaf <sultan@kerneltoast.com>
> > 
> > This is a complete low memory killer solution for Android that is small
> > and simple. It kills the largest, least-important processes it can find
> > whenever a page allocation has completely failed (right after direct
> > reclaim). Processes are killed according to the priorities that Android
> > gives them, so that the least important processes are always killed
> > first. Killing larger processes is preferred in order to free the most
> > memory possible in one go.
> > 
> > Simple LMK is integrated deeply into the page allocator in order to
> > catch exactly when a page allocation fails and exactly when a page is
> > freed. Failed page allocations that have invoked Simple LMK are placed
> > on a queue and wait for Simple LMK to satisfy them. When a page is about
> > to be freed, the failed page allocations are given priority over normal
> > page allocations by Simple LMK to see if they can immediately use the
> > freed page.
> > 
> > Additionally, processes are continuously killed by failed small-order
> > page allocations until they are satisfied.
> > 
> > Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
> 
> Wait, why?  We just removed the in-kernel android memory killer, we
> don't want to add another one back again, right?  Android Go devices
> work just fine with the userspace memory killer code, and those are "low
> memory" by design.
> 
> Why do we need kernel code here at all?
> 
> thanks,
> 
> greg k-h

Hi Greg,

Thanks for replying. It has not been my experience and the experience of many
others that Android's userspace low memory memory killer works "just fine." On
my Pixel 3 XL with a meager 4GB of memory, the userspace killer has had issues
with killing too many processes, which has resulted in a noticeably poor user
experience for all Pixel owners. From the looks of lmkd on the master branch,
there still isn't really any definitive solution for this, aside from a 100ms
delay in between process kills.

I think that the userspace low memory killer is more complex than necessary,
especially since in the kernel we can detect exactly when we run out of memory
and react far more quickly than any userspace daemon.

The original reasoning behind why the old kernel low memory killer was removed
is also a bit vague to me. It just seemed to be abandonware, and all of a sudden
a userspace daemon was touted as the solution.

This driver is like an Android-flavored version of the kernel's oom killer, and
has proven quite effective for me on my Pixel. Processes are killed exactly when
a page allocation fails, so memory use is maximized. There is no complexity to
try and estimate how full memory is either.

Thanks,
Sultan

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-10 20:34 [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android Sultan Alsawaf
  2019-03-10 21:03 ` Greg Kroah-Hartman
@ 2019-03-11 16:32 ` Joel Fernandes
  2019-03-11 16:37   ` Joel Fernandes
  2019-03-11 17:43 ` Michal Hocko
  2 siblings, 1 reply; 113+ messages in thread
From: Joel Fernandes @ 2019-03-11 16:32 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Greg Kroah-Hartman, Arve Hjønnevåg, Todd Kjos,
	Martijn Coenen, Christian Brauner, Ingo Molnar, Peter Zijlstra,
	linux-kernel, devel, linux-mm, Suren Baghdasaryan, Tim Murray,
	mhocko, vbabka, hannes

On Sun, Mar 10, 2019 at 01:34:03PM -0700, Sultan Alsawaf wrote:
[...]
>  
>  	/* Perform scheduler related setup. Assign this task to a CPU. */
>  	retval = sched_fork(clone_flags, p);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 3eb01dedf..fd0d697c6 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -67,6 +67,7 @@
>  #include <linux/lockdep.h>
>  #include <linux/nmi.h>
>  #include <linux/psi.h>
> +#include <linux/simple_lmk.h>
>  
>  #include <asm/sections.h>
>  #include <asm/tlbflush.h>
> @@ -967,6 +968,11 @@ static inline void __free_one_page(struct page *page,
>  		}
>  	}
>  
> +#ifdef CONFIG_ANDROID_SIMPLE_LMK
> +	if (simple_lmk_page_in(page, order, migratetype))
> +		return;
> +#endif
> +
>  	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
>  out:
>  	zone->free_area[order].nr_free++;
> @@ -4427,6 +4433,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
>  	if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
>  		goto nopage;
>  
> +#ifdef CONFIG_ANDROID_SIMPLE_LMK
> +	page = simple_lmk_oom_alloc(order, ac->migratetype);
> +	if (page)
> +		prep_new_page(page, order, gfp_mask, alloc_flags);
> +	goto got_pg;
> +#endif
> +

Hacking generic MM code with Android-specific callback is probably a major
issue with your patch. Also I CC'd -mm maintainers and lists since your patch
touches page_alloc.c. Always run get_maintainer.pl before sending a patch. I
added them this time.

Have you looked at the recent PSI work that Suren and Johannes have been
doing [1]?  As I understand, userspace lmkd may be migrated to use that at some
point.  Suren can provide more details. I am sure AOSP contributions to make
LMKd better by using the PSI backend would be appreciated. Please consider
collaborating on that and help out, thanks. Check the cover-letter of that
patch [1] where LMKd is mentioned.

thanks,

 - Joel

[1] https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1951257.html


^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-11 16:32 ` Joel Fernandes
@ 2019-03-11 16:37   ` Joel Fernandes
  0 siblings, 0 replies; 113+ messages in thread
From: Joel Fernandes @ 2019-03-11 16:37 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Greg Kroah-Hartman, Arve Hjønnevåg, Todd Kjos,
	Martijn Coenen, Christian Brauner, Ingo Molnar, Peter Zijlstra,
	linux-kernel, devel, linux-mm, Suren Baghdasaryan, Tim Murray,
	mhocko, vbabka, hannes

On Mon, Mar 11, 2019 at 12:32:33PM -0400, Joel Fernandes wrote:
> On Sun, Mar 10, 2019 at 01:34:03PM -0700, Sultan Alsawaf wrote:
> [...]
> >  
> >  	/* Perform scheduler related setup. Assign this task to a CPU. */
> >  	retval = sched_fork(clone_flags, p);
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 3eb01dedf..fd0d697c6 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -67,6 +67,7 @@
> >  #include <linux/lockdep.h>
> >  #include <linux/nmi.h>
> >  #include <linux/psi.h>
> > +#include <linux/simple_lmk.h>
> >  
> >  #include <asm/sections.h>
> >  #include <asm/tlbflush.h>
> > @@ -967,6 +968,11 @@ static inline void __free_one_page(struct page *page,
> >  		}
> >  	}
> >  
> > +#ifdef CONFIG_ANDROID_SIMPLE_LMK
> > +	if (simple_lmk_page_in(page, order, migratetype))
> > +		return;
> > +#endif
> > +
> >  	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
> >  out:
> >  	zone->free_area[order].nr_free++;
> > @@ -4427,6 +4433,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
> >  	if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
> >  		goto nopage;
> >  
> > +#ifdef CONFIG_ANDROID_SIMPLE_LMK
> > +	page = simple_lmk_oom_alloc(order, ac->migratetype);
> > +	if (page)
> > +		prep_new_page(page, order, gfp_mask, alloc_flags);
> > +	goto got_pg;
> > +#endif
> > +
> 
> Hacking generic MM code with Android-specific callback is probably a major
> issue with your patch.
>
> Also I CC'd -mm maintainers and lists since your patch
> touches page_alloc.c. Always run get_maintainer.pl before sending a patch. I
> added them this time.

I see you CC'd linux-mm on your initial patch, so I apologize. Ignore this
part of my reply. Thanks.



> Have you looked at the recent PSI work that Suren and Johannes have been
> doing [1]?  As I understand, userspace lmkd may be migrated to use that at some
> point.  Suren can provide more details. I am sure AOSP contributions to make
> LMKd better by using the PSI backend would be appreciated. Please consider
> collaborating on that and help out, thanks. Check the cover-letter of that
> patch [1] where LMKd is mentioned.
 

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-10 20:34 [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android Sultan Alsawaf
  2019-03-10 21:03 ` Greg Kroah-Hartman
  2019-03-11 16:32 ` Joel Fernandes
@ 2019-03-11 17:43 ` Michal Hocko
  2019-03-11 17:58   ` Sultan Alsawaf
  2 siblings, 1 reply; 113+ messages in thread
From: Michal Hocko @ 2019-03-11 17:43 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Greg Kroah-Hartman, Arve Hjønnevåg, Todd Kjos,
	Martijn Coenen, Joel Fernandes, Christian Brauner, Ingo Molnar,
	Peter Zijlstra, linux-kernel, devel, linux-mm,
	Suren Baghdasaryan, Tim Murray

On Sun 10-03-19 13:34:03, Sultan Alsawaf wrote:
> From: Sultan Alsawaf <sultan@kerneltoast.com>
> 
> This is a complete low memory killer solution for Android that is small
> and simple. It kills the largest, least-important processes it can find
> whenever a page allocation has completely failed (right after direct
> reclaim). Processes are killed according to the priorities that Android
> gives them, so that the least important processes are always killed
> first. Killing larger processes is preferred in order to free the most
> memory possible in one go.
> 
> Simple LMK is integrated deeply into the page allocator in order to
> catch exactly when a page allocation fails and exactly when a page is
> freed. Failed page allocations that have invoked Simple LMK are placed
> on a queue and wait for Simple LMK to satisfy them. When a page is about
> to be freed, the failed page allocations are given priority over normal
> page allocations by Simple LMK to see if they can immediately use the
> freed page.
> 
> Additionally, processes are continuously killed by failed small-order
> page allocations until they are satisfied.

I am sorry but we are not going to maintain two different OOM
implementations in the kernel. From a quick look the implementation is
quite a hack which is not really suitable for anything but a very
specific usecase. E.g. reusing a freed page for a waiting allocation
sounds like an interesting idea but it doesn't really work for many
reasons. E.g. any NUMA affinity is broken, zone protection doesn't work
either. Not to mention how the code hooks into the allocator hot paths.
This is simply no no.

Last but not least people have worked really hard to provide means (PSI)
to do what you need in the userspace.
 
> Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
> ---
>  drivers/android/Kconfig      |  28 ++++
>  drivers/android/Makefile     |   1 +
>  drivers/android/simple_lmk.c | 301 +++++++++++++++++++++++++++++++++++
>  include/linux/sched.h        |   3 +
>  include/linux/simple_lmk.h   |  11 ++
>  kernel/fork.c                |   3 +
>  mm/page_alloc.c              |  13 ++
>  7 files changed, 360 insertions(+)
>  create mode 100644 drivers/android/simple_lmk.c
>  create mode 100644 include/linux/simple_lmk.h
> 
> diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig
> index 6fdf2abe4..7469d049d 100644
> --- a/drivers/android/Kconfig
> +++ b/drivers/android/Kconfig
> @@ -54,6 +54,34 @@ config ANDROID_BINDER_IPC_SELFTEST
>  	  exhaustively with combinations of various buffer sizes and
>  	  alignments.
>  
> +config ANDROID_SIMPLE_LMK
> +	bool "Simple Android Low Memory Killer"
> +	depends on !MEMCG
> +	---help---
> +	  This is a complete low memory killer solution for Android that is
> +	  small and simple. It is integrated deeply into the page allocator to
> +	  know exactly when a page allocation hits OOM and exactly when a page
> +	  is freed. Processes are killed according to the priorities that
> +	  Android gives them, so that the least important processes are always
> +	  killed first.
> +
> +if ANDROID_SIMPLE_LMK
> +
> +config ANDROID_SIMPLE_LMK_MINFREE
> +	int "Minimum MiB of memory to free per reclaim"
> +	default "64"
> +	help
> +	  Simple LMK will free at least this many MiB of memory per reclaim.
> +
> +config ANDROID_SIMPLE_LMK_KILL_TIMEOUT
> +	int "Kill timeout in milliseconds"
> +	default "50"
> +	help
> +	  Simple LMK will only perform memory reclaim at most once per this
> +	  amount of time.
> +
> +endif # if ANDROID_SIMPLE_LMK
> +
>  endif # if ANDROID
>  
>  endmenu
> diff --git a/drivers/android/Makefile b/drivers/android/Makefile
> index c7856e320..7c91293b6 100644
> --- a/drivers/android/Makefile
> +++ b/drivers/android/Makefile
> @@ -3,3 +3,4 @@ ccflags-y += -I$(src)			# needed for trace events
>  obj-$(CONFIG_ANDROID_BINDERFS)		+= binderfs.o
>  obj-$(CONFIG_ANDROID_BINDER_IPC)	+= binder.o binder_alloc.o
>  obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o
> +obj-$(CONFIG_ANDROID_SIMPLE_LMK)	+= simple_lmk.o
> diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c
> new file mode 100644
> index 000000000..8a441650a
> --- /dev/null
> +++ b/drivers/android/simple_lmk.c
> @@ -0,0 +1,301 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2019 Sultan Alsawaf <sultan@kerneltoast.com>.
> + */
> +
> +#define pr_fmt(fmt) "simple_lmk: " fmt
> +
> +#include <linux/mm.h>
> +#include <linux/moduleparam.h>
> +#include <linux/oom.h>
> +#include <linux/sched.h>
> +#include <linux/sizes.h>
> +#include <linux/sort.h>
> +
> +#define MIN_FREE_PAGES (CONFIG_ANDROID_SIMPLE_LMK_MINFREE * SZ_1M / PAGE_SIZE)
> +
> +struct oom_alloc_req {
> +	struct page *page;
> +	struct completion done;
> +	struct list_head lh;
> +	unsigned int order;
> +	int migratetype;
> +};
> +
> +struct victim_info {
> +	struct task_struct *tsk;
> +	unsigned long size;
> +};
> +
> +enum {
> +	DISABLED,
> +	STARTING,
> +	READY,
> +	KILLING
> +};
> +
> +/* Pulled from the Android framework */
> +static const short int adj_prio[] = {
> +	906, /* CACHED_APP_MAX_ADJ */
> +	905, /* Cached app */
> +	904, /* Cached app */
> +	903, /* Cached app */
> +	902, /* Cached app */
> +	901, /* Cached app */
> +	900, /* CACHED_APP_MIN_ADJ */
> +	800, /* SERVICE_B_ADJ */
> +	700, /* PREVIOUS_APP_ADJ */
> +	600, /* HOME_APP_ADJ */
> +	500, /* SERVICE_ADJ */
> +	400, /* HEAVY_WEIGHT_APP_ADJ */
> +	300, /* BACKUP_APP_ADJ */
> +	200, /* PERCEPTIBLE_APP_ADJ */
> +	100, /* VISIBLE_APP_ADJ */
> +	0    /* FOREGROUND_APP_ADJ */
> +};
> +
> +/* Make sure that PID_MAX_DEFAULT isn't too big, or these arrays will be huge */
> +static struct victim_info victim_array[PID_MAX_DEFAULT];
> +static struct victim_info *victim_ptr_array[ARRAY_SIZE(victim_array)];
> +static atomic_t simple_lmk_state = ATOMIC_INIT(DISABLED);
> +static atomic_t oom_alloc_count = ATOMIC_INIT(0);
> +static unsigned long last_kill_expires;
> +static unsigned long kill_expires;
> +static DEFINE_SPINLOCK(oom_queue_lock);
> +static LIST_HEAD(oom_alloc_queue);
> +
> +static int victim_info_cmp(const void *lhs, const void *rhs)
> +{
> +	const struct victim_info **lhs_ptr = (typeof(lhs_ptr))lhs;
> +	const struct victim_info **rhs_ptr = (typeof(rhs_ptr))rhs;
> +
> +	if ((*lhs_ptr)->size > (*rhs_ptr)->size)
> +		return -1;
> +
> +	if ((*lhs_ptr)->size < (*rhs_ptr)->size)
> +		return 1;
> +
> +	return 0;
> +}
> +
> +static unsigned long scan_and_kill(int min_adj, int max_adj,
> +				   unsigned long pages_needed)
> +{
> +	unsigned long pages_freed = 0;
> +	unsigned int i, vcount = 0;
> +	struct task_struct *tsk;
> +
> +	rcu_read_lock();
> +	for_each_process(tsk) {
> +		struct task_struct *vtsk;
> +		unsigned long tasksize;
> +		short oom_score_adj;
> +
> +		/* Don't commit suicide or kill kthreads */
> +		if (same_thread_group(tsk, current) || tsk->flags & PF_KTHREAD)
> +			continue;
> +
> +		vtsk = find_lock_task_mm(tsk);
> +		if (!vtsk)
> +			continue;
> +
> +		/* Don't kill tasks that have been killed or lack memory */
> +		if (vtsk->slmk_sigkill_sent ||
> +		    test_tsk_thread_flag(vtsk, TIF_MEMDIE)) {
> +			task_unlock(vtsk);
> +			continue;
> +		}
> +
> +		oom_score_adj = vtsk->signal->oom_score_adj;
> +		if (oom_score_adj < min_adj || oom_score_adj > max_adj) {
> +			task_unlock(vtsk);
> +			continue;
> +		}
> +
> +		tasksize = get_mm_rss(vtsk->mm);
> +		task_unlock(vtsk);
> +		if (!tasksize)
> +			continue;
> +
> +		/* Store this potential victim away for later */
> +		get_task_struct(vtsk);
> +		victim_array[vcount].tsk = vtsk;
> +		victim_array[vcount].size = tasksize;
> +		victim_ptr_array[vcount] = &victim_array[vcount];
> +		vcount++;
> +
> +		/* The victim array is so big that this should never happen */
> +		if (unlikely(vcount == ARRAY_SIZE(victim_array)))
> +			break;
> +	}
> +	rcu_read_unlock();
> +
> +	/* No potential victims for this adj range means no pages freed */
> +	if (!vcount)
> +		return 0;
> +
> +	/*
> +	 * Sort the victims in descending order of size in order to target the
> +	 * largest ones first.
> +	 */
> +	sort(victim_ptr_array, vcount, sizeof(victim_ptr_array[0]),
> +	     victim_info_cmp, NULL);
> +
> +	for (i = 0; i < vcount; i++) {
> +		struct victim_info *victim = victim_ptr_array[i];
> +		struct task_struct *vtsk = victim->tsk;
> +
> +		if (pages_freed >= pages_needed) {
> +			put_task_struct(vtsk);
> +			continue;
> +		}
> +
> +		pr_info("killing %s with adj %d to free %lu MiB\n",
> +			vtsk->comm, vtsk->signal->oom_score_adj,
> +			victim->size * PAGE_SIZE / SZ_1M);
> +
> +		if (!do_send_sig_info(SIGKILL, SEND_SIG_PRIV, vtsk, true))
> +			pages_freed += victim->size;
> +
> +		/* Unconditionally mark task as killed so it isn't reused */
> +		vtsk->slmk_sigkill_sent = true;
> +		put_task_struct(vtsk);
> +	}
> +
> +	return pages_freed;
> +}
> +
> +static void kill_processes(unsigned long pages_needed)
> +{
> +	unsigned long pages_freed = 0;
> +	int i;
> +
> +	for (i = 1; i < ARRAY_SIZE(adj_prio); i++) {
> +		pages_freed += scan_and_kill(adj_prio[i], adj_prio[i - 1],
> +					     pages_needed - pages_freed);
> +		if (pages_freed >= pages_needed)
> +			break;
> +	}
> +}
> +
> +static void do_memory_reclaim(void)
> +{
> +	/* Only one reclaim can occur at a time */
> +	if (atomic_cmpxchg(&simple_lmk_state, READY, KILLING) != READY)
> +		return;
> +
> +	if (time_after_eq(jiffies, last_kill_expires)) {
> +		kill_processes(MIN_FREE_PAGES);
> +		last_kill_expires = jiffies + kill_expires;
> +	}
> +
> +	atomic_set(&simple_lmk_state, READY);
> +}
> +
> +static long reclaim_once_or_more(struct completion *done, unsigned int order)
> +{
> +	long ret;
> +
> +	/* Don't allow costly allocations to do memory reclaim more than once */
> +	if (order > PAGE_ALLOC_COSTLY_ORDER) {
> +		do_memory_reclaim();
> +		return wait_for_completion_killable(done);
> +	}
> +
> +	do {
> +		do_memory_reclaim();
> +		ret = wait_for_completion_killable_timeout(done, kill_expires);
> +	} while (!ret);
> +
> +	return ret;
> +}
> +
> +struct page *simple_lmk_oom_alloc(unsigned int order, int migratetype)
> +{
> +	struct oom_alloc_req page_req = {
> +		.done = COMPLETION_INITIALIZER_ONSTACK(page_req.done),
> +		.order = order,
> +		.migratetype = migratetype
> +	};
> +	long ret;
> +
> +	if (atomic_read(&simple_lmk_state) <= STARTING)
> +		return NULL;
> +
> +	spin_lock(&oom_queue_lock);
> +	list_add_tail(&page_req.lh, &oom_alloc_queue);
> +	spin_unlock(&oom_queue_lock);
> +
> +	atomic_inc(&oom_alloc_count);
> +
> +	/* Do memory reclaim and wait */
> +	ret = reclaim_once_or_more(&page_req.done, order);
> +	if (ret == -ERESTARTSYS) {
> +		/* Give up since this process is dying */
> +		spin_lock(&oom_queue_lock);
> +		if (!page_req.page)
> +			list_del(&page_req.lh);
> +		spin_unlock(&oom_queue_lock);
> +	}
> +
> +	atomic_dec(&oom_alloc_count);
> +
> +	return page_req.page;
> +}
> +
> +bool simple_lmk_page_in(struct page *page, unsigned int order, int migratetype)
> +{
> +	struct oom_alloc_req *page_req;
> +	bool matched = false;
> +	int try_order;
> +
> +	if (atomic_read(&simple_lmk_state) <= STARTING ||
> +	    !atomic_read(&oom_alloc_count))
> +		return false;
> +
> +	/* Try to match this free page with an OOM allocation request */
> +	spin_lock(&oom_queue_lock);
> +	for (try_order = order; try_order >= 0; try_order--) {
> +		list_for_each_entry(page_req, &oom_alloc_queue, lh) {
> +			if (page_req->order == try_order &&
> +			    page_req->migratetype == migratetype) {
> +				matched = true;
> +				break;
> +			}
> +		}
> +
> +		if (matched)
> +			break;
> +	}
> +
> +	if (matched) {
> +		__ClearPageBuddy(page);
> +		page_req->page = page;
> +		list_del(&page_req->lh);
> +		complete(&page_req->done);
> +	}
> +	spin_unlock(&oom_queue_lock);
> +
> +	return matched;
> +}
> +
> +/* Enable Simple LMK when LMKD in Android writes to the minfree parameter */
> +static int simple_lmk_init_set(const char *val, const struct kernel_param *kp)
> +{
> +	if (atomic_cmpxchg(&simple_lmk_state, DISABLED, STARTING) != DISABLED)
> +		return 0;
> +
> +	/* Store the calculated kill timeout jiffies for frequent reuse */
> +	kill_expires = msecs_to_jiffies(CONFIG_ANDROID_SIMPLE_LMK_KILL_TIMEOUT);
> +	atomic_set(&simple_lmk_state, READY);
> +	return 0;
> +}
> +
> +static const struct kernel_param_ops simple_lmk_init_ops = {
> +	.set = simple_lmk_init_set
> +};
> +
> +/* Needed to prevent Android from thinking there's no LMK and thus rebooting */
> +#undef MODULE_PARAM_PREFIX
> +#define MODULE_PARAM_PREFIX "lowmemorykiller."
> +module_param_cb(minfree, &simple_lmk_init_ops, NULL, 0200);
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 1549584a1..d290f9ece 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1199,6 +1199,9 @@ struct task_struct {
>  	unsigned long			lowest_stack;
>  	unsigned long			prev_lowest_stack;
>  #endif
> +#ifdef CONFIG_ANDROID_SIMPLE_LMK
> +	bool slmk_sigkill_sent;
> +#endif
>  
>  	/*
>  	 * New fields for task_struct should be added above here, so that
> diff --git a/include/linux/simple_lmk.h b/include/linux/simple_lmk.h
> new file mode 100644
> index 000000000..64c26368a
> --- /dev/null
> +++ b/include/linux/simple_lmk.h
> @@ -0,0 +1,11 @@
> +/* SPDX-License-Identifier: GPL-2.0
> + *
> + * Copyright (C) 2019 Sultan Alsawaf <sultan@kerneltoast.com>.
> + */
> +#ifndef _SIMPLE_LMK_H_
> +#define _SIMPLE_LMK_H_
> +
> +struct page *simple_lmk_oom_alloc(unsigned int order, int migratetype);
> +bool simple_lmk_page_in(struct page *page, unsigned int order, int migratetype);
> +
> +#endif /* _SIMPLE_LMK_H_ */
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 9dcd18aa2..162c45392 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1881,6 +1881,9 @@ static __latent_entropy struct task_struct *copy_process(
>  	p->sequential_io	= 0;
>  	p->sequential_io_avg	= 0;
>  #endif
> +#ifdef CONFIG_ANDROID_SIMPLE_LMK
> +	p->slmk_sigkill_sent = false;
> +#endif
>  
>  	/* Perform scheduler related setup. Assign this task to a CPU. */
>  	retval = sched_fork(clone_flags, p);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 3eb01dedf..fd0d697c6 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -67,6 +67,7 @@
>  #include <linux/lockdep.h>
>  #include <linux/nmi.h>
>  #include <linux/psi.h>
> +#include <linux/simple_lmk.h>
>  
>  #include <asm/sections.h>
>  #include <asm/tlbflush.h>
> @@ -967,6 +968,11 @@ static inline void __free_one_page(struct page *page,
>  		}
>  	}
>  
> +#ifdef CONFIG_ANDROID_SIMPLE_LMK
> +	if (simple_lmk_page_in(page, order, migratetype))
> +		return;
> +#endif
> +
>  	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
>  out:
>  	zone->free_area[order].nr_free++;
> @@ -4427,6 +4433,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
>  	if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
>  		goto nopage;
>  
> +#ifdef CONFIG_ANDROID_SIMPLE_LMK
> +	page = simple_lmk_oom_alloc(order, ac->migratetype);
> +	if (page)
> +		prep_new_page(page, order, gfp_mask, alloc_flags);
> +	goto got_pg;
> +#endif
> +
>  	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
>  				 did_some_progress > 0, &no_progress_loops))
>  		goto retry;
> -- 
> 2.21.0

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-11 17:43 ` Michal Hocko
@ 2019-03-11 17:58   ` Sultan Alsawaf
  2019-03-11 20:10     ` Suren Baghdasaryan
  0 siblings, 1 reply; 113+ messages in thread
From: Sultan Alsawaf @ 2019-03-11 17:58 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Greg Kroah-Hartman, Arve Hjønnevåg, Todd Kjos,
	Martijn Coenen, Joel Fernandes, Christian Brauner, Ingo Molnar,
	Peter Zijlstra, linux-kernel, devel, linux-mm,
	Suren Baghdasaryan, Tim Murray

On Mon, Mar 11, 2019 at 06:43:20PM +0100, Michal Hocko wrote:
> I am sorry but we are not going to maintain two different OOM
> implementations in the kernel. From a quick look the implementation is
> quite a hack which is not really suitable for anything but a very
> specific usecase. E.g. reusing a freed page for a waiting allocation
> sounds like an interesting idea but it doesn't really work for many
> reasons. E.g. any NUMA affinity is broken, zone protection doesn't work
> either. Not to mention how the code hooks into the allocator hot paths.
> This is simply no no.
> 
> Last but not least people have worked really hard to provide means (PSI)
> to do what you need in the userspace.

Hi Michal,

Thanks for the feedback. I had no doubt that this would be vehemently rejected
on the mailing list, but I wanted feedback/opinions on it and thus sent it as anRFC. At best I thought perhaps the mechanisms I've employed might serve as
inspiration for LMKD improvements in Android, since this hacky OOM killer I've
devised does work quite well for the very specific usecase it is set out to
address. The NUMA affinity and zone protection bits are helpful insights too.

I'll take a look at PSI which Joel mentioned as well.

Thanks,
Sultan Alsawaf

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-11 17:58   ` Sultan Alsawaf
@ 2019-03-11 20:10     ` Suren Baghdasaryan
  2019-03-11 20:46       ` Sultan Alsawaf
  0 siblings, 1 reply; 113+ messages in thread
From: Suren Baghdasaryan @ 2019-03-11 20:10 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Michal Hocko, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Joel Fernandes, Christian Brauner,
	Ingo Molnar, Peter Zijlstra, LKML, devel, linux-mm, Tim Murray

Hi Sultan,

On Mon, Mar 11, 2019 at 10:58 AM Sultan Alsawaf <sultan@kerneltoast.com> wrote:
>
> On Mon, Mar 11, 2019 at 06:43:20PM +0100, Michal Hocko wrote:
> > I am sorry but we are not going to maintain two different OOM
> > implementations in the kernel. From a quick look the implementation is
> > quite a hack which is not really suitable for anything but a very
> > specific usecase. E.g. reusing a freed page for a waiting allocation
> > sounds like an interesting idea but it doesn't really work for many
> > reasons. E.g. any NUMA affinity is broken, zone protection doesn't work
> > either. Not to mention how the code hooks into the allocator hot paths.
> > This is simply no no.
> >
> > Last but not least people have worked really hard to provide means (PSI)
> > to do what you need in the userspace.
>
> Hi Michal,
>
> Thanks for the feedback. I had no doubt that this would be vehemently rejected
> on the mailing list, but I wanted feedback/opinions on it and thus sent it as anRFC.

Thanks for the proposal. I think Michal and Joel already answered why
in-kernel LMK will not be accepted and that was one of the reasons the
lowmemorykiller driver was removed in 4.12.

> At best I thought perhaps the mechanisms I've employed might serve as
> inspiration for LMKD improvements in Android, since this hacky OOM killer I've
> devised does work quite well for the very specific usecase it is set out to
> address. The NUMA affinity and zone protection bits are helpful insights too.

The idea seems interesting although I need to think about this a bit
more. Killing processes based on failed page allocation might backfire
during transient spikes in memory usage.
AFAIKT the biggest issue with using this approach in userspace is that
it's not practically implementable without heavy in-kernel support.
How to implement such interaction between kernel and userspace would
be an interesting discussion which I would be happy to participate in.

> I'll take a look at PSI which Joel mentioned as well.
>
> Thanks,
> Sultan Alsawaf

Thanks,
Suren.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-11 20:10     ` Suren Baghdasaryan
@ 2019-03-11 20:46       ` Sultan Alsawaf
  2019-03-11 21:11         ` Joel Fernandes
  2019-03-11 22:15         ` Suren Baghdasaryan
  0 siblings, 2 replies; 113+ messages in thread
From: Sultan Alsawaf @ 2019-03-11 20:46 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Michal Hocko, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Joel Fernandes, Christian Brauner,
	Ingo Molnar, Peter Zijlstra, LKML, devel, linux-mm, Tim Murray

On Mon, Mar 11, 2019 at 01:10:36PM -0700, Suren Baghdasaryan wrote:
> The idea seems interesting although I need to think about this a bit
> more. Killing processes based on failed page allocation might backfire
> during transient spikes in memory usage.

This issue could be alleviated if tasks could be killed and have their pages
reaped faster. Currently, Linux takes a _very_ long time to free a task's memory
after an initial privileged SIGKILL is sent to a task, even with the task's
priority being set to the highest possible (so unwanted scheduler preemption
starving dying tasks of CPU time is not the issue at play here). I've
frequently measured the difference in time between when a SIGKILL is sent for a
task and when free_task() is called for that task to be hundreds of
milliseconds, which is incredibly long. AFAIK, this is a problem that LMKD
suffers from as well, and perhaps any OOM killer implementation in Linux, since
you cannot evaluate effect you've had on memory pressure by killing a process
for at least several tens of milliseconds.

> AFAIKT the biggest issue with using this approach in userspace is that
> it's not practically implementable without heavy in-kernel support.
> How to implement such interaction between kernel and userspace would
> be an interesting discussion which I would be happy to participate in.

You could signal a lightweight userspace process that has maximum scheduler
priority and have it kill the tasks it'd like.

Thanks,
Sultan

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-11 20:46       ` Sultan Alsawaf
@ 2019-03-11 21:11         ` Joel Fernandes
  2019-03-11 21:46           ` Sultan Alsawaf
  2019-03-11 22:15         ` Suren Baghdasaryan
  1 sibling, 1 reply; 113+ messages in thread
From: Joel Fernandes @ 2019-03-11 21:11 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Suren Baghdasaryan, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen,
	Christian Brauner, Ingo Molnar, Peter Zijlstra, LKML, devel,
	linux-mm, Tim Murray

On Mon, Mar 11, 2019 at 01:46:26PM -0700, Sultan Alsawaf wrote:
> On Mon, Mar 11, 2019 at 01:10:36PM -0700, Suren Baghdasaryan wrote:
> > The idea seems interesting although I need to think about this a bit
> > more. Killing processes based on failed page allocation might backfire
> > during transient spikes in memory usage.
> 
> This issue could be alleviated if tasks could be killed and have their pages
> reaped faster.

But the point is that a transient temporary memory spike should not be a
signal to kill _any_ process.  The reaction to kill shouldn't be so
spontaneous that unwanted tasks are killed because the system went into
panic mode. It should be averaged out which I believe is what PSI does.

thanks,

- Joel


^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-11 21:11         ` Joel Fernandes
@ 2019-03-11 21:46           ` Sultan Alsawaf
  0 siblings, 0 replies; 113+ messages in thread
From: Sultan Alsawaf @ 2019-03-11 21:46 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Suren Baghdasaryan, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen,
	Christian Brauner, Ingo Molnar, Peter Zijlstra, LKML, devel,
	linux-mm, Tim Murray

On Mon, Mar 11, 2019 at 05:11:25PM -0400, Joel Fernandes wrote:
> But the point is that a transient temporary memory spike should not be a
> signal to kill _any_ process.  The reaction to kill shouldn't be so
> spontaneous that unwanted tasks are killed because the system went into
> panic mode. It should be averaged out which I believe is what PSI does.

In my patch from the first email, I implemented the decision to kill a process
at the same time that the existing kernel OOM killer decides to kill a process.
If the kernel's OOM killer were susceptible to killing processes due to
transient memory spikes, then I think there would have been several complaints
about this behavior regardless of which userspace or architecture is in use.
I think the existing OOM killer has this done right.

The decision to kill a process occurs after the page allocator has tried _very_
hard to satisfy a page allocation via alternative means, such as utilizing
compaction, flushing file-backed pages to disk via kswapd, and direct reclaim.
Once all of those means have failed, it is quite reasonable to kill a process to
free memory. Trying to wait out the memory starvation at this point would be
futile.

Thanks,
Sultan

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-11 20:46       ` Sultan Alsawaf
  2019-03-11 21:11         ` Joel Fernandes
@ 2019-03-11 22:15         ` Suren Baghdasaryan
  2019-03-11 22:36           ` Sultan Alsawaf
  2019-03-12  8:05           ` Michal Hocko
  1 sibling, 2 replies; 113+ messages in thread
From: Suren Baghdasaryan @ 2019-03-11 22:15 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Michal Hocko, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Joel Fernandes, Christian Brauner,
	Ingo Molnar, Peter Zijlstra, LKML, devel, linux-mm, Tim Murray

On Mon, Mar 11, 2019 at 1:46 PM Sultan Alsawaf <sultan@kerneltoast.com> wrote:
>
> On Mon, Mar 11, 2019 at 01:10:36PM -0700, Suren Baghdasaryan wrote:
> > The idea seems interesting although I need to think about this a bit
> > more. Killing processes based on failed page allocation might backfire
> > during transient spikes in memory usage.
>
> This issue could be alleviated if tasks could be killed and have their pages
> reaped faster. Currently, Linux takes a _very_ long time to free a task's memory
> after an initial privileged SIGKILL is sent to a task, even with the task's
> priority being set to the highest possible (so unwanted scheduler preemption
> starving dying tasks of CPU time is not the issue at play here). I've
> frequently measured the difference in time between when a SIGKILL is sent for a
> task and when free_task() is called for that task to be hundreds of
> milliseconds, which is incredibly long. AFAIK, this is a problem that LMKD
> suffers from as well, and perhaps any OOM killer implementation in Linux, since
> you cannot evaluate effect you've had on memory pressure by killing a process
> for at least several tens of milliseconds.

Yeah, killing speed is a well-known problem which we are considering
in LMKD. For example the recent LMKD change to assign process being
killed to a cpuset cgroup containing big cores cuts the kill time
considerably. This is not ideal and we are thinking about better ways
to expedite the cleanup process.

> > AFAIKT the biggest issue with using this approach in userspace is that
> > it's not practically implementable without heavy in-kernel support.
> > How to implement such interaction between kernel and userspace would
> > be an interesting discussion which I would be happy to participate in.
>
> You could signal a lightweight userspace process that has maximum scheduler
> priority and have it kill the tasks it'd like.

This what LMKD currently is - a userspace RT process.
My point was that this page allocation queue that you implemented
can't be implemented in userspace, at least not without extensive
communication with kernel.

> Thanks,
> Sultan

Thanks,
Suren.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-11 22:15         ` Suren Baghdasaryan
@ 2019-03-11 22:36           ` Sultan Alsawaf
  2019-03-12  8:05           ` Michal Hocko
  1 sibling, 0 replies; 113+ messages in thread
From: Sultan Alsawaf @ 2019-03-11 22:36 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Michal Hocko, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Joel Fernandes, Christian Brauner,
	Ingo Molnar, Peter Zijlstra, LKML, devel, linux-mm, Tim Murray

On Mon, Mar 11, 2019 at 03:15:35PM -0700, Suren Baghdasaryan wrote:
> This what LMKD currently is - a userspace RT process.
> My point was that this page allocation queue that you implemented
> can't be implemented in userspace, at least not without extensive
> communication with kernel.

Oh, that's easy to address. My page allocation queue and the decision on when to
kill a process are orthogonal. In fact, the page allocation queue could be
touched up a bit to factor in the issues Michal mentioned, and it can be
implemented as an improvement to the existing OOM killer. The point of it is
just to ensure that page allocation requests that have gone OOM are given
priority over other allocation requests when free pages start to trickle in.

Userspace doesn't need to know about the page allocation queue, and the queue is
not necessary to implement the method of determining when to kill processes that
I've proposed. It's an optimization, not a necessity.

Thanks,
Sultan

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-11 22:15         ` Suren Baghdasaryan
  2019-03-11 22:36           ` Sultan Alsawaf
@ 2019-03-12  8:05           ` Michal Hocko
  2019-03-12 14:36             ` Suren Baghdasaryan
                               ` (2 more replies)
  1 sibling, 3 replies; 113+ messages in thread
From: Michal Hocko @ 2019-03-12  8:05 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Sultan Alsawaf, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Joel Fernandes, Christian Brauner,
	Ingo Molnar, Peter Zijlstra, LKML, devel, linux-mm, Tim Murray

On Mon 11-03-19 15:15:35, Suren Baghdasaryan wrote:
> On Mon, Mar 11, 2019 at 1:46 PM Sultan Alsawaf <sultan@kerneltoast.com> wrote:
> >
> > On Mon, Mar 11, 2019 at 01:10:36PM -0700, Suren Baghdasaryan wrote:
> > > The idea seems interesting although I need to think about this a bit
> > > more. Killing processes based on failed page allocation might backfire
> > > during transient spikes in memory usage.
> >
> > This issue could be alleviated if tasks could be killed and have their pages
> > reaped faster. Currently, Linux takes a _very_ long time to free a task's memory
> > after an initial privileged SIGKILL is sent to a task, even with the task's
> > priority being set to the highest possible (so unwanted scheduler preemption
> > starving dying tasks of CPU time is not the issue at play here). I've
> > frequently measured the difference in time between when a SIGKILL is sent for a
> > task and when free_task() is called for that task to be hundreds of
> > milliseconds, which is incredibly long. AFAIK, this is a problem that LMKD
> > suffers from as well, and perhaps any OOM killer implementation in Linux, since
> > you cannot evaluate effect you've had on memory pressure by killing a process
> > for at least several tens of milliseconds.
> 
> Yeah, killing speed is a well-known problem which we are considering
> in LMKD. For example the recent LMKD change to assign process being
> killed to a cpuset cgroup containing big cores cuts the kill time
> considerably. This is not ideal and we are thinking about better ways
> to expedite the cleanup process.

If you design is relies on the speed of killing then it is fundamentally
flawed AFAICT. You cannot assume anything about how quickly a task dies.
It might be blocked in an uninterruptible sleep or performin an
operation which takes some time. Sure, oom_reaper might help here but
still.

The only way to control the OOM behavior pro-actively is to throttle
allocation speed. We have memcg high limit for that purpose. Along with
PSI, I can imagine a reasonably working user space early oom
notifications and reasonable acting upon that.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-12  8:05           ` Michal Hocko
@ 2019-03-12 14:36             ` Suren Baghdasaryan
  2019-03-12 15:25             ` Matthew Wilcox
  2019-03-12 16:37             ` Sultan Alsawaf
  2 siblings, 0 replies; 113+ messages in thread
From: Suren Baghdasaryan @ 2019-03-12 14:36 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Sultan Alsawaf, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Joel Fernandes, Christian Brauner,
	Ingo Molnar, Peter Zijlstra, LKML, devel, linux-mm, Tim Murray

On Tue, Mar 12, 2019 at 1:05 AM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Mon 11-03-19 15:15:35, Suren Baghdasaryan wrote:
> > On Mon, Mar 11, 2019 at 1:46 PM Sultan Alsawaf <sultan@kerneltoast.com> wrote:
> > >
> > > On Mon, Mar 11, 2019 at 01:10:36PM -0700, Suren Baghdasaryan wrote:
> > > > The idea seems interesting although I need to think about this a bit
> > > > more. Killing processes based on failed page allocation might backfire
> > > > during transient spikes in memory usage.
> > >
> > > This issue could be alleviated if tasks could be killed and have their pages
> > > reaped faster. Currently, Linux takes a _very_ long time to free a task's memory
> > > after an initial privileged SIGKILL is sent to a task, even with the task's
> > > priority being set to the highest possible (so unwanted scheduler preemption
> > > starving dying tasks of CPU time is not the issue at play here). I've
> > > frequently measured the difference in time between when a SIGKILL is sent for a
> > > task and when free_task() is called for that task to be hundreds of
> > > milliseconds, which is incredibly long. AFAIK, this is a problem that LMKD
> > > suffers from as well, and perhaps any OOM killer implementation in Linux, since
> > > you cannot evaluate effect you've had on memory pressure by killing a process
> > > for at least several tens of milliseconds.
> >
> > Yeah, killing speed is a well-known problem which we are considering
> > in LMKD. For example the recent LMKD change to assign process being
> > killed to a cpuset cgroup containing big cores cuts the kill time
> > considerably. This is not ideal and we are thinking about better ways
> > to expedite the cleanup process.
>
> If you design is relies on the speed of killing then it is fundamentally
> flawed AFAICT. You cannot assume anything about how quickly a task dies.
> It might be blocked in an uninterruptible sleep or performin an
> operation which takes some time. Sure, oom_reaper might help here but
> still.

That's what I was considering. This is not a silver bullet but
increased speed would not hurt.

> The only way to control the OOM behavior pro-actively is to throttle
> allocation speed. We have memcg high limit for that purpose. Along with
> PSI, I can imagine a reasonably working user space early oom
> notifications and reasonable acting upon that.

That makes sense and we are working in this direction.

> --
> Michal Hocko
> SUSE Labs

Thanks,
Suren.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-12  8:05           ` Michal Hocko
  2019-03-12 14:36             ` Suren Baghdasaryan
@ 2019-03-12 15:25             ` Matthew Wilcox
  2019-03-12 15:33               ` Michal Hocko
  2019-03-12 16:37             ` Sultan Alsawaf
  2 siblings, 1 reply; 113+ messages in thread
From: Matthew Wilcox @ 2019-03-12 15:25 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Suren Baghdasaryan, Sultan Alsawaf, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen,
	Joel Fernandes, Christian Brauner, Ingo Molnar, Peter Zijlstra,
	LKML, devel, linux-mm, Tim Murray

On Tue, Mar 12, 2019 at 09:05:32AM +0100, Michal Hocko wrote:
> On Mon 11-03-19 15:15:35, Suren Baghdasaryan wrote:
> > Yeah, killing speed is a well-known problem which we are considering
> > in LMKD. For example the recent LMKD change to assign process being
> > killed to a cpuset cgroup containing big cores cuts the kill time
> > considerably. This is not ideal and we are thinking about better ways
> > to expedite the cleanup process.
> 
> If you design is relies on the speed of killing then it is fundamentally
> flawed AFAICT. You cannot assume anything about how quickly a task dies.
> It might be blocked in an uninterruptible sleep or performin an
> operation which takes some time. Sure, oom_reaper might help here but
> still.

Many UNINTERRUPTIBLE sleeps can be converted to KILLABLE sleeps.  It just
needs someone to do the work.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-12 15:25             ` Matthew Wilcox
@ 2019-03-12 15:33               ` Michal Hocko
  2019-03-12 15:39                 ` Michal Hocko
  0 siblings, 1 reply; 113+ messages in thread
From: Michal Hocko @ 2019-03-12 15:33 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Suren Baghdasaryan, Sultan Alsawaf, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen,
	Joel Fernandes, Christian Brauner, Ingo Molnar, Peter Zijlstra,
	LKML, devel, linux-mm, Tim Murray

On Tue 12-03-19 08:25:41, Matthew Wilcox wrote:
> On Tue, Mar 12, 2019 at 09:05:32AM +0100, Michal Hocko wrote:
> > On Mon 11-03-19 15:15:35, Suren Baghdasaryan wrote:
> > > Yeah, killing speed is a well-known problem which we are considering
> > > in LMKD. For example the recent LMKD change to assign process being
> > > killed to a cpuset cgroup containing big cores cuts the kill time
> > > considerably. This is not ideal and we are thinking about better ways
> > > to expedite the cleanup process.
> > 
> > If you design is relies on the speed of killing then it is fundamentally
> > flawed AFAICT. You cannot assume anything about how quickly a task dies.
> > It might be blocked in an uninterruptible sleep or performin an
> > operation which takes some time. Sure, oom_reaper might help here but
> > still.
> 
> Many UNINTERRUPTIBLE sleeps can be converted to KILLABLE sleeps.  It just
> needs someone to do the work.

They can and should as much as possible. No question about that. But not
all of them can and that is why nobody should be relying on that. That
is the whole point of having the oom_reaper and async oom victim tear
down.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-12 15:33               ` Michal Hocko
@ 2019-03-12 15:39                 ` Michal Hocko
  0 siblings, 0 replies; 113+ messages in thread
From: Michal Hocko @ 2019-03-12 15:39 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Suren Baghdasaryan, Sultan Alsawaf, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen,
	Joel Fernandes, Christian Brauner, Ingo Molnar, Peter Zijlstra,
	LKML, devel, linux-mm, Tim Murray

On Tue 12-03-19 16:33:15, Michal Hocko wrote:
> On Tue 12-03-19 08:25:41, Matthew Wilcox wrote:
> > On Tue, Mar 12, 2019 at 09:05:32AM +0100, Michal Hocko wrote:
> > > On Mon 11-03-19 15:15:35, Suren Baghdasaryan wrote:
> > > > Yeah, killing speed is a well-known problem which we are considering
> > > > in LMKD. For example the recent LMKD change to assign process being
> > > > killed to a cpuset cgroup containing big cores cuts the kill time
> > > > considerably. This is not ideal and we are thinking about better ways
> > > > to expedite the cleanup process.
> > > 
> > > If you design is relies on the speed of killing then it is fundamentally
> > > flawed AFAICT. You cannot assume anything about how quickly a task dies.
> > > It might be blocked in an uninterruptible sleep or performin an
> > > operation which takes some time. Sure, oom_reaper might help here but
> > > still.
> > 
> > Many UNINTERRUPTIBLE sleeps can be converted to KILLABLE sleeps.  It just
> > needs someone to do the work.
> 
> They can and should as much as possible. No question about that. But not
> all of them can and that is why nobody should be relying on that. That
> is the whole point of having the oom_reaper and async oom victim tear
> down.

Let me clarify a bit. LMK obviously doesn't need any guarantee like the
core oom killer because it is more of a pro-active measure than the last
resort. I merely wanted to say that relying on a design which assumes
anything about time victim needs to exit is flawed and it will fail
under different workloads. On the other hand this might work good enough
on very specific workloads to be usable. I am not questioning that. The
point is that this is not generic enough to be accepted to the upstream
kernel.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-12  8:05           ` Michal Hocko
  2019-03-12 14:36             ` Suren Baghdasaryan
  2019-03-12 15:25             ` Matthew Wilcox
@ 2019-03-12 16:37             ` Sultan Alsawaf
  2019-03-12 16:48               ` Michal Hocko
                                 ` (2 more replies)
  2 siblings, 3 replies; 113+ messages in thread
From: Sultan Alsawaf @ 2019-03-12 16:37 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Suren Baghdasaryan, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Joel Fernandes, Christian Brauner,
	Ingo Molnar, Peter Zijlstra, LKML, devel, linux-mm, Tim Murray

On Tue, Mar 12, 2019 at 09:05:32AM +0100, Michal Hocko wrote:
> The only way to control the OOM behavior pro-actively is to throttle
> allocation speed. We have memcg high limit for that purpose. Along with
> PSI, I can imagine a reasonably working user space early oom
> notifications and reasonable acting upon that.

The issue with pro-active memory management that prompted me to create this was
poor memory utilization. All of the alternative means of reclaiming pages in the
page allocator's slow path turn out to be very useful for maximizing memory
utilization, which is something that we would have to forgo by relying on a
purely pro-active solution. I have not had a chance to look at PSI yet, but
unless a PSI-enabled solution allows allocations to reach the same point as when
the OOM killer is invoked (which is contradictory to what it sets out to do),
then it cannot take advantage of all of the alternative memory-reclaim means
employed in the slowpath, and will result in killing a process before it is
_really_ necessary.

> If you design is relies on the speed of killing then it is fundamentally
> flawed AFAICT. You cannot assume anything about how quickly a task dies.
> It might be blocked in an uninterruptible sleep or performin an
> operation which takes some time. Sure, oom_reaper might help here but
> still.

In theory we could instantly zap any process that is not trapped in the kernel
at the time that the OOM killer is invoked without any consequences though, no?

Thanks,
Sultan

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-12 16:37             ` Sultan Alsawaf
@ 2019-03-12 16:48               ` Michal Hocko
  2019-03-12 16:58               ` Michal Hocko
  2019-03-12 17:17               ` Tim Murray
  2 siblings, 0 replies; 113+ messages in thread
From: Michal Hocko @ 2019-03-12 16:48 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Suren Baghdasaryan, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Joel Fernandes, Christian Brauner,
	Ingo Molnar, Peter Zijlstra, LKML, devel, linux-mm, Tim Murray

On Tue 12-03-19 09:37:41, Sultan Alsawaf wrote:
> On Tue, Mar 12, 2019 at 09:05:32AM +0100, Michal Hocko wrote:
> > The only way to control the OOM behavior pro-actively is to throttle
> > allocation speed. We have memcg high limit for that purpose. Along with
> > PSI, I can imagine a reasonably working user space early oom
> > notifications and reasonable acting upon that.
> 
> The issue with pro-active memory management that prompted me to create this was
> poor memory utilization. All of the alternative means of reclaiming pages in the
> page allocator's slow path turn out to be very useful for maximizing memory
> utilization, which is something that we would have to forgo by relying on a
> purely pro-active solution. I have not had a chance to look at PSI yet, but
> unless a PSI-enabled solution allows allocations to reach the same point as when
> the OOM killer is invoked (which is contradictory to what it sets out to do),
> then it cannot take advantage of all of the alternative memory-reclaim means
> employed in the slowpath, and will result in killing a process before it is
> _really_ necessary.

If you really want to reach the real OOM situation then you can very
well rely on the in-kernel OOM killer. The only reason you want a
customized oom killer is the tasks clasification. And that is a
different story. User space hints on the victim selection has been a
topic for quite while. It never get to any conclusion as interested
parties have always lost an interest because it got hairy quickly.

> > If you design is relies on the speed of killing then it is fundamentally
> > flawed AFAICT. You cannot assume anything about how quickly a task dies.
> > It might be blocked in an uninterruptible sleep or performin an
> > operation which takes some time. Sure, oom_reaper might help here but
> > still.
> 
> In theory we could instantly zap any process that is not trapped in the kernel
> at the time that the OOM killer is invoked without any consequences though, no?

No, this is not so simple. Have a look at the oom_reaper and hops it has
to go through.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-12 16:37             ` Sultan Alsawaf
  2019-03-12 16:48               ` Michal Hocko
@ 2019-03-12 16:58               ` Michal Hocko
  2019-03-12 17:15                 ` Suren Baghdasaryan
  2019-03-12 17:17               ` Tim Murray
  2 siblings, 1 reply; 113+ messages in thread
From: Michal Hocko @ 2019-03-12 16:58 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Suren Baghdasaryan, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Joel Fernandes, Christian Brauner,
	Ingo Molnar, Peter Zijlstra, LKML, devel, linux-mm, Tim Murray

On Tue 12-03-19 09:37:41, Sultan Alsawaf wrote:
> I have not had a chance to look at PSI yet, but
> unless a PSI-enabled solution allows allocations to reach the same point as when
> the OOM killer is invoked (which is contradictory to what it sets out to do),
> then it cannot take advantage of all of the alternative memory-reclaim means
> employed in the slowpath, and will result in killing a process before it is
> _really_ necessary.

One more note. The above is true, but you can also hit one of the
thrashing reclaim behaviors and reclaim last few pages again and again
with the whole system really sluggish. That is what PSI is trying to
help with.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-12 16:58               ` Michal Hocko
@ 2019-03-12 17:15                 ` Suren Baghdasaryan
  0 siblings, 0 replies; 113+ messages in thread
From: Suren Baghdasaryan @ 2019-03-12 17:15 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Sultan Alsawaf, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Joel Fernandes, Christian Brauner,
	Ingo Molnar, Peter Zijlstra, LKML, devel, linux-mm, Tim Murray

On Tue, Mar 12, 2019 at 9:58 AM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Tue 12-03-19 09:37:41, Sultan Alsawaf wrote:
> > I have not had a chance to look at PSI yet, but
> > unless a PSI-enabled solution allows allocations to reach the same point as when
> > the OOM killer is invoked (which is contradictory to what it sets out to do),

LMK's job is to relieve memory pressure before we reach the boiling
point at which OOM killer has to be invoked. If we wait that long it
will definitely affect user experience. There might be usecases when
you might not care about this but on interactive systems like Android
it is important.

> > then it cannot take advantage of all of the alternative memory-reclaim means
> > employed in the slowpath, and will result in killing a process before it is
> > _really_ necessary.

I guess it's a matter of defining when is it _really_ necessary to
kill. In Android case that should be when the user starts suffering
from the delays caused by memory contention and that delay is exactly
what PSI is measuring.

> One more note. The above is true, but you can also hit one of the
> thrashing reclaim behaviors and reclaim last few pages again and again
> with the whole system really sluggish. That is what PSI is trying to
> help with.
> --
> Michal Hocko
> SUSE Labs

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-12 16:37             ` Sultan Alsawaf
  2019-03-12 16:48               ` Michal Hocko
  2019-03-12 16:58               ` Michal Hocko
@ 2019-03-12 17:17               ` Tim Murray
  2019-03-12 17:45                 ` Sultan Alsawaf
  2019-03-14 17:47                 ` Joel Fernandes
  2 siblings, 2 replies; 113+ messages in thread
From: Tim Murray @ 2019-03-12 17:17 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Michal Hocko, Suren Baghdasaryan, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen,
	Joel Fernandes, Christian Brauner, Ingo Molnar, Peter Zijlstra,
	LKML, devel, linux-mm

On Tue, Mar 12, 2019 at 9:37 AM Sultan Alsawaf <sultan@kerneltoast.com> wrote:
>
> On Tue, Mar 12, 2019 at 09:05:32AM +0100, Michal Hocko wrote:
> > The only way to control the OOM behavior pro-actively is to throttle
> > allocation speed. We have memcg high limit for that purpose. Along with
> > PSI, I can imagine a reasonably working user space early oom
> > notifications and reasonable acting upon that.
>
> The issue with pro-active memory management that prompted me to create this was
> poor memory utilization. All of the alternative means of reclaiming pages in the
> page allocator's slow path turn out to be very useful for maximizing memory
> utilization, which is something that we would have to forgo by relying on a
> purely pro-active solution. I have not had a chance to look at PSI yet, but
> unless a PSI-enabled solution allows allocations to reach the same point as when
> the OOM killer is invoked (which is contradictory to what it sets out to do),
> then it cannot take advantage of all of the alternative memory-reclaim means
> employed in the slowpath, and will result in killing a process before it is
> _really_ necessary.

There are two essential parts of a lowmemorykiller implementation:
when to kill and how to kill.

There are a million possible approaches to decide when to kill an
unimportant process. They usually trade off between the same two
failure modes depending on the workload.

If you kill too aggressively, a transient spike that could be
imperceptibly absorbed by evicting some file pages or moving some
pages to ZRAM will result in killing processes, which then get started
up later and have a performance/battery cost.

If you don't kill aggressively enough, you will encounter a workload
that thrashes the page cache, constantly evicting and reloading file
pages and moving things in and out of ZRAM, which makes the system
unusable when a process should have been killed instead.

As far as I've seen, any methodology that uses single points in time
to decide when to kill without completely biasing toward one or the
other is susceptible to both. The minfree approach used by
lowmemorykiller/lmkd certainly is; it is both too aggressive for some
workloads and not aggressive enough for other workloads. My guess is
that simple LMK won't kill on transient spikes but will be extremely
susceptible to page cache thrashing. This is not an improvement; page
cache thrashing manifests as the entire system running very slowly.

What you actually want from lowmemorykiller/lmkd on Android is to only
kill once it becomes clear that the system will continue to try to
reclaim memory to the extent that it could impact what the user
actually cares about. That means tracking how much time is spent in
reclaim/paging operations and the like, and that's exactly what PSI
does. lmkd has had support for using PSI as a replacement for
vmpressure for use as a wakeup trigger (to check current memory levels
against the minfree thresholds) since early February. It works fine;
unsurprisingly it's better than vmpressure at avoiding false wakeups.

Longer term, there's a lot of work to be done in lmkd to turn PSI into
a kill trigger and remove minfree entirely. It's tricky (mainly
because of the "when to kill another process" problem discussed
later), but I believe it's feasible.

How to kill is similarly messy. The latency of reclaiming memory post
SIGKILL can be severe (usually tens of milliseconds, occasionally
>100ms). The latency we see on Android usually isn't because those
threads are blocked in uninterruptible sleep, it's because times of
memory pressure are also usually times of significant CPU contention
and these are overwhelmingly CFS threads, some of which may be
assigned a very low priority. lmkd now sets priorities and resets
cpusets upon killing a process, and we have seen improved reclaim
latency because of this. oom reaper might be a good approach to avoid
this latency (I think some in-kernel lowmemorykiller implementations
rely on it), but we can't use it from userspace. Something for future
consideration.

A non-obvious consequence of both of these concerns is that when to
kill a second process is a distinct and more difficult problem than
when to kill the first. A second process should be killed if reclaim
from the first process has finished and there has been insufficient
memory reclaimed to avoid perceptible impact. Identifying whether
memory pressure continues at the same level can probably be handled
through multiple PSI monitors with different thresholds and window
lengths, but this is an area of future work.

Knowing whether a SIGKILL'd process has finished reclaiming is as far
as I know not possible without something like procfds. That's where
the 100ms timeout in lmkd comes in. lowmemorykiller and lmkd both
attempt to wait up to 100ms for reclaim to finish by checking for the
continued existence of the thread that received the SIGKILL, but this
really means that they wait up to 100ms for the _thread_ to finish,
which doesn't tell you anything about the memory used by that process.
If those threads terminate early and lowmemorykiller/lmkd get a signal
to kill again, then there may be two processes competing for CPU time
to reclaim memory. That doesn't reclaim any faster and may be an
unnecessary kill.

So, in summary, the impactful LMK improvements seem like

- get lmkd and PSI to the point that lmkd can use PSI signals as a
kill trigger and remove all static memory thresholds from lmkd
completely. I think this is mostly on the lmkd side, but there may be
some PSI or PSI monitor changes that would help
- give userspace some path to start reclaiming memory without waiting
for every thread in a process to be scheduled--could be oom reaper,
could be something else
- offer a way to wait for process termination so lmkd can tell when
reclaim has finished and know when killing another process is
appropriate

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-12 17:17               ` Tim Murray
@ 2019-03-12 17:45                 ` Sultan Alsawaf
  2019-03-12 18:43                   ` Tim Murray
  2019-03-14 17:47                 ` Joel Fernandes
  1 sibling, 1 reply; 113+ messages in thread
From: Sultan Alsawaf @ 2019-03-12 17:45 UTC (permalink / raw)
  To: Tim Murray
  Cc: Michal Hocko, Suren Baghdasaryan, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen,
	Joel Fernandes, Christian Brauner, Ingo Molnar, Peter Zijlstra,
	LKML, devel, linux-mm

On Tue, Mar 12, 2019 at 10:17:43AM -0700, Tim Murray wrote:
> Knowing whether a SIGKILL'd process has finished reclaiming is as far
> as I know not possible without something like procfds. That's where
> the 100ms timeout in lmkd comes in. lowmemorykiller and lmkd both
> attempt to wait up to 100ms for reclaim to finish by checking for the
> continued existence of the thread that received the SIGKILL, but this
> really means that they wait up to 100ms for the _thread_ to finish,
> which doesn't tell you anything about the memory used by that process.
> If those threads terminate early and lowmemorykiller/lmkd get a signal
> to kill again, then there may be two processes competing for CPU time
> to reclaim memory. That doesn't reclaim any faster and may be an
> unnecessary kill.
> ...
> - offer a way to wait for process termination so lmkd can tell when
> reclaim has finished and know when killing another process is
> appropriate

Should be pretty easy with something like this:
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1549584a1..6ac478af2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1199,6 +1199,7 @@ struct task_struct {
 	unsigned long			lowest_stack;
 	unsigned long			prev_lowest_stack;
 #endif
+	ktime_t sigkill_time;
 
 	/*
 	 * New fields for task_struct should be added above here, so that
diff --git a/kernel/fork.c b/kernel/fork.c
index 9dcd18aa2..0ae182777 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -435,6 +435,8 @@ void put_task_stack(struct task_struct *tsk)
 
 void free_task(struct task_struct *tsk)
 {
+	ktime_t sigkill_time = tsk->sigkill_time;
+	pid_t pid = tsk->pid;
 #ifndef CONFIG_THREAD_INFO_IN_TASK
 	/*
 	 * The task is finally done with both the stack and thread_info,
@@ -455,6 +457,9 @@ void free_task(struct task_struct *tsk)
 	if (tsk->flags & PF_KTHREAD)
 		free_kthread_struct(tsk);
 	free_task_struct(tsk);
+	if (sigkill_time)
+		printk("%d killed after %lld us\n", pid,
+		       ktime_us_delta(ktime_get(), sigkill_time));
 }
 EXPORT_SYMBOL(free_task);
 
@@ -1881,6 +1886,7 @@ static __latent_entropy struct task_struct *copy_process(
 	p->sequential_io	= 0;
 	p->sequential_io_avg	= 0;
 #endif
+	p->sigkill_time = 0;
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	retval = sched_fork(clone_flags, p);
diff --git a/kernel/signal.c b/kernel/signal.c
index 5d53183e2..1142c8811 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1168,6 +1168,8 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
 	}
 
 out_set:
+	if (sig == SIGKILL)
+		t->sigkill_time = ktime_get();
 	signalfd_notify(t, sig);
 	sigaddset(&pending->signal, sig);

^ permalink raw reply related	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-12 17:45                 ` Sultan Alsawaf
@ 2019-03-12 18:43                   ` Tim Murray
  2019-03-12 18:50                     ` Christian Brauner
  0 siblings, 1 reply; 113+ messages in thread
From: Tim Murray @ 2019-03-12 18:43 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Michal Hocko, Suren Baghdasaryan, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen,
	Joel Fernandes, Christian Brauner, Ingo Molnar, Peter Zijlstra,
	LKML, devel, linux-mm

On Tue, Mar 12, 2019 at 10:45 AM Sultan Alsawaf <sultan@kerneltoast.com> wrote:
>
> On Tue, Mar 12, 2019 at 10:17:43AM -0700, Tim Murray wrote:
> > Knowing whether a SIGKILL'd process has finished reclaiming is as far
> > as I know not possible without something like procfds. That's where
> > the 100ms timeout in lmkd comes in. lowmemorykiller and lmkd both
> > attempt to wait up to 100ms for reclaim to finish by checking for the
> > continued existence of the thread that received the SIGKILL, but this
> > really means that they wait up to 100ms for the _thread_ to finish,
> > which doesn't tell you anything about the memory used by that process.
> > If those threads terminate early and lowmemorykiller/lmkd get a signal
> > to kill again, then there may be two processes competing for CPU time
> > to reclaim memory. That doesn't reclaim any faster and may be an
> > unnecessary kill.
> > ...
> > - offer a way to wait for process termination so lmkd can tell when
> > reclaim has finished and know when killing another process is
> > appropriate
>
> Should be pretty easy with something like this:

Yeah, that's in the spirit of what I was suggesting, but there are lot
of edge cases around how to get that data out efficiently and PID
reuse (it's a real issue--often the Android apps that are causing
memory pressure are also constantly creating/destroying threads).

I believe procfds or a similar mechanism will be a good solution to this.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-12 18:43                   ` Tim Murray
@ 2019-03-12 18:50                     ` Christian Brauner
  0 siblings, 0 replies; 113+ messages in thread
From: Christian Brauner @ 2019-03-12 18:50 UTC (permalink / raw)
  To: Tim Murray
  Cc: Sultan Alsawaf, Michal Hocko, Suren Baghdasaryan,
	Greg Kroah-Hartman, Arve Hjønnevåg, Todd Kjos,
	Martijn Coenen, Joel Fernandes, Ingo Molnar, Peter Zijlstra,
	LKML, linux-drivers, linux-mm

On Tue, Mar 12, 2019 at 7:43 PM Tim Murray <timmurray@google.com> wrote:
>
> On Tue, Mar 12, 2019 at 10:45 AM Sultan Alsawaf <sultan@kerneltoast.com> wrote:
> >
> > On Tue, Mar 12, 2019 at 10:17:43AM -0700, Tim Murray wrote:
> > > Knowing whether a SIGKILL'd process has finished reclaiming is as far
> > > as I know not possible without something like procfds. That's where
> > > the 100ms timeout in lmkd comes in. lowmemorykiller and lmkd both
> > > attempt to wait up to 100ms for reclaim to finish by checking for the
> > > continued existence of the thread that received the SIGKILL, but this
> > > really means that they wait up to 100ms for the _thread_ to finish,
> > > which doesn't tell you anything about the memory used by that process.
> > > If those threads terminate early and lowmemorykiller/lmkd get a signal
> > > to kill again, then there may be two processes competing for CPU time
> > > to reclaim memory. That doesn't reclaim any faster and may be an
> > > unnecessary kill.
> > > ...
> > > - offer a way to wait for process termination so lmkd can tell when
> > > reclaim has finished and know when killing another process is
> > > appropriate
> >
> > Should be pretty easy with something like this:
>
> Yeah, that's in the spirit of what I was suggesting, but there are lot
> of edge cases around how to get that data out efficiently and PID
> reuse (it's a real issue--often the Android apps that are causing
> memory pressure are also constantly creating/destroying threads).
>
> I believe procfds or a similar mechanism will be a good solution to this.

Fwiw, I am working on this and have send a PR for inclusion in 5.1:
https://lore.kernel.org/lkml/20190312135245.27591-1-christian@brauner.io/
There's also a tree to track this work.

Christian

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-12 17:17               ` Tim Murray
  2019-03-12 17:45                 ` Sultan Alsawaf
@ 2019-03-14 17:47                 ` Joel Fernandes
  2019-03-14 20:49                   ` Sultan Alsawaf
  1 sibling, 1 reply; 113+ messages in thread
From: Joel Fernandes @ 2019-03-14 17:47 UTC (permalink / raw)
  To: Tim Murray
  Cc: Sultan Alsawaf, Michal Hocko, Suren Baghdasaryan,
	Greg Kroah-Hartman, Arve Hjønnevåg, Todd Kjos,
	Martijn Coenen, Christian Brauner, Ingo Molnar, Peter Zijlstra,
	LKML, open list:ANDROID DRIVERS, linux-mm, kernel-team,
	Steven Rostedt

Hi Tim,
Thanks for the detailed and excellent write-up. It will serve as a
good future reference for low memory killer requirements. I made some
comments below on the "how to kill" part.

On Tue, Mar 12, 2019 at 10:17 AM Tim Murray <timmurray@google.com> wrote:
>
> On Tue, Mar 12, 2019 at 9:37 AM Sultan Alsawaf <sultan@kerneltoast.com> wrote:
> >
> > On Tue, Mar 12, 2019 at 09:05:32AM +0100, Michal Hocko wrote:
> > > The only way to control the OOM behavior pro-actively is to throttle
> > > allocation speed. We have memcg high limit for that purpose. Along with
> > > PSI, I can imagine a reasonably working user space early oom
> > > notifications and reasonable acting upon that.
> >
> > The issue with pro-active memory management that prompted me to create this was
> > poor memory utilization. All of the alternative means of reclaiming pages in the
> > page allocator's slow path turn out to be very useful for maximizing memory
> > utilization, which is something that we would have to forgo by relying on a
> > purely pro-active solution. I have not had a chance to look at PSI yet, but
> > unless a PSI-enabled solution allows allocations to reach the same point as when
> > the OOM killer is invoked (which is contradictory to what it sets out to do),
> > then it cannot take advantage of all of the alternative memory-reclaim means
> > employed in the slowpath, and will result in killing a process before it is
> > _really_ necessary.
>
> There are two essential parts of a lowmemorykiller implementation:
> when to kill and how to kill.
>
> There are a million possible approaches to decide when to kill an
> unimportant process. They usually trade off between the same two
> failure modes depending on the workload.
>
> If you kill too aggressively, a transient spike that could be
> imperceptibly absorbed by evicting some file pages or moving some
> pages to ZRAM will result in killing processes, which then get started
> up later and have a performance/battery cost.
>
> If you don't kill aggressively enough, you will encounter a workload
> that thrashes the page cache, constantly evicting and reloading file
> pages and moving things in and out of ZRAM, which makes the system
> unusable when a process should have been killed instead.
>
> As far as I've seen, any methodology that uses single points in time
> to decide when to kill without completely biasing toward one or the
> other is susceptible to both. The minfree approach used by
> lowmemorykiller/lmkd certainly is; it is both too aggressive for some
> workloads and not aggressive enough for other workloads. My guess is
> that simple LMK won't kill on transient spikes but will be extremely
> susceptible to page cache thrashing. This is not an improvement; page
> cache thrashing manifests as the entire system running very slowly.
>
> What you actually want from lowmemorykiller/lmkd on Android is to only
> kill once it becomes clear that the system will continue to try to
> reclaim memory to the extent that it could impact what the user
> actually cares about. That means tracking how much time is spent in
> reclaim/paging operations and the like, and that's exactly what PSI
> does. lmkd has had support for using PSI as a replacement for
> vmpressure for use as a wakeup trigger (to check current memory levels
> against the minfree thresholds) since early February. It works fine;
> unsurprisingly it's better than vmpressure at avoiding false wakeups.
>
> Longer term, there's a lot of work to be done in lmkd to turn PSI into
> a kill trigger and remove minfree entirely. It's tricky (mainly
> because of the "when to kill another process" problem discussed
> later), but I believe it's feasible.
>
> How to kill is similarly messy. The latency of reclaiming memory post
> SIGKILL can be severe (usually tens of milliseconds, occasionally
> >100ms). The latency we see on Android usually isn't because those
> threads are blocked in uninterruptible sleep, it's because times of
> memory pressure are also usually times of significant CPU contention
> and these are overwhelmingly CFS threads, some of which may be
> assigned a very low priority. lmkd now sets priorities and resets
> cpusets upon killing a process, and we have seen improved reclaim
> latency because of this. oom reaper might be a good approach to avoid
> this latency (I think some in-kernel lowmemorykiller implementations
> rely on it), but we can't use it from userspace. Something for future
> consideration.
>

This makes sense. If the process receiving the SIGKILL does not get CPU
time, then the kernel may not be able to execute the unconditional
signal handling paths in the context of the victim process to free the memory.

I don't see how proc-fds approach will solve this though. Say you have
process L (which is LMKd) which sends a SIGKILL to process V(which is
a victim). Now L sends SIGKILL to V. Unless V executes the
signal-handling code in kernel context and is scheduled at high enough
priority to get CPU time, I don't think the SIGKILL will be processed.

The exact path that the process being killed executes to free its
memory is: do_signal-> get_signal-> do_group_exit-> do_exit-> mmput.
And this needs to execute in the context of V which needs to get
CPU-time to do such execution.

So my point is to be notified of process death, you still need SIGKILL
to be processed. So you may still need to make sure the task is at a
high enough priority and scheduler puts it on the CPU. Only *after
that* can he proc-fds notification mechanism (or whichever)
notification mechanism can kick in.

Speaking of which I wonder if the scheduler should special case
SIGKILLed threads as higher priority automatically so that they get
CPU time, but don't know if this can cause denial of service kind of
attacks. I don't know if it does something like this already. Peter
should know this right off the bat and he is on CC so he can comment
more.

About the 100ms latency, I wonder whether it is that high because of
the way Android's lmkd is observing that a process has died. There is
a gap between when a process memory is freed and when it disappears
from the process-table.  Once a process is SIGKILLed, it becomes a
zombie. Its memory is freed instantly during the SIGKILL delivery (I
traced this so that's how I know), but until it is reaped by its
parent thread, it will still exist in /proc/<pid> . So if testing the
existence of /proc/<pid> is how Android is observing that the process
died, then there can be a large latency where it takes a very long
time for the parent to actually reap the child way after its memory
was long freed. A quicker way to know if a process's memory is freed
before it is reaped could be to read back /proc/<pid>/maps in
userspace of the victim <pid>, and that file will be empty for zombie
processes. So then one does not need wait for the parent to reap it. I
wonder how much of that 100ms you mentioned is actually the "Waiting
while Parent is reaping the child", than "memory freeing time". So
yeah for this second problem, the procfds work will help.

By the way another approach that can provide a quick and asynchronous
notification of when the process memory is freed, is to monitor
sched_process_exit trace event using eBPF. You can tell eBPF the PID
that you want to monitor before the SIGKILL. As soon as the process
dies and its memory is freed, the eBPF program can send a notification
to user space (using the perf_events polling infra). The
sched_process_exit fires just after the mmput() happens so it is quite
close to when the memory is reclaimed. This also doesn't need any
kernel changes. I could come up with a prototype for this and
benchmark it on Android, if you want. Just let me know.

thanks,

 - Joel










> A non-obvious consequence of both of these concerns is that when to
> kill a second process is a distinct and more difficult problem than
> when to kill the first. A second process should be killed if reclaim
> from the first process has finished and there has been insufficient
> memory reclaimed to avoid perceptible impact. Identifying whether
> memory pressure continues at the same level can probably be handled
> through multiple PSI monitors with different thresholds and window
> lengths, but this is an area of future work.
>
> Knowing whether a SIGKILL'd process has finished reclaiming is as far
> as I know not possible without something like procfds. That's where
> the 100ms timeout in lmkd comes in. lowmemorykiller and lmkd both
> attempt to wait up to 100ms for reclaim to finish by checking for the
> continued existence of the thread that received the SIGKILL, but this
> really means that they wait up to 100ms for the _thread_ to finish,
> which doesn't tell you anything about the memory used by that process.
> If those threads terminate early and lowmemorykiller/lmkd get a signal
> to kill again, then there may be two processes competing for CPU time
> to reclaim memory. That doesn't reclaim any faster and may be an
> unnecessary kill.
>
> So, in summary, the impactful LMK improvements seem like
>
> - get lmkd and PSI to the point that lmkd can use PSI signals as a
> kill trigger and remove all static memory thresholds from lmkd
> completely. I think this is mostly on the lmkd side, but there may be
> some PSI or PSI monitor changes that would help
> - give userspace some path to start reclaiming memory without waiting
> for every thread in a process to be scheduled--could be oom reaper,
> could be something else
> - offer a way to wait for process termination so lmkd can tell when
> reclaim has finished and know when killing another process is
> appropriate

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-14 17:47                 ` Joel Fernandes
@ 2019-03-14 20:49                   ` Sultan Alsawaf
  2019-03-15  2:54                     ` Joel Fernandes
  2019-03-15  3:16                     ` Steven Rostedt
  0 siblings, 2 replies; 113+ messages in thread
From: Sultan Alsawaf @ 2019-03-14 20:49 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Tim Murray, Michal Hocko, Suren Baghdasaryan, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen,
	Christian Brauner, Ingo Molnar, Peter Zijlstra, LKML,
	open list:ANDROID DRIVERS, linux-mm, kernel-team, Steven Rostedt

On Thu, Mar 14, 2019 at 10:47:17AM -0700, Joel Fernandes wrote:
> About the 100ms latency, I wonder whether it is that high because of
> the way Android's lmkd is observing that a process has died. There is
> a gap between when a process memory is freed and when it disappears
> from the process-table.  Once a process is SIGKILLed, it becomes a
> zombie. Its memory is freed instantly during the SIGKILL delivery (I
> traced this so that's how I know), but until it is reaped by its
> parent thread, it will still exist in /proc/<pid> . So if testing the
> existence of /proc/<pid> is how Android is observing that the process
> died, then there can be a large latency where it takes a very long
> time for the parent to actually reap the child way after its memory
> was long freed. A quicker way to know if a process's memory is freed
> before it is reaped could be to read back /proc/<pid>/maps in
> userspace of the victim <pid>, and that file will be empty for zombie
> processes. So then one does not need wait for the parent to reap it. I
> wonder how much of that 100ms you mentioned is actually the "Waiting
> while Parent is reaping the child", than "memory freeing time". So
> yeah for this second problem, the procfds work will help.
>
> By the way another approach that can provide a quick and asynchronous
> notification of when the process memory is freed, is to monitor
> sched_process_exit trace event using eBPF. You can tell eBPF the PID
> that you want to monitor before the SIGKILL. As soon as the process
> dies and its memory is freed, the eBPF program can send a notification
> to user space (using the perf_events polling infra). The
> sched_process_exit fires just after the mmput() happens so it is quite
> close to when the memory is reclaimed. This also doesn't need any
> kernel changes. I could come up with a prototype for this and
> benchmark it on Android, if you want. Just let me know.

Perhaps I'm missing something, but if you want to know when a process has died
after sending a SIGKILL to it, then why not just make the SIGKILL optionally
block until the process has died completely? It'd be rather trivial to just
store a pointer to an onstack completion inside the victim process' task_struct,
and then complete it in free_task().

Thanks,
Sultan

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-14 20:49                   ` Sultan Alsawaf
@ 2019-03-15  2:54                     ` Joel Fernandes
  2019-03-15  3:43                       ` Sultan Alsawaf
  2019-03-15  3:16                     ` Steven Rostedt
  1 sibling, 1 reply; 113+ messages in thread
From: Joel Fernandes @ 2019-03-15  2:54 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Tim Murray, Michal Hocko, Suren Baghdasaryan, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen,
	Christian Brauner, Ingo Molnar, Peter Zijlstra, LKML,
	open list:ANDROID DRIVERS, linux-mm, kernel-team, Steven Rostedt

On Thu, Mar 14, 2019 at 01:49:11PM -0700, Sultan Alsawaf wrote:
> On Thu, Mar 14, 2019 at 10:47:17AM -0700, Joel Fernandes wrote:
> > About the 100ms latency, I wonder whether it is that high because of
> > the way Android's lmkd is observing that a process has died. There is
> > a gap between when a process memory is freed and when it disappears
> > from the process-table.  Once a process is SIGKILLed, it becomes a
> > zombie. Its memory is freed instantly during the SIGKILL delivery (I
> > traced this so that's how I know), but until it is reaped by its
> > parent thread, it will still exist in /proc/<pid> . So if testing the
> > existence of /proc/<pid> is how Android is observing that the process
> > died, then there can be a large latency where it takes a very long
> > time for the parent to actually reap the child way after its memory
> > was long freed. A quicker way to know if a process's memory is freed
> > before it is reaped could be to read back /proc/<pid>/maps in
> > userspace of the victim <pid>, and that file will be empty for zombie
> > processes. So then one does not need wait for the parent to reap it. I
> > wonder how much of that 100ms you mentioned is actually the "Waiting
> > while Parent is reaping the child", than "memory freeing time". So
> > yeah for this second problem, the procfds work will help.
> >
> > By the way another approach that can provide a quick and asynchronous
> > notification of when the process memory is freed, is to monitor
> > sched_process_exit trace event using eBPF. You can tell eBPF the PID
> > that you want to monitor before the SIGKILL. As soon as the process
> > dies and its memory is freed, the eBPF program can send a notification
> > to user space (using the perf_events polling infra). The
> > sched_process_exit fires just after the mmput() happens so it is quite
> > close to when the memory is reclaimed. This also doesn't need any
> > kernel changes. I could come up with a prototype for this and
> > benchmark it on Android, if you want. Just let me know.
> 
> Perhaps I'm missing something, but if you want to know when a process has died
> after sending a SIGKILL to it, then why not just make the SIGKILL optionally
> block until the process has died completely? It'd be rather trivial to just
> store a pointer to an onstack completion inside the victim process' task_struct,
> and then complete it in free_task().

I'm not sure if that makes much semantic sense for how the signal handling is
supposed to work. Imagine a parent sends SIGKILL to its child, and then does
a wait(2). Because the SIGKILL blocks in your idea, then the wait cannot
execute, and because the wait cannot execute, the zombie task will not get
reaped and so the SIGKILL senders never gets unblocked and the whole thing
just gets locked up. No? I don't know it just feels incorrect.

Further, in your idea adding stuff to task_struct will simply bloat it - when
this task can easily be handled using eBPF without making any kernel changes.
Either by probing sched_process_free or sched_process_exit tracepoints.
Scheduler maintainers generally frown on adding stuff to task_struct
pointlessly there's a good reason since bloating it effects the performance
etc, and something like this would probably never be ifdef'd out behind a
CONFIG.

thanks,

 - Joel


^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-14 20:49                   ` Sultan Alsawaf
  2019-03-15  2:54                     ` Joel Fernandes
@ 2019-03-15  3:16                     ` Steven Rostedt
  2019-03-15  3:45                       ` Sultan Alsawaf
  2019-03-15  4:36                       ` Daniel Colascione
  1 sibling, 2 replies; 113+ messages in thread
From: Steven Rostedt @ 2019-03-15  3:16 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Joel Fernandes, Tim Murray, Michal Hocko, Suren Baghdasaryan,
	Greg Kroah-Hartman, Arve Hjønnevåg, Todd Kjos,
	Martijn Coenen, Christian Brauner, Ingo Molnar, Peter Zijlstra,
	LKML, open list:ANDROID DRIVERS, linux-mm, kernel-team

On Thu, 14 Mar 2019 13:49:11 -0700
Sultan Alsawaf <sultan@kerneltoast.com> wrote:

> Perhaps I'm missing something, but if you want to know when a process has died
> after sending a SIGKILL to it, then why not just make the SIGKILL optionally
> block until the process has died completely? It'd be rather trivial to just
> store a pointer to an onstack completion inside the victim process' task_struct,
> and then complete it in free_task().

How would you implement such a method in userspace? kill() doesn't take
any parameters but the pid of the process you want to send a signal to,
and the signal to send. This would require a new system call, and be
quite a bit of work. If you can solve this with an ebpf program, I
strongly suggest you do that instead.

-- Steve

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-15  2:54                     ` Joel Fernandes
@ 2019-03-15  3:43                       ` Sultan Alsawaf
  0 siblings, 0 replies; 113+ messages in thread
From: Sultan Alsawaf @ 2019-03-15  3:43 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Tim Murray, Michal Hocko, Suren Baghdasaryan, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen,
	Christian Brauner, Ingo Molnar, Peter Zijlstra, LKML,
	open list:ANDROID DRIVERS, linux-mm, kernel-team, Steven Rostedt

On Thu, Mar 14, 2019 at 10:54:48PM -0400, Joel Fernandes wrote:
> I'm not sure if that makes much semantic sense for how the signal handling is
> supposed to work. Imagine a parent sends SIGKILL to its child, and then does
> a wait(2). Because the SIGKILL blocks in your idea, then the wait cannot
> execute, and because the wait cannot execute, the zombie task will not get
> reaped and so the SIGKILL senders never gets unblocked and the whole thing
> just gets locked up. No? I don't know it just feels incorrect.

Block until the victim becomes a zombie instead.

> Further, in your idea adding stuff to task_struct will simply bloat it - when
> this task can easily be handled using eBPF without making any kernel changes.
> Either by probing sched_process_free or sched_process_exit tracepoints.
> Scheduler maintainers generally frown on adding stuff to task_struct
> pointlessly there's a good reason since bloating it effects the performance
> etc, and something like this would probably never be ifdef'd out behind a
> CONFIG.

Adding something to task_struct is just the easiest way to test things for
experimentation. This can be avoided in my suggestion by passing the pointer to
a completion via the relevant functions, and then completing it at the time the
victim transitions to a zombie state. I understand it's possible to use eBPF for
this, but it seems kind of messy since this functionality is something that I
think others would want provided by the kernel (i.e., anyone using PSI to
implement their own OOM killer daemon similar to LMKD).

Thanks,
Sultan

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-15  3:16                     ` Steven Rostedt
@ 2019-03-15  3:45                       ` Sultan Alsawaf
  2019-03-15  4:36                       ` Daniel Colascione
  1 sibling, 0 replies; 113+ messages in thread
From: Sultan Alsawaf @ 2019-03-15  3:45 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Joel Fernandes, Tim Murray, Michal Hocko, Suren Baghdasaryan,
	Greg Kroah-Hartman, Arve Hjønnevåg, Todd Kjos,
	Martijn Coenen, Christian Brauner, Ingo Molnar, Peter Zijlstra,
	LKML, open list:ANDROID DRIVERS, linux-mm, kernel-team

On Thu, Mar 14, 2019 at 11:16:41PM -0400, Steven Rostedt wrote:
> How would you implement such a method in userspace? kill() doesn't take
> any parameters but the pid of the process you want to send a signal to,
> and the signal to send. This would require a new system call, and be
> quite a bit of work. If you can solve this with an ebpf program, I
> strongly suggest you do that instead.

This can be done by introducing a new signal number that provides SIGKILL
functionality while blocking (maybe SIGKILLBLOCK?).

Thanks,
Sultan

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-15  3:16                     ` Steven Rostedt
  2019-03-15  3:45                       ` Sultan Alsawaf
@ 2019-03-15  4:36                       ` Daniel Colascione
  2019-03-15 13:36                         ` Joel Fernandes
                                           ` (3 more replies)
  1 sibling, 4 replies; 113+ messages in thread
From: Daniel Colascione @ 2019-03-15  4:36 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Sultan Alsawaf, Joel Fernandes, Tim Murray, Michal Hocko,
	Suren Baghdasaryan, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Christian Brauner, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team

On Thu, Mar 14, 2019 at 8:16 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Thu, 14 Mar 2019 13:49:11 -0700
> Sultan Alsawaf <sultan@kerneltoast.com> wrote:
>
> > Perhaps I'm missing something, but if you want to know when a process has died
> > after sending a SIGKILL to it, then why not just make the SIGKILL optionally
> > block until the process has died completely? It'd be rather trivial to just
> > store a pointer to an onstack completion inside the victim process' task_struct,
> > and then complete it in free_task().
>
> How would you implement such a method in userspace? kill() doesn't take
> any parameters but the pid of the process you want to send a signal to,
> and the signal to send. This would require a new system call, and be
> quite a bit of work.

That's what the pidfd work is for. Please read the original threads
about the motivation and design of that facility.

> If you can solve this with an ebpf program, I
> strongly suggest you do that instead.

Regarding process death notification: I will absolutely not support
putting aBPF and perf trace events on the critical path of core system
memory management functionality. Tracing and monitoring facilities are
great for learning about the system, but they were never intended to
be load-bearing. The proposed eBPF process-monitoring approach is just
a variant of the netlink proposal we discussed previously on the pidfd
threads; it has all of its drawbacks. We really need a core system
call  --- really, we've needed robust process management since the
creation of unix --- and I'm glad that we're finally getting it.
Adding new system calls is not expensive; going to great lengths to
avoid adding one is like calling a helicopter to avoid crossing the
street. I don't think we should present an abuse of the debugging and
performance monitoring infrastructure as an alternative to a robust
and desperately-needed bit of core functionality that's neither hard
to add nor complex to implement nor expensive to use.

Regarding the proposal for a new kernel-side lmkd: when possible, the
kernel should provide mechanism, not policy. Putting the low memory
killer back into the kernel after we've spent significant effort
making it possible for userspace to do that job. Compared to kernel
code, more easily understood, more easily debuggable, more easily
updated, and much safer. If we *can* move something out of the kernel,
we should. This patch moves us in exactly the wrong direction. Yes, we
need *something* that sits synchronously astride the page allocation
path and does *something* to stop a busy beaver allocator that eats
all the available memory before lmkd, even mlocked and realtime, can
respond. The OOM killer is adequate for this very rare case.

With respect to kill timing: Tim is right about the need for two
levels of policy: first, a high-level process prioritization and
memory-demand balancing scheme (which is what OOM score adjustment
code in ActivityManager amounts to); and second, a low-level
process-killing methodology that maximizes sustainable memory reclaim
and minimizes unwanted side effects while killing those processes that
should be dead. Both of these policies belong in userspace --- because
they *can* be in userspace --- and userspace needs only a few tools,
most of which already exist, to do a perfectly adequate job.

We do want killed processes to die promptly. That's why I support
boosting a process's priority somehow when lmkd is about to kill it.
The precise way in which we do that --- involving not only actual
priority, but scheduler knobs, cgroup assignment, core affinity, and
so on --- is a complex topic best left to userspace. lmkd already has
all the knobs it needs to implement whatever priority boosting policy
it wants.

Hell, once we add a pidfd_wait --- which I plan to work on, assuming
nobody beats me to it, after pidfd_send_signal lands --- you can
imagine a general-purpose priority inheritance mechanism expediting
process death when a high-priority process waits on a pidfd_wait
handle for a condemned process. You know you're on the right track
design-wise when you start seeing this kind of elegant constructive
interference between seemingly-unrelated features. What we don't need
is some kind of blocking SIGKILL alternative or backdoor event
delivery system.

We definitely don't want to have to wait for a process's parent to
reap it. Instead, we want to wait for it to become a zombie. That's
why I designed my original exithand patch to fire death notification
upon transition to the zombie state, not upon process table removal,
and I expect pidfd_wait (or whatever we call it) to act the same way.

In any case, there's a clear path forward here --- general-purpose,
cheap, and elegant --- and we should just focus on doing that instead
of more complex proposals with few advantages.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-15  4:36                       ` Daniel Colascione
@ 2019-03-15 13:36                         ` Joel Fernandes
  2019-03-15 15:56                         ` Suren Baghdasaryan
                                           ` (2 subsequent siblings)
  3 siblings, 0 replies; 113+ messages in thread
From: Joel Fernandes @ 2019-03-15 13:36 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: Steven Rostedt, Sultan Alsawaf, Tim Murray, Michal Hocko,
	Suren Baghdasaryan, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Christian Brauner, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team

On Thu, Mar 14, 2019 at 09:36:43PM -0700, Daniel Colascione wrote:
[snip] 
> > If you can solve this with an ebpf program, I
> > strongly suggest you do that instead.
> 
> Regarding process death notification: I will absolutely not support
> putting aBPF and perf trace events on the critical path of core system
> memory management functionality. Tracing and monitoring facilities are
> great for learning about the system, but they were never intended to
> be load-bearing. The proposed eBPF process-monitoring approach is just
> a variant of the netlink proposal we discussed previously on the pidfd
> threads; it has all of its drawbacks. We really need a core system
> call  --- really, we've needed robust process management since the
> creation of unix --- and I'm glad that we're finally getting it.
> Adding new system calls is not expensive; going to great lengths to
> avoid adding one is like calling a helicopter to avoid crossing the
> street. I don't think we should present an abuse of the debugging and
> performance monitoring infrastructure as an alternative to a robust
> and desperately-needed bit of core functionality that's neither hard
> to add nor complex to implement nor expensive to use.

The eBPF-based solution to this would be just as simple while avoiding any
kernel changes. I don't know why you think it is not load-bearing. However, I
agree the proc/pidfd approach is better and can be simpler for some users who
don't want to deal with eBPF - especially since something like this has many
usecases. I was just suggesting the eBPF solution as a better alternative to
the task_struct surgery idea from Sultan since that sounded to me quite
hackish (that could just be my opinion).

> Regarding the proposal for a new kernel-side lmkd: when possible, the
> kernel should provide mechanism, not policy. Putting the low memory
> killer back into the kernel after we've spent significant effort
> making it possible for userspace to do that job. Compared to kernel
> code, more easily understood, more easily debuggable, more easily
> updated, and much safer. If we *can* move something out of the kernel,
> we should. This patch moves us in exactly the wrong direction. Yes, we
> need *something* that sits synchronously astride the page allocation
> path and does *something* to stop a busy beaver allocator that eats
> all the available memory before lmkd, even mlocked and realtime, can
> respond. The OOM killer is adequate for this very rare case.
> 
> With respect to kill timing: Tim is right about the need for two
> levels of policy: first, a high-level process prioritization and
> memory-demand balancing scheme (which is what OOM score adjustment
> code in ActivityManager amounts to); and second, a low-level
> process-killing methodology that maximizes sustainable memory reclaim
> and minimizes unwanted side effects while killing those processes that
> should be dead. Both of these policies belong in userspace --- because
> they *can* be in userspace --- and userspace needs only a few tools,
> most of which already exist, to do a perfectly adequate job.
> 
> We do want killed processes to die promptly. That's why I support
> boosting a process's priority somehow when lmkd is about to kill it.
> The precise way in which we do that --- involving not only actual
> priority, but scheduler knobs, cgroup assignment, core affinity, and
> so on --- is a complex topic best left to userspace. lmkd already has
> all the knobs it needs to implement whatever priority boosting policy
> it wants.
> 
> Hell, once we add a pidfd_wait --- which I plan to work on, assuming
> nobody beats me to it, after pidfd_send_signal lands --- you can
> imagine a general-purpose priority inheritance mechanism expediting
> process death when a high-priority process waits on a pidfd_wait
> handle for a condemned process. You know you're on the right track
> design-wise when you start seeing this kind of elegant constructive
> interference between seemingly-unrelated features. What we don't need
> is some kind of blocking SIGKILL alternative or backdoor event
> delivery system.
> 
> We definitely don't want to have to wait for a process's parent to
> reap it. Instead, we want to wait for it to become a zombie. That's
> why I designed my original exithand patch to fire death notification
> upon transition to the zombie state, not upon process table removal,
> and I expect pidfd_wait (or whatever we call it) to act the same way.

Agreed. Looking forward to the patches. :)

thanks,

 - Joel


^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-15  4:36                       ` Daniel Colascione
  2019-03-15 13:36                         ` Joel Fernandes
@ 2019-03-15 15:56                         ` Suren Baghdasaryan
  2019-03-15 16:12                           ` Daniel Colascione
  2019-03-15 16:43                         ` Steven Rostedt
  2019-03-15 18:03                         ` Christian Brauner
  3 siblings, 1 reply; 113+ messages in thread
From: Suren Baghdasaryan @ 2019-03-15 15:56 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: Steven Rostedt, Sultan Alsawaf, Joel Fernandes, Tim Murray,
	Michal Hocko, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Christian Brauner, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team

On Thu, Mar 14, 2019 at 9:37 PM Daniel Colascione <dancol@google.com> wrote:
>
> On Thu, Mar 14, 2019 at 8:16 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > On Thu, 14 Mar 2019 13:49:11 -0700
> > Sultan Alsawaf <sultan@kerneltoast.com> wrote:
> >
> > > Perhaps I'm missing something, but if you want to know when a process has died
> > > after sending a SIGKILL to it, then why not just make the SIGKILL optionally
> > > block until the process has died completely? It'd be rather trivial to just
> > > store a pointer to an onstack completion inside the victim process' task_struct,
> > > and then complete it in free_task().
> >
> > How would you implement such a method in userspace? kill() doesn't take
> > any parameters but the pid of the process you want to send a signal to,
> > and the signal to send. This would require a new system call, and be
> > quite a bit of work.
>
> That's what the pidfd work is for. Please read the original threads
> about the motivation and design of that facility.
>
> > If you can solve this with an ebpf program, I
> > strongly suggest you do that instead.
>
> Regarding process death notification: I will absolutely not support
> putting aBPF and perf trace events on the critical path of core system
> memory management functionality. Tracing and monitoring facilities are
> great for learning about the system, but they were never intended to
> be load-bearing. The proposed eBPF process-monitoring approach is just
> a variant of the netlink proposal we discussed previously on the pidfd
> threads; it has all of its drawbacks. We really need a core system
> call  --- really, we've needed robust process management since the
> creation of unix --- and I'm glad that we're finally getting it.
> Adding new system calls is not expensive; going to great lengths to
> avoid adding one is like calling a helicopter to avoid crossing the
> street. I don't think we should present an abuse of the debugging and
> performance monitoring infrastructure as an alternative to a robust
> and desperately-needed bit of core functionality that's neither hard
> to add nor complex to implement nor expensive to use.
>
> Regarding the proposal for a new kernel-side lmkd: when possible, the
> kernel should provide mechanism, not policy. Putting the low memory
> killer back into the kernel after we've spent significant effort
> making it possible for userspace to do that job. Compared to kernel
> code, more easily understood, more easily debuggable, more easily
> updated, and much safer. If we *can* move something out of the kernel,
> we should. This patch moves us in exactly the wrong direction. Yes, we
> need *something* that sits synchronously astride the page allocation
> path and does *something* to stop a busy beaver allocator that eats
> all the available memory before lmkd, even mlocked and realtime, can
> respond. The OOM killer is adequate for this very rare case.
>
> With respect to kill timing: Tim is right about the need for two
> levels of policy: first, a high-level process prioritization and
> memory-demand balancing scheme (which is what OOM score adjustment
> code in ActivityManager amounts to); and second, a low-level
> process-killing methodology that maximizes sustainable memory reclaim
> and minimizes unwanted side effects while killing those processes that
> should be dead. Both of these policies belong in userspace --- because
> they *can* be in userspace --- and userspace needs only a few tools,
> most of which already exist, to do a perfectly adequate job.
>
> We do want killed processes to die promptly. That's why I support
> boosting a process's priority somehow when lmkd is about to kill it.
> The precise way in which we do that --- involving not only actual
> priority, but scheduler knobs, cgroup assignment, core affinity, and
> so on --- is a complex topic best left to userspace. lmkd already has
> all the knobs it needs to implement whatever priority boosting policy
> it wants.
>
> Hell, once we add a pidfd_wait --- which I plan to work on, assuming
> nobody beats me to it, after pidfd_send_signal lands --- you can
> imagine a general-purpose priority inheritance mechanism expediting
> process death when a high-priority process waits on a pidfd_wait
> handle for a condemned process. You know you're on the right track
> design-wise when you start seeing this kind of elegant constructive
> interference between seemingly-unrelated features. What we don't need
> is some kind of blocking SIGKILL alternative or backdoor event
> delivery system.

When talking about pidfd_wait functionality do you mean something like
this: https://lore.kernel.org/patchwork/patch/345098/ ? I missed the
discussion about it, could you please point me to it?

> We definitely don't want to have to wait for a process's parent to
> reap it. Instead, we want to wait for it to become a zombie. That's
> why I designed my original exithand patch to fire death notification
> upon transition to the zombie state, not upon process table removal,
> and I expect pidfd_wait (or whatever we call it) to act the same way.
>
> In any case, there's a clear path forward here --- general-purpose,
> cheap, and elegant --- and we should just focus on doing that instead
> of more complex proposals with few advantages.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-15 15:56                         ` Suren Baghdasaryan
@ 2019-03-15 16:12                           ` Daniel Colascione
  0 siblings, 0 replies; 113+ messages in thread
From: Daniel Colascione @ 2019-03-15 16:12 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Steven Rostedt, Sultan Alsawaf, Joel Fernandes, Tim Murray,
	Michal Hocko, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Christian Brauner, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team

On Fri, Mar 15, 2019 at 8:56 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Thu, Mar 14, 2019 at 9:37 PM Daniel Colascione <dancol@google.com> wrote:
> >
> > On Thu, Mar 14, 2019 at 8:16 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> > >
> > > On Thu, 14 Mar 2019 13:49:11 -0700
> > > Sultan Alsawaf <sultan@kerneltoast.com> wrote:
> > >
> > > > Perhaps I'm missing something, but if you want to know when a process has died
> > > > after sending a SIGKILL to it, then why not just make the SIGKILL optionally
> > > > block until the process has died completely? It'd be rather trivial to just
> > > > store a pointer to an onstack completion inside the victim process' task_struct,
> > > > and then complete it in free_task().
> > >
> > > How would you implement such a method in userspace? kill() doesn't take
> > > any parameters but the pid of the process you want to send a signal to,
> > > and the signal to send. This would require a new system call, and be
> > > quite a bit of work.
> >
> > That's what the pidfd work is for. Please read the original threads
> > about the motivation and design of that facility.
> >
> > > If you can solve this with an ebpf program, I
> > > strongly suggest you do that instead.
> >
> > Regarding process death notification: I will absolutely not support
> > putting aBPF and perf trace events on the critical path of core system
> > memory management functionality. Tracing and monitoring facilities are
> > great for learning about the system, but they were never intended to
> > be load-bearing. The proposed eBPF process-monitoring approach is just
> > a variant of the netlink proposal we discussed previously on the pidfd
> > threads; it has all of its drawbacks. We really need a core system
> > call  --- really, we've needed robust process management since the
> > creation of unix --- and I'm glad that we're finally getting it.
> > Adding new system calls is not expensive; going to great lengths to
> > avoid adding one is like calling a helicopter to avoid crossing the
> > street. I don't think we should present an abuse of the debugging and
> > performance monitoring infrastructure as an alternative to a robust
> > and desperately-needed bit of core functionality that's neither hard
> > to add nor complex to implement nor expensive to use.
> >
> > Regarding the proposal for a new kernel-side lmkd: when possible, the
> > kernel should provide mechanism, not policy. Putting the low memory
> > killer back into the kernel after we've spent significant effort
> > making it possible for userspace to do that job. Compared to kernel
> > code, more easily understood, more easily debuggable, more easily
> > updated, and much safer. If we *can* move something out of the kernel,
> > we should. This patch moves us in exactly the wrong direction. Yes, we
> > need *something* that sits synchronously astride the page allocation
> > path and does *something* to stop a busy beaver allocator that eats
> > all the available memory before lmkd, even mlocked and realtime, can
> > respond. The OOM killer is adequate for this very rare case.
> >
> > With respect to kill timing: Tim is right about the need for two
> > levels of policy: first, a high-level process prioritization and
> > memory-demand balancing scheme (which is what OOM score adjustment
> > code in ActivityManager amounts to); and second, a low-level
> > process-killing methodology that maximizes sustainable memory reclaim
> > and minimizes unwanted side effects while killing those processes that
> > should be dead. Both of these policies belong in userspace --- because
> > they *can* be in userspace --- and userspace needs only a few tools,
> > most of which already exist, to do a perfectly adequate job.
> >
> > We do want killed processes to die promptly. That's why I support
> > boosting a process's priority somehow when lmkd is about to kill it.
> > The precise way in which we do that --- involving not only actual
> > priority, but scheduler knobs, cgroup assignment, core affinity, and
> > so on --- is a complex topic best left to userspace. lmkd already has
> > all the knobs it needs to implement whatever priority boosting policy
> > it wants.
> >
> > Hell, once we add a pidfd_wait --- which I plan to work on, assuming
> > nobody beats me to it, after pidfd_send_signal lands --- you can
> > imagine a general-purpose priority inheritance mechanism expediting
> > process death when a high-priority process waits on a pidfd_wait
> > handle for a condemned process. You know you're on the right track
> > design-wise when you start seeing this kind of elegant constructive
> > interference between seemingly-unrelated features. What we don't need
> > is some kind of blocking SIGKILL alternative or backdoor event
> > delivery system.
>
> When talking about pidfd_wait functionality do you mean something like
> this: https://lore.kernel.org/patchwork/patch/345098/ ? I missed the
> discussion about it, could you please point me to it?

That directory-polling approach came up in the discussion. It's a bad
idea, mostly for API reasons. I'm talking about something more like
https://lore.kernel.org/lkml/20181029175322.189042-1-dancol@google.com/,
albeit in system call form instead of in the form of a new per-task
proc file.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-15  4:36                       ` Daniel Colascione
  2019-03-15 13:36                         ` Joel Fernandes
  2019-03-15 15:56                         ` Suren Baghdasaryan
@ 2019-03-15 16:43                         ` Steven Rostedt
  2019-03-15 17:17                           ` Daniel Colascione
  2019-03-15 18:03                         ` Christian Brauner
  3 siblings, 1 reply; 113+ messages in thread
From: Steven Rostedt @ 2019-03-15 16:43 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: Sultan Alsawaf, Joel Fernandes, Tim Murray, Michal Hocko,
	Suren Baghdasaryan, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Christian Brauner, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team

On Thu, 14 Mar 2019 21:36:43 -0700
Daniel Colascione <dancol@google.com> wrote:

> On Thu, Mar 14, 2019 at 8:16 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > On Thu, 14 Mar 2019 13:49:11 -0700
> > Sultan Alsawaf <sultan@kerneltoast.com> wrote:
> >  
> > > Perhaps I'm missing something, but if you want to know when a process has died
> > > after sending a SIGKILL to it, then why not just make the SIGKILL optionally
> > > block until the process has died completely? It'd be rather trivial to just
> > > store a pointer to an onstack completion inside the victim process' task_struct,
> > > and then complete it in free_task().  
> >
> > How would you implement such a method in userspace? kill() doesn't take
> > any parameters but the pid of the process you want to send a signal to,
> > and the signal to send. This would require a new system call, and be
> > quite a bit of work.  
> 
> That's what the pidfd work is for. Please read the original threads
> about the motivation and design of that facility.

I wasn't Cc'd on the original work, so I haven't read them.

> 
> > If you can solve this with an ebpf program, I
> > strongly suggest you do that instead.  
> 



> We do want killed processes to die promptly. That's why I support
> boosting a process's priority somehow when lmkd is about to kill it.
> The precise way in which we do that --- involving not only actual
> priority, but scheduler knobs, cgroup assignment, core affinity, and
> so on --- is a complex topic best left to userspace. lmkd already has
> all the knobs it needs to implement whatever priority boosting policy
> it wants.
> 
> Hell, once we add a pidfd_wait --- which I plan to work on, assuming
> nobody beats me to it, after pidfd_send_signal lands --- you can
> imagine a general-purpose priority inheritance mechanism expediting
> process death when a high-priority process waits on a pidfd_wait
> handle for a condemned process. You know you're on the right track
> design-wise when you start seeing this kind of elegant constructive
> interference between seemingly-unrelated features. What we don't need
> is some kind of blocking SIGKILL alternative or backdoor event
> delivery system.
> 
> We definitely don't want to have to wait for a process's parent to
> reap it. Instead, we want to wait for it to become a zombie. That's
> why I designed my original exithand patch to fire death notification
> upon transition to the zombie state, not upon process table removal,
> and I expect pidfd_wait (or whatever we call it) to act the same way.
> 
> In any case, there's a clear path forward here --- general-purpose,
> cheap, and elegant --- and we should just focus on doing that instead
> of more complex proposals with few advantages.

If you add new pidfd systemcalls then making a new way to send a signal
and block till it does die or whatever is more acceptable than adding a
new signal that changes the semantics of sending signals, which is what
I was against.

I do agree with Joel about bloating task_struct too. If anything, have
a wait queue you add, where you can allocate a descriptor with the task
dieing and task killing, and just search this queue on dying. We could
add a TIF flag to the task as well to let the exiting of this task know
it should do such an operation.

-- Steve

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-15 16:43                         ` Steven Rostedt
@ 2019-03-15 17:17                           ` Daniel Colascione
  0 siblings, 0 replies; 113+ messages in thread
From: Daniel Colascione @ 2019-03-15 17:17 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Sultan Alsawaf, Joel Fernandes, Tim Murray, Michal Hocko,
	Suren Baghdasaryan, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Christian Brauner, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team

On Fri, Mar 15, 2019 at 9:43 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Thu, 14 Mar 2019 21:36:43 -0700
> Daniel Colascione <dancol@google.com> wrote:
>
> > On Thu, Mar 14, 2019 at 8:16 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> > >
> > > On Thu, 14 Mar 2019 13:49:11 -0700
> > > Sultan Alsawaf <sultan@kerneltoast.com> wrote:
> > >
> > > > Perhaps I'm missing something, but if you want to know when a process has died
> > > > after sending a SIGKILL to it, then why not just make the SIGKILL optionally
> > > > block until the process has died completely? It'd be rather trivial to just
> > > > store a pointer to an onstack completion inside the victim process' task_struct,
> > > > and then complete it in free_task().
> > >
> > > How would you implement such a method in userspace? kill() doesn't take
> > > any parameters but the pid of the process you want to send a signal to,
> > > and the signal to send. This would require a new system call, and be
> > > quite a bit of work.
> >
> > That's what the pidfd work is for. Please read the original threads
> > about the motivation and design of that facility.
>
> I wasn't Cc'd on the original work, so I haven't read them.
>
> >
> > > If you can solve this with an ebpf program, I
> > > strongly suggest you do that instead.
> >
>
>
>
> > We do want killed processes to die promptly. That's why I support
> > boosting a process's priority somehow when lmkd is about to kill it.
> > The precise way in which we do that --- involving not only actual
> > priority, but scheduler knobs, cgroup assignment, core affinity, and
> > so on --- is a complex topic best left to userspace. lmkd already has
> > all the knobs it needs to implement whatever priority boosting policy
> > it wants.
> >
> > Hell, once we add a pidfd_wait --- which I plan to work on, assuming
> > nobody beats me to it, after pidfd_send_signal lands --- you can
> > imagine a general-purpose priority inheritance mechanism expediting
> > process death when a high-priority process waits on a pidfd_wait
> > handle for a condemned process. You know you're on the right track
> > design-wise when you start seeing this kind of elegant constructive
> > interference between seemingly-unrelated features. What we don't need
> > is some kind of blocking SIGKILL alternative or backdoor event
> > delivery system.
> >
> > We definitely don't want to have to wait for a process's parent to
> > reap it. Instead, we want to wait for it to become a zombie. That's
> > why I designed my original exithand patch to fire death notification
> > upon transition to the zombie state, not upon process table removal,
> > and I expect pidfd_wait (or whatever we call it) to act the same way.
> >
> > In any case, there's a clear path forward here --- general-purpose,
> > cheap, and elegant --- and we should just focus on doing that instead
> > of more complex proposals with few advantages.
>
> If you add new pidfd systemcalls then making a new way to send a signal
> and block till it does die or whatever is

Right. And we shouldn't couple the killing and the waiting: while we
now have a good race-free way to kill processes using
pidfd_send_signal, but we still have no good facility for waiting for
the death of a process that isn't a child of the waiter. Any kind of
unified "kill and wait for death" primitive precludes the killing
thread waiting for things other than death at the same time! Instead,
if we allow waiting for an arbitrary process's death using
general-purpose wait primitives like select/poll/epoll/io_submit/etc.,
then synchronous killing becomes just another sleep that composes in
useful and predictable ways.

> more acceptable than adding a
> new signal that changes the semantics of sending signals, which is what
> I was against.

Agreed. Even if it were possible to easily add signals without
breaking everyone, a special kind of signal with delivery semantics
different from those of existing signals is a bad idea, and not really
a signal at all, but just a new system call in disguise.

> I do agree with Joel about bloating task_struct too. If anything, have
> a wait queue you add, where you can allocate a descriptor with the task
> dieing and task killing, and just search this queue on dying. We could
> add a TIF flag to the task as well to let the exiting of this task know
> it should do such an operation.

That's my basic plan. I think we need one link from struct signal or
something so we don't end up doing some kind of *global* search on
process death, but let's see how it goes.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-15  4:36                       ` Daniel Colascione
                                           ` (2 preceding siblings ...)
  2019-03-15 16:43                         ` Steven Rostedt
@ 2019-03-15 18:03                         ` Christian Brauner
  2019-03-15 18:13                           ` Joel Fernandes
  3 siblings, 1 reply; 113+ messages in thread
From: Christian Brauner @ 2019-03-15 18:03 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: Steven Rostedt, Sultan Alsawaf, Joel Fernandes, Tim Murray,
	Michal Hocko, Suren Baghdasaryan, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team

On Thu, Mar 14, 2019 at 09:36:43PM -0700, Daniel Colascione wrote:
> On Thu, Mar 14, 2019 at 8:16 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > On Thu, 14 Mar 2019 13:49:11 -0700
> > Sultan Alsawaf <sultan@kerneltoast.com> wrote:
> >
> > > Perhaps I'm missing something, but if you want to know when a process has died
> > > after sending a SIGKILL to it, then why not just make the SIGKILL optionally
> > > block until the process has died completely? It'd be rather trivial to just
> > > store a pointer to an onstack completion inside the victim process' task_struct,
> > > and then complete it in free_task().
> >
> > How would you implement such a method in userspace? kill() doesn't take
> > any parameters but the pid of the process you want to send a signal to,
> > and the signal to send. This would require a new system call, and be
> > quite a bit of work.
> 
> That's what the pidfd work is for. Please read the original threads
> about the motivation and design of that facility.
> 
> > If you can solve this with an ebpf program, I
> > strongly suggest you do that instead.
> 
> Regarding process death notification: I will absolutely not support
> putting aBPF and perf trace events on the critical path of core system
> memory management functionality. Tracing and monitoring facilities are
> great for learning about the system, but they were never intended to
> be load-bearing. The proposed eBPF process-monitoring approach is just
> a variant of the netlink proposal we discussed previously on the pidfd
> threads; it has all of its drawbacks. We really need a core system
> call  --- really, we've needed robust process management since the
> creation of unix --- and I'm glad that we're finally getting it.
> Adding new system calls is not expensive; going to great lengths to
> avoid adding one is like calling a helicopter to avoid crossing the
> street. I don't think we should present an abuse of the debugging and
> performance monitoring infrastructure as an alternative to a robust
> and desperately-needed bit of core functionality that's neither hard
> to add nor complex to implement nor expensive to use.
> 
> Regarding the proposal for a new kernel-side lmkd: when possible, the
> kernel should provide mechanism, not policy. Putting the low memory
> killer back into the kernel after we've spent significant effort
> making it possible for userspace to do that job. Compared to kernel
> code, more easily understood, more easily debuggable, more easily
> updated, and much safer. If we *can* move something out of the kernel,
> we should. This patch moves us in exactly the wrong direction. Yes, we
> need *something* that sits synchronously astride the page allocation
> path and does *something* to stop a busy beaver allocator that eats
> all the available memory before lmkd, even mlocked and realtime, can
> respond. The OOM killer is adequate for this very rare case.
> 
> With respect to kill timing: Tim is right about the need for two
> levels of policy: first, a high-level process prioritization and
> memory-demand balancing scheme (which is what OOM score adjustment
> code in ActivityManager amounts to); and second, a low-level
> process-killing methodology that maximizes sustainable memory reclaim
> and minimizes unwanted side effects while killing those processes that
> should be dead. Both of these policies belong in userspace --- because
> they *can* be in userspace --- and userspace needs only a few tools,
> most of which already exist, to do a perfectly adequate job.
> 
> We do want killed processes to die promptly. That's why I support
> boosting a process's priority somehow when lmkd is about to kill it.
> The precise way in which we do that --- involving not only actual
> priority, but scheduler knobs, cgroup assignment, core affinity, and
> so on --- is a complex topic best left to userspace. lmkd already has
> all the knobs it needs to implement whatever priority boosting policy
> it wants.
> 
> Hell, once we add a pidfd_wait --- which I plan to work on, assuming
> nobody beats me to it, after pidfd_send_signal lands --- you can

Daniel,

I've just been talking to Joel.
I actually "expected" you to work pidfd_wait() after prior
conversations we had on the pidfd_send_signal() patchsets. :) That's why
I got a separate git tree on kernel.org since I expect a lot more work
to come. I hope that Linus still decides to pull pidfd_send_signal()
before Sunday (For the ones who have missed the link in a prior response
of mine:
https://lkml.org/lkml/2019/3/12/439

This is the first merge window I sent this PR.

The pidfd tree has a branch for-next that is already tracked by Stephen
in linux-next since the 5.0 merge window. The patches for
pidfd_send_signal() sit in the pidfd branch.
I'd be happy to share the tree with you and Joel (We can rename it if
you prefer I don't care).
I would really like to centralize this work so that we sort of have a
"united front" and end up with a coherent api and can send PRs from a
centralized place:
https://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git/

Christian

> imagine a general-purpose priority inheritance mechanism expediting
> process death when a high-priority process waits on a pidfd_wait
> handle for a condemned process. You know you're on the right track
> design-wise when you start seeing this kind of elegant constructive
> interference between seemingly-unrelated features. What we don't need
> is some kind of blocking SIGKILL alternative or backdoor event
> delivery system.
> 
> We definitely don't want to have to wait for a process's parent to
> reap it. Instead, we want to wait for it to become a zombie. That's
> why I designed my original exithand patch to fire death notification
> upon transition to the zombie state, not upon process table removal,
> and I expect pidfd_wait (or whatever we call it) to act the same way.
> 
> In any case, there's a clear path forward here --- general-purpose,
> cheap, and elegant --- and we should just focus on doing that instead
> of more complex proposals with few advantages.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-15 18:03                         ` Christian Brauner
@ 2019-03-15 18:13                           ` Joel Fernandes
  2019-03-15 18:24                             ` Christian Brauner
  0 siblings, 1 reply; 113+ messages in thread
From: Joel Fernandes @ 2019-03-15 18:13 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Daniel Colascione, Steven Rostedt, Sultan Alsawaf, Tim Murray,
	Michal Hocko, Suren Baghdasaryan, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team

On Fri, Mar 15, 2019 at 07:03:07PM +0100, Christian Brauner wrote:
> On Thu, Mar 14, 2019 at 09:36:43PM -0700, Daniel Colascione wrote:
> > On Thu, Mar 14, 2019 at 8:16 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> > >
> > > On Thu, 14 Mar 2019 13:49:11 -0700
> > > Sultan Alsawaf <sultan@kerneltoast.com> wrote:
> > >
> > > > Perhaps I'm missing something, but if you want to know when a process has died
> > > > after sending a SIGKILL to it, then why not just make the SIGKILL optionally
> > > > block until the process has died completely? It'd be rather trivial to just
> > > > store a pointer to an onstack completion inside the victim process' task_struct,
> > > > and then complete it in free_task().
> > >
> > > How would you implement such a method in userspace? kill() doesn't take
> > > any parameters but the pid of the process you want to send a signal to,
> > > and the signal to send. This would require a new system call, and be
> > > quite a bit of work.
> > 
> > That's what the pidfd work is for. Please read the original threads
> > about the motivation and design of that facility.
> > 
> > > If you can solve this with an ebpf program, I
> > > strongly suggest you do that instead.
> > 
> > Regarding process death notification: I will absolutely not support
> > putting aBPF and perf trace events on the critical path of core system
> > memory management functionality. Tracing and monitoring facilities are
> > great for learning about the system, but they were never intended to
> > be load-bearing. The proposed eBPF process-monitoring approach is just
> > a variant of the netlink proposal we discussed previously on the pidfd
> > threads; it has all of its drawbacks. We really need a core system
> > call  --- really, we've needed robust process management since the
> > creation of unix --- and I'm glad that we're finally getting it.
> > Adding new system calls is not expensive; going to great lengths to
> > avoid adding one is like calling a helicopter to avoid crossing the
> > street. I don't think we should present an abuse of the debugging and
> > performance monitoring infrastructure as an alternative to a robust
> > and desperately-needed bit of core functionality that's neither hard
> > to add nor complex to implement nor expensive to use.
> > 
> > Regarding the proposal for a new kernel-side lmkd: when possible, the
> > kernel should provide mechanism, not policy. Putting the low memory
> > killer back into the kernel after we've spent significant effort
> > making it possible for userspace to do that job. Compared to kernel
> > code, more easily understood, more easily debuggable, more easily
> > updated, and much safer. If we *can* move something out of the kernel,
> > we should. This patch moves us in exactly the wrong direction. Yes, we
> > need *something* that sits synchronously astride the page allocation
> > path and does *something* to stop a busy beaver allocator that eats
> > all the available memory before lmkd, even mlocked and realtime, can
> > respond. The OOM killer is adequate for this very rare case.
> > 
> > With respect to kill timing: Tim is right about the need for two
> > levels of policy: first, a high-level process prioritization and
> > memory-demand balancing scheme (which is what OOM score adjustment
> > code in ActivityManager amounts to); and second, a low-level
> > process-killing methodology that maximizes sustainable memory reclaim
> > and minimizes unwanted side effects while killing those processes that
> > should be dead. Both of these policies belong in userspace --- because
> > they *can* be in userspace --- and userspace needs only a few tools,
> > most of which already exist, to do a perfectly adequate job.
> > 
> > We do want killed processes to die promptly. That's why I support
> > boosting a process's priority somehow when lmkd is about to kill it.
> > The precise way in which we do that --- involving not only actual
> > priority, but scheduler knobs, cgroup assignment, core affinity, and
> > so on --- is a complex topic best left to userspace. lmkd already has
> > all the knobs it needs to implement whatever priority boosting policy
> > it wants.
> > 
> > Hell, once we add a pidfd_wait --- which I plan to work on, assuming
> > nobody beats me to it, after pidfd_send_signal lands --- you can
> 
> Daniel,
> 
> I've just been talking to Joel.
> I actually "expected" you to work pidfd_wait() after prior
> conversations we had on the pidfd_send_signal() patchsets. :) That's why
> I got a separate git tree on kernel.org since I expect a lot more work
> to come. I hope that Linus still decides to pull pidfd_send_signal()
> before Sunday (For the ones who have missed the link in a prior response
> of mine:
> https://lkml.org/lkml/2019/3/12/439
> 
> This is the first merge window I sent this PR.
> 
> The pidfd tree has a branch for-next that is already tracked by Stephen
> in linux-next since the 5.0 merge window. The patches for
> pidfd_send_signal() sit in the pidfd branch.
> I'd be happy to share the tree with you and Joel (We can rename it if
> you prefer I don't care).
> I would really like to centralize this work so that we sort of have a
> "united front" and end up with a coherent api and can send PRs from a
> centralized place:
> https://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git/

I am totally onboard with working together / reviewing this work with you all
on a common tree somewhere (Christian's pidfd tree is fine). I was curious,
why do we want to add a new syscall (pidfd_wait) though? Why not just use
standard poll/epoll interface on the proc fd like Daniel was suggesting.
AFAIK, once the proc file is opened, the struct pid is essentially pinned
even though the proc number may be reused. Then the caller can just poll.
We can add a waitqueue to struct pid, and wake up any waiters on process
death (A quick look shows task_struct can be mapped to its struct pid) and
also possibly optimize it using Steve's TIF flag idea. No new syscall is
needed then, let me know if I missed something?

thanks,

 - Joel

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-15 18:13                           ` Joel Fernandes
@ 2019-03-15 18:24                             ` Christian Brauner
  2019-03-15 18:49                               ` Joel Fernandes
  0 siblings, 1 reply; 113+ messages in thread
From: Christian Brauner @ 2019-03-15 18:24 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Daniel Colascione, Steven Rostedt, Sultan Alsawaf, Tim Murray,
	Michal Hocko, Suren Baghdasaryan, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team

On Fri, Mar 15, 2019 at 02:13:24PM -0400, Joel Fernandes wrote:
> On Fri, Mar 15, 2019 at 07:03:07PM +0100, Christian Brauner wrote:
> > On Thu, Mar 14, 2019 at 09:36:43PM -0700, Daniel Colascione wrote:
> > > On Thu, Mar 14, 2019 at 8:16 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> > > >
> > > > On Thu, 14 Mar 2019 13:49:11 -0700
> > > > Sultan Alsawaf <sultan@kerneltoast.com> wrote:
> > > >
> > > > > Perhaps I'm missing something, but if you want to know when a process has died
> > > > > after sending a SIGKILL to it, then why not just make the SIGKILL optionally
> > > > > block until the process has died completely? It'd be rather trivial to just
> > > > > store a pointer to an onstack completion inside the victim process' task_struct,
> > > > > and then complete it in free_task().
> > > >
> > > > How would you implement such a method in userspace? kill() doesn't take
> > > > any parameters but the pid of the process you want to send a signal to,
> > > > and the signal to send. This would require a new system call, and be
> > > > quite a bit of work.
> > > 
> > > That's what the pidfd work is for. Please read the original threads
> > > about the motivation and design of that facility.
> > > 
> > > > If you can solve this with an ebpf program, I
> > > > strongly suggest you do that instead.
> > > 
> > > Regarding process death notification: I will absolutely not support
> > > putting aBPF and perf trace events on the critical path of core system
> > > memory management functionality. Tracing and monitoring facilities are
> > > great for learning about the system, but they were never intended to
> > > be load-bearing. The proposed eBPF process-monitoring approach is just
> > > a variant of the netlink proposal we discussed previously on the pidfd
> > > threads; it has all of its drawbacks. We really need a core system
> > > call  --- really, we've needed robust process management since the
> > > creation of unix --- and I'm glad that we're finally getting it.
> > > Adding new system calls is not expensive; going to great lengths to
> > > avoid adding one is like calling a helicopter to avoid crossing the
> > > street. I don't think we should present an abuse of the debugging and
> > > performance monitoring infrastructure as an alternative to a robust
> > > and desperately-needed bit of core functionality that's neither hard
> > > to add nor complex to implement nor expensive to use.
> > > 
> > > Regarding the proposal for a new kernel-side lmkd: when possible, the
> > > kernel should provide mechanism, not policy. Putting the low memory
> > > killer back into the kernel after we've spent significant effort
> > > making it possible for userspace to do that job. Compared to kernel
> > > code, more easily understood, more easily debuggable, more easily
> > > updated, and much safer. If we *can* move something out of the kernel,
> > > we should. This patch moves us in exactly the wrong direction. Yes, we
> > > need *something* that sits synchronously astride the page allocation
> > > path and does *something* to stop a busy beaver allocator that eats
> > > all the available memory before lmkd, even mlocked and realtime, can
> > > respond. The OOM killer is adequate for this very rare case.
> > > 
> > > With respect to kill timing: Tim is right about the need for two
> > > levels of policy: first, a high-level process prioritization and
> > > memory-demand balancing scheme (which is what OOM score adjustment
> > > code in ActivityManager amounts to); and second, a low-level
> > > process-killing methodology that maximizes sustainable memory reclaim
> > > and minimizes unwanted side effects while killing those processes that
> > > should be dead. Both of these policies belong in userspace --- because
> > > they *can* be in userspace --- and userspace needs only a few tools,
> > > most of which already exist, to do a perfectly adequate job.
> > > 
> > > We do want killed processes to die promptly. That's why I support
> > > boosting a process's priority somehow when lmkd is about to kill it.
> > > The precise way in which we do that --- involving not only actual
> > > priority, but scheduler knobs, cgroup assignment, core affinity, and
> > > so on --- is a complex topic best left to userspace. lmkd already has
> > > all the knobs it needs to implement whatever priority boosting policy
> > > it wants.
> > > 
> > > Hell, once we add a pidfd_wait --- which I plan to work on, assuming
> > > nobody beats me to it, after pidfd_send_signal lands --- you can
> > 
> > Daniel,
> > 
> > I've just been talking to Joel.
> > I actually "expected" you to work pidfd_wait() after prior
> > conversations we had on the pidfd_send_signal() patchsets. :) That's why
> > I got a separate git tree on kernel.org since I expect a lot more work
> > to come. I hope that Linus still decides to pull pidfd_send_signal()
> > before Sunday (For the ones who have missed the link in a prior response
> > of mine:
> > https://lkml.org/lkml/2019/3/12/439
> > 
> > This is the first merge window I sent this PR.
> > 
> > The pidfd tree has a branch for-next that is already tracked by Stephen
> > in linux-next since the 5.0 merge window. The patches for
> > pidfd_send_signal() sit in the pidfd branch.
> > I'd be happy to share the tree with you and Joel (We can rename it if
> > you prefer I don't care).
> > I would really like to centralize this work so that we sort of have a
> > "united front" and end up with a coherent api and can send PRs from a
> > centralized place:
> > https://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git/
> 
> I am totally onboard with working together / reviewing this work with you all
> on a common tree somewhere (Christian's pidfd tree is fine). I was curious,

Excellent.

> why do we want to add a new syscall (pidfd_wait) though? Why not just use
> standard poll/epoll interface on the proc fd like Daniel was suggesting.
> AFAIK, once the proc file is opened, the struct pid is essentially pinned
> even though the proc number may be reused. Then the caller can just poll.
> We can add a waitqueue to struct pid, and wake up any waiters on process
> death (A quick look shows task_struct can be mapped to its struct pid) and
> also possibly optimize it using Steve's TIF flag idea. No new syscall is
> needed then, let me know if I missed something?

Huh, I thought that Daniel was against the poll/epoll solution?
I have no clear opinion on what is better at the moment since I have
been mostly concerned with getting pidfd_send_signal() into shape and
was reluctant to put more ideas/work into this if it gets shutdown.
Once we have pidfd_send_signal() the wait discussion makes sense.

Thanks!
Christian

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-15 18:24                             ` Christian Brauner
@ 2019-03-15 18:49                               ` Joel Fernandes
  2019-03-16 17:31                                 ` Suren Baghdasaryan
  0 siblings, 1 reply; 113+ messages in thread
From: Joel Fernandes @ 2019-03-15 18:49 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Daniel Colascione, Steven Rostedt, Sultan Alsawaf, Tim Murray,
	Michal Hocko, Suren Baghdasaryan, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team

On Fri, Mar 15, 2019 at 07:24:28PM +0100, Christian Brauner wrote:
[..]
> > why do we want to add a new syscall (pidfd_wait) though? Why not just use
> > standard poll/epoll interface on the proc fd like Daniel was suggesting.
> > AFAIK, once the proc file is opened, the struct pid is essentially pinned
> > even though the proc number may be reused. Then the caller can just poll.
> > We can add a waitqueue to struct pid, and wake up any waiters on process
> > death (A quick look shows task_struct can be mapped to its struct pid) and
> > also possibly optimize it using Steve's TIF flag idea. No new syscall is
> > needed then, let me know if I missed something?
> 
> Huh, I thought that Daniel was against the poll/epoll solution?

Hmm, going through earlier threads, I believe so now. Here was Daniel's
reasoning about avoiding a notification about process death through proc
directory fd: http://lkml.iu.edu/hypermail/linux/kernel/1811.0/00232.html

May be a dedicated syscall for this would be cleaner after all.

> I have no clear opinion on what is better at the moment since I have
> been mostly concerned with getting pidfd_send_signal() into shape and
> was reluctant to put more ideas/work into this if it gets shutdown.
> Once we have pidfd_send_signal() the wait discussion makes sense.

Ok. It looks like that is almost in though (fingers crossed :)).

thanks,

 - Joel


^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-15 18:49                               ` Joel Fernandes
@ 2019-03-16 17:31                                 ` Suren Baghdasaryan
  2019-03-16 18:00                                   ` Daniel Colascione
  0 siblings, 1 reply; 113+ messages in thread
From: Suren Baghdasaryan @ 2019-03-16 17:31 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Christian Brauner, Daniel Colascione, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team

On Fri, Mar 15, 2019 at 11:49 AM Joel Fernandes <joel@joelfernandes.org> wrote:
>
> On Fri, Mar 15, 2019 at 07:24:28PM +0100, Christian Brauner wrote:
> [..]
> > > why do we want to add a new syscall (pidfd_wait) though? Why not just use
> > > standard poll/epoll interface on the proc fd like Daniel was suggesting.
> > > AFAIK, once the proc file is opened, the struct pid is essentially pinned
> > > even though the proc number may be reused. Then the caller can just poll.
> > > We can add a waitqueue to struct pid, and wake up any waiters on process
> > > death (A quick look shows task_struct can be mapped to its struct pid) and
> > > also possibly optimize it using Steve's TIF flag idea. No new syscall is
> > > needed then, let me know if I missed something?
> >
> > Huh, I thought that Daniel was against the poll/epoll solution?
>
> Hmm, going through earlier threads, I believe so now. Here was Daniel's
> reasoning about avoiding a notification about process death through proc
> directory fd: http://lkml.iu.edu/hypermail/linux/kernel/1811.0/00232.html
>
> May be a dedicated syscall for this would be cleaner after all.

Ah, I wish I've seen that discussion before...
syscall makes sense and it can be non-blocking and we can use
select/poll/epoll if we use eventfd. I would strongly advocate for
non-blocking version or at least to have a non-blocking option.
Something like this:

evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
// register eventfd to receive death notification
pidfd_wait(pid_to_kill, evfd);
// kill the process
pidfd_send_signal(pid_to_kill, ...)
// tend to other things
...
// wait for the process to die
poll_wait(evfd, ...);

This simplifies userspace, allows it to wait for multiple events using
epoll and I think kernel implementation will be also quite simple
because it already implements eventfd_signal() that takes care of
waitqueue handling.

If pidfd_send_signal could be extended to have an optional eventfd
parameter then we would not even have to add a new syscall.

> > I have no clear opinion on what is better at the moment since I have
> > been mostly concerned with getting pidfd_send_signal() into shape and
> > was reluctant to put more ideas/work into this if it gets shutdown.
> > Once we have pidfd_send_signal() the wait discussion makes sense.
>
> Ok. It looks like that is almost in though (fingers crossed :)).
>
> thanks,
>
>  - Joel
>

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-16 17:31                                 ` Suren Baghdasaryan
@ 2019-03-16 18:00                                   ` Daniel Colascione
  2019-03-16 18:57                                     ` Christian Brauner
  0 siblings, 1 reply; 113+ messages in thread
From: Daniel Colascione @ 2019-03-16 18:00 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Joel Fernandes, Christian Brauner, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team

On Sat, Mar 16, 2019 at 10:31 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Fri, Mar 15, 2019 at 11:49 AM Joel Fernandes <joel@joelfernandes.org> wrote:
> >
> > On Fri, Mar 15, 2019 at 07:24:28PM +0100, Christian Brauner wrote:
> > [..]
> > > > why do we want to add a new syscall (pidfd_wait) though? Why not just use
> > > > standard poll/epoll interface on the proc fd like Daniel was suggesting.
> > > > AFAIK, once the proc file is opened, the struct pid is essentially pinned
> > > > even though the proc number may be reused. Then the caller can just poll.
> > > > We can add a waitqueue to struct pid, and wake up any waiters on process
> > > > death (A quick look shows task_struct can be mapped to its struct pid) and
> > > > also possibly optimize it using Steve's TIF flag idea. No new syscall is
> > > > needed then, let me know if I missed something?
> > >
> > > Huh, I thought that Daniel was against the poll/epoll solution?
> >
> > Hmm, going through earlier threads, I believe so now. Here was Daniel's
> > reasoning about avoiding a notification about process death through proc
> > directory fd: http://lkml.iu.edu/hypermail/linux/kernel/1811.0/00232.html
> >
> > May be a dedicated syscall for this would be cleaner after all.
>
> Ah, I wish I've seen that discussion before...
> syscall makes sense and it can be non-blocking and we can use
> select/poll/epoll if we use eventfd.

Thanks for taking a look.

> I would strongly advocate for
> non-blocking version or at least to have a non-blocking option.

Waiting for FD readiness is *already* blocking or non-blocking
according to the caller's desire --- users can pass options they want
to poll(2) or whatever. There's no need for any kind of special
configuration knob or non-blocking option. We already *have* a
non-blocking option that works universally for everything.

As I mentioned in the linked thread, waiting for process exit should
work just like waiting for bytes to appear on a pipe. Process exit
status is just another blob of bytes that a process might receive. A
process exit handle ought to be just another information source. The
reason the unix process API is so awful is that for whatever reason
the original designers treated processes as some kind of special kind
of resource instead of fitting them into the otherwise general-purpose
unix data-handling API. Let's not repeat that mistake.

> Something like this:
>
> evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> // register eventfd to receive death notification
> pidfd_wait(pid_to_kill, evfd);
> // kill the process
> pidfd_send_signal(pid_to_kill, ...)
> // tend to other things

Now you've lost me. pidfd_wait should return a *new* FD, not wire up
an eventfd.

Why? Because the new type FD can report process exit *status*
information (via read(2) after readability signal) as well as this
binary yes-or-no signal *that* a process exited, and this capability
is useful if you want to the pidfd interface to be a good
general-purpose process management facility to replace the awful
wait() family of functions. You can't get an exit status from an
eventfd. Wiring up an eventfd the way you've proposed also complicates
wait-causality information, complicating both tracing and any priority
inheritance we might want in the future (because all the wakeups gets
mixed into the eventfd and you can't unscramble an egg). And for what?
What do we gain by using an eventfd? Is the reason that exit.c would
be able to use eventfd_signal instead of poking a waitqueue directly?
How is that better? With an eventfd, you've increased path length on
process exit *and* complicated the API for no reason.

> ...
> // wait for the process to die
> poll_wait(evfd, ...);
>
> This simplifies userspace

Not relative to an exit handle it doesn't.

>, allows it to wait for multiple events using
> epoll

So does a process exit status handle.

> and I think kernel implementation will be also quite simple
> because it already implements eventfd_signal() that takes care of
> waitqueue handling.

What if there are multiple eventfds registered for the death of a
process? In any case, you need some mechanism to find, upon process
death, a list of waiters, then wake each of them up. That's either a
global search or a search in some list rooted in a task-related
structure (either struct task or one of its friends). Using an eventfd
here adds nothing, since upon death, you need this list search
regardless, and as I mentioned above, eventfd-wiring just makes the
API worse.

> If pidfd_send_signal could be extended to have an optional eventfd
> parameter then we would not even have to add a new syscall.

There is nothing wrong with adding a new system call. I don't know why
there's this idea circulating that adding system calls is something we
should bend over backwards to avoid. It's cheap, and support-wise,
kernel interface is kernel interface. Sending a signal has *nothing*
to do with wiring up some kind of notification and there's no reason
to mingle it with some kind of event registration.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-16 18:00                                   ` Daniel Colascione
@ 2019-03-16 18:57                                     ` Christian Brauner
  2019-03-16 19:37                                       ` Suren Baghdasaryan
  0 siblings, 1 reply; 113+ messages in thread
From: Christian Brauner @ 2019-03-16 18:57 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: Suren Baghdasaryan, Joel Fernandes, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team

On Sat, Mar 16, 2019 at 11:00:10AM -0700, Daniel Colascione wrote:
> On Sat, Mar 16, 2019 at 10:31 AM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Fri, Mar 15, 2019 at 11:49 AM Joel Fernandes <joel@joelfernandes.org> wrote:
> > >
> > > On Fri, Mar 15, 2019 at 07:24:28PM +0100, Christian Brauner wrote:
> > > [..]
> > > > > why do we want to add a new syscall (pidfd_wait) though? Why not just use
> > > > > standard poll/epoll interface on the proc fd like Daniel was suggesting.
> > > > > AFAIK, once the proc file is opened, the struct pid is essentially pinned
> > > > > even though the proc number may be reused. Then the caller can just poll.
> > > > > We can add a waitqueue to struct pid, and wake up any waiters on process
> > > > > death (A quick look shows task_struct can be mapped to its struct pid) and
> > > > > also possibly optimize it using Steve's TIF flag idea. No new syscall is
> > > > > needed then, let me know if I missed something?
> > > >
> > > > Huh, I thought that Daniel was against the poll/epoll solution?
> > >
> > > Hmm, going through earlier threads, I believe so now. Here was Daniel's
> > > reasoning about avoiding a notification about process death through proc
> > > directory fd: http://lkml.iu.edu/hypermail/linux/kernel/1811.0/00232.html
> > >
> > > May be a dedicated syscall for this would be cleaner after all.
> >
> > Ah, I wish I've seen that discussion before...
> > syscall makes sense and it can be non-blocking and we can use
> > select/poll/epoll if we use eventfd.
> 
> Thanks for taking a look.
> 
> > I would strongly advocate for
> > non-blocking version or at least to have a non-blocking option.
> 
> Waiting for FD readiness is *already* blocking or non-blocking
> according to the caller's desire --- users can pass options they want
> to poll(2) or whatever. There's no need for any kind of special
> configuration knob or non-blocking option. We already *have* a
> non-blocking option that works universally for everything.
> 
> As I mentioned in the linked thread, waiting for process exit should
> work just like waiting for bytes to appear on a pipe. Process exit
> status is just another blob of bytes that a process might receive. A
> process exit handle ought to be just another information source. The
> reason the unix process API is so awful is that for whatever reason
> the original designers treated processes as some kind of special kind
> of resource instead of fitting them into the otherwise general-purpose
> unix data-handling API. Let's not repeat that mistake.
> 
> > Something like this:
> >
> > evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> > // register eventfd to receive death notification
> > pidfd_wait(pid_to_kill, evfd);
> > // kill the process
> > pidfd_send_signal(pid_to_kill, ...)
> > // tend to other things
> 
> Now you've lost me. pidfd_wait should return a *new* FD, not wire up
> an eventfd.
> 
> Why? Because the new type FD can report process exit *status*
> information (via read(2) after readability signal) as well as this
> binary yes-or-no signal *that* a process exited, and this capability
> is useful if you want to the pidfd interface to be a good
> general-purpose process management facility to replace the awful
> wait() family of functions. You can't get an exit status from an
> eventfd. Wiring up an eventfd the way you've proposed also complicates
> wait-causality information, complicating both tracing and any priority
> inheritance we might want in the future (because all the wakeups gets
> mixed into the eventfd and you can't unscramble an egg). And for what?
> What do we gain by using an eventfd? Is the reason that exit.c would
> be able to use eventfd_signal instead of poking a waitqueue directly?
> How is that better? With an eventfd, you've increased path length on
> process exit *and* complicated the API for no reason.
> 
> > ...
> > // wait for the process to die
> > poll_wait(evfd, ...);
> >
> > This simplifies userspace
> 
> Not relative to an exit handle it doesn't.
> 
> >, allows it to wait for multiple events using
> > epoll
> 
> So does a process exit status handle.
> 
> > and I think kernel implementation will be also quite simple
> > because it already implements eventfd_signal() that takes care of
> > waitqueue handling.
> 
> What if there are multiple eventfds registered for the death of a
> process? In any case, you need some mechanism to find, upon process
> death, a list of waiters, then wake each of them up. That's either a
> global search or a search in some list rooted in a task-related
> structure (either struct task or one of its friends). Using an eventfd
> here adds nothing, since upon death, you need this list search
> regardless, and as I mentioned above, eventfd-wiring just makes the
> API worse.
> 
> > If pidfd_send_signal could be extended to have an optional eventfd
> > parameter then we would not even have to add a new syscall.
> 
> There is nothing wrong with adding a new system call. I don't know why
> there's this idea circulating that adding system calls is something we
> should bend over backwards to avoid. It's cheap, and support-wise,
> kernel interface is kernel interface. Sending a signal has *nothing*
> to do with wiring up some kind of notification and there's no reason
> to mingle it with some kind of event registration.


I agree with Daniel.
One design goal is to not stuff clearly delinated tasks related to
process management into the same syscall. That will just leave us with a
confusing api. Sending signals is part of managing a process while it is
running. Waiting on a process to end is clearly separate from that.
It's important to keep in mind that the goal of the pidfd work is to end
up with an api that is of use to all of user space concerned with
process management not just a specific project.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-16 18:57                                     ` Christian Brauner
@ 2019-03-16 19:37                                       ` Suren Baghdasaryan
  2019-03-17  1:53                                         ` Joel Fernandes
  0 siblings, 1 reply; 113+ messages in thread
From: Suren Baghdasaryan @ 2019-03-16 19:37 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Daniel Colascione, Joel Fernandes, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team

On Sat, Mar 16, 2019 at 11:57 AM Christian Brauner <christian@brauner.io> wrote:
>
> On Sat, Mar 16, 2019 at 11:00:10AM -0700, Daniel Colascione wrote:
> > On Sat, Mar 16, 2019 at 10:31 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > On Fri, Mar 15, 2019 at 11:49 AM Joel Fernandes <joel@joelfernandes.org> wrote:
> > > >
> > > > On Fri, Mar 15, 2019 at 07:24:28PM +0100, Christian Brauner wrote:
> > > > [..]
> > > > > > why do we want to add a new syscall (pidfd_wait) though? Why not just use
> > > > > > standard poll/epoll interface on the proc fd like Daniel was suggesting.
> > > > > > AFAIK, once the proc file is opened, the struct pid is essentially pinned
> > > > > > even though the proc number may be reused. Then the caller can just poll.
> > > > > > We can add a waitqueue to struct pid, and wake up any waiters on process
> > > > > > death (A quick look shows task_struct can be mapped to its struct pid) and
> > > > > > also possibly optimize it using Steve's TIF flag idea. No new syscall is
> > > > > > needed then, let me know if I missed something?
> > > > >
> > > > > Huh, I thought that Daniel was against the poll/epoll solution?
> > > >
> > > > Hmm, going through earlier threads, I believe so now. Here was Daniel's
> > > > reasoning about avoiding a notification about process death through proc
> > > > directory fd: http://lkml.iu.edu/hypermail/linux/kernel/1811.0/00232.html
> > > >
> > > > May be a dedicated syscall for this would be cleaner after all.
> > >
> > > Ah, I wish I've seen that discussion before...
> > > syscall makes sense and it can be non-blocking and we can use
> > > select/poll/epoll if we use eventfd.
> >
> > Thanks for taking a look.
> >
> > > I would strongly advocate for
> > > non-blocking version or at least to have a non-blocking option.
> >
> > Waiting for FD readiness is *already* blocking or non-blocking
> > according to the caller's desire --- users can pass options they want
> > to poll(2) or whatever. There's no need for any kind of special
> > configuration knob or non-blocking option. We already *have* a
> > non-blocking option that works universally for everything.
> >
> > As I mentioned in the linked thread, waiting for process exit should
> > work just like waiting for bytes to appear on a pipe. Process exit
> > status is just another blob of bytes that a process might receive. A
> > process exit handle ought to be just another information source. The
> > reason the unix process API is so awful is that for whatever reason
> > the original designers treated processes as some kind of special kind
> > of resource instead of fitting them into the otherwise general-purpose
> > unix data-handling API. Let's not repeat that mistake.
> >
> > > Something like this:
> > >
> > > evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> > > // register eventfd to receive death notification
> > > pidfd_wait(pid_to_kill, evfd);
> > > // kill the process
> > > pidfd_send_signal(pid_to_kill, ...)
> > > // tend to other things
> >
> > Now you've lost me. pidfd_wait should return a *new* FD, not wire up
> > an eventfd.
> >

Ok, I probably misunderstood your post linked by Joel. I though your
original proposal was based on being able to poll a file under
/proc/pid and then you changed your mind to have a separate syscall
which I assumed would be a blocking one to wait for process exit.
Maybe you can describe the new interface you are thinking about in
terms of userspace usage like I did above? Several lines of code would
explain more than paragraphs of text.

> > Why? Because the new type FD can report process exit *status*
> > information (via read(2) after readability signal) as well as this
> > binary yes-or-no signal *that* a process exited, and this capability
> > is useful if you want to the pidfd interface to be a good
> > general-purpose process management facility to replace the awful
> > wait() family of functions. You can't get an exit status from an
> > eventfd. Wiring up an eventfd the way you've proposed also complicates
> > wait-causality information, complicating both tracing and any priority
> > inheritance we might want in the future (because all the wakeups gets
> > mixed into the eventfd and you can't unscramble an egg). And for what?
> > What do we gain by using an eventfd? Is the reason that exit.c would
> > be able to use eventfd_signal instead of poking a waitqueue directly?
> > How is that better? With an eventfd, you've increased path length on
> > process exit *and* complicated the API for no reason.
> >
> > > ...
> > > // wait for the process to die
> > > poll_wait(evfd, ...);
> > >
> > > This simplifies userspace
> >
> > Not relative to an exit handle it doesn't.
> >
> > >, allows it to wait for multiple events using
> > > epoll
> >
> > So does a process exit status handle.
> >
> > > and I think kernel implementation will be also quite simple
> > > because it already implements eventfd_signal() that takes care of
> > > waitqueue handling.
> >
> > What if there are multiple eventfds registered for the death of a
> > process? In any case, you need some mechanism to find, upon process
> > death, a list of waiters, then wake each of them up. That's either a
> > global search or a search in some list rooted in a task-related
> > structure (either struct task or one of its friends). Using an eventfd
> > here adds nothing, since upon death, you need this list search
> > regardless, and as I mentioned above, eventfd-wiring just makes the
> > API worse.
> >
> > > If pidfd_send_signal could be extended to have an optional eventfd
> > > parameter then we would not even have to add a new syscall.
> >
> > There is nothing wrong with adding a new system call. I don't know why
> > there's this idea circulating that adding system calls is something we
> > should bend over backwards to avoid. It's cheap, and support-wise,
> > kernel interface is kernel interface. Sending a signal has *nothing*
> > to do with wiring up some kind of notification and there's no reason
> > to mingle it with some kind of event registration.
>
>
> I agree with Daniel.
> One design goal is to not stuff clearly delinated tasks related to
> process management into the same syscall. That will just leave us with a
> confusing api. Sending signals is part of managing a process while it is
> running. Waiting on a process to end is clearly separate from that.
> It's important to keep in mind that the goal of the pidfd work is to end
> up with an api that is of use to all of user space concerned with
> process management not just a specific project.

I'm not bent on adding or not adding a new syscall as long as
functionality is there.
Thanks!

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-16 19:37                                       ` Suren Baghdasaryan
@ 2019-03-17  1:53                                         ` Joel Fernandes
  2019-03-17 11:42                                           ` Christian Brauner
  0 siblings, 1 reply; 113+ messages in thread
From: Joel Fernandes @ 2019-03-17  1:53 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Christian Brauner, Daniel Colascione, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team

On Sat, Mar 16, 2019 at 12:37:18PM -0700, Suren Baghdasaryan wrote:
> On Sat, Mar 16, 2019 at 11:57 AM Christian Brauner <christian@brauner.io> wrote:
> >
> > On Sat, Mar 16, 2019 at 11:00:10AM -0700, Daniel Colascione wrote:
> > > On Sat, Mar 16, 2019 at 10:31 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > >
> > > > On Fri, Mar 15, 2019 at 11:49 AM Joel Fernandes <joel@joelfernandes.org> wrote:
> > > > >
> > > > > On Fri, Mar 15, 2019 at 07:24:28PM +0100, Christian Brauner wrote:
> > > > > [..]
> > > > > > > why do we want to add a new syscall (pidfd_wait) though? Why not just use
> > > > > > > standard poll/epoll interface on the proc fd like Daniel was suggesting.
> > > > > > > AFAIK, once the proc file is opened, the struct pid is essentially pinned
> > > > > > > even though the proc number may be reused. Then the caller can just poll.
> > > > > > > We can add a waitqueue to struct pid, and wake up any waiters on process
> > > > > > > death (A quick look shows task_struct can be mapped to its struct pid) and
> > > > > > > also possibly optimize it using Steve's TIF flag idea. No new syscall is
> > > > > > > needed then, let me know if I missed something?
> > > > > >
> > > > > > Huh, I thought that Daniel was against the poll/epoll solution?
> > > > >
> > > > > Hmm, going through earlier threads, I believe so now. Here was Daniel's
> > > > > reasoning about avoiding a notification about process death through proc
> > > > > directory fd: http://lkml.iu.edu/hypermail/linux/kernel/1811.0/00232.html
> > > > >
> > > > > May be a dedicated syscall for this would be cleaner after all.
> > > >
> > > > Ah, I wish I've seen that discussion before...
> > > > syscall makes sense and it can be non-blocking and we can use
> > > > select/poll/epoll if we use eventfd.
> > >
> > > Thanks for taking a look.
> > >
> > > > I would strongly advocate for
> > > > non-blocking version or at least to have a non-blocking option.
> > >
> > > Waiting for FD readiness is *already* blocking or non-blocking
> > > according to the caller's desire --- users can pass options they want
> > > to poll(2) or whatever. There's no need for any kind of special
> > > configuration knob or non-blocking option. We already *have* a
> > > non-blocking option that works universally for everything.
> > >
> > > As I mentioned in the linked thread, waiting for process exit should
> > > work just like waiting for bytes to appear on a pipe. Process exit
> > > status is just another blob of bytes that a process might receive. A
> > > process exit handle ought to be just another information source. The
> > > reason the unix process API is so awful is that for whatever reason
> > > the original designers treated processes as some kind of special kind
> > > of resource instead of fitting them into the otherwise general-purpose
> > > unix data-handling API. Let's not repeat that mistake.
> > >
> > > > Something like this:
> > > >
> > > > evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> > > > // register eventfd to receive death notification
> > > > pidfd_wait(pid_to_kill, evfd);
> > > > // kill the process
> > > > pidfd_send_signal(pid_to_kill, ...)
> > > > // tend to other things
> > >
> > > Now you've lost me. pidfd_wait should return a *new* FD, not wire up
> > > an eventfd.
> > >
> 
> Ok, I probably misunderstood your post linked by Joel. I though your
> original proposal was based on being able to poll a file under
> /proc/pid and then you changed your mind to have a separate syscall
> which I assumed would be a blocking one to wait for process exit.
> Maybe you can describe the new interface you are thinking about in
> terms of userspace usage like I did above? Several lines of code would
> explain more than paragraphs of text.

Hey, Thanks Suren for the eventfd idea. I agree with Daniel on this. The idea
from Daniel here is to wait for process death and exit events by just
referring to a stable fd, independent of whatever is going on in /proc.

What is needed is something like this (in highly pseudo-code form):

pidfd = opendir("/proc/<pid>",..);
wait_fd = pidfd_wait(pidfd);
read or poll wait_fd (non-blocking or blocking whichever)

wait_fd will block until the task has either died or reaped. In both these
cases, it can return a suitable string such as "dead" or "reaped" although an
integer with some predefined meaning is also Ok.

What that guarantees is, even if the task's PID has been reused, or the task
has already died or already died + reaped, all of these events cannot race
with the code above and the information passed to the user is race-free and
stable / guaranteed.

An eventfd seems to not fit well, because AFAICS passing the raw PID to
eventfd as in your example would still race since the PID could have been
reused by another process by the time the eventfd is created.

Also Andy's idea in [1] seems to use poll flags to communicate various tihngs
which is still not as explicit about the PID's status so that's a poor API
choice compared to the explicit syscall.

I am planning to work on a prototype patch based on Daniel's idea and post something
soon (chatted with Daniel about it and will reference him in the posting as
well), during this posting I will also summarize all the previous discussions
and come up with some tests as well.  I hope to have something soon.

Let me know if I hit all the points correctly and I hope we are all on the
same page.

Thanks!

 - Joel

[1] http://lkml.iu.edu/hypermail//linux/kernel/1212.0/00808.html


^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-17  1:53                                         ` Joel Fernandes
@ 2019-03-17 11:42                                           ` Christian Brauner
  2019-03-17 15:40                                             ` Daniel Colascione
  2019-03-17 16:35                                             ` Serge E. Hallyn
  0 siblings, 2 replies; 113+ messages in thread
From: Christian Brauner @ 2019-03-17 11:42 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Suren Baghdasaryan, Daniel Colascione, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, oleg, luto, serge

On Sat, Mar 16, 2019 at 09:53:06PM -0400, Joel Fernandes wrote:
> On Sat, Mar 16, 2019 at 12:37:18PM -0700, Suren Baghdasaryan wrote:
> > On Sat, Mar 16, 2019 at 11:57 AM Christian Brauner <christian@brauner.io> wrote:
> > >
> > > On Sat, Mar 16, 2019 at 11:00:10AM -0700, Daniel Colascione wrote:
> > > > On Sat, Mar 16, 2019 at 10:31 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > >
> > > > > On Fri, Mar 15, 2019 at 11:49 AM Joel Fernandes <joel@joelfernandes.org> wrote:
> > > > > >
> > > > > > On Fri, Mar 15, 2019 at 07:24:28PM +0100, Christian Brauner wrote:
> > > > > > [..]
> > > > > > > > why do we want to add a new syscall (pidfd_wait) though? Why not just use
> > > > > > > > standard poll/epoll interface on the proc fd like Daniel was suggesting.
> > > > > > > > AFAIK, once the proc file is opened, the struct pid is essentially pinned
> > > > > > > > even though the proc number may be reused. Then the caller can just poll.
> > > > > > > > We can add a waitqueue to struct pid, and wake up any waiters on process
> > > > > > > > death (A quick look shows task_struct can be mapped to its struct pid) and
> > > > > > > > also possibly optimize it using Steve's TIF flag idea. No new syscall is
> > > > > > > > needed then, let me know if I missed something?
> > > > > > >
> > > > > > > Huh, I thought that Daniel was against the poll/epoll solution?
> > > > > >
> > > > > > Hmm, going through earlier threads, I believe so now. Here was Daniel's
> > > > > > reasoning about avoiding a notification about process death through proc
> > > > > > directory fd: http://lkml.iu.edu/hypermail/linux/kernel/1811.0/00232.html
> > > > > >
> > > > > > May be a dedicated syscall for this would be cleaner after all.
> > > > >
> > > > > Ah, I wish I've seen that discussion before...
> > > > > syscall makes sense and it can be non-blocking and we can use
> > > > > select/poll/epoll if we use eventfd.
> > > >
> > > > Thanks for taking a look.
> > > >
> > > > > I would strongly advocate for
> > > > > non-blocking version or at least to have a non-blocking option.
> > > >
> > > > Waiting for FD readiness is *already* blocking or non-blocking
> > > > according to the caller's desire --- users can pass options they want
> > > > to poll(2) or whatever. There's no need for any kind of special
> > > > configuration knob or non-blocking option. We already *have* a
> > > > non-blocking option that works universally for everything.
> > > >
> > > > As I mentioned in the linked thread, waiting for process exit should
> > > > work just like waiting for bytes to appear on a pipe. Process exit
> > > > status is just another blob of bytes that a process might receive. A
> > > > process exit handle ought to be just another information source. The
> > > > reason the unix process API is so awful is that for whatever reason
> > > > the original designers treated processes as some kind of special kind
> > > > of resource instead of fitting them into the otherwise general-purpose
> > > > unix data-handling API. Let's not repeat that mistake.
> > > >
> > > > > Something like this:
> > > > >
> > > > > evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> > > > > // register eventfd to receive death notification
> > > > > pidfd_wait(pid_to_kill, evfd);
> > > > > // kill the process
> > > > > pidfd_send_signal(pid_to_kill, ...)
> > > > > // tend to other things
> > > >
> > > > Now you've lost me. pidfd_wait should return a *new* FD, not wire up
> > > > an eventfd.
> > > >
> > 
> > Ok, I probably misunderstood your post linked by Joel. I though your
> > original proposal was based on being able to poll a file under
> > /proc/pid and then you changed your mind to have a separate syscall
> > which I assumed would be a blocking one to wait for process exit.
> > Maybe you can describe the new interface you are thinking about in
> > terms of userspace usage like I did above? Several lines of code would
> > explain more than paragraphs of text.
> 
> Hey, Thanks Suren for the eventfd idea. I agree with Daniel on this. The idea
> from Daniel here is to wait for process death and exit events by just
> referring to a stable fd, independent of whatever is going on in /proc.
> 
> What is needed is something like this (in highly pseudo-code form):
> 
> pidfd = opendir("/proc/<pid>",..);
> wait_fd = pidfd_wait(pidfd);
> read or poll wait_fd (non-blocking or blocking whichever)
> 
> wait_fd will block until the task has either died or reaped. In both these
> cases, it can return a suitable string such as "dead" or "reaped" although an
> integer with some predefined meaning is also Ok.
> 
> What that guarantees is, even if the task's PID has been reused, or the task
> has already died or already died + reaped, all of these events cannot race
> with the code above and the information passed to the user is race-free and
> stable / guaranteed.
> 
> An eventfd seems to not fit well, because AFAICS passing the raw PID to
> eventfd as in your example would still race since the PID could have been
> reused by another process by the time the eventfd is created.
> 
> Also Andy's idea in [1] seems to use poll flags to communicate various tihngs
> which is still not as explicit about the PID's status so that's a poor API
> choice compared to the explicit syscall.
> 
> I am planning to work on a prototype patch based on Daniel's idea and post something
> soon (chatted with Daniel about it and will reference him in the posting as
> well), during this posting I will also summarize all the previous discussions
> and come up with some tests as well.  I hope to have something soon.

Having pidfd_wait() return another fd will make the syscall harder to
swallow for a lot of people I reckon.
What exactly prevents us from making the pidfd itself readable/pollable
for the exit staus? They are "special" fds anyway. I would really like
to avoid polluting the api with multiple different types of fds if possible.

ret = pidfd_wait(pidfd);
read or poll pidfd
(Note that I'm traveling so my responses might be delayed quite a bit.)
(Ccing a few people that might have an opinion here.)

Christian

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-17 11:42                                           ` Christian Brauner
@ 2019-03-17 15:40                                             ` Daniel Colascione
  2019-03-18  0:29                                               ` Christian Brauner
  2019-03-17 16:35                                             ` Serge E. Hallyn
  1 sibling, 1 reply; 113+ messages in thread
From: Daniel Colascione @ 2019-03-17 15:40 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Joel Fernandes, Suren Baghdasaryan, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Oleg Nesterov, Andy Lutomirski, Serge E. Hallyn

On Sun, Mar 17, 2019 at 4:42 AM Christian Brauner <christian@brauner.io> wrote:
>
> On Sat, Mar 16, 2019 at 09:53:06PM -0400, Joel Fernandes wrote:
> > On Sat, Mar 16, 2019 at 12:37:18PM -0700, Suren Baghdasaryan wrote:
> > > On Sat, Mar 16, 2019 at 11:57 AM Christian Brauner <christian@brauner.io> wrote:
> > > >
> > > > On Sat, Mar 16, 2019 at 11:00:10AM -0700, Daniel Colascione wrote:
> > > > > On Sat, Mar 16, 2019 at 10:31 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > > >
> > > > > > On Fri, Mar 15, 2019 at 11:49 AM Joel Fernandes <joel@joelfernandes.org> wrote:
> > > > > > >
> > > > > > > On Fri, Mar 15, 2019 at 07:24:28PM +0100, Christian Brauner wrote:
> > > > > > > [..]
> > > > > > > > > why do we want to add a new syscall (pidfd_wait) though? Why not just use
> > > > > > > > > standard poll/epoll interface on the proc fd like Daniel was suggesting.
> > > > > > > > > AFAIK, once the proc file is opened, the struct pid is essentially pinned
> > > > > > > > > even though the proc number may be reused. Then the caller can just poll.
> > > > > > > > > We can add a waitqueue to struct pid, and wake up any waiters on process
> > > > > > > > > death (A quick look shows task_struct can be mapped to its struct pid) and
> > > > > > > > > also possibly optimize it using Steve's TIF flag idea. No new syscall is
> > > > > > > > > needed then, let me know if I missed something?
> > > > > > > >
> > > > > > > > Huh, I thought that Daniel was against the poll/epoll solution?
> > > > > > >
> > > > > > > Hmm, going through earlier threads, I believe so now. Here was Daniel's
> > > > > > > reasoning about avoiding a notification about process death through proc
> > > > > > > directory fd: http://lkml.iu.edu/hypermail/linux/kernel/1811.0/00232.html
> > > > > > >
> > > > > > > May be a dedicated syscall for this would be cleaner after all.
> > > > > >
> > > > > > Ah, I wish I've seen that discussion before...
> > > > > > syscall makes sense and it can be non-blocking and we can use
> > > > > > select/poll/epoll if we use eventfd.
> > > > >
> > > > > Thanks for taking a look.
> > > > >
> > > > > > I would strongly advocate for
> > > > > > non-blocking version or at least to have a non-blocking option.
> > > > >
> > > > > Waiting for FD readiness is *already* blocking or non-blocking
> > > > > according to the caller's desire --- users can pass options they want
> > > > > to poll(2) or whatever. There's no need for any kind of special
> > > > > configuration knob or non-blocking option. We already *have* a
> > > > > non-blocking option that works universally for everything.
> > > > >
> > > > > As I mentioned in the linked thread, waiting for process exit should
> > > > > work just like waiting for bytes to appear on a pipe. Process exit
> > > > > status is just another blob of bytes that a process might receive. A
> > > > > process exit handle ought to be just another information source. The
> > > > > reason the unix process API is so awful is that for whatever reason
> > > > > the original designers treated processes as some kind of special kind
> > > > > of resource instead of fitting them into the otherwise general-purpose
> > > > > unix data-handling API. Let's not repeat that mistake.
> > > > >
> > > > > > Something like this:
> > > > > >
> > > > > > evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> > > > > > // register eventfd to receive death notification
> > > > > > pidfd_wait(pid_to_kill, evfd);
> > > > > > // kill the process
> > > > > > pidfd_send_signal(pid_to_kill, ...)
> > > > > > // tend to other things
> > > > >
> > > > > Now you've lost me. pidfd_wait should return a *new* FD, not wire up
> > > > > an eventfd.
> > > > >
> > >
> > > Ok, I probably misunderstood your post linked by Joel. I though your
> > > original proposal was based on being able to poll a file under
> > > /proc/pid and then you changed your mind to have a separate syscall
> > > which I assumed would be a blocking one to wait for process exit.
> > > Maybe you can describe the new interface you are thinking about in
> > > terms of userspace usage like I did above? Several lines of code would
> > > explain more than paragraphs of text.
> >
> > Hey, Thanks Suren for the eventfd idea. I agree with Daniel on this. The idea
> > from Daniel here is to wait for process death and exit events by just
> > referring to a stable fd, independent of whatever is going on in /proc.
> >
> > What is needed is something like this (in highly pseudo-code form):
> >
> > pidfd = opendir("/proc/<pid>",..);
> > wait_fd = pidfd_wait(pidfd);
> > read or poll wait_fd (non-blocking or blocking whichever)
> >
> > wait_fd will block until the task has either died or reaped. In both these
> > cases, it can return a suitable string such as "dead" or "reaped" although an
> > integer with some predefined meaning is also Ok.

I want to return a siginfo_t: we already use this structure in other
contexts to report exit status.

> > What that guarantees is, even if the task's PID has been reused, or the task
> > has already died or already died + reaped, all of these events cannot race
> > with the code above and the information passed to the user is race-free and
> > stable / guaranteed.
> >
> > An eventfd seems to not fit well, because AFAICS passing the raw PID to
> > eventfd as in your example would still race since the PID could have been
> > reused by another process by the time the eventfd is created.
> >
> > Also Andy's idea in [1] seems to use poll flags to communicate various tihngs
> > which is still not as explicit about the PID's status so that's a poor API
> > choice compared to the explicit syscall.
> >
> > I am planning to work on a prototype patch based on Daniel's idea and post something
> > soon (chatted with Daniel about it and will reference him in the posting as
> > well), during this posting I will also summarize all the previous discussions
> > and come up with some tests as well.  I hope to have something soon.

Thanks.

> Having pidfd_wait() return another fd will make the syscall harder to
> swallow for a lot of people I reckon.
> What exactly prevents us from making the pidfd itself readable/pollable
> for the exit staus? They are "special" fds anyway. I would really like
> to avoid polluting the api with multiple different types of fds if possible.

If pidfds had been their own file type, I'd agree with you. But pidfds
are directories, which means that we're beholden to make them behave
like directories normally do. I'd rather introduce another FD than
heavily overload the semantics of a directory FD in one particular
context. In no other circumstances are directory FDs also weird
IO-data sources. Our providing a facility to get a new FD to which we
*can* give pipe-like behavior does no harm and *usage* cleaner and
easier to reason about.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-17 11:42                                           ` Christian Brauner
  2019-03-17 15:40                                             ` Daniel Colascione
@ 2019-03-17 16:35                                             ` Serge E. Hallyn
  2019-03-17 17:11                                               ` Daniel Colascione
  1 sibling, 1 reply; 113+ messages in thread
From: Serge E. Hallyn @ 2019-03-17 16:35 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Steven Rostedt, Sultan Alsawaf, Tim Murray, Michal Hocko,
	Greg Kroah-Hartman, Arve Hjønnevåg, Todd Kjos,
	Martijn Coenen, Ingo Molnar, Peter Zijlstra, LKML,
	open list:ANDROID DRIVERS, linux-mm, kernel-team, oleg, luto,
	serge

On Sun, Mar 17, 2019 at 12:42:40PM +0100, Christian Brauner wrote:
> On Sat, Mar 16, 2019 at 09:53:06PM -0400, Joel Fernandes wrote:
> > On Sat, Mar 16, 2019 at 12:37:18PM -0700, Suren Baghdasaryan wrote:
> > > On Sat, Mar 16, 2019 at 11:57 AM Christian Brauner <christian@brauner.io> wrote:
> > > >
> > > > On Sat, Mar 16, 2019 at 11:00:10AM -0700, Daniel Colascione wrote:
> > > > > On Sat, Mar 16, 2019 at 10:31 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > > >
> > > > > > On Fri, Mar 15, 2019 at 11:49 AM Joel Fernandes <joel@joelfernandes.org> wrote:
> > > > > > >
> > > > > > > On Fri, Mar 15, 2019 at 07:24:28PM +0100, Christian Brauner wrote:
> > > > > > > [..]
> > > > > > > > > why do we want to add a new syscall (pidfd_wait) though? Why not just use
> > > > > > > > > standard poll/epoll interface on the proc fd like Daniel was suggesting.
> > > > > > > > > AFAIK, once the proc file is opened, the struct pid is essentially pinned
> > > > > > > > > even though the proc number may be reused. Then the caller can just poll.
> > > > > > > > > We can add a waitqueue to struct pid, and wake up any waiters on process
> > > > > > > > > death (A quick look shows task_struct can be mapped to its struct pid) and
> > > > > > > > > also possibly optimize it using Steve's TIF flag idea. No new syscall is
> > > > > > > > > needed then, let me know if I missed something?
> > > > > > > >
> > > > > > > > Huh, I thought that Daniel was against the poll/epoll solution?
> > > > > > >
> > > > > > > Hmm, going through earlier threads, I believe so now. Here was Daniel's
> > > > > > > reasoning about avoiding a notification about process death through proc
> > > > > > > directory fd: http://lkml.iu.edu/hypermail/linux/kernel/1811.0/00232.html
> > > > > > >
> > > > > > > May be a dedicated syscall for this would be cleaner after all.
> > > > > >
> > > > > > Ah, I wish I've seen that discussion before...
> > > > > > syscall makes sense and it can be non-blocking and we can use
> > > > > > select/poll/epoll if we use eventfd.
> > > > >
> > > > > Thanks for taking a look.
> > > > >
> > > > > > I would strongly advocate for
> > > > > > non-blocking version or at least to have a non-blocking option.
> > > > >
> > > > > Waiting for FD readiness is *already* blocking or non-blocking
> > > > > according to the caller's desire --- users can pass options they want
> > > > > to poll(2) or whatever. There's no need for any kind of special
> > > > > configuration knob or non-blocking option. We already *have* a
> > > > > non-blocking option that works universally for everything.
> > > > >
> > > > > As I mentioned in the linked thread, waiting for process exit should
> > > > > work just like waiting for bytes to appear on a pipe. Process exit
> > > > > status is just another blob of bytes that a process might receive. A
> > > > > process exit handle ought to be just another information source. The
> > > > > reason the unix process API is so awful is that for whatever reason
> > > > > the original designers treated processes as some kind of special kind
> > > > > of resource instead of fitting them into the otherwise general-purpose
> > > > > unix data-handling API. Let's not repeat that mistake.
> > > > >
> > > > > > Something like this:
> > > > > >
> > > > > > evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> > > > > > // register eventfd to receive death notification
> > > > > > pidfd_wait(pid_to_kill, evfd);
> > > > > > // kill the process
> > > > > > pidfd_send_signal(pid_to_kill, ...)
> > > > > > // tend to other things
> > > > >
> > > > > Now you've lost me. pidfd_wait should return a *new* FD, not wire up
> > > > > an eventfd.
> > > > >
> > > 
> > > Ok, I probably misunderstood your post linked by Joel. I though your
> > > original proposal was based on being able to poll a file under
> > > /proc/pid and then you changed your mind to have a separate syscall
> > > which I assumed would be a blocking one to wait for process exit.
> > > Maybe you can describe the new interface you are thinking about in
> > > terms of userspace usage like I did above? Several lines of code would
> > > explain more than paragraphs of text.
> > 
> > Hey, Thanks Suren for the eventfd idea. I agree with Daniel on this. The idea
> > from Daniel here is to wait for process death and exit events by just
> > referring to a stable fd, independent of whatever is going on in /proc.
> > 
> > What is needed is something like this (in highly pseudo-code form):
> > 
> > pidfd = opendir("/proc/<pid>",..);
> > wait_fd = pidfd_wait(pidfd);
> > read or poll wait_fd (non-blocking or blocking whichever)
> > 
> > wait_fd will block until the task has either died or reaped. In both these
> > cases, it can return a suitable string such as "dead" or "reaped" although an
> > integer with some predefined meaning is also Ok.
> > 
> > What that guarantees is, even if the task's PID has been reused, or the task
> > has already died or already died + reaped, all of these events cannot race
> > with the code above and the information passed to the user is race-free and
> > stable / guaranteed.
> > 
> > An eventfd seems to not fit well, because AFAICS passing the raw PID to
> > eventfd as in your example would still race since the PID could have been
> > reused by another process by the time the eventfd is created.
> > 
> > Also Andy's idea in [1] seems to use poll flags to communicate various tihngs
> > which is still not as explicit about the PID's status so that's a poor API
> > choice compared to the explicit syscall.
> > 
> > I am planning to work on a prototype patch based on Daniel's idea and post something
> > soon (chatted with Daniel about it and will reference him in the posting as
> > well), during this posting I will also summarize all the previous discussions
> > and come up with some tests as well.  I hope to have something soon.
> 
> Having pidfd_wait() return another fd will make the syscall harder to
> swallow for a lot of people I reckon.
> What exactly prevents us from making the pidfd itself readable/pollable
> for the exit staus? They are "special" fds anyway. I would really like
> to avoid polluting the api with multiple different types of fds if possible.
> 
> ret = pidfd_wait(pidfd);
> read or poll pidfd

I'm not quite clear on what the two steps are doing here.  Is pidfd_wait()
doing a waitpid(2), and the read gets exit status info?

> (Note that I'm traveling so my responses might be delayed quite a bit.)
> (Ccing a few people that might have an opinion here.)
> 
> Christian

On its own, what you (Christian) show seems nicer.  But think about a main event
loop (like in lxc), where we just loop over epoll_wait() on various descriptors.
If we want to wait for any of several types of events - maybe a signalfd, socket
traffic, or a process death - it would be nice if we can treat them all the same
way, without having to setup a separate thread to watch the pidfd and send
data over another fd.  Is there a nice way we can provide that with what you've
got above?

-serge

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-17 16:35                                             ` Serge E. Hallyn
@ 2019-03-17 17:11                                               ` Daniel Colascione
  2019-03-17 17:16                                                 ` Serge E. Hallyn
  0 siblings, 1 reply; 113+ messages in thread
From: Daniel Colascione @ 2019-03-17 17:11 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Christian Brauner, Joel Fernandes, Suren Baghdasaryan,
	Steven Rostedt, Sultan Alsawaf, Tim Murray, Michal Hocko,
	Greg Kroah-Hartman, Arve Hjønnevåg, Todd Kjos,
	Martijn Coenen, Ingo Molnar, Peter Zijlstra, LKML,
	open list:ANDROID DRIVERS, linux-mm, kernel-team, Oleg Nesterov,
	Andy Lutomirski

On Sun, Mar 17, 2019 at 9:35 AM Serge E. Hallyn <serge@hallyn.com> wrote:
>
> On Sun, Mar 17, 2019 at 12:42:40PM +0100, Christian Brauner wrote:
> > On Sat, Mar 16, 2019 at 09:53:06PM -0400, Joel Fernandes wrote:
> > > On Sat, Mar 16, 2019 at 12:37:18PM -0700, Suren Baghdasaryan wrote:
> > > > On Sat, Mar 16, 2019 at 11:57 AM Christian Brauner <christian@brauner.io> wrote:
> > > > >
> > > > > On Sat, Mar 16, 2019 at 11:00:10AM -0700, Daniel Colascione wrote:
> > > > > > On Sat, Mar 16, 2019 at 10:31 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > > > >
> > > > > > > On Fri, Mar 15, 2019 at 11:49 AM Joel Fernandes <joel@joelfernandes.org> wrote:
> > > > > > > >
> > > > > > > > On Fri, Mar 15, 2019 at 07:24:28PM +0100, Christian Brauner wrote:
> > > > > > > > [..]
> > > > > > > > > > why do we want to add a new syscall (pidfd_wait) though? Why not just use
> > > > > > > > > > standard poll/epoll interface on the proc fd like Daniel was suggesting.
> > > > > > > > > > AFAIK, once the proc file is opened, the struct pid is essentially pinned
> > > > > > > > > > even though the proc number may be reused. Then the caller can just poll.
> > > > > > > > > > We can add a waitqueue to struct pid, and wake up any waiters on process
> > > > > > > > > > death (A quick look shows task_struct can be mapped to its struct pid) and
> > > > > > > > > > also possibly optimize it using Steve's TIF flag idea. No new syscall is
> > > > > > > > > > needed then, let me know if I missed something?
> > > > > > > > >
> > > > > > > > > Huh, I thought that Daniel was against the poll/epoll solution?
> > > > > > > >
> > > > > > > > Hmm, going through earlier threads, I believe so now. Here was Daniel's
> > > > > > > > reasoning about avoiding a notification about process death through proc
> > > > > > > > directory fd: http://lkml.iu.edu/hypermail/linux/kernel/1811.0/00232.html
> > > > > > > >
> > > > > > > > May be a dedicated syscall for this would be cleaner after all.
> > > > > > >
> > > > > > > Ah, I wish I've seen that discussion before...
> > > > > > > syscall makes sense and it can be non-blocking and we can use
> > > > > > > select/poll/epoll if we use eventfd.
> > > > > >
> > > > > > Thanks for taking a look.
> > > > > >
> > > > > > > I would strongly advocate for
> > > > > > > non-blocking version or at least to have a non-blocking option.
> > > > > >
> > > > > > Waiting for FD readiness is *already* blocking or non-blocking
> > > > > > according to the caller's desire --- users can pass options they want
> > > > > > to poll(2) or whatever. There's no need for any kind of special
> > > > > > configuration knob or non-blocking option. We already *have* a
> > > > > > non-blocking option that works universally for everything.
> > > > > >
> > > > > > As I mentioned in the linked thread, waiting for process exit should
> > > > > > work just like waiting for bytes to appear on a pipe. Process exit
> > > > > > status is just another blob of bytes that a process might receive. A
> > > > > > process exit handle ought to be just another information source. The
> > > > > > reason the unix process API is so awful is that for whatever reason
> > > > > > the original designers treated processes as some kind of special kind
> > > > > > of resource instead of fitting them into the otherwise general-purpose
> > > > > > unix data-handling API. Let's not repeat that mistake.
> > > > > >
> > > > > > > Something like this:
> > > > > > >
> > > > > > > evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> > > > > > > // register eventfd to receive death notification
> > > > > > > pidfd_wait(pid_to_kill, evfd);
> > > > > > > // kill the process
> > > > > > > pidfd_send_signal(pid_to_kill, ...)
> > > > > > > // tend to other things
> > > > > >
> > > > > > Now you've lost me. pidfd_wait should return a *new* FD, not wire up
> > > > > > an eventfd.
> > > > > >
> > > >
> > > > Ok, I probably misunderstood your post linked by Joel. I though your
> > > > original proposal was based on being able to poll a file under
> > > > /proc/pid and then you changed your mind to have a separate syscall
> > > > which I assumed would be a blocking one to wait for process exit.
> > > > Maybe you can describe the new interface you are thinking about in
> > > > terms of userspace usage like I did above? Several lines of code would
> > > > explain more than paragraphs of text.
> > >
> > > Hey, Thanks Suren for the eventfd idea. I agree with Daniel on this. The idea
> > > from Daniel here is to wait for process death and exit events by just
> > > referring to a stable fd, independent of whatever is going on in /proc.
> > >
> > > What is needed is something like this (in highly pseudo-code form):
> > >
> > > pidfd = opendir("/proc/<pid>",..);
> > > wait_fd = pidfd_wait(pidfd);
> > > read or poll wait_fd (non-blocking or blocking whichever)
> > >
> > > wait_fd will block until the task has either died or reaped. In both these
> > > cases, it can return a suitable string such as "dead" or "reaped" although an
> > > integer with some predefined meaning is also Ok.
> > >
> > > What that guarantees is, even if the task's PID has been reused, or the task
> > > has already died or already died + reaped, all of these events cannot race
> > > with the code above and the information passed to the user is race-free and
> > > stable / guaranteed.
> > >
> > > An eventfd seems to not fit well, because AFAICS passing the raw PID to
> > > eventfd as in your example would still race since the PID could have been
> > > reused by another process by the time the eventfd is created.
> > >
> > > Also Andy's idea in [1] seems to use poll flags to communicate various tihngs
> > > which is still not as explicit about the PID's status so that's a poor API
> > > choice compared to the explicit syscall.
> > >
> > > I am planning to work on a prototype patch based on Daniel's idea and post something
> > > soon (chatted with Daniel about it and will reference him in the posting as
> > > well), during this posting I will also summarize all the previous discussions
> > > and come up with some tests as well.  I hope to have something soon.
> >
> > Having pidfd_wait() return another fd will make the syscall harder to
> > swallow for a lot of people I reckon.
> > What exactly prevents us from making the pidfd itself readable/pollable
> > for the exit staus? They are "special" fds anyway. I would really like
> > to avoid polluting the api with multiple different types of fds if possible.
> >
> > ret = pidfd_wait(pidfd);
> > read or poll pidfd
>
> I'm not quite clear on what the two steps are doing here.  Is pidfd_wait()
> doing a waitpid(2), and the read gets exit status info?

pidfd_wait on an open pidfd returns a "wait handle" FD. The wait
handle works just like a pipe: you can select/epoll/whatever for
readability. read(2) on the wait handle (which blocks unless you set
O_NONBLOCK, just like a pipe) completes with a siginfo_t when the
process to which the wait handle is attached exits. Roughly,

int kill_and_wait_for_exit(int pidfd) {
  int wait_handle = pidfd_wait(pidfd);
  pidfd_send_signal(pidfd, ...);
  siginfo_t exit_info;
  read(wait_handle, &exit_info, sizeof(exit_info)); // Blocks because
we haven't configured non-blocking behavior, just like a pipe.
  close(wait_handle);
  return exit_info.si_status;
}

>
> > (Note that I'm traveling so my responses might be delayed quite a bit.)
> > (Ccing a few people that might have an opinion here.)
> >
> > Christian
>
> On its own, what you (Christian) show seems nicer.  But think about a main event
> loop (like in lxc), where we just loop over epoll_wait() on various descriptors.
> If we want to wait for any of several types of events - maybe a signalfd, socket
> traffic, or a process death - it would be nice if we can treat them all the same
> way, without having to setup a separate thread to watch the pidfd and send
> data over another fd.  Is there a nice way we can provide that with what you've
> got above?

Nobody is proposing any kind of mechanism that would require a
separate thread. What I'm proposing works with poll and read and
should be trivial to integrate into any existing event loop: from the
perspective of the event loop, it looks just like a pipe.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-17 17:11                                               ` Daniel Colascione
@ 2019-03-17 17:16                                                 ` Serge E. Hallyn
  2019-03-17 22:02                                                   ` Suren Baghdasaryan
  0 siblings, 1 reply; 113+ messages in thread
From: Serge E. Hallyn @ 2019-03-17 17:16 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: Serge E. Hallyn, Christian Brauner, Joel Fernandes,
	Suren Baghdasaryan, Steven Rostedt, Sultan Alsawaf, Tim Murray,
	Michal Hocko, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Ingo Molnar, Peter Zijlstra, LKML,
	open list:ANDROID DRIVERS, linux-mm, kernel-team, Oleg Nesterov,
	Andy Lutomirski

On Sun, Mar 17, 2019 at 10:11:10AM -0700, Daniel Colascione wrote:
> On Sun, Mar 17, 2019 at 9:35 AM Serge E. Hallyn <serge@hallyn.com> wrote:
> >
> > On Sun, Mar 17, 2019 at 12:42:40PM +0100, Christian Brauner wrote:
> > > On Sat, Mar 16, 2019 at 09:53:06PM -0400, Joel Fernandes wrote:
> > > > On Sat, Mar 16, 2019 at 12:37:18PM -0700, Suren Baghdasaryan wrote:
> > > > > On Sat, Mar 16, 2019 at 11:57 AM Christian Brauner <christian@brauner.io> wrote:
> > > > > >
> > > > > > On Sat, Mar 16, 2019 at 11:00:10AM -0700, Daniel Colascione wrote:
> > > > > > > On Sat, Mar 16, 2019 at 10:31 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > > > > >
> > > > > > > > On Fri, Mar 15, 2019 at 11:49 AM Joel Fernandes <joel@joelfernandes.org> wrote:
> > > > > > > > >
> > > > > > > > > On Fri, Mar 15, 2019 at 07:24:28PM +0100, Christian Brauner wrote:
> > > > > > > > > [..]
> > > > > > > > > > > why do we want to add a new syscall (pidfd_wait) though? Why not just use
> > > > > > > > > > > standard poll/epoll interface on the proc fd like Daniel was suggesting.
> > > > > > > > > > > AFAIK, once the proc file is opened, the struct pid is essentially pinned
> > > > > > > > > > > even though the proc number may be reused. Then the caller can just poll.
> > > > > > > > > > > We can add a waitqueue to struct pid, and wake up any waiters on process
> > > > > > > > > > > death (A quick look shows task_struct can be mapped to its struct pid) and
> > > > > > > > > > > also possibly optimize it using Steve's TIF flag idea. No new syscall is
> > > > > > > > > > > needed then, let me know if I missed something?
> > > > > > > > > >
> > > > > > > > > > Huh, I thought that Daniel was against the poll/epoll solution?
> > > > > > > > >
> > > > > > > > > Hmm, going through earlier threads, I believe so now. Here was Daniel's
> > > > > > > > > reasoning about avoiding a notification about process death through proc
> > > > > > > > > directory fd: http://lkml.iu.edu/hypermail/linux/kernel/1811.0/00232.html
> > > > > > > > >
> > > > > > > > > May be a dedicated syscall for this would be cleaner after all.
> > > > > > > >
> > > > > > > > Ah, I wish I've seen that discussion before...
> > > > > > > > syscall makes sense and it can be non-blocking and we can use
> > > > > > > > select/poll/epoll if we use eventfd.
> > > > > > >
> > > > > > > Thanks for taking a look.
> > > > > > >
> > > > > > > > I would strongly advocate for
> > > > > > > > non-blocking version or at least to have a non-blocking option.
> > > > > > >
> > > > > > > Waiting for FD readiness is *already* blocking or non-blocking
> > > > > > > according to the caller's desire --- users can pass options they want
> > > > > > > to poll(2) or whatever. There's no need for any kind of special
> > > > > > > configuration knob or non-blocking option. We already *have* a
> > > > > > > non-blocking option that works universally for everything.
> > > > > > >
> > > > > > > As I mentioned in the linked thread, waiting for process exit should
> > > > > > > work just like waiting for bytes to appear on a pipe. Process exit
> > > > > > > status is just another blob of bytes that a process might receive. A
> > > > > > > process exit handle ought to be just another information source. The
> > > > > > > reason the unix process API is so awful is that for whatever reason
> > > > > > > the original designers treated processes as some kind of special kind
> > > > > > > of resource instead of fitting them into the otherwise general-purpose
> > > > > > > unix data-handling API. Let's not repeat that mistake.
> > > > > > >
> > > > > > > > Something like this:
> > > > > > > >
> > > > > > > > evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> > > > > > > > // register eventfd to receive death notification
> > > > > > > > pidfd_wait(pid_to_kill, evfd);
> > > > > > > > // kill the process
> > > > > > > > pidfd_send_signal(pid_to_kill, ...)
> > > > > > > > // tend to other things
> > > > > > >
> > > > > > > Now you've lost me. pidfd_wait should return a *new* FD, not wire up
> > > > > > > an eventfd.
> > > > > > >
> > > > >
> > > > > Ok, I probably misunderstood your post linked by Joel. I though your
> > > > > original proposal was based on being able to poll a file under
> > > > > /proc/pid and then you changed your mind to have a separate syscall
> > > > > which I assumed would be a blocking one to wait for process exit.
> > > > > Maybe you can describe the new interface you are thinking about in
> > > > > terms of userspace usage like I did above? Several lines of code would
> > > > > explain more than paragraphs of text.
> > > >
> > > > Hey, Thanks Suren for the eventfd idea. I agree with Daniel on this. The idea
> > > > from Daniel here is to wait for process death and exit events by just
> > > > referring to a stable fd, independent of whatever is going on in /proc.
> > > >
> > > > What is needed is something like this (in highly pseudo-code form):
> > > >
> > > > pidfd = opendir("/proc/<pid>",..);
> > > > wait_fd = pidfd_wait(pidfd);
> > > > read or poll wait_fd (non-blocking or blocking whichever)
> > > >
> > > > wait_fd will block until the task has either died or reaped. In both these
> > > > cases, it can return a suitable string such as "dead" or "reaped" although an
> > > > integer with some predefined meaning is also Ok.
> > > >
> > > > What that guarantees is, even if the task's PID has been reused, or the task
> > > > has already died or already died + reaped, all of these events cannot race
> > > > with the code above and the information passed to the user is race-free and
> > > > stable / guaranteed.
> > > >
> > > > An eventfd seems to not fit well, because AFAICS passing the raw PID to
> > > > eventfd as in your example would still race since the PID could have been
> > > > reused by another process by the time the eventfd is created.
> > > >
> > > > Also Andy's idea in [1] seems to use poll flags to communicate various tihngs
> > > > which is still not as explicit about the PID's status so that's a poor API
> > > > choice compared to the explicit syscall.
> > > >
> > > > I am planning to work on a prototype patch based on Daniel's idea and post something
> > > > soon (chatted with Daniel about it and will reference him in the posting as
> > > > well), during this posting I will also summarize all the previous discussions
> > > > and come up with some tests as well.  I hope to have something soon.
> > >
> > > Having pidfd_wait() return another fd will make the syscall harder to
> > > swallow for a lot of people I reckon.
> > > What exactly prevents us from making the pidfd itself readable/pollable
> > > for the exit staus? They are "special" fds anyway. I would really like
> > > to avoid polluting the api with multiple different types of fds if possible.
> > >
> > > ret = pidfd_wait(pidfd);
> > > read or poll pidfd
> >
> > I'm not quite clear on what the two steps are doing here.  Is pidfd_wait()
> > doing a waitpid(2), and the read gets exit status info?
> 
> pidfd_wait on an open pidfd returns a "wait handle" FD. The wait

That is what you are proposing.  I'm not sure that's what Christian
was proposing.  'ret' is ambiguous there.  Christian?

> handle works just like a pipe: you can select/epoll/whatever for
> readability. read(2) on the wait handle (which blocks unless you set
> O_NONBLOCK, just like a pipe) completes with a siginfo_t when the
> process to which the wait handle is attached exits. Roughly,
> 
> int kill_and_wait_for_exit(int pidfd) {
>   int wait_handle = pidfd_wait(pidfd);
>   pidfd_send_signal(pidfd, ...);
>   siginfo_t exit_info;
>   read(wait_handle, &exit_info, sizeof(exit_info)); // Blocks because
> we haven't configured non-blocking behavior, just like a pipe.
>   close(wait_handle);
>   return exit_info.si_status;
> }
> 
> >
> > > (Note that I'm traveling so my responses might be delayed quite a bit.)
> > > (Ccing a few people that might have an opinion here.)
> > >
> > > Christian
> >
> > On its own, what you (Christian) show seems nicer.  But think about a main event
> > loop (like in lxc), where we just loop over epoll_wait() on various descriptors.
> > If we want to wait for any of several types of events - maybe a signalfd, socket
> > traffic, or a process death - it would be nice if we can treat them all the same
> > way, without having to setup a separate thread to watch the pidfd and send
> > data over another fd.  Is there a nice way we can provide that with what you've
> > got above?
> 
> Nobody is proposing any kind of mechanism that would require a
> separate thread. What I'm proposing works with poll and read and
> should be trivial to integrate into any existing event loop: from the
> perspective of the event loop, it looks just like a pipe.

(yes, I understood your proposal)

-serge

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-17 17:16                                                 ` Serge E. Hallyn
@ 2019-03-17 22:02                                                   ` Suren Baghdasaryan
  0 siblings, 0 replies; 113+ messages in thread
From: Suren Baghdasaryan @ 2019-03-17 22:02 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Daniel Colascione, Christian Brauner, Joel Fernandes,
	Steven Rostedt, Sultan Alsawaf, Tim Murray, Michal Hocko,
	Greg Kroah-Hartman, Arve Hjønnevåg, Todd Kjos,
	Martijn Coenen, Ingo Molnar, Peter Zijlstra, LKML,
	open list:ANDROID DRIVERS, linux-mm, kernel-team, Oleg Nesterov,
	Andy Lutomirski

On Sun, Mar 17, 2019 at 10:16 AM Serge E. Hallyn <serge@hallyn.com> wrote:
>
> On Sun, Mar 17, 2019 at 10:11:10AM -0700, Daniel Colascione wrote:
> > On Sun, Mar 17, 2019 at 9:35 AM Serge E. Hallyn <serge@hallyn.com> wrote:
> > >
> > > On Sun, Mar 17, 2019 at 12:42:40PM +0100, Christian Brauner wrote:
> > > > On Sat, Mar 16, 2019 at 09:53:06PM -0400, Joel Fernandes wrote:
> > > > > On Sat, Mar 16, 2019 at 12:37:18PM -0700, Suren Baghdasaryan wrote:
> > > > > > On Sat, Mar 16, 2019 at 11:57 AM Christian Brauner <christian@brauner.io> wrote:
> > > > > > >
> > > > > > > On Sat, Mar 16, 2019 at 11:00:10AM -0700, Daniel Colascione wrote:
> > > > > > > > On Sat, Mar 16, 2019 at 10:31 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > > > > > >
> > > > > > > > > On Fri, Mar 15, 2019 at 11:49 AM Joel Fernandes <joel@joelfernandes.org> wrote:
> > > > > > > > > >
> > > > > > > > > > On Fri, Mar 15, 2019 at 07:24:28PM +0100, Christian Brauner wrote:
> > > > > > > > > > [..]
> > > > > > > > > > > > why do we want to add a new syscall (pidfd_wait) though? Why not just use
> > > > > > > > > > > > standard poll/epoll interface on the proc fd like Daniel was suggesting.
> > > > > > > > > > > > AFAIK, once the proc file is opened, the struct pid is essentially pinned
> > > > > > > > > > > > even though the proc number may be reused. Then the caller can just poll.
> > > > > > > > > > > > We can add a waitqueue to struct pid, and wake up any waiters on process
> > > > > > > > > > > > death (A quick look shows task_struct can be mapped to its struct pid) and
> > > > > > > > > > > > also possibly optimize it using Steve's TIF flag idea. No new syscall is
> > > > > > > > > > > > needed then, let me know if I missed something?
> > > > > > > > > > >
> > > > > > > > > > > Huh, I thought that Daniel was against the poll/epoll solution?
> > > > > > > > > >
> > > > > > > > > > Hmm, going through earlier threads, I believe so now. Here was Daniel's
> > > > > > > > > > reasoning about avoiding a notification about process death through proc
> > > > > > > > > > directory fd: http://lkml.iu.edu/hypermail/linux/kernel/1811.0/00232.html
> > > > > > > > > >
> > > > > > > > > > May be a dedicated syscall for this would be cleaner after all.
> > > > > > > > >
> > > > > > > > > Ah, I wish I've seen that discussion before...
> > > > > > > > > syscall makes sense and it can be non-blocking and we can use
> > > > > > > > > select/poll/epoll if we use eventfd.
> > > > > > > >
> > > > > > > > Thanks for taking a look.
> > > > > > > >
> > > > > > > > > I would strongly advocate for
> > > > > > > > > non-blocking version or at least to have a non-blocking option.
> > > > > > > >
> > > > > > > > Waiting for FD readiness is *already* blocking or non-blocking
> > > > > > > > according to the caller's desire --- users can pass options they want
> > > > > > > > to poll(2) or whatever. There's no need for any kind of special
> > > > > > > > configuration knob or non-blocking option. We already *have* a
> > > > > > > > non-blocking option that works universally for everything.
> > > > > > > >
> > > > > > > > As I mentioned in the linked thread, waiting for process exit should
> > > > > > > > work just like waiting for bytes to appear on a pipe. Process exit
> > > > > > > > status is just another blob of bytes that a process might receive. A
> > > > > > > > process exit handle ought to be just another information source. The
> > > > > > > > reason the unix process API is so awful is that for whatever reason
> > > > > > > > the original designers treated processes as some kind of special kind
> > > > > > > > of resource instead of fitting them into the otherwise general-purpose
> > > > > > > > unix data-handling API. Let's not repeat that mistake.
> > > > > > > >
> > > > > > > > > Something like this:
> > > > > > > > >
> > > > > > > > > evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> > > > > > > > > // register eventfd to receive death notification
> > > > > > > > > pidfd_wait(pid_to_kill, evfd);
> > > > > > > > > // kill the process
> > > > > > > > > pidfd_send_signal(pid_to_kill, ...)
> > > > > > > > > // tend to other things
> > > > > > > >
> > > > > > > > Now you've lost me. pidfd_wait should return a *new* FD, not wire up
> > > > > > > > an eventfd.
> > > > > > > >
> > > > > >
> > > > > > Ok, I probably misunderstood your post linked by Joel. I though your
> > > > > > original proposal was based on being able to poll a file under
> > > > > > /proc/pid and then you changed your mind to have a separate syscall
> > > > > > which I assumed would be a blocking one to wait for process exit.
> > > > > > Maybe you can describe the new interface you are thinking about in
> > > > > > terms of userspace usage like I did above? Several lines of code would
> > > > > > explain more than paragraphs of text.
> > > > >
> > > > > Hey, Thanks Suren for the eventfd idea. I agree with Daniel on this. The idea
> > > > > from Daniel here is to wait for process death and exit events by just
> > > > > referring to a stable fd, independent of whatever is going on in /proc.
> > > > >
> > > > > What is needed is something like this (in highly pseudo-code form):
> > > > >
> > > > > pidfd = opendir("/proc/<pid>",..);
> > > > > wait_fd = pidfd_wait(pidfd);
> > > > > read or poll wait_fd (non-blocking or blocking whichever)
> > > > >

Thanks for the explanation Joel. Now I understand the proposal. Will
think about it some more and looking forward for the implementation
patch.

> > > > > wait_fd will block until the task has either died or reaped. In both these
> > > > > cases, it can return a suitable string such as "dead" or "reaped" although an
> > > > > integer with some predefined meaning is also Ok.
> > > > >
> > > > > What that guarantees is, even if the task's PID has been reused, or the task
> > > > > has already died or already died + reaped, all of these events cannot race
> > > > > with the code above and the information passed to the user is race-free and
> > > > > stable / guaranteed.
> > > > >
> > > > > An eventfd seems to not fit well, because AFAICS passing the raw PID to
> > > > > eventfd as in your example would still race since the PID could have been
> > > > > reused by another process by the time the eventfd is created.
> > > > > Also Andy's idea in [1] seems to use poll flags to communicate various tihngs
> > > > > which is still not as explicit about the PID's status so that's a poor API
> > > > > choice compared to the explicit syscall.
> > > > >
> > > > > I am planning to work on a prototype patch based on Daniel's idea and post something
> > > > > soon (chatted with Daniel about it and will reference him in the posting as
> > > > > well), during this posting I will also summarize all the previous discussions
> > > > > and come up with some tests as well.  I hope to have something soon.
> > > >
> > > > Having pidfd_wait() return another fd will make the syscall harder to
> > > > swallow for a lot of people I reckon.
> > > > What exactly prevents us from making the pidfd itself readable/pollable
> > > > for the exit staus? They are "special" fds anyway. I would really like
> > > > to avoid polluting the api with multiple different types of fds if possible.
> > > >
> > > > ret = pidfd_wait(pidfd);
> > > > read or poll pidfd
> > >
> > > I'm not quite clear on what the two steps are doing here.  Is pidfd_wait()
> > > doing a waitpid(2), and the read gets exit status info?
> >
> > pidfd_wait on an open pidfd returns a "wait handle" FD. The wait
>
> That is what you are proposing.  I'm not sure that's what Christian
> was proposing.  'ret' is ambiguous there.  Christian?
>
> > handle works just like a pipe: you can select/epoll/whatever for
> > readability. read(2) on the wait handle (which blocks unless you set
> > O_NONBLOCK, just like a pipe) completes with a siginfo_t when the
> > process to which the wait handle is attached exits. Roughly,
> >
> > int kill_and_wait_for_exit(int pidfd) {
> >   int wait_handle = pidfd_wait(pidfd);
> >   pidfd_send_signal(pidfd, ...);
> >   siginfo_t exit_info;
> >   read(wait_handle, &exit_info, sizeof(exit_info)); // Blocks because
> > we haven't configured non-blocking behavior, just like a pipe.
> >   close(wait_handle);
> >   return exit_info.si_status;
> > }
> >
> > >
> > > > (Note that I'm traveling so my responses might be delayed quite a bit.)
> > > > (Ccing a few people that might have an opinion here.)
> > > >
> > > > Christian
> > >
> > > On its own, what you (Christian) show seems nicer.  But think about a main event
> > > loop (like in lxc), where we just loop over epoll_wait() on various descriptors.
> > > If we want to wait for any of several types of events - maybe a signalfd, socket
> > > traffic, or a process death - it would be nice if we can treat them all the same
> > > way, without having to setup a separate thread to watch the pidfd and send
> > > data over another fd.  Is there a nice way we can provide that with what you've
> > > got above?
> >
> > Nobody is proposing any kind of mechanism that would require a
> > separate thread. What I'm proposing works with poll and read and
> > should be trivial to integrate into any existing event loop: from the
> > perspective of the event loop, it looks just like a pipe.
>
> (yes, I understood your proposal)
>
> -serge

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-17 15:40                                             ` Daniel Colascione
@ 2019-03-18  0:29                                               ` Christian Brauner
  2019-03-18 23:50                                                 ` Joel Fernandes
  0 siblings, 1 reply; 113+ messages in thread
From: Christian Brauner @ 2019-03-18  0:29 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: Joel Fernandes, Suren Baghdasaryan, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Oleg Nesterov, Andy Lutomirski, Serge E. Hallyn

On Sun, Mar 17, 2019 at 08:40:19AM -0700, Daniel Colascione wrote:
> On Sun, Mar 17, 2019 at 4:42 AM Christian Brauner <christian@brauner.io> wrote:
> >
> > On Sat, Mar 16, 2019 at 09:53:06PM -0400, Joel Fernandes wrote:
> > > On Sat, Mar 16, 2019 at 12:37:18PM -0700, Suren Baghdasaryan wrote:
> > > > On Sat, Mar 16, 2019 at 11:57 AM Christian Brauner <christian@brauner.io> wrote:
> > > > >
> > > > > On Sat, Mar 16, 2019 at 11:00:10AM -0700, Daniel Colascione wrote:
> > > > > > On Sat, Mar 16, 2019 at 10:31 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > > > >
> > > > > > > On Fri, Mar 15, 2019 at 11:49 AM Joel Fernandes <joel@joelfernandes.org> wrote:
> > > > > > > >
> > > > > > > > On Fri, Mar 15, 2019 at 07:24:28PM +0100, Christian Brauner wrote:
> > > > > > > > [..]
> > > > > > > > > > why do we want to add a new syscall (pidfd_wait) though? Why not just use
> > > > > > > > > > standard poll/epoll interface on the proc fd like Daniel was suggesting.
> > > > > > > > > > AFAIK, once the proc file is opened, the struct pid is essentially pinned
> > > > > > > > > > even though the proc number may be reused. Then the caller can just poll.
> > > > > > > > > > We can add a waitqueue to struct pid, and wake up any waiters on process
> > > > > > > > > > death (A quick look shows task_struct can be mapped to its struct pid) and
> > > > > > > > > > also possibly optimize it using Steve's TIF flag idea. No new syscall is
> > > > > > > > > > needed then, let me know if I missed something?
> > > > > > > > >
> > > > > > > > > Huh, I thought that Daniel was against the poll/epoll solution?
> > > > > > > >
> > > > > > > > Hmm, going through earlier threads, I believe so now. Here was Daniel's
> > > > > > > > reasoning about avoiding a notification about process death through proc
> > > > > > > > directory fd: http://lkml.iu.edu/hypermail/linux/kernel/1811.0/00232.html
> > > > > > > >
> > > > > > > > May be a dedicated syscall for this would be cleaner after all.
> > > > > > >
> > > > > > > Ah, I wish I've seen that discussion before...
> > > > > > > syscall makes sense and it can be non-blocking and we can use
> > > > > > > select/poll/epoll if we use eventfd.
> > > > > >
> > > > > > Thanks for taking a look.
> > > > > >
> > > > > > > I would strongly advocate for
> > > > > > > non-blocking version or at least to have a non-blocking option.
> > > > > >
> > > > > > Waiting for FD readiness is *already* blocking or non-blocking
> > > > > > according to the caller's desire --- users can pass options they want
> > > > > > to poll(2) or whatever. There's no need for any kind of special
> > > > > > configuration knob or non-blocking option. We already *have* a
> > > > > > non-blocking option that works universally for everything.
> > > > > >
> > > > > > As I mentioned in the linked thread, waiting for process exit should
> > > > > > work just like waiting for bytes to appear on a pipe. Process exit
> > > > > > status is just another blob of bytes that a process might receive. A
> > > > > > process exit handle ought to be just another information source. The
> > > > > > reason the unix process API is so awful is that for whatever reason
> > > > > > the original designers treated processes as some kind of special kind
> > > > > > of resource instead of fitting them into the otherwise general-purpose
> > > > > > unix data-handling API. Let's not repeat that mistake.
> > > > > >
> > > > > > > Something like this:
> > > > > > >
> > > > > > > evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> > > > > > > // register eventfd to receive death notification
> > > > > > > pidfd_wait(pid_to_kill, evfd);
> > > > > > > // kill the process
> > > > > > > pidfd_send_signal(pid_to_kill, ...)
> > > > > > > // tend to other things
> > > > > >
> > > > > > Now you've lost me. pidfd_wait should return a *new* FD, not wire up
> > > > > > an eventfd.
> > > > > >
> > > >
> > > > Ok, I probably misunderstood your post linked by Joel. I though your
> > > > original proposal was based on being able to poll a file under
> > > > /proc/pid and then you changed your mind to have a separate syscall
> > > > which I assumed would be a blocking one to wait for process exit.
> > > > Maybe you can describe the new interface you are thinking about in
> > > > terms of userspace usage like I did above? Several lines of code would
> > > > explain more than paragraphs of text.
> > >
> > > Hey, Thanks Suren for the eventfd idea. I agree with Daniel on this. The idea
> > > from Daniel here is to wait for process death and exit events by just
> > > referring to a stable fd, independent of whatever is going on in /proc.
> > >
> > > What is needed is something like this (in highly pseudo-code form):
> > >
> > > pidfd = opendir("/proc/<pid>",..);
> > > wait_fd = pidfd_wait(pidfd);
> > > read or poll wait_fd (non-blocking or blocking whichever)
> > >
> > > wait_fd will block until the task has either died or reaped. In both these
> > > cases, it can return a suitable string such as "dead" or "reaped" although an
> > > integer with some predefined meaning is also Ok.
> 
> I want to return a siginfo_t: we already use this structure in other
> contexts to report exit status.
> 
> > > What that guarantees is, even if the task's PID has been reused, or the task
> > > has already died or already died + reaped, all of these events cannot race
> > > with the code above and the information passed to the user is race-free and
> > > stable / guaranteed.
> > >
> > > An eventfd seems to not fit well, because AFAICS passing the raw PID to
> > > eventfd as in your example would still race since the PID could have been
> > > reused by another process by the time the eventfd is created.
> > >
> > > Also Andy's idea in [1] seems to use poll flags to communicate various tihngs
> > > which is still not as explicit about the PID's status so that's a poor API
> > > choice compared to the explicit syscall.
> > >
> > > I am planning to work on a prototype patch based on Daniel's idea and post something
> > > soon (chatted with Daniel about it and will reference him in the posting as
> > > well), during this posting I will also summarize all the previous discussions
> > > and come up with some tests as well.  I hope to have something soon.
> 
> Thanks.
> 
> > Having pidfd_wait() return another fd will make the syscall harder to
> > swallow for a lot of people I reckon.
> > What exactly prevents us from making the pidfd itself readable/pollable
> > for the exit staus? They are "special" fds anyway. I would really like
> > to avoid polluting the api with multiple different types of fds if possible.
> 
> If pidfds had been their own file type, I'd agree with you. But pidfds
> are directories, which means that we're beholden to make them behave
> like directories normally do. I'd rather introduce another FD than
> heavily overload the semantics of a directory FD in one particular
> context. In no other circumstances are directory FDs also weird
> IO-data sources. Our providing a facility to get a new FD to which we
> *can* give pipe-like behavior does no harm and *usage* cleaner and
> easier to reason about.

I have two things I'm currently working on:
- hijacking translate_pid()
- pidfd_clone() essentially

My first goal is to talk to Eric about taking the translate_pid()
syscall that has been sitting in his tree and expanding it.
translate_pid() currently allows you to either get an fd for the pid
namespace a pid resides in or the pid number of a given process in
another pid namespace relative to a passed in pid namespace fd. I would
like to make it possible for this syscall to also give us back pidfds.
One question I'm currently struggling with is exactly what you said
above: what type of file descriptor these are going to give back to us.
It seems that a regular file instead of directory would make the most
sense and would lead to a nicer API and I'm very much leaning towards
that.

Christian

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-18  0:29                                               ` Christian Brauner
@ 2019-03-18 23:50                                                 ` Joel Fernandes
  2019-03-19 22:14                                                   ` Christian Brauner
  0 siblings, 1 reply; 113+ messages in thread
From: Joel Fernandes @ 2019-03-18 23:50 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Daniel Colascione, Suren Baghdasaryan, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Oleg Nesterov, Andy Lutomirski, Serge E. Hallyn

On Mon, Mar 18, 2019 at 01:29:51AM +0100, Christian Brauner wrote:
> On Sun, Mar 17, 2019 at 08:40:19AM -0700, Daniel Colascione wrote:
> > On Sun, Mar 17, 2019 at 4:42 AM Christian Brauner <christian@brauner.io> wrote:
> > >
> > > On Sat, Mar 16, 2019 at 09:53:06PM -0400, Joel Fernandes wrote:
> > > > On Sat, Mar 16, 2019 at 12:37:18PM -0700, Suren Baghdasaryan wrote:
> > > > > On Sat, Mar 16, 2019 at 11:57 AM Christian Brauner <christian@brauner.io> wrote:
> > > > > >
> > > > > > On Sat, Mar 16, 2019 at 11:00:10AM -0700, Daniel Colascione wrote:
> > > > > > > On Sat, Mar 16, 2019 at 10:31 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > > > > >
> > > > > > > > On Fri, Mar 15, 2019 at 11:49 AM Joel Fernandes <joel@joelfernandes.org> wrote:
> > > > > > > > >
> > > > > > > > > On Fri, Mar 15, 2019 at 07:24:28PM +0100, Christian Brauner wrote:
> > > > > > > > > [..]
> > > > > > > > > > > why do we want to add a new syscall (pidfd_wait) though? Why not just use
> > > > > > > > > > > standard poll/epoll interface on the proc fd like Daniel was suggesting.
> > > > > > > > > > > AFAIK, once the proc file is opened, the struct pid is essentially pinned
> > > > > > > > > > > even though the proc number may be reused. Then the caller can just poll.
> > > > > > > > > > > We can add a waitqueue to struct pid, and wake up any waiters on process
> > > > > > > > > > > death (A quick look shows task_struct can be mapped to its struct pid) and
> > > > > > > > > > > also possibly optimize it using Steve's TIF flag idea. No new syscall is
> > > > > > > > > > > needed then, let me know if I missed something?
> > > > > > > > > >
> > > > > > > > > > Huh, I thought that Daniel was against the poll/epoll solution?
> > > > > > > > >
> > > > > > > > > Hmm, going through earlier threads, I believe so now. Here was Daniel's
> > > > > > > > > reasoning about avoiding a notification about process death through proc
> > > > > > > > > directory fd: http://lkml.iu.edu/hypermail/linux/kernel/1811.0/00232.html
> > > > > > > > >
> > > > > > > > > May be a dedicated syscall for this would be cleaner after all.
> > > > > > > >
> > > > > > > > Ah, I wish I've seen that discussion before...
> > > > > > > > syscall makes sense and it can be non-blocking and we can use
> > > > > > > > select/poll/epoll if we use eventfd.
> > > > > > >
> > > > > > > Thanks for taking a look.
> > > > > > >
> > > > > > > > I would strongly advocate for
> > > > > > > > non-blocking version or at least to have a non-blocking option.
> > > > > > >
> > > > > > > Waiting for FD readiness is *already* blocking or non-blocking
> > > > > > > according to the caller's desire --- users can pass options they want
> > > > > > > to poll(2) or whatever. There's no need for any kind of special
> > > > > > > configuration knob or non-blocking option. We already *have* a
> > > > > > > non-blocking option that works universally for everything.
> > > > > > >
> > > > > > > As I mentioned in the linked thread, waiting for process exit should
> > > > > > > work just like waiting for bytes to appear on a pipe. Process exit
> > > > > > > status is just another blob of bytes that a process might receive. A
> > > > > > > process exit handle ought to be just another information source. The
> > > > > > > reason the unix process API is so awful is that for whatever reason
> > > > > > > the original designers treated processes as some kind of special kind
> > > > > > > of resource instead of fitting them into the otherwise general-purpose
> > > > > > > unix data-handling API. Let's not repeat that mistake.
> > > > > > >
> > > > > > > > Something like this:
> > > > > > > >
> > > > > > > > evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> > > > > > > > // register eventfd to receive death notification
> > > > > > > > pidfd_wait(pid_to_kill, evfd);
> > > > > > > > // kill the process
> > > > > > > > pidfd_send_signal(pid_to_kill, ...)
> > > > > > > > // tend to other things
> > > > > > >
> > > > > > > Now you've lost me. pidfd_wait should return a *new* FD, not wire up
> > > > > > > an eventfd.
> > > > > > >
> > > > >
> > > > > Ok, I probably misunderstood your post linked by Joel. I though your
> > > > > original proposal was based on being able to poll a file under
> > > > > /proc/pid and then you changed your mind to have a separate syscall
> > > > > which I assumed would be a blocking one to wait for process exit.
> > > > > Maybe you can describe the new interface you are thinking about in
> > > > > terms of userspace usage like I did above? Several lines of code would
> > > > > explain more than paragraphs of text.
> > > >
> > > > Hey, Thanks Suren for the eventfd idea. I agree with Daniel on this. The idea
> > > > from Daniel here is to wait for process death and exit events by just
> > > > referring to a stable fd, independent of whatever is going on in /proc.
> > > >
> > > > What is needed is something like this (in highly pseudo-code form):
> > > >
> > > > pidfd = opendir("/proc/<pid>",..);
> > > > wait_fd = pidfd_wait(pidfd);
> > > > read or poll wait_fd (non-blocking or blocking whichever)
> > > >
> > > > wait_fd will block until the task has either died or reaped. In both these
> > > > cases, it can return a suitable string such as "dead" or "reaped" although an
> > > > integer with some predefined meaning is also Ok.
> > 
> > I want to return a siginfo_t: we already use this structure in other
> > contexts to report exit status.
> > 

Fine with me. I did a prototype (code is below) as a string but I can change
that to siginfo_t in the future.

> > > Having pidfd_wait() return another fd will make the syscall harder to
> > > swallow for a lot of people I reckon.
> > > What exactly prevents us from making the pidfd itself readable/pollable
> > > for the exit staus? They are "special" fds anyway. I would really like
> > > to avoid polluting the api with multiple different types of fds if possible.
> > 
> > If pidfds had been their own file type, I'd agree with you. But pidfds
> > are directories, which means that we're beholden to make them behave
> > like directories normally do. I'd rather introduce another FD than
> > heavily overload the semantics of a directory FD in one particular
> > context. In no other circumstances are directory FDs also weird
> > IO-data sources. Our providing a facility to get a new FD to which we
> > *can* give pipe-like behavior does no harm and *usage* cleaner and
> > easier to reason about.
> 
> I have two things I'm currently working on:
> - hijacking translate_pid()
> - pidfd_clone() essentially
> 
> My first goal is to talk to Eric about taking the translate_pid()
> syscall that has been sitting in his tree and expanding it.
> translate_pid() currently allows you to either get an fd for the pid
> namespace a pid resides in or the pid number of a given process in
> another pid namespace relative to a passed in pid namespace fd.

That's good to know. More comments below:

> I would
> like to make it possible for this syscall to also give us back pidfds.
> One question I'm currently struggling with is exactly what you said
> above: what type of file descriptor these are going to give back to us.
> It seems that a regular file instead of directory would make the most
> sense and would lead to a nicer API and I'm very much leaning towards
> that.

How about something like the following? We can plumb the new file as a pseudo
file that is invisible and linked to the fd. This is extremely rough (does
not do error handling, synchronizatoin etc) but just wanted to share the idea
of what the "frontend" could look like. It is also missing all the actual pid
status messages. It just takes care of the creating new fd from the pidfd
part and providing file read ops returning the "status" string.  It is also
written in signal.c and should likely go into proc fs files under fs.
Appreciate any suggestions (a test program did prove it works).

Also, I was able to translate a pidfd to a pid_namespace by referring to some
existing code but perhaps you may be able to suggest something better for
such translation..

---8<-----------------------

From: Joel Fernandes <joelaf@google.com>
Subject: [PATCH] Partial skeleton prototype of pidfd_wait frontend

Signed-off-by: Joel Fernandes <joelaf@google.com>
---
 arch/x86/entry/syscalls/syscall_32.tbl |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 include/linux/syscalls.h               |  1 +
 include/uapi/asm-generic/unistd.h      |  4 +-
 kernel/signal.c                        | 62 ++++++++++++++++++++++++++
 kernel/sys_ni.c                        |  3 ++
 6 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 1f9607ed087c..2a63f1896b63 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -433,3 +433,4 @@
 425	i386	io_uring_setup		sys_io_uring_setup		__ia32_sys_io_uring_setup
 426	i386	io_uring_enter		sys_io_uring_enter		__ia32_sys_io_uring_enter
 427	i386	io_uring_register	sys_io_uring_register		__ia32_sys_io_uring_register
+428	i386	pidfd_wait		sys_pidfd_wait			__ia32_sys_pidfd_wait
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 92ee0b4378d4..cf2e08a8053b 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -349,6 +349,7 @@
 425	common	io_uring_setup		__x64_sys_io_uring_setup
 426	common	io_uring_enter		__x64_sys_io_uring_enter
 427	common	io_uring_register	__x64_sys_io_uring_register
+428	common	pidfd_wait		__x64_sys_pidfd_wait
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e446806a561f..62160970ed3f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -988,6 +988,7 @@ asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
 asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
 				       siginfo_t __user *info,
 				       unsigned int flags);
+asmlinkage long sys_pidfd_wait(int pidfd);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index dee7292e1df6..137aa8662230 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -832,9 +832,11 @@ __SYSCALL(__NR_io_uring_setup, sys_io_uring_setup)
 __SYSCALL(__NR_io_uring_enter, sys_io_uring_enter)
 #define __NR_io_uring_register 427
 __SYSCALL(__NR_io_uring_register, sys_io_uring_register)
+#define __NR_pidfd_wait 428
+__SYSCALL(__NR_pidfd_wait, sys_pidfd_wait)
 
 #undef __NR_syscalls
-#define __NR_syscalls 428
+#define __NR_syscalls 429
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/signal.c b/kernel/signal.c
index b7953934aa99..ebb550b87044 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3550,6 +3550,68 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)
 	return copy_siginfo_from_user(kinfo, info);
 }
 
+static ssize_t pidfd_wait_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	/*
+	 * This is just a test string, it will contain the actual
+	 * status of the pidfd in the future.
+	 */
+	char buf[] = "status";
+
+	return copy_to_iter(buf, strlen(buf)+1, to);
+}
+
+static const struct file_operations pidfd_wait_file_ops = {
+	.read_iter	= pidfd_wait_read_iter,
+};
+
+static struct inode *pidfd_wait_get_inode(struct super_block *sb)
+{
+	struct inode *inode = new_inode(sb);
+
+	inode->i_ino = get_next_ino();
+	inode_init_owner(inode, NULL, S_IFREG);
+
+	inode->i_op		= &simple_dir_inode_operations;
+	inode->i_fop		= &pidfd_wait_file_ops;
+
+	return inode;
+}
+
+SYSCALL_DEFINE1(pidfd_wait, int, pidfd)
+{
+	struct fd f;
+	struct inode *inode;
+	struct file *file;
+	int new_fd;
+	struct pid_namespace *pid_ns;
+	struct super_block *sb;
+	struct vfsmount *mnt;
+
+	f = fdget_raw(pidfd);
+	if (!f.file)
+		return -EBADF;
+
+	sb = file_inode(f.file)->i_sb;
+	pid_ns = sb->s_fs_info;
+
+	inode = pidfd_wait_get_inode(sb);
+
+	mnt = pid_ns->proc_mnt;
+
+	file = alloc_file_pseudo(inode, mnt, "pidfd_wait", O_RDONLY,
+			&pidfd_wait_file_ops);
+
+	file->f_mode |= FMODE_PREAD;
+
+	new_fd = get_unused_fd_flags(0);
+	fd_install(new_fd, file);
+
+	fdput(f);
+
+	return new_fd;
+}
+
 /**
  * sys_pidfd_send_signal - send a signal to a process through a task file
  *                          descriptor
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index d21f4befaea4..f52c4d864038 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -450,3 +450,6 @@ COND_SYSCALL(setuid16);
 
 /* restartable sequence */
 COND_SYSCALL(rseq);
+
+/* pidfd */
+COND_SYSCALL(pidfd_wait);
-- 
2.21.0.225.g810b269d1ac-goog


^ permalink raw reply related	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-18 23:50                                                 ` Joel Fernandes
@ 2019-03-19 22:14                                                   ` Christian Brauner
  2019-03-19 22:26                                                     ` Joel Fernandes
  2019-03-19 22:48                                                     ` Daniel Colascione
  0 siblings, 2 replies; 113+ messages in thread
From: Christian Brauner @ 2019-03-19 22:14 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Daniel Colascione, Suren Baghdasaryan, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Oleg Nesterov, Andy Lutomirski, Serge E. Hallyn,
	keescook

On Mon, Mar 18, 2019 at 07:50:52PM -0400, Joel Fernandes wrote:
> On Mon, Mar 18, 2019 at 01:29:51AM +0100, Christian Brauner wrote:
> > On Sun, Mar 17, 2019 at 08:40:19AM -0700, Daniel Colascione wrote:
> > > On Sun, Mar 17, 2019 at 4:42 AM Christian Brauner <christian@brauner.io> wrote:
> > > >
> > > > On Sat, Mar 16, 2019 at 09:53:06PM -0400, Joel Fernandes wrote:
> > > > > On Sat, Mar 16, 2019 at 12:37:18PM -0700, Suren Baghdasaryan wrote:
> > > > > > On Sat, Mar 16, 2019 at 11:57 AM Christian Brauner <christian@brauner.io> wrote:
> > > > > > >
> > > > > > > On Sat, Mar 16, 2019 at 11:00:10AM -0700, Daniel Colascione wrote:
> > > > > > > > On Sat, Mar 16, 2019 at 10:31 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > > > > > > >
> > > > > > > > > On Fri, Mar 15, 2019 at 11:49 AM Joel Fernandes <joel@joelfernandes.org> wrote:
> > > > > > > > > >
> > > > > > > > > > On Fri, Mar 15, 2019 at 07:24:28PM +0100, Christian Brauner wrote:
> > > > > > > > > > [..]
> > > > > > > > > > > > why do we want to add a new syscall (pidfd_wait) though? Why not just use
> > > > > > > > > > > > standard poll/epoll interface on the proc fd like Daniel was suggesting.
> > > > > > > > > > > > AFAIK, once the proc file is opened, the struct pid is essentially pinned
> > > > > > > > > > > > even though the proc number may be reused. Then the caller can just poll.
> > > > > > > > > > > > We can add a waitqueue to struct pid, and wake up any waiters on process
> > > > > > > > > > > > death (A quick look shows task_struct can be mapped to its struct pid) and
> > > > > > > > > > > > also possibly optimize it using Steve's TIF flag idea. No new syscall is
> > > > > > > > > > > > needed then, let me know if I missed something?
> > > > > > > > > > >
> > > > > > > > > > > Huh, I thought that Daniel was against the poll/epoll solution?
> > > > > > > > > >
> > > > > > > > > > Hmm, going through earlier threads, I believe so now. Here was Daniel's
> > > > > > > > > > reasoning about avoiding a notification about process death through proc
> > > > > > > > > > directory fd: http://lkml.iu.edu/hypermail/linux/kernel/1811.0/00232.html
> > > > > > > > > >
> > > > > > > > > > May be a dedicated syscall for this would be cleaner after all.
> > > > > > > > >
> > > > > > > > > Ah, I wish I've seen that discussion before...
> > > > > > > > > syscall makes sense and it can be non-blocking and we can use
> > > > > > > > > select/poll/epoll if we use eventfd.
> > > > > > > >
> > > > > > > > Thanks for taking a look.
> > > > > > > >
> > > > > > > > > I would strongly advocate for
> > > > > > > > > non-blocking version or at least to have a non-blocking option.
> > > > > > > >
> > > > > > > > Waiting for FD readiness is *already* blocking or non-blocking
> > > > > > > > according to the caller's desire --- users can pass options they want
> > > > > > > > to poll(2) or whatever. There's no need for any kind of special
> > > > > > > > configuration knob or non-blocking option. We already *have* a
> > > > > > > > non-blocking option that works universally for everything.
> > > > > > > >
> > > > > > > > As I mentioned in the linked thread, waiting for process exit should
> > > > > > > > work just like waiting for bytes to appear on a pipe. Process exit
> > > > > > > > status is just another blob of bytes that a process might receive. A
> > > > > > > > process exit handle ought to be just another information source. The
> > > > > > > > reason the unix process API is so awful is that for whatever reason
> > > > > > > > the original designers treated processes as some kind of special kind
> > > > > > > > of resource instead of fitting them into the otherwise general-purpose
> > > > > > > > unix data-handling API. Let's not repeat that mistake.
> > > > > > > >
> > > > > > > > > Something like this:
> > > > > > > > >
> > > > > > > > > evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> > > > > > > > > // register eventfd to receive death notification
> > > > > > > > > pidfd_wait(pid_to_kill, evfd);
> > > > > > > > > // kill the process
> > > > > > > > > pidfd_send_signal(pid_to_kill, ...)
> > > > > > > > > // tend to other things
> > > > > > > >
> > > > > > > > Now you've lost me. pidfd_wait should return a *new* FD, not wire up
> > > > > > > > an eventfd.
> > > > > > > >
> > > > > >
> > > > > > Ok, I probably misunderstood your post linked by Joel. I though your
> > > > > > original proposal was based on being able to poll a file under
> > > > > > /proc/pid and then you changed your mind to have a separate syscall
> > > > > > which I assumed would be a blocking one to wait for process exit.
> > > > > > Maybe you can describe the new interface you are thinking about in
> > > > > > terms of userspace usage like I did above? Several lines of code would
> > > > > > explain more than paragraphs of text.
> > > > >
> > > > > Hey, Thanks Suren for the eventfd idea. I agree with Daniel on this. The idea
> > > > > from Daniel here is to wait for process death and exit events by just
> > > > > referring to a stable fd, independent of whatever is going on in /proc.
> > > > >
> > > > > What is needed is something like this (in highly pseudo-code form):
> > > > >
> > > > > pidfd = opendir("/proc/<pid>",..);
> > > > > wait_fd = pidfd_wait(pidfd);
> > > > > read or poll wait_fd (non-blocking or blocking whichever)
> > > > >
> > > > > wait_fd will block until the task has either died or reaped. In both these
> > > > > cases, it can return a suitable string such as "dead" or "reaped" although an
> > > > > integer with some predefined meaning is also Ok.
> > > 
> > > I want to return a siginfo_t: we already use this structure in other
> > > contexts to report exit status.
> > > 
> 
> Fine with me. I did a prototype (code is below) as a string but I can change
> that to siginfo_t in the future.
> 
> > > > Having pidfd_wait() return another fd will make the syscall harder to
> > > > swallow for a lot of people I reckon.
> > > > What exactly prevents us from making the pidfd itself readable/pollable
> > > > for the exit staus? They are "special" fds anyway. I would really like
> > > > to avoid polluting the api with multiple different types of fds if possible.
> > > 
> > > If pidfds had been their own file type, I'd agree with you. But pidfds
> > > are directories, which means that we're beholden to make them behave
> > > like directories normally do. I'd rather introduce another FD than
> > > heavily overload the semantics of a directory FD in one particular
> > > context. In no other circumstances are directory FDs also weird
> > > IO-data sources. Our providing a facility to get a new FD to which we
> > > *can* give pipe-like behavior does no harm and *usage* cleaner and
> > > easier to reason about.
> > 
> > I have two things I'm currently working on:
> > - hijacking translate_pid()
> > - pidfd_clone() essentially
> > 
> > My first goal is to talk to Eric about taking the translate_pid()
> > syscall that has been sitting in his tree and expanding it.
> > translate_pid() currently allows you to either get an fd for the pid
> > namespace a pid resides in or the pid number of a given process in
> > another pid namespace relative to a passed in pid namespace fd.
> 
> That's good to know. More comments below:

Sorry for the delay I'm still traveling. I'll be back on a fully
functional schedule starting Monday.

> 
> > I would
> > like to make it possible for this syscall to also give us back pidfds.
> > One question I'm currently struggling with is exactly what you said
> > above: what type of file descriptor these are going to give back to us.
> > It seems that a regular file instead of directory would make the most
> > sense and would lead to a nicer API and I'm very much leaning towards
> > that.
> 
> How about something like the following? We can plumb the new file as a pseudo
> file that is invisible and linked to the fd. This is extremely rough (does
> not do error handling, synchronizatoin etc) but just wanted to share the idea
> of what the "frontend" could look like. It is also missing all the actual pid
> status messages. It just takes care of the creating new fd from the pidfd
> part and providing file read ops returning the "status" string.  It is also
> written in signal.c and should likely go into proc fs files under fs.
> Appreciate any suggestions (a test program did prove it works).
> 
> Also, I was able to translate a pidfd to a pid_namespace by referring to some
> existing code but perhaps you may be able to suggest something better for
> such translation..

Yeah, there's better ways but I think there's another issue. See below.

> 
> ---8<-----------------------
> 
> From: Joel Fernandes <joelaf@google.com>
> Subject: [PATCH] Partial skeleton prototype of pidfd_wait frontend
> 
> Signed-off-by: Joel Fernandes <joelaf@google.com>
> ---
>  arch/x86/entry/syscalls/syscall_32.tbl |  1 +
>  arch/x86/entry/syscalls/syscall_64.tbl |  1 +
>  include/linux/syscalls.h               |  1 +
>  include/uapi/asm-generic/unistd.h      |  4 +-
>  kernel/signal.c                        | 62 ++++++++++++++++++++++++++
>  kernel/sys_ni.c                        |  3 ++
>  6 files changed, 71 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 1f9607ed087c..2a63f1896b63 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -433,3 +433,4 @@
>  425	i386	io_uring_setup		sys_io_uring_setup		__ia32_sys_io_uring_setup
>  426	i386	io_uring_enter		sys_io_uring_enter		__ia32_sys_io_uring_enter
>  427	i386	io_uring_register	sys_io_uring_register		__ia32_sys_io_uring_register
> +428	i386	pidfd_wait		sys_pidfd_wait			__ia32_sys_pidfd_wait
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 92ee0b4378d4..cf2e08a8053b 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -349,6 +349,7 @@
>  425	common	io_uring_setup		__x64_sys_io_uring_setup
>  426	common	io_uring_enter		__x64_sys_io_uring_enter
>  427	common	io_uring_register	__x64_sys_io_uring_register
> +428	common	pidfd_wait		__x64_sys_pidfd_wait
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index e446806a561f..62160970ed3f 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -988,6 +988,7 @@ asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
>  asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
>  				       siginfo_t __user *info,
>  				       unsigned int flags);
> +asmlinkage long sys_pidfd_wait(int pidfd);
>  
>  /*
>   * Architecture-specific system calls
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index dee7292e1df6..137aa8662230 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -832,9 +832,11 @@ __SYSCALL(__NR_io_uring_setup, sys_io_uring_setup)
>  __SYSCALL(__NR_io_uring_enter, sys_io_uring_enter)
>  #define __NR_io_uring_register 427
>  __SYSCALL(__NR_io_uring_register, sys_io_uring_register)
> +#define __NR_pidfd_wait 428
> +__SYSCALL(__NR_pidfd_wait, sys_pidfd_wait)
>  
>  #undef __NR_syscalls
> -#define __NR_syscalls 428
> +#define __NR_syscalls 429
>  
>  /*
>   * 32 bit systems traditionally used different
> diff --git a/kernel/signal.c b/kernel/signal.c
> index b7953934aa99..ebb550b87044 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -3550,6 +3550,68 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)
>  	return copy_siginfo_from_user(kinfo, info);
>  }
>  
> +static ssize_t pidfd_wait_read_iter(struct kiocb *iocb, struct iov_iter *to)
> +{
> +	/*
> +	 * This is just a test string, it will contain the actual
> +	 * status of the pidfd in the future.
> +	 */
> +	char buf[] = "status";
> +
> +	return copy_to_iter(buf, strlen(buf)+1, to);
> +}
> +
> +static const struct file_operations pidfd_wait_file_ops = {
> +	.read_iter	= pidfd_wait_read_iter,
> +};
> +
> +static struct inode *pidfd_wait_get_inode(struct super_block *sb)
> +{
> +	struct inode *inode = new_inode(sb);
> +
> +	inode->i_ino = get_next_ino();
> +	inode_init_owner(inode, NULL, S_IFREG);
> +
> +	inode->i_op		= &simple_dir_inode_operations;
> +	inode->i_fop		= &pidfd_wait_file_ops;
> +
> +	return inode;
> +}
> +
> +SYSCALL_DEFINE1(pidfd_wait, int, pidfd)
> +{
> +	struct fd f;
> +	struct inode *inode;
> +	struct file *file;
> +	int new_fd;
> +	struct pid_namespace *pid_ns;
> +	struct super_block *sb;
> +	struct vfsmount *mnt;
> +
> +	f = fdget_raw(pidfd);
> +	if (!f.file)
> +		return -EBADF;
> +
> +	sb = file_inode(f.file)->i_sb;
> +	pid_ns = sb->s_fs_info;
> +
> +	inode = pidfd_wait_get_inode(sb);
> +
> +	mnt = pid_ns->proc_mnt;
> +
> +	file = alloc_file_pseudo(inode, mnt, "pidfd_wait", O_RDONLY,
> +			&pidfd_wait_file_ops);

So I dislike the idea of allocating new inodes from the procfs super
block. I would like to avoid pinning the whole pidfd concept exclusively
to proc. The idea is that the pidfd API will be useable through procfs
via open("/proc/<pid>") because that is what users expect and really
wanted to have for a long time. So it makes sense to have this working.
But it should really be useable without it. That's why translate_pid()
and pidfd_clone() are on the table.  What I'm saying is, once the pidfd
api is "complete" you should be able to set CONFIG_PROCFS=N - even
though that's crazy - and still be able to use pidfds. This is also a
point akpm asked about when I did the pidfd_send_signal work.

So instead of going throught proc we should probably do what David has
been doing in the mount API and come to rely on anone_inode. So
something like:

fd = anon_inode_getfd("pidfd", &pidfd_fops, file_priv_data, flags);

and stash information such as pid namespace etc. in a pidfd struct or
something that we then can stash file->private_data of the new file.
This also lets us avoid all this open coding done here.
Another advantage is that anon_inodes is its own kernel-internal
filesystem.

Christian

> +
> +	file->f_mode |= FMODE_PREAD;
> +
> +	new_fd = get_unused_fd_flags(0);
> +	fd_install(new_fd, file);
> +
> +	fdput(f);
> +
> +	return new_fd;
> +}
> +
>  /**
>   * sys_pidfd_send_signal - send a signal to a process through a task file
>   *                          descriptor
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index d21f4befaea4..f52c4d864038 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -450,3 +450,6 @@ COND_SYSCALL(setuid16);
>  
>  /* restartable sequence */
>  COND_SYSCALL(rseq);
> +
> +/* pidfd */
> +COND_SYSCALL(pidfd_wait);
> -- 
> 2.21.0.225.g810b269d1ac-goog
> 

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-19 22:14                                                   ` Christian Brauner
@ 2019-03-19 22:26                                                     ` Joel Fernandes
  2019-03-19 22:48                                                     ` Daniel Colascione
  1 sibling, 0 replies; 113+ messages in thread
From: Joel Fernandes @ 2019-03-19 22:26 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Daniel Colascione, Suren Baghdasaryan, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Oleg Nesterov, Andy Lutomirski, Serge E. Hallyn,
	keescook

On Tue, Mar 19, 2019 at 11:14:17PM +0100, Christian Brauner wrote:
[snip] 
> > 
> > ---8<-----------------------
> > 
> > From: Joel Fernandes <joelaf@google.com>
> > Subject: [PATCH] Partial skeleton prototype of pidfd_wait frontend
> > 
> > Signed-off-by: Joel Fernandes <joelaf@google.com>
> > ---
> >  arch/x86/entry/syscalls/syscall_32.tbl |  1 +
> >  arch/x86/entry/syscalls/syscall_64.tbl |  1 +
> >  include/linux/syscalls.h               |  1 +
> >  include/uapi/asm-generic/unistd.h      |  4 +-
> >  kernel/signal.c                        | 62 ++++++++++++++++++++++++++
> >  kernel/sys_ni.c                        |  3 ++
> >  6 files changed, 71 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> > index 1f9607ed087c..2a63f1896b63 100644
> > --- a/arch/x86/entry/syscalls/syscall_32.tbl
> > +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> > @@ -433,3 +433,4 @@
> >  425	i386	io_uring_setup		sys_io_uring_setup		__ia32_sys_io_uring_setup
> >  426	i386	io_uring_enter		sys_io_uring_enter		__ia32_sys_io_uring_enter
> >  427	i386	io_uring_register	sys_io_uring_register		__ia32_sys_io_uring_register
> > +428	i386	pidfd_wait		sys_pidfd_wait			__ia32_sys_pidfd_wait
> > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> > index 92ee0b4378d4..cf2e08a8053b 100644
> > --- a/arch/x86/entry/syscalls/syscall_64.tbl
> > +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> > @@ -349,6 +349,7 @@
> >  425	common	io_uring_setup		__x64_sys_io_uring_setup
> >  426	common	io_uring_enter		__x64_sys_io_uring_enter
> >  427	common	io_uring_register	__x64_sys_io_uring_register
> > +428	common	pidfd_wait		__x64_sys_pidfd_wait
> >  
> >  #
> >  # x32-specific system call numbers start at 512 to avoid cache impact
> > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> > index e446806a561f..62160970ed3f 100644
> > --- a/include/linux/syscalls.h
> > +++ b/include/linux/syscalls.h
> > @@ -988,6 +988,7 @@ asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
> >  asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
> >  				       siginfo_t __user *info,
> >  				       unsigned int flags);
> > +asmlinkage long sys_pidfd_wait(int pidfd);
> >  
> >  /*
> >   * Architecture-specific system calls
> > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> > index dee7292e1df6..137aa8662230 100644
> > --- a/include/uapi/asm-generic/unistd.h
> > +++ b/include/uapi/asm-generic/unistd.h
> > @@ -832,9 +832,11 @@ __SYSCALL(__NR_io_uring_setup, sys_io_uring_setup)
> >  __SYSCALL(__NR_io_uring_enter, sys_io_uring_enter)
> >  #define __NR_io_uring_register 427
> >  __SYSCALL(__NR_io_uring_register, sys_io_uring_register)
> > +#define __NR_pidfd_wait 428
> > +__SYSCALL(__NR_pidfd_wait, sys_pidfd_wait)
> >  
> >  #undef __NR_syscalls
> > -#define __NR_syscalls 428
> > +#define __NR_syscalls 429
> >  
> >  /*
> >   * 32 bit systems traditionally used different
> > diff --git a/kernel/signal.c b/kernel/signal.c
> > index b7953934aa99..ebb550b87044 100644
> > --- a/kernel/signal.c
> > +++ b/kernel/signal.c
> > @@ -3550,6 +3550,68 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)
> >  	return copy_siginfo_from_user(kinfo, info);
> >  }
> >  
> > +static ssize_t pidfd_wait_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > +{
> > +	/*
> > +	 * This is just a test string, it will contain the actual
> > +	 * status of the pidfd in the future.
> > +	 */
> > +	char buf[] = "status";
> > +
> > +	return copy_to_iter(buf, strlen(buf)+1, to);
> > +}
> > +
> > +static const struct file_operations pidfd_wait_file_ops = {
> > +	.read_iter	= pidfd_wait_read_iter,
> > +};
> > +
> > +static struct inode *pidfd_wait_get_inode(struct super_block *sb)
> > +{
> > +	struct inode *inode = new_inode(sb);
> > +
> > +	inode->i_ino = get_next_ino();
> > +	inode_init_owner(inode, NULL, S_IFREG);
> > +
> > +	inode->i_op		= &simple_dir_inode_operations;
> > +	inode->i_fop		= &pidfd_wait_file_ops;
> > +
> > +	return inode;
> > +}
> > +
> > +SYSCALL_DEFINE1(pidfd_wait, int, pidfd)
> > +{
> > +	struct fd f;
> > +	struct inode *inode;
> > +	struct file *file;
> > +	int new_fd;
> > +	struct pid_namespace *pid_ns;
> > +	struct super_block *sb;
> > +	struct vfsmount *mnt;
> > +
> > +	f = fdget_raw(pidfd);
> > +	if (!f.file)
> > +		return -EBADF;
> > +
> > +	sb = file_inode(f.file)->i_sb;
> > +	pid_ns = sb->s_fs_info;
> > +
> > +	inode = pidfd_wait_get_inode(sb);
> > +
> > +	mnt = pid_ns->proc_mnt;
> > +
> > +	file = alloc_file_pseudo(inode, mnt, "pidfd_wait", O_RDONLY,
> > +			&pidfd_wait_file_ops);
> 
> So I dislike the idea of allocating new inodes from the procfs super
> block. I would like to avoid pinning the whole pidfd concept exclusively
> to proc. The idea is that the pidfd API will be useable through procfs
> via open("/proc/<pid>") because that is what users expect and really
> wanted to have for a long time. So it makes sense to have this working.
> But it should really be useable without it. That's why translate_pid()
> and pidfd_clone() are on the table.  What I'm saying is, once the pidfd
> api is "complete" you should be able to set CONFIG_PROCFS=N - even
> though that's crazy - and still be able to use pidfds. This is also a
> point akpm asked about when I did the pidfd_send_signal work.

Oh, ok. Somehow 'proc' and 'pid' sound very similar in terminology so
naturally I felt the proc fs superblock would be a fit, but I see your point.

> So instead of going throught proc we should probably do what David has
> been doing in the mount API and come to rely on anone_inode. So
> something like:
> 
> fd = anon_inode_getfd("pidfd", &pidfd_fops, file_priv_data, flags);
> 
> and stash information such as pid namespace etc. in a pidfd struct or
> something that we then can stash file->private_data of the new file.
> This also lets us avoid all this open coding done here.
> Another advantage is that anon_inodes is its own kernel-internal
> filesystem.

Thanks for the suggestion! Agreed this is better and will do it this way then. 

thanks,

 - Joel

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-19 22:14                                                   ` Christian Brauner
  2019-03-19 22:26                                                     ` Joel Fernandes
@ 2019-03-19 22:48                                                     ` Daniel Colascione
  2019-03-19 23:10                                                       ` Christian Brauner
  1 sibling, 1 reply; 113+ messages in thread
From: Daniel Colascione @ 2019-03-19 22:48 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Joel Fernandes, Suren Baghdasaryan, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Oleg Nesterov, Andy Lutomirski, Serge E. Hallyn,
	Kees Cook

On Tue, Mar 19, 2019 at 3:14 PM Christian Brauner <christian@brauner.io> wrote:
> So I dislike the idea of allocating new inodes from the procfs super
> block. I would like to avoid pinning the whole pidfd concept exclusively
> to proc. The idea is that the pidfd API will be useable through procfs
> via open("/proc/<pid>") because that is what users expect and really
> wanted to have for a long time. So it makes sense to have this working.
> But it should really be useable without it. That's why translate_pid()
> and pidfd_clone() are on the table.  What I'm saying is, once the pidfd
> api is "complete" you should be able to set CONFIG_PROCFS=N - even
> though that's crazy - and still be able to use pidfds. This is also a
> point akpm asked about when I did the pidfd_send_signal work.

I agree that you shouldn't need CONFIG_PROCFS=Y to use pidfds. One
crazy idea that I was discussing with Joel the other day is to just
make CONFIG_PROCFS=Y mandatory and provide a new get_procfs_root()
system call that returned, out of thin air and independent of the
mount table, a procfs root directory file descriptor for the caller's
PID namspace and suitable for use with openat(2).

C'mon: /proc is used by everyone today and almost every program breaks
if it's not around. The string "/proc" is already de facto kernel ABI.
Let's just drop the pretense of /proc being optional and bake it into
the kernel proper, then give programs a way to get to /proc that isn't
tied to any particular mount configuration. This way, we don't need a
translate_pid(), since callers can just use procfs to do the same
thing. (That is, if I understand correctly what translate_pid does.)

We still need a pidfd_clone() for atomicity reasons, but that's a
separate story. My goal is to be able to write a library that
transparently creates and manages a helper child process even in a
"hostile" process environment in which some other uncoordinated thread
is constantly doing a waitpid(-1) (e.g., the JVM).

> So instead of going throught proc we should probably do what David has
> been doing in the mount API and come to rely on anone_inode. So
> something like:
>
> fd = anon_inode_getfd("pidfd", &pidfd_fops, file_priv_data, flags);
>
> and stash information such as pid namespace etc. in a pidfd struct or
> something that we then can stash file->private_data of the new file.
> This also lets us avoid all this open coding done here.
> Another advantage is that anon_inodes is its own kernel-internal
> filesystem.

Sure. That works too.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-19 22:48                                                     ` Daniel Colascione
@ 2019-03-19 23:10                                                       ` Christian Brauner
  2019-03-20  1:52                                                         ` Joel Fernandes
  0 siblings, 1 reply; 113+ messages in thread
From: Christian Brauner @ 2019-03-19 23:10 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: Joel Fernandes, Suren Baghdasaryan, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Oleg Nesterov, Andy Lutomirski, Serge E. Hallyn,
	Kees Cook

On Tue, Mar 19, 2019 at 03:48:32PM -0700, Daniel Colascione wrote:
> On Tue, Mar 19, 2019 at 3:14 PM Christian Brauner <christian@brauner.io> wrote:
> > So I dislike the idea of allocating new inodes from the procfs super
> > block. I would like to avoid pinning the whole pidfd concept exclusively
> > to proc. The idea is that the pidfd API will be useable through procfs
> > via open("/proc/<pid>") because that is what users expect and really
> > wanted to have for a long time. So it makes sense to have this working.
> > But it should really be useable without it. That's why translate_pid()
> > and pidfd_clone() are on the table.  What I'm saying is, once the pidfd
> > api is "complete" you should be able to set CONFIG_PROCFS=N - even
> > though that's crazy - and still be able to use pidfds. This is also a
> > point akpm asked about when I did the pidfd_send_signal work.
> 
> I agree that you shouldn't need CONFIG_PROCFS=Y to use pidfds. One
> crazy idea that I was discussing with Joel the other day is to just
> make CONFIG_PROCFS=Y mandatory and provide a new get_procfs_root()
> system call that returned, out of thin air and independent of the
> mount table, a procfs root directory file descriptor for the caller's
> PID namspace and suitable for use with openat(2).

Even if this works I'm pretty sure that Al and a lot of others will not
be happy about this. A syscall to get an fd to /proc? That's not going
to happen and I don't see the need for a separate syscall just for that.
(I do see the point of making CONFIG_PROCFS=y the default btw.)

Inode allocation from the procfs mount for the file descriptors Joel
wants is not correct. Their not really procfs file descriptors so this
is a nack. We can't just hook into proc that way.

> 
> C'mon: /proc is used by everyone today and almost every program breaks
> if it's not around. The string "/proc" is already de facto kernel ABI.
> Let's just drop the pretense of /proc being optional and bake it into
> the kernel proper, then give programs a way to get to /proc that isn't
> tied to any particular mount configuration. This way, we don't need a
> translate_pid(), since callers can just use procfs to do the same
> thing. (That is, if I understand correctly what translate_pid does.)

I'm not sure what you think translate_pid() is doing since you're not
saying what you think it does.
Examples from the old patchset:
translate_pid(pid, ns, -1)      - get pid in our pid namespace
translate_pid(pid, -1, ns)      - get pid in other pid namespace
translate_pid(1, ns, -1)        - get pid of init task for namespace
translate_pid(pid, -1, ns) > 0  - is pid is reachable from ns?
translate_pid(1, ns1, ns2) > 0  - is ns1 inside ns2?
translate_pid(1, ns1, ns2) == 0 - is ns1 outside ns2?
translate_pid(1, ns1, ns2) == 1 - is ns1 equal ns2?

Allowing this syscall to yield pidfds as proper regular file fds and
also taking pidfds as argument is an excellent way to kill a few
problems at once:
- cheap pid namespace introspection
- creates a bridge between the "old" pid-based api and the "new" pidfd api
- allows us to get proper non-directory file descriptors for any pids we
  like

The additional advantage is that people are already happy to add this
syscall so simply extending it and routing it through the pidfd tree or
Eric's tree is reasonable. (It should probably grow a flag argument. I
need to start prototyping this.)

> 
> We still need a pidfd_clone() for atomicity reasons, but that's a
> separate story. My goal is to be able to write a library that

Yes, on my todo list and I have a ported patch based on prior working
rotting somehwere on my git server.

> transparently creates and manages a helper child process even in a
> "hostile" process environment in which some other uncoordinated thread
> is constantly doing a waitpid(-1) (e.g., the JVM).
> 
> > So instead of going throught proc we should probably do what David has
> > been doing in the mount API and come to rely on anone_inode. So
> > something like:
> >
> > fd = anon_inode_getfd("pidfd", &pidfd_fops, file_priv_data, flags);
> >
> > and stash information such as pid namespace etc. in a pidfd struct or
> > something that we then can stash file->private_data of the new file.
> > This also lets us avoid all this open coding done here.
> > Another advantage is that anon_inodes is its own kernel-internal
> > filesystem.
> 
> Sure. That works too.

Great.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-19 23:10                                                       ` Christian Brauner
@ 2019-03-20  1:52                                                         ` Joel Fernandes
  2019-03-20  2:42                                                           ` pidfd design Daniel Colascione
  2019-05-07  2:16                                                           ` [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android Sultan Alsawaf
  0 siblings, 2 replies; 113+ messages in thread
From: Joel Fernandes @ 2019-03-20  1:52 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Daniel Colascione, Suren Baghdasaryan, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Oleg Nesterov, Andy Lutomirski, Serge E. Hallyn,
	Kees Cook

On Wed, Mar 20, 2019 at 12:10:23AM +0100, Christian Brauner wrote:
> On Tue, Mar 19, 2019 at 03:48:32PM -0700, Daniel Colascione wrote:
> > On Tue, Mar 19, 2019 at 3:14 PM Christian Brauner <christian@brauner.io> wrote:
> > > So I dislike the idea of allocating new inodes from the procfs super
> > > block. I would like to avoid pinning the whole pidfd concept exclusively
> > > to proc. The idea is that the pidfd API will be useable through procfs
> > > via open("/proc/<pid>") because that is what users expect and really
> > > wanted to have for a long time. So it makes sense to have this working.
> > > But it should really be useable without it. That's why translate_pid()
> > > and pidfd_clone() are on the table.  What I'm saying is, once the pidfd
> > > api is "complete" you should be able to set CONFIG_PROCFS=N - even
> > > though that's crazy - and still be able to use pidfds. This is also a
> > > point akpm asked about when I did the pidfd_send_signal work.
> > 
> > I agree that you shouldn't need CONFIG_PROCFS=Y to use pidfds. One
> > crazy idea that I was discussing with Joel the other day is to just
> > make CONFIG_PROCFS=Y mandatory and provide a new get_procfs_root()
> > system call that returned, out of thin air and independent of the
> > mount table, a procfs root directory file descriptor for the caller's
> > PID namspace and suitable for use with openat(2).
> 
> Even if this works I'm pretty sure that Al and a lot of others will not
> be happy about this. A syscall to get an fd to /proc? That's not going
> to happen and I don't see the need for a separate syscall just for that.
> (I do see the point of making CONFIG_PROCFS=y the default btw.)

I think his point here was that he wanted a handle to procfs no matter where
it was mounted and then can later use openat on that. Agreed that it may be
unnecessary unless there is a usecase for it, and especially if the /proc
directory being the defacto mountpoint for procfs is a universal convention.

> Inode allocation from the procfs mount for the file descriptors Joel
> wants is not correct. Their not really procfs file descriptors so this
> is a nack. We can't just hook into proc that way.

I was not particular about using procfs mount for the FDs but that's the only
way I knew how to do it until you pointed out anon_inode (my grep skills
missed that), so thank you!

> > C'mon: /proc is used by everyone today and almost every program breaks
> > if it's not around. The string "/proc" is already de facto kernel ABI.
> > Let's just drop the pretense of /proc being optional and bake it into
> > the kernel proper, then give programs a way to get to /proc that isn't
> > tied to any particular mount configuration. This way, we don't need a
> > translate_pid(), since callers can just use procfs to do the same
> > thing. (That is, if I understand correctly what translate_pid does.)
> 
> I'm not sure what you think translate_pid() is doing since you're not
> saying what you think it does.
> Examples from the old patchset:
> translate_pid(pid, ns, -1)      - get pid in our pid namespace
> translate_pid(pid, -1, ns)      - get pid in other pid namespace
> translate_pid(1, ns, -1)        - get pid of init task for namespace
> translate_pid(pid, -1, ns) > 0  - is pid is reachable from ns?
> translate_pid(1, ns1, ns2) > 0  - is ns1 inside ns2?
> translate_pid(1, ns1, ns2) == 0 - is ns1 outside ns2?
> translate_pid(1, ns1, ns2) == 1 - is ns1 equal ns2?
> 
> Allowing this syscall to yield pidfds as proper regular file fds and
> also taking pidfds as argument is an excellent way to kill a few
> problems at once:
> - cheap pid namespace introspection
> - creates a bridge between the "old" pid-based api and the "new" pidfd api

This second point would solve the problem of getting a new pidfd given a pid
indeed, without depending on /proc/<pid> at all. So kudos for that and I am
glad you are making it return pidfds (but correct me if I misunderstood what
you're planning to do with translate_fd). It also obviates any need for
dealing with procfs mount points.

> - allows us to get proper non-directory file descriptors for any pids we
>   like

Here I got a bit lost. AIUI pidfd is a directory fd. Why would we want it to
not be a directory fd? That would be ambigiuous with what pidfd_send_signal
expects.

Also would it be a bad idea to extend translate_pid to also do what we want
for the pidfd_wait syscall?  So translate_fd in this case would return an fd
that is just used for the pid's death status.

All of these extensions seem to mean translate_pid should probably take a
fourth parameter that tells it the target translation type?

They way I am hypothesizing, translate_pid, it should probably be
- translation to a pid in some ns
- translation of a pid to a pidfd
- translation of a pid to a "wait" fd which returns the death/reap process status.

If that makes sense, that would also avoid the need for a new syscall we are adding.

> The additional advantage is that people are already happy to add this
> syscall so simply extending it and routing it through the pidfd tree or
> Eric's tree is reasonable. (It should probably grow a flag argument. I
> need to start prototyping this.)

Great!

> > 
> > We still need a pidfd_clone() for atomicity reasons, but that's a
> > separate story. My goal is to be able to write a library that
> 
> Yes, on my todo list and I have a ported patch based on prior working
> rotting somehwere on my git server.

Is that different from using dup2 on a pidfd?  Sorry I don't follow what is
pidfd_clone / why it is needed.

thanks,

 - Joel

^ permalink raw reply	[flat|nested] 113+ messages in thread

* pidfd design
  2019-03-20  1:52                                                         ` Joel Fernandes
@ 2019-03-20  2:42                                                           ` Daniel Colascione
  2019-03-20  3:59                                                             ` Christian Brauner
  2019-05-07  2:16                                                           ` [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android Sultan Alsawaf
  1 sibling, 1 reply; 113+ messages in thread
From: Daniel Colascione @ 2019-03-20  2:42 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Christian Brauner, Suren Baghdasaryan, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Oleg Nesterov, Andy Lutomirski, Serge E. Hallyn,
	Kees Cook

On Tue, Mar 19, 2019 at 6:52 PM Joel Fernandes <joel@joelfernandes.org> wrote:
>
> On Wed, Mar 20, 2019 at 12:10:23AM +0100, Christian Brauner wrote:
> > On Tue, Mar 19, 2019 at 03:48:32PM -0700, Daniel Colascione wrote:
> > > On Tue, Mar 19, 2019 at 3:14 PM Christian Brauner <christian@brauner.io> wrote:
> > > > So I dislike the idea of allocating new inodes from the procfs super
> > > > block. I would like to avoid pinning the whole pidfd concept exclusively
> > > > to proc. The idea is that the pidfd API will be useable through procfs
> > > > via open("/proc/<pid>") because that is what users expect and really
> > > > wanted to have for a long time. So it makes sense to have this working.
> > > > But it should really be useable without it. That's why translate_pid()
> > > > and pidfd_clone() are on the table.  What I'm saying is, once the pidfd
> > > > api is "complete" you should be able to set CONFIG_PROCFS=N - even
> > > > though that's crazy - and still be able to use pidfds. This is also a
> > > > point akpm asked about when I did the pidfd_send_signal work.
> > >
> > > I agree that you shouldn't need CONFIG_PROCFS=Y to use pidfds. One
> > > crazy idea that I was discussing with Joel the other day is to just
> > > make CONFIG_PROCFS=Y mandatory and provide a new get_procfs_root()
> > > system call that returned, out of thin air and independent of the
> > > mount table, a procfs root directory file descriptor for the caller's
> > > PID namspace and suitable for use with openat(2).
> >
> > Even if this works I'm pretty sure that Al and a lot of others will not
> > be happy about this. A syscall to get an fd to /proc?

Why not? procfs provides access to a lot of core kernel functionality.
Why should you need a mountpoint to get to it?

> That's not going
> > to happen and I don't see the need for a separate syscall just for that.

We need a system call for the same reason we need a getrandom(2): you
have to bootstrap somehow when you're in a minimal environment.

> > (I do see the point of making CONFIG_PROCFS=y the default btw.)

I'm not proposing that we make CONFIG_PROCFS=y the default. I'm
proposing that we *hardwire* it as the default and just declare that
it's not possible to build a Linux kernel that doesn't include procfs.
Why do we even have that button?

> I think his point here was that he wanted a handle to procfs no matter where
> it was mounted and then can later use openat on that. Agreed that it may be
> unnecessary unless there is a usecase for it, and especially if the /proc
> directory being the defacto mountpoint for procfs is a universal convention.

If it's a universal convention and, in practice, everyone needs proc
mounted anyway, so what's the harm in hardwiring CONFIG_PROCFS=y? If
we advertise /proc as not merely some kind of optional debug interface
but *the* way certain kernel features are exposed --- and there's
nothing wrong with that --- then we should give programs access to
these core kernel features in a way that doesn't depend on userspace
kernel configuration, and you do that by either providing a
procfs-root-getting system call or just hardwiring the "/proc/" prefix
into VFS.

> > Inode allocation from the procfs mount for the file descriptors Joel
> > wants is not correct. Their not really procfs file descriptors so this
> > is a nack. We can't just hook into proc that way.
>
> I was not particular about using procfs mount for the FDs but that's the only
> way I knew how to do it until you pointed out anon_inode (my grep skills
> missed that), so thank you!
>
> > > C'mon: /proc is used by everyone today and almost every program breaks
> > > if it's not around. The string "/proc" is already de facto kernel ABI.
> > > Let's just drop the pretense of /proc being optional and bake it into
> > > the kernel proper, then give programs a way to get to /proc that isn't
> > > tied to any particular mount configuration. This way, we don't need a
> > > translate_pid(), since callers can just use procfs to do the same
> > > thing. (That is, if I understand correctly what translate_pid does.)
> >
> > I'm not sure what you think translate_pid() is doing since you're not
> > saying what you think it does.
> > Examples from the old patchset:
> > translate_pid(pid, ns, -1)      - get pid in our pid namespace

Ah, it's a bit different from what I had in mind. It's fair to want to
translate PIDs between namespaces, but the only way to make the
translate_pid under discussion robust is to have it accept and produce
pidfds. (At that point, you might as well call it translate_pidfd.) We
should not be adding new APIs to the kernel that accept numeric PIDs:
it's not possible to use these APIs correctly except under very
limited circumstances --- mostly, talking about init or a parent
talking about its child.

Really, we need a few related operations, and we shouldn't necessarily
mingle them.

1) Given a numeric PID, give me a pidfd: that works today: you just
open /proc/<pid>

2) Given a pidfd, give me a numeric PID: that works today: you just
openat(pidfd, "stat", O_RDONLY) and read the first token (which is
always the numeric PID).

3) Given a pidfd, send a signal: that's what pidfd_send_signal does,
and it's a good start on the rest of these operations.

4) Given a pidfd, wait for the named process to exit: that's what my
original exithand thing did, and that's what Joel's helpfully agreed
to start hacking on.

5) Given a pidfd in NS1, get a pidfd in NS2. That's what translate_pid
is for. My preferred signature for this routine is translate_pid(int
pidfd, int nsfd) -> pidfd. We don't need two namespace arguments. Why
not? Because the pidfd *already* names a single process, uniquely!

6) Make a new process and atomically give me a pidfd for it. We need a
new kind of clone(2) for that. People have been proposing some kind of
FD-based fork/spawn/etc. thing for ages, and we can finally provide
it. Yay.

7) Retrieve miscellaneous information about a process identified by a
pidfd: openat(2) handles this case today.

This is a decent framework for a good general-purpose process API that
builds on the one the kernel already provides. With this API, people
should never have to touch the old unix process API except to talk to
humans and other legacy systems. It's a big project, but worthwhile,
and we can do it piecemeal.

Christian, what worries me is that you want to make this project 10x
harder, both in technical and lkml-political terms, by making it work
without CONFIG_PROCFS=y. Without procfs, all the operations above that
involve the word "openat" or "/proc" break, which means that our
general-purpose process API needs to provide its own equivalents to
these operations, and on top of these, its own non-procfs pidfd FD
type --- let's call it pidfd_2. (Let's call a directory FD on
/proc/<pid> a pidfd_1.) Under this scheme, we have to have all
operations that accept a pidfd_1 (like pidfd_send_signal) and have
them accept pidfd_2 file descriptors as well in general fashion. (The
difference between pidfd_1 and pidfd_2 is visible to users who can
call fstat and look at st_dev.) We'd also need an API to translate a
pidfd_2 to a pidfd_1 so you could call openat on it to look at
/proc/<pid> data files, to support operation #7 above.  The
alternative to provide #7 is some kind of new general-purpose
process-information-retrieval interface that mirrors the functionality
/proc/<pid> already provides --- e.g., getting the smaps list for a
process.

To sum it up, we can

A) declare that pidfds don't work without CONFIG_PROCFS=y,
B) hardwire CONFIG_PROCFS=y in all configurations, or
C) support both procfs-based pidfd_1 FDs and non-procfs pidfd_2 FDs.

Option C seems like pointless complexity to me, as I described above.
Option C means that we have to duplicate a lot of existing and
perfectly good functionality.

Option A is fine by me, since I think CONFIG_PROCFS=n is just a
bonkers and broken configuration that's barely even Linux.

From a design perspective, I prefer option B: it turns a de-facto
guaranteed /proc ABI into a de-jure guaranteed ABI, and that's just
more straightforward for everyone --- especially since it reduces the
complexity of the Linux core by deleting all the !CONFIG_PROCFS code
paths. My point about the procfs system call is that *if* we go with
option B and make procfs mandatory, we're essentially stating that
certain kernel functionality is always available, and because (as a
general rule) kernel functionality that's always available should be
available to every process, we should provide a way to *use* this
always-present kernel functionality that doesn't depend on the mount
table --- thus my proposed get_procfs_root(2).

We don't have to decide between A and B right now. We can continue
develop pidfd development under the assumption we're going with option
A, and when option B seems like a good idea, we can just switch with
minimal hassle. On the other hand, if we did implement option C and,
later, it became apparently that option B was right after all, all the
work needed for option C would have been a waste.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-20  2:42                                                           ` pidfd design Daniel Colascione
@ 2019-03-20  3:59                                                             ` Christian Brauner
  2019-03-20  7:02                                                               ` Daniel Colascione
  0 siblings, 1 reply; 113+ messages in thread
From: Christian Brauner @ 2019-03-20  3:59 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: Joel Fernandes, Suren Baghdasaryan, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Oleg Nesterov, Andy Lutomirski, Serge E. Hallyn,
	Kees Cook

On Tue, Mar 19, 2019 at 07:42:52PM -0700, Daniel Colascione wrote:
> On Tue, Mar 19, 2019 at 6:52 PM Joel Fernandes <joel@joelfernandes.org> wrote:
> >
> > On Wed, Mar 20, 2019 at 12:10:23AM +0100, Christian Brauner wrote:
> > > On Tue, Mar 19, 2019 at 03:48:32PM -0700, Daniel Colascione wrote:
> > > > On Tue, Mar 19, 2019 at 3:14 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > So I dislike the idea of allocating new inodes from the procfs super
> > > > > block. I would like to avoid pinning the whole pidfd concept exclusively
> > > > > to proc. The idea is that the pidfd API will be useable through procfs
> > > > > via open("/proc/<pid>") because that is what users expect and really
> > > > > wanted to have for a long time. So it makes sense to have this working.
> > > > > But it should really be useable without it. That's why translate_pid()
> > > > > and pidfd_clone() are on the table.  What I'm saying is, once the pidfd
> > > > > api is "complete" you should be able to set CONFIG_PROCFS=N - even
> > > > > though that's crazy - and still be able to use pidfds. This is also a
> > > > > point akpm asked about when I did the pidfd_send_signal work.
> > > >
> > > > I agree that you shouldn't need CONFIG_PROCFS=Y to use pidfds. One
> > > > crazy idea that I was discussing with Joel the other day is to just
> > > > make CONFIG_PROCFS=Y mandatory and provide a new get_procfs_root()
> > > > system call that returned, out of thin air and independent of the
> > > > mount table, a procfs root directory file descriptor for the caller's
> > > > PID namspace and suitable for use with openat(2).
> > >
> > > Even if this works I'm pretty sure that Al and a lot of others will not
> > > be happy about this. A syscall to get an fd to /proc?
> 
> Why not? procfs provides access to a lot of core kernel functionality.
> Why should you need a mountpoint to get to it?
> 
> > That's not going
> > > to happen and I don't see the need for a separate syscall just for that.
> 
> We need a system call for the same reason we need a getrandom(2): you
> have to bootstrap somehow when you're in a minimal environment.
> 
> > > (I do see the point of making CONFIG_PROCFS=y the default btw.)
> 
> I'm not proposing that we make CONFIG_PROCFS=y the default. I'm
> proposing that we *hardwire* it as the default and just declare that
> it's not possible to build a Linux kernel that doesn't include procfs.
> Why do we even have that button?
> 
> > I think his point here was that he wanted a handle to procfs no matter where
> > it was mounted and then can later use openat on that. Agreed that it may be
> > unnecessary unless there is a usecase for it, and especially if the /proc
> > directory being the defacto mountpoint for procfs is a universal convention.
> 
> If it's a universal convention and, in practice, everyone needs proc
> mounted anyway, so what's the harm in hardwiring CONFIG_PROCFS=y? If
> we advertise /proc as not merely some kind of optional debug interface
> but *the* way certain kernel features are exposed --- and there's
> nothing wrong with that --- then we should give programs access to
> these core kernel features in a way that doesn't depend on userspace
> kernel configuration, and you do that by either providing a
> procfs-root-getting system call or just hardwiring the "/proc/" prefix
> into VFS.
> 
> > > Inode allocation from the procfs mount for the file descriptors Joel
> > > wants is not correct. Their not really procfs file descriptors so this
> > > is a nack. We can't just hook into proc that way.
> >
> > I was not particular about using procfs mount for the FDs but that's the only
> > way I knew how to do it until you pointed out anon_inode (my grep skills
> > missed that), so thank you!
> >
> > > > C'mon: /proc is used by everyone today and almost every program breaks
> > > > if it's not around. The string "/proc" is already de facto kernel ABI.
> > > > Let's just drop the pretense of /proc being optional and bake it into
> > > > the kernel proper, then give programs a way to get to /proc that isn't
> > > > tied to any particular mount configuration. This way, we don't need a
> > > > translate_pid(), since callers can just use procfs to do the same
> > > > thing. (That is, if I understand correctly what translate_pid does.)
> > >
> > > I'm not sure what you think translate_pid() is doing since you're not
> > > saying what you think it does.
> > > Examples from the old patchset:
> > > translate_pid(pid, ns, -1)      - get pid in our pid namespace
> 
> Ah, it's a bit different from what I had in mind. It's fair to want to
> translate PIDs between namespaces, but the only way to make the
> translate_pid under discussion robust is to have it accept and produce
> pidfds. (At that point, you might as well call it translate_pidfd.) We
> should not be adding new APIs to the kernel that accept numeric PIDs:

The traditional pid-based api is not going away. There are users that
have the requirement to translate pids between namespaces and also doing
introspection on these namespaces independent of pidfds. We will not
restrict the usefulness of this syscall by making it only work with
pidfds.

> it's not possible to use these APIs correctly except under very
> limited circumstances --- mostly, talking about init or a parent

The pid-based api is one of the most widely used apis of the kernel and
people have been using it quite successfully for a long time. Yes, it's
rac, but it's here to stay.

> talking about its child.
> 
> Really, we need a few related operations, and we shouldn't necessarily
> mingle them.

Yes, we've established that previously.

> 
> 1) Given a numeric PID, give me a pidfd: that works today: you just
> open /proc/<pid>

Agreed.

> 
> 2) Given a pidfd, give me a numeric PID: that works today: you just
> openat(pidfd, "stat", O_RDONLY) and read the first token (which is
> always the numeric PID).

Agreed.

> 
> 3) Given a pidfd, send a signal: that's what pidfd_send_signal does,
> and it's a good start on the rest of these operations.

Agreed.

> 5) Given a pidfd in NS1, get a pidfd in NS2. That's what translate_pid
> is for. My preferred signature for this routine is translate_pid(int
> pidfd, int nsfd) -> pidfd. We don't need two namespace arguments. Why
> not? Because the pidfd *already* names a single process, uniquely!

Given that people are interested in pids we can't just always return a
pidfd. That would mean a user would need to do get the pidfd read from
<pidfd>/stat and then close the pidfd. If you do that for a 100 pids or
more you end up allocating and closing file descriptors constantly for
no reason. We can't just debate pids away. So it will also need to be
able to yield pids e.g. through a flag argument.

> 
> 6) Make a new process and atomically give me a pidfd for it. We need a
> new kind of clone(2) for that. People have been proposing some kind of
> FD-based fork/spawn/etc. thing for ages, and we can finally provide
> it. Yay.

Agreed.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-20  3:59                                                             ` Christian Brauner
@ 2019-03-20  7:02                                                               ` Daniel Colascione
  2019-03-20 11:33                                                                 ` Joel Fernandes
  0 siblings, 1 reply; 113+ messages in thread
From: Daniel Colascione @ 2019-03-20  7:02 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Joel Fernandes, Suren Baghdasaryan, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Oleg Nesterov, Andy Lutomirski, Serge E. Hallyn,
	Kees Cook

On Tue, Mar 19, 2019 at 8:59 PM Christian Brauner <christian@brauner.io> wrote:
>
> On Tue, Mar 19, 2019 at 07:42:52PM -0700, Daniel Colascione wrote:
> > On Tue, Mar 19, 2019 at 6:52 PM Joel Fernandes <joel@joelfernandes.org> wrote:
> > >
> > > On Wed, Mar 20, 2019 at 12:10:23AM +0100, Christian Brauner wrote:
> > > > On Tue, Mar 19, 2019 at 03:48:32PM -0700, Daniel Colascione wrote:
> > > > > On Tue, Mar 19, 2019 at 3:14 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > So I dislike the idea of allocating new inodes from the procfs super
> > > > > > block. I would like to avoid pinning the whole pidfd concept exclusively
> > > > > > to proc. The idea is that the pidfd API will be useable through procfs
> > > > > > via open("/proc/<pid>") because that is what users expect and really
> > > > > > wanted to have for a long time. So it makes sense to have this working.
> > > > > > But it should really be useable without it. That's why translate_pid()
> > > > > > and pidfd_clone() are on the table.  What I'm saying is, once the pidfd
> > > > > > api is "complete" you should be able to set CONFIG_PROCFS=N - even
> > > > > > though that's crazy - and still be able to use pidfds. This is also a
> > > > > > point akpm asked about when I did the pidfd_send_signal work.
> > > > >
> > > > > I agree that you shouldn't need CONFIG_PROCFS=Y to use pidfds. One
> > > > > crazy idea that I was discussing with Joel the other day is to just
> > > > > make CONFIG_PROCFS=Y mandatory and provide a new get_procfs_root()
> > > > > system call that returned, out of thin air and independent of the
> > > > > mount table, a procfs root directory file descriptor for the caller's
> > > > > PID namspace and suitable for use with openat(2).
> > > >
> > > > Even if this works I'm pretty sure that Al and a lot of others will not
> > > > be happy about this. A syscall to get an fd to /proc?
> >
> > Why not? procfs provides access to a lot of core kernel functionality.
> > Why should you need a mountpoint to get to it?
> >
> > > That's not going
> > > > to happen and I don't see the need for a separate syscall just for that.
> >
> > We need a system call for the same reason we need a getrandom(2): you
> > have to bootstrap somehow when you're in a minimal environment.
> >
> > > > (I do see the point of making CONFIG_PROCFS=y the default btw.)
> >
> > I'm not proposing that we make CONFIG_PROCFS=y the default. I'm
> > proposing that we *hardwire* it as the default and just declare that
> > it's not possible to build a Linux kernel that doesn't include procfs.
> > Why do we even have that button?
> >
> > > I think his point here was that he wanted a handle to procfs no matter where
> > > it was mounted and then can later use openat on that. Agreed that it may be
> > > unnecessary unless there is a usecase for it, and especially if the /proc
> > > directory being the defacto mountpoint for procfs is a universal convention.
> >
> > If it's a universal convention and, in practice, everyone needs proc
> > mounted anyway, so what's the harm in hardwiring CONFIG_PROCFS=y? If
> > we advertise /proc as not merely some kind of optional debug interface
> > but *the* way certain kernel features are exposed --- and there's
> > nothing wrong with that --- then we should give programs access to
> > these core kernel features in a way that doesn't depend on userspace
> > kernel configuration, and you do that by either providing a
> > procfs-root-getting system call or just hardwiring the "/proc/" prefix
> > into VFS.
> >
> > > > Inode allocation from the procfs mount for the file descriptors Joel
> > > > wants is not correct. Their not really procfs file descriptors so this
> > > > is a nack. We can't just hook into proc that way.
> > >
> > > I was not particular about using procfs mount for the FDs but that's the only
> > > way I knew how to do it until you pointed out anon_inode (my grep skills
> > > missed that), so thank you!
> > >
> > > > > C'mon: /proc is used by everyone today and almost every program breaks
> > > > > if it's not around. The string "/proc" is already de facto kernel ABI.
> > > > > Let's just drop the pretense of /proc being optional and bake it into
> > > > > the kernel proper, then give programs a way to get to /proc that isn't
> > > > > tied to any particular mount configuration. This way, we don't need a
> > > > > translate_pid(), since callers can just use procfs to do the same
> > > > > thing. (That is, if I understand correctly what translate_pid does.)
> > > >
> > > > I'm not sure what you think translate_pid() is doing since you're not
> > > > saying what you think it does.
> > > > Examples from the old patchset:
> > > > translate_pid(pid, ns, -1)      - get pid in our pid namespace
> >
> > Ah, it's a bit different from what I had in mind. It's fair to want to
> > translate PIDs between namespaces, but the only way to make the
> > translate_pid under discussion robust is to have it accept and produce
> > pidfds. (At that point, you might as well call it translate_pidfd.) We
> > should not be adding new APIs to the kernel that accept numeric PIDs:
>
> The traditional pid-based api is not going away. There are users that
> have the requirement to translate pids between namespaces and also doing
> introspection on these namespaces independent of pidfds. We will not
> restrict the usefulness of this syscall by making it only work with
> pidfds.
>
> > it's not possible to use these APIs correctly except under very
> > limited circumstances --- mostly, talking about init or a parent
>
> The pid-based api is one of the most widely used apis of the kernel and
> people have been using it quite successfully for a long time. Yes, it's
> rac, but it's here to stay.
>
> > talking about its child.
> >
> > Really, we need a few related operations, and we shouldn't necessarily
> > mingle them.
>
> Yes, we've established that previously.
>
> >
> > 1) Given a numeric PID, give me a pidfd: that works today: you just
> > open /proc/<pid>
>
> Agreed.
>
> >
> > 2) Given a pidfd, give me a numeric PID: that works today: you just
> > openat(pidfd, "stat", O_RDONLY) and read the first token (which is
> > always the numeric PID).
>
> Agreed.
>
> >
> > 3) Given a pidfd, send a signal: that's what pidfd_send_signal does,
> > and it's a good start on the rest of these operations.
>
> Agreed.
>
> > 5) Given a pidfd in NS1, get a pidfd in NS2. That's what translate_pid
> > is for. My preferred signature for this routine is translate_pid(int
> > pidfd, int nsfd) -> pidfd. We don't need two namespace arguments. Why
> > not? Because the pidfd *already* names a single process, uniquely!
>
> Given that people are interested in pids we can't just always return a
> pidfd. That would mean a user would need to do get the pidfd read from
> <pidfd>/stat and then close the pidfd. If you do that for a 100 pids or
> more you end up allocating and closing file descriptors constantly for
> no reason. We can't just debate pids away. So it will also need to be
> able to yield pids e.g. through a flag argument.

Sure, but that's still not a reason that we should care about pidfds
working separately from procfs.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-20  7:02                                                               ` Daniel Colascione
@ 2019-03-20 11:33                                                                 ` Joel Fernandes
  2019-03-20 18:26                                                                   ` Christian Brauner
  0 siblings, 1 reply; 113+ messages in thread
From: Joel Fernandes @ 2019-03-20 11:33 UTC (permalink / raw)
  To: Daniel Colascione, Christian Brauner
  Cc: Suren Baghdasaryan, Steven Rostedt, Sultan Alsawaf, Tim Murray,
	Michal Hocko, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Ingo Molnar, Peter Zijlstra, LKML,
	open list:ANDROID DRIVERS, linux-mm, kernel-team, Oleg Nesterov,
	Andy Lutomirski, Serge E. Hallyn, Kees Cook



On March 20, 2019 3:02:32 AM EDT, Daniel Colascione <dancol@google.com> wrote:
>On Tue, Mar 19, 2019 at 8:59 PM Christian Brauner
><christian@brauner.io> wrote:
>>
>> On Tue, Mar 19, 2019 at 07:42:52PM -0700, Daniel Colascione wrote:
>> > On Tue, Mar 19, 2019 at 6:52 PM Joel Fernandes
><joel@joelfernandes.org> wrote:
>> > >
>> > > On Wed, Mar 20, 2019 at 12:10:23AM +0100, Christian Brauner
>wrote:
>> > > > On Tue, Mar 19, 2019 at 03:48:32PM -0700, Daniel Colascione
>wrote:
>> > > > > On Tue, Mar 19, 2019 at 3:14 PM Christian Brauner
><christian@brauner.io> wrote:
>> > > > > > So I dislike the idea of allocating new inodes from the
>procfs super
>> > > > > > block. I would like to avoid pinning the whole pidfd
>concept exclusively
>> > > > > > to proc. The idea is that the pidfd API will be useable
>through procfs
>> > > > > > via open("/proc/<pid>") because that is what users expect
>and really
>> > > > > > wanted to have for a long time. So it makes sense to have
>this working.
>> > > > > > But it should really be useable without it. That's why
>translate_pid()
>> > > > > > and pidfd_clone() are on the table.  What I'm saying is,
>once the pidfd
>> > > > > > api is "complete" you should be able to set CONFIG_PROCFS=N
>- even
>> > > > > > though that's crazy - and still be able to use pidfds. This
>is also a
>> > > > > > point akpm asked about when I did the pidfd_send_signal
>work.
>> > > > >
>> > > > > I agree that you shouldn't need CONFIG_PROCFS=Y to use
>pidfds. One
>> > > > > crazy idea that I was discussing with Joel the other day is
>to just
>> > > > > make CONFIG_PROCFS=Y mandatory and provide a new
>get_procfs_root()
>> > > > > system call that returned, out of thin air and independent of
>the
>> > > > > mount table, a procfs root directory file descriptor for the
>caller's
>> > > > > PID namspace and suitable for use with openat(2).
>> > > >
>> > > > Even if this works I'm pretty sure that Al and a lot of others
>will not
>> > > > be happy about this. A syscall to get an fd to /proc?
>> >
>> > Why not? procfs provides access to a lot of core kernel
>functionality.
>> > Why should you need a mountpoint to get to it?
>> >
>> > > That's not going
>> > > > to happen and I don't see the need for a separate syscall just
>for that.
>> >
>> > We need a system call for the same reason we need a getrandom(2):
>you
>> > have to bootstrap somehow when you're in a minimal environment.
>> >
>> > > > (I do see the point of making CONFIG_PROCFS=y the default btw.)
>> >
>> > I'm not proposing that we make CONFIG_PROCFS=y the default. I'm
>> > proposing that we *hardwire* it as the default and just declare
>that
>> > it's not possible to build a Linux kernel that doesn't include
>procfs.
>> > Why do we even have that button?
>> >
>> > > I think his point here was that he wanted a handle to procfs no
>matter where
>> > > it was mounted and then can later use openat on that. Agreed that
>it may be
>> > > unnecessary unless there is a usecase for it, and especially if
>the /proc
>> > > directory being the defacto mountpoint for procfs is a universal
>convention.
>> >
>> > If it's a universal convention and, in practice, everyone needs
>proc
>> > mounted anyway, so what's the harm in hardwiring CONFIG_PROCFS=y?
>If
>> > we advertise /proc as not merely some kind of optional debug
>interface
>> > but *the* way certain kernel features are exposed --- and there's
>> > nothing wrong with that --- then we should give programs access to
>> > these core kernel features in a way that doesn't depend on
>userspace
>> > kernel configuration, and you do that by either providing a
>> > procfs-root-getting system call or just hardwiring the "/proc/"
>prefix
>> > into VFS.
>> >
>> > > > Inode allocation from the procfs mount for the file descriptors
>Joel
>> > > > wants is not correct. Their not really procfs file descriptors
>so this
>> > > > is a nack. We can't just hook into proc that way.
>> > >
>> > > I was not particular about using procfs mount for the FDs but
>that's the only
>> > > way I knew how to do it until you pointed out anon_inode (my grep
>skills
>> > > missed that), so thank you!
>> > >
>> > > > > C'mon: /proc is used by everyone today and almost every
>program breaks
>> > > > > if it's not around. The string "/proc" is already de facto
>kernel ABI.
>> > > > > Let's just drop the pretense of /proc being optional and bake
>it into
>> > > > > the kernel proper, then give programs a way to get to /proc
>that isn't
>> > > > > tied to any particular mount configuration. This way, we
>don't need a
>> > > > > translate_pid(), since callers can just use procfs to do the
>same
>> > > > > thing. (That is, if I understand correctly what translate_pid
>does.)
>> > > >
>> > > > I'm not sure what you think translate_pid() is doing since
>you're not
>> > > > saying what you think it does.
>> > > > Examples from the old patchset:
>> > > > translate_pid(pid, ns, -1)      - get pid in our pid namespace
>> >
>> > Ah, it's a bit different from what I had in mind. It's fair to want
>to
>> > translate PIDs between namespaces, but the only way to make the
>> > translate_pid under discussion robust is to have it accept and
>produce
>> > pidfds. (At that point, you might as well call it translate_pidfd.)
>We
>> > should not be adding new APIs to the kernel that accept numeric
>PIDs:
>>
>> The traditional pid-based api is not going away. There are users that
>> have the requirement to translate pids between namespaces and also
>doing
>> introspection on these namespaces independent of pidfds. We will not
>> restrict the usefulness of this syscall by making it only work with
>> pidfds.
>>
>> > it's not possible to use these APIs correctly except under very
>> > limited circumstances --- mostly, talking about init or a parent
>>
>> The pid-based api is one of the most widely used apis of the kernel
>and
>> people have been using it quite successfully for a long time. Yes,
>it's
>> rac, but it's here to stay.
>>
>> > talking about its child.
>> >
>> > Really, we need a few related operations, and we shouldn't
>necessarily
>> > mingle them.
>>
>> Yes, we've established that previously.
>>
>> >
>> > 1) Given a numeric PID, give me a pidfd: that works today: you just
>> > open /proc/<pid>
>>
>> Agreed.
>>
>> >
>> > 2) Given a pidfd, give me a numeric PID: that works today: you just
>> > openat(pidfd, "stat", O_RDONLY) and read the first token (which is
>> > always the numeric PID).
>>
>> Agreed.
>>
>> >
>> > 3) Given a pidfd, send a signal: that's what pidfd_send_signal
>does,
>> > and it's a good start on the rest of these operations.
>>
>> Agreed.
>>
>> > 5) Given a pidfd in NS1, get a pidfd in NS2. That's what
>translate_pid
>> > is for. My preferred signature for this routine is
>translate_pid(int
>> > pidfd, int nsfd) -> pidfd. We don't need two namespace arguments.
>Why
>> > not? Because the pidfd *already* names a single process, uniquely!
>>
>> Given that people are interested in pids we can't just always return
>a
>> pidfd. That would mean a user would need to do get the pidfd read
>from
>> <pidfd>/stat and then close the pidfd. If you do that for a 100 pids
>or
>> more you end up allocating and closing file descriptors constantly
>for
>> no reason. We can't just debate pids away. So it will also need to be
>> able to yield pids e.g. through a flag argument.
>
>Sure, but that's still not a reason that we should care about pidfds
>working separately from procfs..

Agreed. I can't imagine pidfd being anything but a proc pid directory handle. So I am confused what Christian meant. Pidfd *is* a procfs directory fid  always. That's what I gathered from his pidfd_send_signal patch but let me know if I'm way off in the woods.

For my next revision, I am thinking of adding the flag argument Christian mentioned to make translate_pid return an anon_inode FD which can be used for death status, given a <pid>. Since it is thought that translate_pid can be made to return a pid FD, I think it is ok to have it return a pid status FD for the purposes of the death status as well.

Joel Fernandes, Android kernel team
Sent from k9-mail on Android

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-20 11:33                                                                 ` Joel Fernandes
@ 2019-03-20 18:26                                                                   ` Christian Brauner
  2019-03-20 18:38                                                                     ` Daniel Colascione
  2019-03-20 19:11                                                                     ` Joel Fernandes
  0 siblings, 2 replies; 113+ messages in thread
From: Christian Brauner @ 2019-03-20 18:26 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: Daniel Colascione, Suren Baghdasaryan, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Oleg Nesterov, Andy Lutomirski, Serge E. Hallyn,
	Kees Cook

On Wed, Mar 20, 2019 at 07:33:51AM -0400, Joel Fernandes wrote:
> 
> 
> On March 20, 2019 3:02:32 AM EDT, Daniel Colascione <dancol@google.com> wrote:
> >On Tue, Mar 19, 2019 at 8:59 PM Christian Brauner
> ><christian@brauner.io> wrote:
> >>
> >> On Tue, Mar 19, 2019 at 07:42:52PM -0700, Daniel Colascione wrote:
> >> > On Tue, Mar 19, 2019 at 6:52 PM Joel Fernandes
> ><joel@joelfernandes.org> wrote:
> >> > >
> >> > > On Wed, Mar 20, 2019 at 12:10:23AM +0100, Christian Brauner
> >wrote:
> >> > > > On Tue, Mar 19, 2019 at 03:48:32PM -0700, Daniel Colascione
> >wrote:
> >> > > > > On Tue, Mar 19, 2019 at 3:14 PM Christian Brauner
> ><christian@brauner.io> wrote:
> >> > > > > > So I dislike the idea of allocating new inodes from the
> >procfs super
> >> > > > > > block. I would like to avoid pinning the whole pidfd
> >concept exclusively
> >> > > > > > to proc. The idea is that the pidfd API will be useable
> >through procfs
> >> > > > > > via open("/proc/<pid>") because that is what users expect
> >and really
> >> > > > > > wanted to have for a long time. So it makes sense to have
> >this working.
> >> > > > > > But it should really be useable without it. That's why
> >translate_pid()
> >> > > > > > and pidfd_clone() are on the table.  What I'm saying is,
> >once the pidfd
> >> > > > > > api is "complete" you should be able to set CONFIG_PROCFS=N
> >- even
> >> > > > > > though that's crazy - and still be able to use pidfds. This
> >is also a
> >> > > > > > point akpm asked about when I did the pidfd_send_signal
> >work.
> >> > > > >
> >> > > > > I agree that you shouldn't need CONFIG_PROCFS=Y to use
> >pidfds. One
> >> > > > > crazy idea that I was discussing with Joel the other day is
> >to just
> >> > > > > make CONFIG_PROCFS=Y mandatory and provide a new
> >get_procfs_root()
> >> > > > > system call that returned, out of thin air and independent of
> >the
> >> > > > > mount table, a procfs root directory file descriptor for the
> >caller's
> >> > > > > PID namspace and suitable for use with openat(2).
> >> > > >
> >> > > > Even if this works I'm pretty sure that Al and a lot of others
> >will not
> >> > > > be happy about this. A syscall to get an fd to /proc?
> >> >
> >> > Why not? procfs provides access to a lot of core kernel
> >functionality.
> >> > Why should you need a mountpoint to get to it?
> >> >
> >> > > That's not going
> >> > > > to happen and I don't see the need for a separate syscall just
> >for that.
> >> >
> >> > We need a system call for the same reason we need a getrandom(2):
> >you
> >> > have to bootstrap somehow when you're in a minimal environment.
> >> >
> >> > > > (I do see the point of making CONFIG_PROCFS=y the default btw.)
> >> >
> >> > I'm not proposing that we make CONFIG_PROCFS=y the default. I'm
> >> > proposing that we *hardwire* it as the default and just declare
> >that
> >> > it's not possible to build a Linux kernel that doesn't include
> >procfs.
> >> > Why do we even have that button?
> >> >
> >> > > I think his point here was that he wanted a handle to procfs no
> >matter where
> >> > > it was mounted and then can later use openat on that. Agreed that
> >it may be
> >> > > unnecessary unless there is a usecase for it, and especially if
> >the /proc
> >> > > directory being the defacto mountpoint for procfs is a universal
> >convention.
> >> >
> >> > If it's a universal convention and, in practice, everyone needs
> >proc
> >> > mounted anyway, so what's the harm in hardwiring CONFIG_PROCFS=y?
> >If
> >> > we advertise /proc as not merely some kind of optional debug
> >interface
> >> > but *the* way certain kernel features are exposed --- and there's
> >> > nothing wrong with that --- then we should give programs access to
> >> > these core kernel features in a way that doesn't depend on
> >userspace
> >> > kernel configuration, and you do that by either providing a
> >> > procfs-root-getting system call or just hardwiring the "/proc/"
> >prefix
> >> > into VFS.
> >> >
> >> > > > Inode allocation from the procfs mount for the file descriptors
> >Joel
> >> > > > wants is not correct. Their not really procfs file descriptors
> >so this
> >> > > > is a nack. We can't just hook into proc that way.
> >> > >
> >> > > I was not particular about using procfs mount for the FDs but
> >that's the only
> >> > > way I knew how to do it until you pointed out anon_inode (my grep
> >skills
> >> > > missed that), so thank you!
> >> > >
> >> > > > > C'mon: /proc is used by everyone today and almost every
> >program breaks
> >> > > > > if it's not around. The string "/proc" is already de facto
> >kernel ABI.
> >> > > > > Let's just drop the pretense of /proc being optional and bake
> >it into
> >> > > > > the kernel proper, then give programs a way to get to /proc
> >that isn't
> >> > > > > tied to any particular mount configuration. This way, we
> >don't need a
> >> > > > > translate_pid(), since callers can just use procfs to do the
> >same
> >> > > > > thing. (That is, if I understand correctly what translate_pid
> >does.)
> >> > > >
> >> > > > I'm not sure what you think translate_pid() is doing since
> >you're not
> >> > > > saying what you think it does.
> >> > > > Examples from the old patchset:
> >> > > > translate_pid(pid, ns, -1)      - get pid in our pid namespace
> >> >
> >> > Ah, it's a bit different from what I had in mind. It's fair to want
> >to
> >> > translate PIDs between namespaces, but the only way to make the
> >> > translate_pid under discussion robust is to have it accept and
> >produce
> >> > pidfds. (At that point, you might as well call it translate_pidfd.)
> >We
> >> > should not be adding new APIs to the kernel that accept numeric
> >PIDs:
> >>
> >> The traditional pid-based api is not going away. There are users that
> >> have the requirement to translate pids between namespaces and also
> >doing
> >> introspection on these namespaces independent of pidfds. We will not
> >> restrict the usefulness of this syscall by making it only work with
> >> pidfds.
> >>
> >> > it's not possible to use these APIs correctly except under very
> >> > limited circumstances --- mostly, talking about init or a parent
> >>
> >> The pid-based api is one of the most widely used apis of the kernel
> >and
> >> people have been using it quite successfully for a long time. Yes,
> >it's
> >> rac, but it's here to stay.
> >>
> >> > talking about its child.
> >> >
> >> > Really, we need a few related operations, and we shouldn't
> >necessarily
> >> > mingle them.
> >>
> >> Yes, we've established that previously.
> >>
> >> >
> >> > 1) Given a numeric PID, give me a pidfd: that works today: you just
> >> > open /proc/<pid>
> >>
> >> Agreed.
> >>
> >> >
> >> > 2) Given a pidfd, give me a numeric PID: that works today: you just
> >> > openat(pidfd, "stat", O_RDONLY) and read the first token (which is
> >> > always the numeric PID).
> >>
> >> Agreed.
> >>
> >> >
> >> > 3) Given a pidfd, send a signal: that's what pidfd_send_signal
> >does,
> >> > and it's a good start on the rest of these operations.
> >>
> >> Agreed.
> >>
> >> > 5) Given a pidfd in NS1, get a pidfd in NS2. That's what
> >translate_pid
> >> > is for. My preferred signature for this routine is
> >translate_pid(int
> >> > pidfd, int nsfd) -> pidfd. We don't need two namespace arguments.
> >Why
> >> > not? Because the pidfd *already* names a single process, uniquely!
> >>
> >> Given that people are interested in pids we can't just always return
> >a
> >> pidfd. That would mean a user would need to do get the pidfd read
> >from
> >> <pidfd>/stat and then close the pidfd. If you do that for a 100 pids
> >or
> >> more you end up allocating and closing file descriptors constantly
> >for
> >> no reason. We can't just debate pids away. So it will also need to be
> >> able to yield pids e.g. through a flag argument.
> >
> >Sure, but that's still not a reason that we should care about pidfds
> >working separately from procfs..

That's unrelated to the point made in the above paragraph.
Please note, I said that the pidfd api should work when proc is not
available not that they can't be dirfds.

> 
> Agreed. I can't imagine pidfd being anything but a proc pid directory handle. So I am confused what Christian meant. Pidfd *is* a procfs directory fid  always. That's what I gathered from his pidfd_send_signal patch but let me know if I'm way off in the woods.

(K9 Mail still hasn't learned to wrap lines at 80 it seems. :))

Again, I never said that pidfds should be a directory handle.
(Though I would like to point out that one of the original ideas I
discussed at LPC was to have something like this to get regular file
descriptors instead of dirfds:
https://gist.github.com/brauner/59eec91550c5624c9999eaebd95a70df)

> 
> For my next revision, I am thinking of adding the flag argument Christian mentioned to make translate_pid return an anon_inode FD which can be used for death status, given a <pid>. Since it is thought that translate_pid can be made to return a pid FD, I think it is ok to have it return a pid status FD for the purposes of the death status as well.

translate_pid() should just return you a pidfd. Having it return a pidfd
and a status fd feels like stuffing too much functionality in there. If
you're fine with it I'll finish prototyping what I had in mind. As I
said in previous mails I'm already working on this.

Would you be ok with prototyping the pidfd_wait() syscall you had in
mind? Especially the wait_fd part that you want to have I would like to
see how that is supposed to work, e.g. who is allowed to wait on the
process and how notifications will work for non-parent processes and so
on. I feel we won't get anywhere by talking in the abstrace and other
people are far more likely to review/comment once there's actual code.

Christian

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-20 18:26                                                                   ` Christian Brauner
@ 2019-03-20 18:38                                                                     ` Daniel Colascione
  2019-03-20 18:51                                                                       ` Christian Brauner
  2019-03-20 19:11                                                                     ` Joel Fernandes
  1 sibling, 1 reply; 113+ messages in thread
From: Daniel Colascione @ 2019-03-20 18:38 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Joel Fernandes, Suren Baghdasaryan, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Oleg Nesterov, Andy Lutomirski, Serge E. Hallyn,
	Kees Cook

On Wed, Mar 20, 2019 at 11:26 AM Christian Brauner <christian@brauner.io> wrote:
> On Wed, Mar 20, 2019 at 07:33:51AM -0400, Joel Fernandes wrote:
> >
> >
> > On March 20, 2019 3:02:32 AM EDT, Daniel Colascione <dancol@google.com> wrote:
> > >On Tue, Mar 19, 2019 at 8:59 PM Christian Brauner
> > ><christian@brauner.io> wrote:
> > >>
> > >> On Tue, Mar 19, 2019 at 07:42:52PM -0700, Daniel Colascione wrote:
> > >> > On Tue, Mar 19, 2019 at 6:52 PM Joel Fernandes
> > ><joel@joelfernandes.org> wrote:
> > >> > >
> > >> > > On Wed, Mar 20, 2019 at 12:10:23AM +0100, Christian Brauner
> > >wrote:
> > >> > > > On Tue, Mar 19, 2019 at 03:48:32PM -0700, Daniel Colascione
> > >wrote:
> > >> > > > > On Tue, Mar 19, 2019 at 3:14 PM Christian Brauner
> > ><christian@brauner.io> wrote:
> > >> > > > > > So I dislike the idea of allocating new inodes from the
> > >procfs super
> > >> > > > > > block. I would like to avoid pinning the whole pidfd
> > >concept exclusively
> > >> > > > > > to proc. The idea is that the pidfd API will be useable
> > >through procfs
> > >> > > > > > via open("/proc/<pid>") because that is what users expect
> > >and really
> > >> > > > > > wanted to have for a long time. So it makes sense to have
> > >this working.
> > >> > > > > > But it should really be useable without it. That's why
> > >translate_pid()
> > >> > > > > > and pidfd_clone() are on the table.  What I'm saying is,
> > >once the pidfd
> > >> > > > > > api is "complete" you should be able to set CONFIG_PROCFS=N
> > >- even
> > >> > > > > > though that's crazy - and still be able to use pidfds. This
> > >is also a
> > >> > > > > > point akpm asked about when I did the pidfd_send_signal
> > >work.
> > >> > > > >
> > >> > > > > I agree that you shouldn't need CONFIG_PROCFS=Y to use
> > >pidfds. One
> > >> > > > > crazy idea that I was discussing with Joel the other day is
> > >to just
> > >> > > > > make CONFIG_PROCFS=Y mandatory and provide a new
> > >get_procfs_root()
> > >> > > > > system call that returned, out of thin air and independent of
> > >the
> > >> > > > > mount table, a procfs root directory file descriptor for the
> > >caller's
> > >> > > > > PID namspace and suitable for use with openat(2).
> > >> > > >
> > >> > > > Even if this works I'm pretty sure that Al and a lot of others
> > >will not
> > >> > > > be happy about this. A syscall to get an fd to /proc?
> > >> >
> > >> > Why not? procfs provides access to a lot of core kernel
> > >functionality.
> > >> > Why should you need a mountpoint to get to it?
> > >> >
> > >> > > That's not going
> > >> > > > to happen and I don't see the need for a separate syscall just
> > >for that.
> > >> >
> > >> > We need a system call for the same reason we need a getrandom(2):
> > >you
> > >> > have to bootstrap somehow when you're in a minimal environment.
> > >> >
> > >> > > > (I do see the point of making CONFIG_PROCFS=y the default btw.)
> > >> >
> > >> > I'm not proposing that we make CONFIG_PROCFS=y the default. I'm
> > >> > proposing that we *hardwire* it as the default and just declare
> > >that
> > >> > it's not possible to build a Linux kernel that doesn't include
> > >procfs.
> > >> > Why do we even have that button?
> > >> >
> > >> > > I think his point here was that he wanted a handle to procfs no
> > >matter where
> > >> > > it was mounted and then can later use openat on that. Agreed that
> > >it may be
> > >> > > unnecessary unless there is a usecase for it, and especially if
> > >the /proc
> > >> > > directory being the defacto mountpoint for procfs is a universal
> > >convention.
> > >> >
> > >> > If it's a universal convention and, in practice, everyone needs
> > >proc
> > >> > mounted anyway, so what's the harm in hardwiring CONFIG_PROCFS=y?
> > >If
> > >> > we advertise /proc as not merely some kind of optional debug
> > >interface
> > >> > but *the* way certain kernel features are exposed --- and there's
> > >> > nothing wrong with that --- then we should give programs access to
> > >> > these core kernel features in a way that doesn't depend on
> > >userspace
> > >> > kernel configuration, and you do that by either providing a
> > >> > procfs-root-getting system call or just hardwiring the "/proc/"
> > >prefix
> > >> > into VFS.
> > >> >
> > >> > > > Inode allocation from the procfs mount for the file descriptors
> > >Joel
> > >> > > > wants is not correct. Their not really procfs file descriptors
> > >so this
> > >> > > > is a nack. We can't just hook into proc that way.
> > >> > >
> > >> > > I was not particular about using procfs mount for the FDs but
> > >that's the only
> > >> > > way I knew how to do it until you pointed out anon_inode (my grep
> > >skills
> > >> > > missed that), so thank you!
> > >> > >
> > >> > > > > C'mon: /proc is used by everyone today and almost every
> > >program breaks
> > >> > > > > if it's not around. The string "/proc" is already de facto
> > >kernel ABI.
> > >> > > > > Let's just drop the pretense of /proc being optional and bake
> > >it into
> > >> > > > > the kernel proper, then give programs a way to get to /proc
> > >that isn't
> > >> > > > > tied to any particular mount configuration. This way, we
> > >don't need a
> > >> > > > > translate_pid(), since callers can just use procfs to do the
> > >same
> > >> > > > > thing. (That is, if I understand correctly what translate_pid
> > >does.)
> > >> > > >
> > >> > > > I'm not sure what you think translate_pid() is doing since
> > >you're not
> > >> > > > saying what you think it does.
> > >> > > > Examples from the old patchset:
> > >> > > > translate_pid(pid, ns, -1)      - get pid in our pid namespace
> > >> >
> > >> > Ah, it's a bit different from what I had in mind. It's fair to want
> > >to
> > >> > translate PIDs between namespaces, but the only way to make the
> > >> > translate_pid under discussion robust is to have it accept and
> > >produce
> > >> > pidfds. (At that point, you might as well call it translate_pidfd.)
> > >We
> > >> > should not be adding new APIs to the kernel that accept numeric
> > >PIDs:
> > >>
> > >> The traditional pid-based api is not going away. There are users that
> > >> have the requirement to translate pids between namespaces and also
> > >doing
> > >> introspection on these namespaces independent of pidfds. We will not
> > >> restrict the usefulness of this syscall by making it only work with
> > >> pidfds.
> > >>
> > >> > it's not possible to use these APIs correctly except under very
> > >> > limited circumstances --- mostly, talking about init or a parent
> > >>
> > >> The pid-based api is one of the most widely used apis of the kernel
> > >and
> > >> people have been using it quite successfully for a long time. Yes,
> > >it's
> > >> rac, but it's here to stay.
> > >>
> > >> > talking about its child.
> > >> >
> > >> > Really, we need a few related operations, and we shouldn't
> > >necessarily
> > >> > mingle them.
> > >>
> > >> Yes, we've established that previously.
> > >>
> > >> >
> > >> > 1) Given a numeric PID, give me a pidfd: that works today: you just
> > >> > open /proc/<pid>
> > >>
> > >> Agreed.
> > >>
> > >> >
> > >> > 2) Given a pidfd, give me a numeric PID: that works today: you just
> > >> > openat(pidfd, "stat", O_RDONLY) and read the first token (which is
> > >> > always the numeric PID).
> > >>
> > >> Agreed.
> > >>
> > >> >
> > >> > 3) Given a pidfd, send a signal: that's what pidfd_send_signal
> > >does,
> > >> > and it's a good start on the rest of these operations.
> > >>
> > >> Agreed.
> > >>
> > >> > 5) Given a pidfd in NS1, get a pidfd in NS2. That's what
> > >translate_pid
> > >> > is for. My preferred signature for this routine is
> > >translate_pid(int
> > >> > pidfd, int nsfd) -> pidfd. We don't need two namespace arguments.
> > >Why
> > >> > not? Because the pidfd *already* names a single process, uniquely!
> > >>
> > >> Given that people are interested in pids we can't just always return
> > >a
> > >> pidfd. That would mean a user would need to do get the pidfd read
> > >from
> > >> <pidfd>/stat and then close the pidfd. If you do that for a 100 pids
> > >or
> > >> more you end up allocating and closing file descriptors constantly
> > >for
> > >> no reason. We can't just debate pids away. So it will also need to be
> > >> able to yield pids e.g. through a flag argument.
> > >
> > >Sure, but that's still not a reason that we should care about pidfds
> > >working separately from procfs..
>
> That's unrelated to the point made in the above paragraph.
> Please note, I said that the pidfd api should work when proc is not
> available not that they can't be dirfds.

What do you mean by "not available"? CONFIG_PROCFS=n? If pidfds
supposed to work when proc is unavailable yet also be directory FDs,
to what directory should the FD refer? As I mentioned in my previous
message, trying to make pidfd work without CONFIG_PROCFS is a very bad
idea.

>
> >
> > Agreed. I can't imagine pidfd being anything but a proc pid directory handle. So I am confused what Christian meant. Pidfd *is* a procfs directory fid  always. That's what I gathered from his pidfd_send_signal patch but let me know if I'm way off in the woods.
>
> (K9 Mail still hasn't learned to wrap lines at 80 it seems. :))
>
> Again, I never said that pidfds should be a directory handle.
> (Though I would like to point out that one of the original ideas I
> discussed at LPC was to have something like this to get regular file
> descriptors instead of dirfds:
> https://gist.github.com/brauner/59eec91550c5624c9999eaebd95a70df)

As I mentioned in my original email on this thread, if you have
regular file descriptors instead of directory FDs, you have to use
some special new API instead of openat to get metadata about a
process. That's pointless duplication of functionality considering
that a directory FD gives you that information automatically.

> > For my next revision, I am thinking of adding the flag argument Christian mentioned to make translate_pid return an anon_inode FD which can be used for death status, given a <pid>. Since it is thought that translate_pid can be made to return a pid FD, I think it is ok to have it return a pid status FD for the purposes of the death status as well.

> translate_pid() should just return you a pidfd. Having it return a pidfd
> and a status fd feels like stuffing too much functionality in there. If
> you're fine with it I'll finish prototyping what I had in mind. As I
> said in previous mails I'm already working on this.

translate_pid also needs to *accept* pidfds, at least optionally.
Unless you have a function from pidfd to pidfd, you race.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-20 18:38                                                                     ` Daniel Colascione
@ 2019-03-20 18:51                                                                       ` Christian Brauner
  2019-03-20 18:58                                                                         ` Andy Lutomirski
                                                                                           ` (2 more replies)
  0 siblings, 3 replies; 113+ messages in thread
From: Christian Brauner @ 2019-03-20 18:51 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: Joel Fernandes, Suren Baghdasaryan, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Oleg Nesterov, Andy Lutomirski, Serge E. Hallyn,
	Kees Cook

On Wed, Mar 20, 2019 at 11:38:35AM -0700, Daniel Colascione wrote:
> On Wed, Mar 20, 2019 at 11:26 AM Christian Brauner <christian@brauner.io> wrote:
> > On Wed, Mar 20, 2019 at 07:33:51AM -0400, Joel Fernandes wrote:
> > >
> > >
> > > On March 20, 2019 3:02:32 AM EDT, Daniel Colascione <dancol@google.com> wrote:
> > > >On Tue, Mar 19, 2019 at 8:59 PM Christian Brauner
> > > ><christian@brauner.io> wrote:
> > > >>
> > > >> On Tue, Mar 19, 2019 at 07:42:52PM -0700, Daniel Colascione wrote:
> > > >> > On Tue, Mar 19, 2019 at 6:52 PM Joel Fernandes
> > > ><joel@joelfernandes.org> wrote:
> > > >> > >
> > > >> > > On Wed, Mar 20, 2019 at 12:10:23AM +0100, Christian Brauner
> > > >wrote:
> > > >> > > > On Tue, Mar 19, 2019 at 03:48:32PM -0700, Daniel Colascione
> > > >wrote:
> > > >> > > > > On Tue, Mar 19, 2019 at 3:14 PM Christian Brauner
> > > ><christian@brauner.io> wrote:
> > > >> > > > > > So I dislike the idea of allocating new inodes from the
> > > >procfs super
> > > >> > > > > > block. I would like to avoid pinning the whole pidfd
> > > >concept exclusively
> > > >> > > > > > to proc. The idea is that the pidfd API will be useable
> > > >through procfs
> > > >> > > > > > via open("/proc/<pid>") because that is what users expect
> > > >and really
> > > >> > > > > > wanted to have for a long time. So it makes sense to have
> > > >this working.
> > > >> > > > > > But it should really be useable without it. That's why
> > > >translate_pid()
> > > >> > > > > > and pidfd_clone() are on the table.  What I'm saying is,
> > > >once the pidfd
> > > >> > > > > > api is "complete" you should be able to set CONFIG_PROCFS=N
> > > >- even
> > > >> > > > > > though that's crazy - and still be able to use pidfds. This
> > > >is also a
> > > >> > > > > > point akpm asked about when I did the pidfd_send_signal
> > > >work.
> > > >> > > > >
> > > >> > > > > I agree that you shouldn't need CONFIG_PROCFS=Y to use
> > > >pidfds. One
> > > >> > > > > crazy idea that I was discussing with Joel the other day is
> > > >to just
> > > >> > > > > make CONFIG_PROCFS=Y mandatory and provide a new
> > > >get_procfs_root()
> > > >> > > > > system call that returned, out of thin air and independent of
> > > >the
> > > >> > > > > mount table, a procfs root directory file descriptor for the
> > > >caller's
> > > >> > > > > PID namspace and suitable for use with openat(2).
> > > >> > > >
> > > >> > > > Even if this works I'm pretty sure that Al and a lot of others
> > > >will not
> > > >> > > > be happy about this. A syscall to get an fd to /proc?
> > > >> >
> > > >> > Why not? procfs provides access to a lot of core kernel
> > > >functionality.
> > > >> > Why should you need a mountpoint to get to it?
> > > >> >
> > > >> > > That's not going
> > > >> > > > to happen and I don't see the need for a separate syscall just
> > > >for that.
> > > >> >
> > > >> > We need a system call for the same reason we need a getrandom(2):
> > > >you
> > > >> > have to bootstrap somehow when you're in a minimal environment.
> > > >> >
> > > >> > > > (I do see the point of making CONFIG_PROCFS=y the default btw.)
> > > >> >
> > > >> > I'm not proposing that we make CONFIG_PROCFS=y the default. I'm
> > > >> > proposing that we *hardwire* it as the default and just declare
> > > >that
> > > >> > it's not possible to build a Linux kernel that doesn't include
> > > >procfs.
> > > >> > Why do we even have that button?
> > > >> >
> > > >> > > I think his point here was that he wanted a handle to procfs no
> > > >matter where
> > > >> > > it was mounted and then can later use openat on that. Agreed that
> > > >it may be
> > > >> > > unnecessary unless there is a usecase for it, and especially if
> > > >the /proc
> > > >> > > directory being the defacto mountpoint for procfs is a universal
> > > >convention.
> > > >> >
> > > >> > If it's a universal convention and, in practice, everyone needs
> > > >proc
> > > >> > mounted anyway, so what's the harm in hardwiring CONFIG_PROCFS=y?
> > > >If
> > > >> > we advertise /proc as not merely some kind of optional debug
> > > >interface
> > > >> > but *the* way certain kernel features are exposed --- and there's
> > > >> > nothing wrong with that --- then we should give programs access to
> > > >> > these core kernel features in a way that doesn't depend on
> > > >userspace
> > > >> > kernel configuration, and you do that by either providing a
> > > >> > procfs-root-getting system call or just hardwiring the "/proc/"
> > > >prefix
> > > >> > into VFS.
> > > >> >
> > > >> > > > Inode allocation from the procfs mount for the file descriptors
> > > >Joel
> > > >> > > > wants is not correct. Their not really procfs file descriptors
> > > >so this
> > > >> > > > is a nack. We can't just hook into proc that way.
> > > >> > >
> > > >> > > I was not particular about using procfs mount for the FDs but
> > > >that's the only
> > > >> > > way I knew how to do it until you pointed out anon_inode (my grep
> > > >skills
> > > >> > > missed that), so thank you!
> > > >> > >
> > > >> > > > > C'mon: /proc is used by everyone today and almost every
> > > >program breaks
> > > >> > > > > if it's not around. The string "/proc" is already de facto
> > > >kernel ABI.
> > > >> > > > > Let's just drop the pretense of /proc being optional and bake
> > > >it into
> > > >> > > > > the kernel proper, then give programs a way to get to /proc
> > > >that isn't
> > > >> > > > > tied to any particular mount configuration. This way, we
> > > >don't need a
> > > >> > > > > translate_pid(), since callers can just use procfs to do the
> > > >same
> > > >> > > > > thing. (That is, if I understand correctly what translate_pid
> > > >does.)
> > > >> > > >
> > > >> > > > I'm not sure what you think translate_pid() is doing since
> > > >you're not
> > > >> > > > saying what you think it does.
> > > >> > > > Examples from the old patchset:
> > > >> > > > translate_pid(pid, ns, -1)      - get pid in our pid namespace
> > > >> >
> > > >> > Ah, it's a bit different from what I had in mind. It's fair to want
> > > >to
> > > >> > translate PIDs between namespaces, but the only way to make the
> > > >> > translate_pid under discussion robust is to have it accept and
> > > >produce
> > > >> > pidfds. (At that point, you might as well call it translate_pidfd.)
> > > >We
> > > >> > should not be adding new APIs to the kernel that accept numeric
> > > >PIDs:
> > > >>
> > > >> The traditional pid-based api is not going away. There are users that
> > > >> have the requirement to translate pids between namespaces and also
> > > >doing
> > > >> introspection on these namespaces independent of pidfds. We will not
> > > >> restrict the usefulness of this syscall by making it only work with
> > > >> pidfds.
> > > >>
> > > >> > it's not possible to use these APIs correctly except under very
> > > >> > limited circumstances --- mostly, talking about init or a parent
> > > >>
> > > >> The pid-based api is one of the most widely used apis of the kernel
> > > >and
> > > >> people have been using it quite successfully for a long time. Yes,
> > > >it's
> > > >> rac, but it's here to stay.
> > > >>
> > > >> > talking about its child.
> > > >> >
> > > >> > Really, we need a few related operations, and we shouldn't
> > > >necessarily
> > > >> > mingle them.
> > > >>
> > > >> Yes, we've established that previously.
> > > >>
> > > >> >
> > > >> > 1) Given a numeric PID, give me a pidfd: that works today: you just
> > > >> > open /proc/<pid>
> > > >>
> > > >> Agreed.
> > > >>
> > > >> >
> > > >> > 2) Given a pidfd, give me a numeric PID: that works today: you just
> > > >> > openat(pidfd, "stat", O_RDONLY) and read the first token (which is
> > > >> > always the numeric PID).
> > > >>
> > > >> Agreed.
> > > >>
> > > >> >
> > > >> > 3) Given a pidfd, send a signal: that's what pidfd_send_signal
> > > >does,
> > > >> > and it's a good start on the rest of these operations.
> > > >>
> > > >> Agreed.
> > > >>
> > > >> > 5) Given a pidfd in NS1, get a pidfd in NS2. That's what
> > > >translate_pid
> > > >> > is for. My preferred signature for this routine is
> > > >translate_pid(int
> > > >> > pidfd, int nsfd) -> pidfd. We don't need two namespace arguments.
> > > >Why
> > > >> > not? Because the pidfd *already* names a single process, uniquely!
> > > >>
> > > >> Given that people are interested in pids we can't just always return
> > > >a
> > > >> pidfd. That would mean a user would need to do get the pidfd read
> > > >from
> > > >> <pidfd>/stat and then close the pidfd. If you do that for a 100 pids
> > > >or
> > > >> more you end up allocating and closing file descriptors constantly
> > > >for
> > > >> no reason. We can't just debate pids away. So it will also need to be
> > > >> able to yield pids e.g. through a flag argument.
> > > >
> > > >Sure, but that's still not a reason that we should care about pidfds
> > > >working separately from procfs..
> >
> > That's unrelated to the point made in the above paragraph.
> > Please note, I said that the pidfd api should work when proc is not
> > available not that they can't be dirfds.
> 
> What do you mean by "not available"? CONFIG_PROCFS=n? If pidfds

I'm talking about the ability to clone processes and get fd handles on
them via pidfd_clone() or CLONE_NEWFD.

> 
> > translate_pid() should just return you a pidfd. Having it return a pidfd
> > and a status fd feels like stuffing too much functionality in there. If
> > you're fine with it I'll finish prototyping what I had in mind. As I
> > said in previous mails I'm already working on this.
> 
> translate_pid also needs to *accept* pidfds, at least optionally.
> Unless you have a function from pidfd to pidfd, you race.

You're misunderstanding. Again, I said in my previous mails it should
accept pidfds optionally as arguments, yes. But I don't want it to
return the status fds that you previously wanted pidfd_wait() to return.
I really want to see Joel's pidfd_wait() patchset and have more people
review the actual code.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-20 18:51                                                                       ` Christian Brauner
@ 2019-03-20 18:58                                                                         ` Andy Lutomirski
  2019-03-20 19:14                                                                           ` Christian Brauner
  2019-03-20 19:19                                                                         ` Joel Fernandes
  2019-03-20 19:29                                                                         ` Daniel Colascione
  2 siblings, 1 reply; 113+ messages in thread
From: Andy Lutomirski @ 2019-03-20 18:58 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Daniel Colascione, Joel Fernandes, Suren Baghdasaryan,
	Steven Rostedt, Sultan Alsawaf, Tim Murray, Michal Hocko,
	Greg Kroah-Hartman, Arve Hjønnevåg, Todd Kjos,
	Martijn Coenen, Ingo Molnar, Peter Zijlstra, LKML,
	open list:ANDROID DRIVERS, linux-mm, kernel-team, Oleg Nesterov,
	Serge E. Hallyn, Kees Cook

On Wed, Mar 20, 2019 at 11:52 AM Christian Brauner <christian@brauner.io> wrote:
>
> You're misunderstanding. Again, I said in my previous mails it should
> accept pidfds optionally as arguments, yes. But I don't want it to
> return the status fds that you previously wanted pidfd_wait() to return.
> I really want to see Joel's pidfd_wait() patchset and have more people
> review the actual code.

Just to make sure that no one is forgetting a material security consideration:

$ ls /proc/self
attr             exe        mountinfo      projid_map    status
autogroup        fd         mounts         root          syscall
auxv             fdinfo     mountstats     sched         task
cgroup           gid_map    net            schedstat     timers
clear_refs       io         ns             sessionid     timerslack_ns
cmdline          latency    numa_maps      setgroups     uid_map
comm             limits     oom_adj        smaps         wchan
coredump_filter  loginuid   oom_score      smaps_rollup
cpuset           map_files  oom_score_adj  stack
cwd              maps       pagemap        stat
environ          mem        personality    statm

A bunch of this stuff makes sense to make accessible through a syscall
interface that we expect to be used even in sandboxes.  But a bunch of
it does not.  For example, *_map, mounts, mountstats, and net are all
namespace-wide things that certain policies expect to be unavailable.
stack, for example, is a potential attack surface.  Etc.

As it stands, if you create a fresh userns and mountns and try to
mount /proc, there are some really awful and hideous rules that are
checked for security reasons.  All these new APIs either need to
return something more restrictive than a proc dirfd or they need to
follow the same rules.  And I'm afraid that the latter may be a
nonstarter if you expect these APIs to be used in libraries.

Yes, this is unfortunate, but it is indeed the current situation.  I
suppose that we could return magic restricted dirfds, or we could
return things that aren't dirfds and all and have some API that gives
you the dirfd associated with a procfd but only if you can see
/proc/PID.

--Andy

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-20 18:26                                                                   ` Christian Brauner
  2019-03-20 18:38                                                                     ` Daniel Colascione
@ 2019-03-20 19:11                                                                     ` Joel Fernandes
  1 sibling, 0 replies; 113+ messages in thread
From: Joel Fernandes @ 2019-03-20 19:11 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Daniel Colascione, Suren Baghdasaryan, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Oleg Nesterov, Andy Lutomirski, Serge E. Hallyn,
	Kees Cook

On Wed, Mar 20, 2019 at 07:26:50PM +0100, Christian Brauner wrote:
> On Wed, Mar 20, 2019 at 07:33:51AM -0400, Joel Fernandes wrote:
> > 
> > 
> > On March 20, 2019 3:02:32 AM EDT, Daniel Colascione <dancol@google.com> wrote:
> > >On Tue, Mar 19, 2019 at 8:59 PM Christian Brauner
> > ><christian@brauner.io> wrote:
> > >>
> > >> On Tue, Mar 19, 2019 at 07:42:52PM -0700, Daniel Colascione wrote:
> > >> > On Tue, Mar 19, 2019 at 6:52 PM Joel Fernandes
> > ><joel@joelfernandes.org> wrote:
> > >> > >
> > >> > > On Wed, Mar 20, 2019 at 12:10:23AM +0100, Christian Brauner
> > >wrote:
> > >> > > > On Tue, Mar 19, 2019 at 03:48:32PM -0700, Daniel Colascione
> > >wrote:
> > >> > > > > On Tue, Mar 19, 2019 at 3:14 PM Christian Brauner
> > ><christian@brauner.io> wrote:
> > >> > > > > > So I dislike the idea of allocating new inodes from the
> > >procfs super
> > >> > > > > > block. I would like to avoid pinning the whole pidfd
> > >concept exclusively
> > >> > > > > > to proc. The idea is that the pidfd API will be useable
> > >through procfs
> > >> > > > > > via open("/proc/<pid>") because that is what users expect
> > >and really
> > >> > > > > > wanted to have for a long time. So it makes sense to have
> > >this working.
> > >> > > > > > But it should really be useable without it. That's why
> > >translate_pid()
> > >> > > > > > and pidfd_clone() are on the table.  What I'm saying is,
> > >once the pidfd
> > >> > > > > > api is "complete" you should be able to set CONFIG_PROCFS=N
> > >- even
> > >> > > > > > though that's crazy - and still be able to use pidfds. This
> > >is also a
> > >> > > > > > point akpm asked about when I did the pidfd_send_signal
> > >work.
> > >> > > > >
> > >> > > > > I agree that you shouldn't need CONFIG_PROCFS=Y to use
> > >pidfds. One
> > >> > > > > crazy idea that I was discussing with Joel the other day is
> > >to just
> > >> > > > > make CONFIG_PROCFS=Y mandatory and provide a new
> > >get_procfs_root()
> > >> > > > > system call that returned, out of thin air and independent of
> > >the
> > >> > > > > mount table, a procfs root directory file descriptor for the
> > >caller's
> > >> > > > > PID namspace and suitable for use with openat(2).
> > >> > > >
> > >> > > > Even if this works I'm pretty sure that Al and a lot of others
> > >will not
> > >> > > > be happy about this. A syscall to get an fd to /proc?
> > >> >
> > >> > Why not? procfs provides access to a lot of core kernel
> > >functionality.
> > >> > Why should you need a mountpoint to get to it?
> > >> >
> > >> > > That's not going
> > >> > > > to happen and I don't see the need for a separate syscall just
> > >for that.
> > >> >
> > >> > We need a system call for the same reason we need a getrandom(2):
> > >you
> > >> > have to bootstrap somehow when you're in a minimal environment.
> > >> >
> > >> > > > (I do see the point of making CONFIG_PROCFS=y the default btw.)
> > >> >
> > >> > I'm not proposing that we make CONFIG_PROCFS=y the default. I'm
> > >> > proposing that we *hardwire* it as the default and just declare
> > >that
> > >> > it's not possible to build a Linux kernel that doesn't include
> > >procfs.
> > >> > Why do we even have that button?
> > >> >
> > >> > > I think his point here was that he wanted a handle to procfs no
> > >matter where
> > >> > > it was mounted and then can later use openat on that. Agreed that
> > >it may be
> > >> > > unnecessary unless there is a usecase for it, and especially if
> > >the /proc
> > >> > > directory being the defacto mountpoint for procfs is a universal
> > >convention.
> > >> >
> > >> > If it's a universal convention and, in practice, everyone needs
> > >proc
> > >> > mounted anyway, so what's the harm in hardwiring CONFIG_PROCFS=y?
> > >If
> > >> > we advertise /proc as not merely some kind of optional debug
> > >interface
> > >> > but *the* way certain kernel features are exposed --- and there's
> > >> > nothing wrong with that --- then we should give programs access to
> > >> > these core kernel features in a way that doesn't depend on
> > >userspace
> > >> > kernel configuration, and you do that by either providing a
> > >> > procfs-root-getting system call or just hardwiring the "/proc/"
> > >prefix
> > >> > into VFS.
> > >> >
> > >> > > > Inode allocation from the procfs mount for the file descriptors
> > >Joel
> > >> > > > wants is not correct. Their not really procfs file descriptors
> > >so this
> > >> > > > is a nack. We can't just hook into proc that way.
> > >> > >
> > >> > > I was not particular about using procfs mount for the FDs but
> > >that's the only
> > >> > > way I knew how to do it until you pointed out anon_inode (my grep
> > >skills
> > >> > > missed that), so thank you!
> > >> > >
> > >> > > > > C'mon: /proc is used by everyone today and almost every
> > >program breaks
> > >> > > > > if it's not around. The string "/proc" is already de facto
> > >kernel ABI.
> > >> > > > > Let's just drop the pretense of /proc being optional and bake
> > >it into
> > >> > > > > the kernel proper, then give programs a way to get to /proc
> > >that isn't
> > >> > > > > tied to any particular mount configuration. This way, we
> > >don't need a
> > >> > > > > translate_pid(), since callers can just use procfs to do the
> > >same
> > >> > > > > thing. (That is, if I understand correctly what translate_pid
> > >does.)
> > >> > > >
> > >> > > > I'm not sure what you think translate_pid() is doing since
> > >you're not
> > >> > > > saying what you think it does.
> > >> > > > Examples from the old patchset:
> > >> > > > translate_pid(pid, ns, -1)      - get pid in our pid namespace
> > >> >
> > >> > Ah, it's a bit different from what I had in mind. It's fair to want
> > >to
> > >> > translate PIDs between namespaces, but the only way to make the
> > >> > translate_pid under discussion robust is to have it accept and
> > >produce
> > >> > pidfds. (At that point, you might as well call it translate_pidfd.)
> > >We
> > >> > should not be adding new APIs to the kernel that accept numeric
> > >PIDs:
> > >>
> > >> The traditional pid-based api is not going away. There are users that
> > >> have the requirement to translate pids between namespaces and also
> > >doing
> > >> introspection on these namespaces independent of pidfds. We will not
> > >> restrict the usefulness of this syscall by making it only work with
> > >> pidfds.
> > >>
> > >> > it's not possible to use these APIs correctly except under very
> > >> > limited circumstances --- mostly, talking about init or a parent
> > >>
> > >> The pid-based api is one of the most widely used apis of the kernel
> > >and
> > >> people have been using it quite successfully for a long time. Yes,
> > >it's
> > >> rac, but it's here to stay.
> > >>
> > >> > talking about its child.
> > >> >
> > >> > Really, we need a few related operations, and we shouldn't
> > >necessarily
> > >> > mingle them.
> > >>
> > >> Yes, we've established that previously.
> > >>
> > >> >
> > >> > 1) Given a numeric PID, give me a pidfd: that works today: you just
> > >> > open /proc/<pid>
> > >>
> > >> Agreed.
> > >>
> > >> >
> > >> > 2) Given a pidfd, give me a numeric PID: that works today: you just
> > >> > openat(pidfd, "stat", O_RDONLY) and read the first token (which is
> > >> > always the numeric PID).
> > >>
> > >> Agreed.
> > >>
> > >> >
> > >> > 3) Given a pidfd, send a signal: that's what pidfd_send_signal
> > >does,
> > >> > and it's a good start on the rest of these operations.
> > >>
> > >> Agreed.
> > >>
> > >> > 5) Given a pidfd in NS1, get a pidfd in NS2. That's what
> > >translate_pid
> > >> > is for. My preferred signature for this routine is
> > >translate_pid(int
> > >> > pidfd, int nsfd) -> pidfd. We don't need two namespace arguments.
> > >Why
> > >> > not? Because the pidfd *already* names a single process, uniquely!
> > >>
> > >> Given that people are interested in pids we can't just always return
> > >a
> > >> pidfd. That would mean a user would need to do get the pidfd read
> > >from
> > >> <pidfd>/stat and then close the pidfd. If you do that for a 100 pids
> > >or
> > >> more you end up allocating and closing file descriptors constantly
> > >for
> > >> no reason. We can't just debate pids away. So it will also need to be
> > >> able to yield pids e.g. through a flag argument.
> > >
> > >Sure, but that's still not a reason that we should care about pidfds
> > >working separately from procfs..
> 
> That's unrelated to the point made in the above paragraph.
> Please note, I said that the pidfd api should work when proc is not
> available not that they can't be dirfds.
> 
> > 
> > Agreed. I can't imagine pidfd being anything but a proc pid directory handle. So I am confused what Christian meant. Pidfd *is* a procfs directory fid  always. That's what I gathered from his pidfd_send_signal patch but let me know if I'm way off in the woods.
> 
> (K9 Mail still hasn't learned to wrap lines at 80 it seems. :))

Indeed, or I misconfigured it :) Just set it up recently so I'm still messing
with it.

The other issue is it does wrapping on quoted lines too, and there's a bug
filed somewhere for that.

> Again, I never said that pidfds should be a directory handle.
> (Though I would like to point out that one of the original ideas I
> discussed at LPC was to have something like this to get regular file
> descriptors instead of dirfds:
> https://gist.github.com/brauner/59eec91550c5624c9999eaebd95a70df)

Ok. I was just going by this code in your send_signal patch where you error
out if the pidfd is not a directory.
 
+struct pid *tgid_pidfd_to_pid(const struct file *file)
+{
+	if (!d_is_dir(file->f_path.dentry) ||
+	    (file->f_op != &proc_tgid_base_operations))
+		return ERR_PTR(-EBADF);

> > For my next revision, I am thinking of adding the flag argument Christian mentioned to make translate_pid return an anon_inode FD which can be used for death status, given a <pid>. Since it is thought that translate_pid can be made to return a pid FD, I think it is ok to have it return a pid status FD for the purposes of the death status as well.
> 
> translate_pid() should just return you a pidfd. Having it return a pidfd
> and a status fd feels like stuffing too much functionality in there. If
> you're fine with it I'll finish prototyping what I had in mind. As I
> said in previous mails I'm already working on this.

Yes, please continue to work on it. No problem.

> Would you be ok with prototyping the pidfd_wait() syscall you had in
> mind?

Yes, Of course, I am working on it. No problem. It is still good to discuss
these ideas and to know what my direction should be, so I appreciate the
conversation here.

> Especially the wait_fd part that you want to have I would like to
> see how that is supposed to work, e.g. who is allowed to wait on the
> process and how notifications will work for non-parent processes and so
> on. I feel we won't get anywhere by talking in the abstrace and other
> people are far more likely to review/comment once there's actual code.

Got it. Lets chat more once I post something.

thanks,

 - Joel


^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-20 18:58                                                                         ` Andy Lutomirski
@ 2019-03-20 19:14                                                                           ` Christian Brauner
  2019-03-20 19:40                                                                             ` Daniel Colascione
  0 siblings, 1 reply; 113+ messages in thread
From: Christian Brauner @ 2019-03-20 19:14 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Daniel Colascione, Joel Fernandes, Suren Baghdasaryan,
	Steven Rostedt, Sultan Alsawaf, Tim Murray, Michal Hocko,
	Greg Kroah-Hartman, Arve Hjønnevåg, Todd Kjos,
	Martijn Coenen, Ingo Molnar, Peter Zijlstra, LKML,
	open list:ANDROID DRIVERS, linux-mm, kernel-team, Oleg Nesterov,
	Serge E. Hallyn, Kees Cook

On Wed, Mar 20, 2019 at 11:58:57AM -0700, Andy Lutomirski wrote:
> On Wed, Mar 20, 2019 at 11:52 AM Christian Brauner <christian@brauner.io> wrote:
> >
> > You're misunderstanding. Again, I said in my previous mails it should
> > accept pidfds optionally as arguments, yes. But I don't want it to
> > return the status fds that you previously wanted pidfd_wait() to return.
> > I really want to see Joel's pidfd_wait() patchset and have more people
> > review the actual code.
> 
> Just to make sure that no one is forgetting a material security consideration:

Andy, thanks for commenting!

> 
> $ ls /proc/self
> attr             exe        mountinfo      projid_map    status
> autogroup        fd         mounts         root          syscall
> auxv             fdinfo     mountstats     sched         task
> cgroup           gid_map    net            schedstat     timers
> clear_refs       io         ns             sessionid     timerslack_ns
> cmdline          latency    numa_maps      setgroups     uid_map
> comm             limits     oom_adj        smaps         wchan
> coredump_filter  loginuid   oom_score      smaps_rollup
> cpuset           map_files  oom_score_adj  stack
> cwd              maps       pagemap        stat
> environ          mem        personality    statm
> 
> A bunch of this stuff makes sense to make accessible through a syscall
> interface that we expect to be used even in sandboxes.  But a bunch of
> it does not.  For example, *_map, mounts, mountstats, and net are all
> namespace-wide things that certain policies expect to be unavailable.
> stack, for example, is a potential attack surface.  Etc.
> 
> As it stands, if you create a fresh userns and mountns and try to
> mount /proc, there are some really awful and hideous rules that are
> checked for security reasons.  All these new APIs either need to
> return something more restrictive than a proc dirfd or they need to
> follow the same rules.  And I'm afraid that the latter may be a
> nonstarter if you expect these APIs to be used in libraries.
> 
> Yes, this is unfortunate, but it is indeed the current situation.  I
> suppose that we could return magic restricted dirfds, or we could
> return things that aren't dirfds and all and have some API that gives
> you the dirfd associated with a procfd but only if you can see
> /proc/PID.

What would be your opinion to having a
/proc/<pid>/handle
file instead of having a dirfd. Essentially, what I initially proposed
at LPC. The change on what we currently have in master would be:
https://gist.github.com/brauner/59eec91550c5624c9999eaebd95a70df

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-20 18:51                                                                       ` Christian Brauner
  2019-03-20 18:58                                                                         ` Andy Lutomirski
@ 2019-03-20 19:19                                                                         ` Joel Fernandes
  2019-03-20 19:29                                                                         ` Daniel Colascione
  2 siblings, 0 replies; 113+ messages in thread
From: Joel Fernandes @ 2019-03-20 19:19 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Daniel Colascione, Suren Baghdasaryan, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Oleg Nesterov, Andy Lutomirski, Serge E. Hallyn,
	Kees Cook

On Wed, Mar 20, 2019 at 07:51:57PM +0100, Christian Brauner wrote:
[snip]
> > > translate_pid() should just return you a pidfd. Having it return a pidfd
> > > and a status fd feels like stuffing too much functionality in there. If
> > > you're fine with it I'll finish prototyping what I had in mind. As I
> > > said in previous mails I'm already working on this.
> > 
> > translate_pid also needs to *accept* pidfds, at least optionally.
> > Unless you have a function from pidfd to pidfd, you race.
> 
> You're misunderstanding. Again, I said in my previous mails it should
> accept pidfds optionally as arguments, yes. But I don't want it to
> return the status fds that you previously wanted pidfd_wait() to return.
> I really want to see Joel's pidfd_wait() patchset and have more people
> review the actual code.

No problem, pidfd_wait is also fine with me and we can change it later to
translate_pid or something else if needed.

Agreed that lets get to some code writing now that (I hope) we are all on the
same page and discuss on actual code.

 - Joel


^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-20 18:51                                                                       ` Christian Brauner
  2019-03-20 18:58                                                                         ` Andy Lutomirski
  2019-03-20 19:19                                                                         ` Joel Fernandes
@ 2019-03-20 19:29                                                                         ` Daniel Colascione
  2019-03-24 14:44                                                                           ` Serge E. Hallyn
  2 siblings, 1 reply; 113+ messages in thread
From: Daniel Colascione @ 2019-03-20 19:29 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Joel Fernandes, Suren Baghdasaryan, Steven Rostedt,
	Sultan Alsawaf, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Oleg Nesterov, Andy Lutomirski, Serge E. Hallyn,
	Kees Cook

On Wed, Mar 20, 2019 at 11:52 AM Christian Brauner <christian@brauner.io> wrote:
>
> On Wed, Mar 20, 2019 at 11:38:35AM -0700, Daniel Colascione wrote:
> > On Wed, Mar 20, 2019 at 11:26 AM Christian Brauner <christian@brauner.io> wrote:
> > > On Wed, Mar 20, 2019 at 07:33:51AM -0400, Joel Fernandes wrote:
> > > >
> > > >
> > > > On March 20, 2019 3:02:32 AM EDT, Daniel Colascione <dancol@google.com> wrote:
> > > > >On Tue, Mar 19, 2019 at 8:59 PM Christian Brauner
> > > > ><christian@brauner.io> wrote:
> > > > >>
> > > > >> On Tue, Mar 19, 2019 at 07:42:52PM -0700, Daniel Colascione wrote:
> > > > >> > On Tue, Mar 19, 2019 at 6:52 PM Joel Fernandes
> > > > ><joel@joelfernandes.org> wrote:
> > > > >> > >
> > > > >> > > On Wed, Mar 20, 2019 at 12:10:23AM +0100, Christian Brauner
> > > > >wrote:
> > > > >> > > > On Tue, Mar 19, 2019 at 03:48:32PM -0700, Daniel Colascione
> > > > >wrote:
> > > > >> > > > > On Tue, Mar 19, 2019 at 3:14 PM Christian Brauner
> > > > ><christian@brauner.io> wrote:
> > > > >> > > > > > So I dislike the idea of allocating new inodes from the
> > > > >procfs super
> > > > >> > > > > > block. I would like to avoid pinning the whole pidfd
> > > > >concept exclusively
> > > > >> > > > > > to proc. The idea is that the pidfd API will be useable
> > > > >through procfs
> > > > >> > > > > > via open("/proc/<pid>") because that is what users expect
> > > > >and really
> > > > >> > > > > > wanted to have for a long time. So it makes sense to have
> > > > >this working.
> > > > >> > > > > > But it should really be useable without it. That's why
> > > > >translate_pid()
> > > > >> > > > > > and pidfd_clone() are on the table.  What I'm saying is,
> > > > >once the pidfd
> > > > >> > > > > > api is "complete" you should be able to set CONFIG_PROCFS=N
> > > > >- even
> > > > >> > > > > > though that's crazy - and still be able to use pidfds. This
> > > > >is also a
> > > > >> > > > > > point akpm asked about when I did the pidfd_send_signal
> > > > >work.
> > > > >> > > > >
> > > > >> > > > > I agree that you shouldn't need CONFIG_PROCFS=Y to use
> > > > >pidfds. One
> > > > >> > > > > crazy idea that I was discussing with Joel the other day is
> > > > >to just
> > > > >> > > > > make CONFIG_PROCFS=Y mandatory and provide a new
> > > > >get_procfs_root()
> > > > >> > > > > system call that returned, out of thin air and independent of
> > > > >the
> > > > >> > > > > mount table, a procfs root directory file descriptor for the
> > > > >caller's
> > > > >> > > > > PID namspace and suitable for use with openat(2).
> > > > >> > > >
> > > > >> > > > Even if this works I'm pretty sure that Al and a lot of others
> > > > >will not
> > > > >> > > > be happy about this. A syscall to get an fd to /proc?
> > > > >> >
> > > > >> > Why not? procfs provides access to a lot of core kernel
> > > > >functionality.
> > > > >> > Why should you need a mountpoint to get to it?
> > > > >> >
> > > > >> > > That's not going
> > > > >> > > > to happen and I don't see the need for a separate syscall just
> > > > >for that.
> > > > >> >
> > > > >> > We need a system call for the same reason we need a getrandom(2):
> > > > >you
> > > > >> > have to bootstrap somehow when you're in a minimal environment.
> > > > >> >
> > > > >> > > > (I do see the point of making CONFIG_PROCFS=y the default btw.)
> > > > >> >
> > > > >> > I'm not proposing that we make CONFIG_PROCFS=y the default. I'm
> > > > >> > proposing that we *hardwire* it as the default and just declare
> > > > >that
> > > > >> > it's not possible to build a Linux kernel that doesn't include
> > > > >procfs.
> > > > >> > Why do we even have that button?
> > > > >> >
> > > > >> > > I think his point here was that he wanted a handle to procfs no
> > > > >matter where
> > > > >> > > it was mounted and then can later use openat on that. Agreed that
> > > > >it may be
> > > > >> > > unnecessary unless there is a usecase for it, and especially if
> > > > >the /proc
> > > > >> > > directory being the defacto mountpoint for procfs is a universal
> > > > >convention.
> > > > >> >
> > > > >> > If it's a universal convention and, in practice, everyone needs
> > > > >proc
> > > > >> > mounted anyway, so what's the harm in hardwiring CONFIG_PROCFS=y?
> > > > >If
> > > > >> > we advertise /proc as not merely some kind of optional debug
> > > > >interface
> > > > >> > but *the* way certain kernel features are exposed --- and there's
> > > > >> > nothing wrong with that --- then we should give programs access to
> > > > >> > these core kernel features in a way that doesn't depend on
> > > > >userspace
> > > > >> > kernel configuration, and you do that by either providing a
> > > > >> > procfs-root-getting system call or just hardwiring the "/proc/"
> > > > >prefix
> > > > >> > into VFS.
> > > > >> >
> > > > >> > > > Inode allocation from the procfs mount for the file descriptors
> > > > >Joel
> > > > >> > > > wants is not correct. Their not really procfs file descriptors
> > > > >so this
> > > > >> > > > is a nack. We can't just hook into proc that way.
> > > > >> > >
> > > > >> > > I was not particular about using procfs mount for the FDs but
> > > > >that's the only
> > > > >> > > way I knew how to do it until you pointed out anon_inode (my grep
> > > > >skills
> > > > >> > > missed that), so thank you!
> > > > >> > >
> > > > >> > > > > C'mon: /proc is used by everyone today and almost every
> > > > >program breaks
> > > > >> > > > > if it's not around. The string "/proc" is already de facto
> > > > >kernel ABI.
> > > > >> > > > > Let's just drop the pretense of /proc being optional and bake
> > > > >it into
> > > > >> > > > > the kernel proper, then give programs a way to get to /proc
> > > > >that isn't
> > > > >> > > > > tied to any particular mount configuration. This way, we
> > > > >don't need a
> > > > >> > > > > translate_pid(), since callers can just use procfs to do the
> > > > >same
> > > > >> > > > > thing. (That is, if I understand correctly what translate_pid
> > > > >does.)
> > > > >> > > >
> > > > >> > > > I'm not sure what you think translate_pid() is doing since
> > > > >you're not
> > > > >> > > > saying what you think it does.
> > > > >> > > > Examples from the old patchset:
> > > > >> > > > translate_pid(pid, ns, -1)      - get pid in our pid namespace
> > > > >> >
> > > > >> > Ah, it's a bit different from what I had in mind. It's fair to want
> > > > >to
> > > > >> > translate PIDs between namespaces, but the only way to make the
> > > > >> > translate_pid under discussion robust is to have it accept and
> > > > >produce
> > > > >> > pidfds. (At that point, you might as well call it translate_pidfd.)
> > > > >We
> > > > >> > should not be adding new APIs to the kernel that accept numeric
> > > > >PIDs:
> > > > >>
> > > > >> The traditional pid-based api is not going away. There are users that
> > > > >> have the requirement to translate pids between namespaces and also
> > > > >doing
> > > > >> introspection on these namespaces independent of pidfds. We will not
> > > > >> restrict the usefulness of this syscall by making it only work with
> > > > >> pidfds.
> > > > >>
> > > > >> > it's not possible to use these APIs correctly except under very
> > > > >> > limited circumstances --- mostly, talking about init or a parent
> > > > >>
> > > > >> The pid-based api is one of the most widely used apis of the kernel
> > > > >and
> > > > >> people have been using it quite successfully for a long time. Yes,
> > > > >it's
> > > > >> rac, but it's here to stay.
> > > > >>
> > > > >> > talking about its child.
> > > > >> >
> > > > >> > Really, we need a few related operations, and we shouldn't
> > > > >necessarily
> > > > >> > mingle them.
> > > > >>
> > > > >> Yes, we've established that previously.
> > > > >>
> > > > >> >
> > > > >> > 1) Given a numeric PID, give me a pidfd: that works today: you just
> > > > >> > open /proc/<pid>
> > > > >>
> > > > >> Agreed.
> > > > >>
> > > > >> >
> > > > >> > 2) Given a pidfd, give me a numeric PID: that works today: you just
> > > > >> > openat(pidfd, "stat", O_RDONLY) and read the first token (which is
> > > > >> > always the numeric PID).
> > > > >>
> > > > >> Agreed.
> > > > >>
> > > > >> >
> > > > >> > 3) Given a pidfd, send a signal: that's what pidfd_send_signal
> > > > >does,
> > > > >> > and it's a good start on the rest of these operations.
> > > > >>
> > > > >> Agreed.
> > > > >>
> > > > >> > 5) Given a pidfd in NS1, get a pidfd in NS2. That's what
> > > > >translate_pid
> > > > >> > is for. My preferred signature for this routine is
> > > > >translate_pid(int
> > > > >> > pidfd, int nsfd) -> pidfd. We don't need two namespace arguments.
> > > > >Why
> > > > >> > not? Because the pidfd *already* names a single process, uniquely!
> > > > >>
> > > > >> Given that people are interested in pids we can't just always return
> > > > >a
> > > > >> pidfd. That would mean a user would need to do get the pidfd read
> > > > >from
> > > > >> <pidfd>/stat and then close the pidfd. If you do that for a 100 pids
> > > > >or
> > > > >> more you end up allocating and closing file descriptors constantly
> > > > >for
> > > > >> no reason. We can't just debate pids away. So it will also need to be
> > > > >> able to yield pids e.g. through a flag argument.
> > > > >
> > > > >Sure, but that's still not a reason that we should care about pidfds
> > > > >working separately from procfs..
> > >
> > > That's unrelated to the point made in the above paragraph.
> > > Please note, I said that the pidfd api should work when proc is not
> > > available not that they can't be dirfds.
> >
> > What do you mean by "not available"? CONFIG_PROCFS=n? If pidfds
>
> I'm talking about the ability to clone processes and get fd handles on
> them via pidfd_clone() or CLONE_NEWFD.

I wouldn't call that situation "proc [not being] available". We need
pidfd_clone to return a pidfd for atomicity reasons, not /proc
availability reasons. Again, it doesn't make any sense to support this
stuff when CONFIG_PROCFS=n, and CONFIG_PROCFS=n shouldn't even be a
supported configuration.

> > > translate_pid() should just return you a pidfd. Having it return a pidfd
> > > and a status fd feels like stuffing too much functionality in there. If
> > > you're fine with it I'll finish prototyping what I had in mind. As I
> > > said in previous mails I'm already working on this.
> >
> > translate_pid also needs to *accept* pidfds, at least optionally.
> > Unless you have a function from pidfd to pidfd, you race.
>
> You're misunderstanding. Again, I said in my previous mails it should
> accept pidfds optionally as arguments, yes. But I don't want it to
> return the status fds that you previously wanted pidfd_wait() to return.

Agreed. There should be a different way to get these wait handle FDs.

> I really want to see Joel's pidfd_wait() patchset and have more people
> review the actual code.

Sure. But it's also unpleasant to have people write code and then have
to throw it away due to guessing incorrectly about unclear
requirements.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-20 19:14                                                                           ` Christian Brauner
@ 2019-03-20 19:40                                                                             ` Daniel Colascione
  2019-03-21 17:02                                                                               ` Andy Lutomirski
  0 siblings, 1 reply; 113+ messages in thread
From: Daniel Colascione @ 2019-03-20 19:40 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Andy Lutomirski, Joel Fernandes, Suren Baghdasaryan,
	Steven Rostedt, Sultan Alsawaf, Tim Murray, Michal Hocko,
	Greg Kroah-Hartman, Arve Hjønnevåg, Todd Kjos,
	Martijn Coenen, Ingo Molnar, Peter Zijlstra, LKML,
	open list:ANDROID DRIVERS, linux-mm, kernel-team, Oleg Nesterov,
	Serge E. Hallyn, Kees Cook

On Wed, Mar 20, 2019 at 12:14 PM Christian Brauner <christian@brauner.io> wrote:
>
> On Wed, Mar 20, 2019 at 11:58:57AM -0700, Andy Lutomirski wrote:
> > On Wed, Mar 20, 2019 at 11:52 AM Christian Brauner <christian@brauner.io> wrote:
> > >
> > > You're misunderstanding. Again, I said in my previous mails it should
> > > accept pidfds optionally as arguments, yes. But I don't want it to
> > > return the status fds that you previously wanted pidfd_wait() to return.
> > > I really want to see Joel's pidfd_wait() patchset and have more people
> > > review the actual code.
> >
> > Just to make sure that no one is forgetting a material security consideration:
>
> Andy, thanks for commenting!
>
> >
> > $ ls /proc/self
> > attr             exe        mountinfo      projid_map    status
> > autogroup        fd         mounts         root          syscall
> > auxv             fdinfo     mountstats     sched         task
> > cgroup           gid_map    net            schedstat     timers
> > clear_refs       io         ns             sessionid     timerslack_ns
> > cmdline          latency    numa_maps      setgroups     uid_map
> > comm             limits     oom_adj        smaps         wchan
> > coredump_filter  loginuid   oom_score      smaps_rollup
> > cpuset           map_files  oom_score_adj  stack
> > cwd              maps       pagemap        stat
> > environ          mem        personality    statm
> >
> > A bunch of this stuff makes sense to make accessible through a syscall
> > interface that we expect to be used even in sandboxes.  But a bunch of
> > it does not.  For example, *_map, mounts, mountstats, and net are all
> > namespace-wide things that certain policies expect to be unavailable.
> > stack, for example, is a potential attack surface.  Etc.

If you can access these files sources via open(2) on /proc/<pid>, you
should be able to access them via a pidfd. If you can't, you
shouldn't. Which /proc? The one you'd get by mounting procfs. I don't
see how pidfd makes any material changes to anyone's security. As far
as I'm concerned, if a sandbox can't mount /proc at all, it's just a
broken and unsupported configuration.

An actual threat model and real thought paid to access capabilities
would help. Almost everything around the interaction of Linux kernel
namespaces and security feels like a jumble of ad-hoc patches added as
afterthoughts in response to random objections.

>> All these new APIs either need to
> > return something more restrictive than a proc dirfd or they need to
> > follow the same rules.

What's wrong with the latter?

> > And I'm afraid that the latter may be a
> > nonstarter if you expect these APIs to be used in libraries.

What's special about libraries? How is a library any worse-off using
openat(2) on a pidfd than it would be just opening the file called
"/proc/$apid"?

> > Yes, this is unfortunate, but it is indeed the current situation.  I
> > suppose that we could return magic restricted dirfds, or we could
> > return things that aren't dirfds and all and have some API that gives
> > you the dirfd associated with a procfd but only if you can see
> > /proc/PID.
>
> What would be your opinion to having a
> /proc/<pid>/handle
> file instead of having a dirfd. Essentially, what I initially proposed
> at LPC. The change on what we currently have in master would be:
> https://gist.github.com/brauner/59eec91550c5624c9999eaebd95a70df

And how do you propose, given one of these handle objects, getting a
process's current priority, or its current oom score, or its list of
memory maps? As I mentioned in my original email, and which nobody has
addressed, if you don't use a dirfd as your process handle or you
don't provide an easy way to get one of these proc directory FDs, you
need to duplicate a lot of metadata access interfaces.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-20 19:40                                                                             ` Daniel Colascione
@ 2019-03-21 17:02                                                                               ` Andy Lutomirski
  2019-03-25 20:13                                                                                 ` Jann Horn
  0 siblings, 1 reply; 113+ messages in thread
From: Andy Lutomirski @ 2019-03-21 17:02 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: Christian Brauner, Andy Lutomirski, Joel Fernandes,
	Suren Baghdasaryan, Steven Rostedt, Sultan Alsawaf, Tim Murray,
	Michal Hocko, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Ingo Molnar, Peter Zijlstra, LKML,
	open list:ANDROID DRIVERS, linux-mm, kernel-team, Oleg Nesterov,
	Serge E. Hallyn, Kees Cook

On Wed, Mar 20, 2019 at 12:40 PM Daniel Colascione <dancol@google.com> wrote:
>
> On Wed, Mar 20, 2019 at 12:14 PM Christian Brauner <christian@brauner.io> wrote:
> >
> > On Wed, Mar 20, 2019 at 11:58:57AM -0700, Andy Lutomirski wrote:
> > > On Wed, Mar 20, 2019 at 11:52 AM Christian Brauner <christian@brauner.io> wrote:
> > > >
> > > > You're misunderstanding. Again, I said in my previous mails it should
> > > > accept pidfds optionally as arguments, yes. But I don't want it to
> > > > return the status fds that you previously wanted pidfd_wait() to return.
> > > > I really want to see Joel's pidfd_wait() patchset and have more people
> > > > review the actual code.
> > >
> > > Just to make sure that no one is forgetting a material security consideration:
> >
> > Andy, thanks for commenting!
> >
> > >
> > > $ ls /proc/self
> > > attr             exe        mountinfo      projid_map    status
> > > autogroup        fd         mounts         root          syscall
> > > auxv             fdinfo     mountstats     sched         task
> > > cgroup           gid_map    net            schedstat     timers
> > > clear_refs       io         ns             sessionid     timerslack_ns
> > > cmdline          latency    numa_maps      setgroups     uid_map
> > > comm             limits     oom_adj        smaps         wchan
> > > coredump_filter  loginuid   oom_score      smaps_rollup
> > > cpuset           map_files  oom_score_adj  stack
> > > cwd              maps       pagemap        stat
> > > environ          mem        personality    statm
> > >
> > > A bunch of this stuff makes sense to make accessible through a syscall
> > > interface that we expect to be used even in sandboxes.  But a bunch of
> > > it does not.  For example, *_map, mounts, mountstats, and net are all
> > > namespace-wide things that certain policies expect to be unavailable.
> > > stack, for example, is a potential attack surface.  Etc.
>
> If you can access these files sources via open(2) on /proc/<pid>, you
> should be able to access them via a pidfd. If you can't, you
> shouldn't. Which /proc? The one you'd get by mounting procfs. I don't
> see how pidfd makes any material changes to anyone's security. As far
> as I'm concerned, if a sandbox can't mount /proc at all, it's just a
> broken and unsupported configuration.

It's not "broken and unsupported".  I know of an actual working,
deployed container-ish sandbox that does exactly this.  I would also
guess that quite a few not-at-all-container-like sandboxes work like
this.  (The obvious seccomp + unshare + pivot_root
deny-myself-access-to-lots-of-things trick results in no /proc, which
is by dsign.)

>
> An actual threat model and real thought paid to access capabilities
> would help. Almost everything around the interaction of Linux kernel
> namespaces and security feels like a jumble of ad-hoc patches added as
> afterthoughts in response to random objections.

I fully agree.  But if you start thinking for real about access
capabilities, there's no way that you're going to conclude that a
capability to access some process implies a capability to access the
settings of its network namespace.

>
> >> All these new APIs either need to
> > > return something more restrictive than a proc dirfd or they need to
> > > follow the same rules.
>

...

> What's special about libraries? How is a library any worse-off using
> openat(2) on a pidfd than it would be just opening the file called
> "/proc/$apid"?

Because most libraries actually work, right now, without /proc.  Even
libraries that spawn subprocesses.  If we make the new API have the
property that it doesn't work if you're in a non-root user namespace
and /proc isn't mounted, the result will be an utter mess.

>
> > > Yes, this is unfortunate, but it is indeed the current situation.  I
> > > suppose that we could return magic restricted dirfds, or we could
> > > return things that aren't dirfds and all and have some API that gives
> > > you the dirfd associated with a procfd but only if you can see
> > > /proc/PID.
> >
> > What would be your opinion to having a
> > /proc/<pid>/handle
> > file instead of having a dirfd. Essentially, what I initially proposed
> > at LPC. The change on what we currently have in master would be:
> > https://gist.github.com/brauner/59eec91550c5624c9999eaebd95a70df
>
> And how do you propose, given one of these handle objects, getting a
> process's current priority, or its current oom score, or its list of
> memory maps? As I mentioned in my original email, and which nobody has
> addressed, if you don't use a dirfd as your process handle or you
> don't provide an easy way to get one of these proc directory FDs, you
> need to duplicate a lot of metadata access interfaces.

An API that takes a process handle object and an fd pointing at /proc
(the root of the proc fs) and gives you back a proc dirfd would do the
trick.  You could do this with no new kernel features at all if you're
willing to read the pid, call openat(2), and handle the races in user
code.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-20 19:29                                                                         ` Daniel Colascione
@ 2019-03-24 14:44                                                                           ` Serge E. Hallyn
  2019-03-24 18:48                                                                             ` Joel Fernandes
  0 siblings, 1 reply; 113+ messages in thread
From: Serge E. Hallyn @ 2019-03-24 14:44 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: Christian Brauner, Joel Fernandes, Suren Baghdasaryan,
	Steven Rostedt, Sultan Alsawaf, Tim Murray, Michal Hocko,
	Greg Kroah-Hartman, Arve Hjønnevåg, Todd Kjos,
	Martijn Coenen, Ingo Molnar, Peter Zijlstra, LKML,
	open list:ANDROID DRIVERS, linux-mm, kernel-team, Oleg Nesterov,
	Andy Lutomirski, Serge E. Hallyn, Kees Cook

On Wed, Mar 20, 2019 at 12:29:31PM -0700, Daniel Colascione wrote:
> On Wed, Mar 20, 2019 at 11:52 AM Christian Brauner <christian@brauner.io> wrote:
> > I really want to see Joel's pidfd_wait() patchset and have more people
> > review the actual code.
> 
> Sure. But it's also unpleasant to have people write code and then have
> to throw it away due to guessing incorrectly about unclear
> requirements.

No, it is not.  It is not unpleasant.  And it is useful.  It is the best way to
identify and resolve those incorrect guesses and unclear requirements.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-24 14:44                                                                           ` Serge E. Hallyn
@ 2019-03-24 18:48                                                                             ` Joel Fernandes
  0 siblings, 0 replies; 113+ messages in thread
From: Joel Fernandes @ 2019-03-24 18:48 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Daniel Colascione, Christian Brauner, Joel Fernandes,
	Suren Baghdasaryan, Steven Rostedt, Sultan Alsawaf, Tim Murray,
	Michal Hocko, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Ingo Molnar, Peter Zijlstra, LKML,
	open list:ANDROID DRIVERS, linux-mm, kernel-team, Oleg Nesterov,
	Andy Lutomirski, Kees Cook

On Sun, Mar 24, 2019 at 10:44 AM Serge E. Hallyn <serge@hallyn.com> wrote:
>
> On Wed, Mar 20, 2019 at 12:29:31PM -0700, Daniel Colascione wrote:
> > On Wed, Mar 20, 2019 at 11:52 AM Christian Brauner <christian@brauner.io> wrote:
> > > I really want to see Joel's pidfd_wait() patchset and have more people
> > > review the actual code.
> >
> > Sure. But it's also unpleasant to have people write code and then have
> > to throw it away due to guessing incorrectly about unclear
> > requirements.
>
> No, it is not.  It is not unpleasant.  And it is useful.  It is the best way to
> identify and resolve those incorrect guesses and unclear requirements.

No problem, a bit of discussion helped set the direction. Personally
it did help clarify lot of things for me.  We are hard at work with
come up with an implementation and are looking at posting something
soon. I agree that the best is to discuss on actual code where
possible.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-21 17:02                                                                               ` Andy Lutomirski
@ 2019-03-25 20:13                                                                                 ` Jann Horn
  2019-03-25 20:23                                                                                   ` Daniel Colascione
  0 siblings, 1 reply; 113+ messages in thread
From: Jann Horn @ 2019-03-25 20:13 UTC (permalink / raw)
  To: Andy Lutomirski, Christian Brauner
  Cc: Daniel Colascione, Joel Fernandes, Suren Baghdasaryan,
	Steven Rostedt, Sultan Alsawaf, Tim Murray, Michal Hocko,
	Greg Kroah-Hartman, Arve Hjønnevåg, Todd Kjos,
	Martijn Coenen, Ingo Molnar, Peter Zijlstra, LKML,
	open list:ANDROID DRIVERS, kernel-team, Oleg Nesterov,
	Serge E. Hallyn, Kees Cook, Jonathan Kowalski, Linux API

On Mon, Mar 25, 2019 at 8:44 PM Andy Lutomirski <luto@kernel.org> wrote:
> On Wed, Mar 20, 2019 at 12:40 PM Daniel Colascione <dancol@google.com> wrote:
> > On Wed, Mar 20, 2019 at 12:14 PM Christian Brauner <christian@brauner.io> wrote:
> > > On Wed, Mar 20, 2019 at 11:58:57AM -0700, Andy Lutomirski wrote:
> > > > On Wed, Mar 20, 2019 at 11:52 AM Christian Brauner <christian@brauner.io> wrote:
> > > > >
> > > > > You're misunderstanding. Again, I said in my previous mails it should
> > > > > accept pidfds optionally as arguments, yes. But I don't want it to
> > > > > return the status fds that you previously wanted pidfd_wait() to return.
> > > > > I really want to see Joel's pidfd_wait() patchset and have more people
> > > > > review the actual code.
> > > >
> > > > Just to make sure that no one is forgetting a material security consideration:
> > >
> > > Andy, thanks for commenting!
> > >
> > > >
> > > > $ ls /proc/self
> > > > attr             exe        mountinfo      projid_map    status
> > > > autogroup        fd         mounts         root          syscall
> > > > auxv             fdinfo     mountstats     sched         task
> > > > cgroup           gid_map    net            schedstat     timers
> > > > clear_refs       io         ns             sessionid     timerslack_ns
> > > > cmdline          latency    numa_maps      setgroups     uid_map
> > > > comm             limits     oom_adj        smaps         wchan
> > > > coredump_filter  loginuid   oom_score      smaps_rollup
> > > > cpuset           map_files  oom_score_adj  stack
> > > > cwd              maps       pagemap        stat
> > > > environ          mem        personality    statm
> > > >
> > > > A bunch of this stuff makes sense to make accessible through a syscall
> > > > interface that we expect to be used even in sandboxes.  But a bunch of
> > > > it does not.  For example, *_map, mounts, mountstats, and net are all
> > > > namespace-wide things that certain policies expect to be unavailable.
> > > > stack, for example, is a potential attack surface.  Etc.
> >
> > If you can access these files sources via open(2) on /proc/<pid>, you
> > should be able to access them via a pidfd. If you can't, you
> > shouldn't. Which /proc? The one you'd get by mounting procfs. I don't
> > see how pidfd makes any material changes to anyone's security. As far
> > as I'm concerned, if a sandbox can't mount /proc at all, it's just a
> > broken and unsupported configuration.
>
> It's not "broken and unsupported".  I know of an actual working,
> deployed container-ish sandbox that does exactly this.  I would also
> guess that quite a few not-at-all-container-like sandboxes work like
> this.  (The obvious seccomp + unshare + pivot_root
> deny-myself-access-to-lots-of-things trick results in no /proc, which
> is by dsign.)
>
> >
> > An actual threat model and real thought paid to access capabilities
> > would help. Almost everything around the interaction of Linux kernel
> > namespaces and security feels like a jumble of ad-hoc patches added as
> > afterthoughts in response to random objections.
>
> I fully agree.  But if you start thinking for real about access
> capabilities, there's no way that you're going to conclude that a
> capability to access some process implies a capability to access the
> settings of its network namespace.
>
> >
> > >> All these new APIs either need to
> > > > return something more restrictive than a proc dirfd or they need to
> > > > follow the same rules.
> >
>
> ...
>
> > What's special about libraries? How is a library any worse-off using
> > openat(2) on a pidfd than it would be just opening the file called
> > "/proc/$apid"?
>
> Because most libraries actually work, right now, without /proc.  Even
> libraries that spawn subprocesses.  If we make the new API have the
> property that it doesn't work if you're in a non-root user namespace
> and /proc isn't mounted, the result will be an utter mess.
>
> >
> > > > Yes, this is unfortunate, but it is indeed the current situation.  I
> > > > suppose that we could return magic restricted dirfds, or we could
> > > > return things that aren't dirfds and all and have some API that gives
> > > > you the dirfd associated with a procfd but only if you can see
> > > > /proc/PID.
> > >
> > > What would be your opinion to having a
> > > /proc/<pid>/handle
> > > file instead of having a dirfd. Essentially, what I initially proposed
> > > at LPC. The change on what we currently have in master would be:
> > > https://gist.github.com/brauner/59eec91550c5624c9999eaebd95a70df
> >
> > And how do you propose, given one of these handle objects, getting a
> > process's current priority, or its current oom score, or its list of
> > memory maps? As I mentioned in my original email, and which nobody has
> > addressed, if you don't use a dirfd as your process handle or you
> > don't provide an easy way to get one of these proc directory FDs, you
> > need to duplicate a lot of metadata access interfaces.
>
> An API that takes a process handle object and an fd pointing at /proc
> (the root of the proc fs) and gives you back a proc dirfd would do the
> trick.  You could do this with no new kernel features at all if you're
> willing to read the pid, call openat(2), and handle the races in user
> code.

This seems like something that might be a good fit for two ioctls?

One ioctl on procfs roots to translate pidfds into that procfs,
subject to both the normal lookup permission checks and only working
if the pidfd has a translation into the procfs:

int proc_root_fd = open("/proc", O_RDONLY);
int proc_dir_fd = ioctl(proc_root_fd, PROC_PIDFD_TO_PROCFSFD, pidfd);

And one ioctl on procfs directories to translate from PGIDs and PIDs to pidfds:

int proc_pgid_fd = open("/proc/self", O_RDONLY);
int self_pg_pidfd = ioctl(proc_pgid_fd, PROC_PROCFSFD_TO_PIDFD, 0);
int proc_pid_fd = open("/proc/thread-self", O_RDONLY);
int self_p_pidfd = ioctl(proc_pid_fd, PROC_PROCFSFD_TO_PIDFD, 0);


And then, as you proposed, the new sys_clone() can just return a
pidfd, and you can convert it into a procfs fd yourself if you want.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-25 20:13                                                                                 ` Jann Horn
@ 2019-03-25 20:23                                                                                   ` Daniel Colascione
  2019-03-25 23:42                                                                                     ` Andy Lutomirski
  0 siblings, 1 reply; 113+ messages in thread
From: Daniel Colascione @ 2019-03-25 20:23 UTC (permalink / raw)
  To: Jann Horn
  Cc: Andy Lutomirski, Christian Brauner, Joel Fernandes,
	Suren Baghdasaryan, Steven Rostedt, Sultan Alsawaf, Tim Murray,
	Michal Hocko, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Ingo Molnar, Peter Zijlstra, LKML,
	open list:ANDROID DRIVERS, kernel-team, Oleg Nesterov,
	Serge E. Hallyn, Kees Cook, Jonathan Kowalski, Linux API

On Mon, Mar 25, 2019 at 1:14 PM Jann Horn <jannh@google.com> wrote:
>
> On Mon, Mar 25, 2019 at 8:44 PM Andy Lutomirski <luto@kernel.org> wrote:
> > On Wed, Mar 20, 2019 at 12:40 PM Daniel Colascione <dancol@google.com> wrote:
> > > On Wed, Mar 20, 2019 at 12:14 PM Christian Brauner <christian@brauner.io> wrote:
> > > > On Wed, Mar 20, 2019 at 11:58:57AM -0700, Andy Lutomirski wrote:
> > > > > On Wed, Mar 20, 2019 at 11:52 AM Christian Brauner <christian@brauner.io> wrote:
> > > > > >
> > > > > > You're misunderstanding. Again, I said in my previous mails it should
> > > > > > accept pidfds optionally as arguments, yes. But I don't want it to
> > > > > > return the status fds that you previously wanted pidfd_wait() to return.
> > > > > > I really want to see Joel's pidfd_wait() patchset and have more people
> > > > > > review the actual code.
> > > > >
> > > > > Just to make sure that no one is forgetting a material security consideration:
> > > >
> > > > Andy, thanks for commenting!
> > > >
> > > > >
> > > > > $ ls /proc/self
> > > > > attr             exe        mountinfo      projid_map    status
> > > > > autogroup        fd         mounts         root          syscall
> > > > > auxv             fdinfo     mountstats     sched         task
> > > > > cgroup           gid_map    net            schedstat     timers
> > > > > clear_refs       io         ns             sessionid     timerslack_ns
> > > > > cmdline          latency    numa_maps      setgroups     uid_map
> > > > > comm             limits     oom_adj        smaps         wchan
> > > > > coredump_filter  loginuid   oom_score      smaps_rollup
> > > > > cpuset           map_files  oom_score_adj  stack
> > > > > cwd              maps       pagemap        stat
> > > > > environ          mem        personality    statm
> > > > >
> > > > > A bunch of this stuff makes sense to make accessible through a syscall
> > > > > interface that we expect to be used even in sandboxes.  But a bunch of
> > > > > it does not.  For example, *_map, mounts, mountstats, and net are all
> > > > > namespace-wide things that certain policies expect to be unavailable.
> > > > > stack, for example, is a potential attack surface.  Etc.
> > >
> > > If you can access these files sources via open(2) on /proc/<pid>, you
> > > should be able to access them via a pidfd. If you can't, you
> > > shouldn't. Which /proc? The one you'd get by mounting procfs. I don't
> > > see how pidfd makes any material changes to anyone's security. As far
> > > as I'm concerned, if a sandbox can't mount /proc at all, it's just a
> > > broken and unsupported configuration.
> >
> > It's not "broken and unsupported".  I know of an actual working,
> > deployed container-ish sandbox that does exactly this.  I would also
> > guess that quite a few not-at-all-container-like sandboxes work like
> > this.  (The obvious seccomp + unshare + pivot_root
> > deny-myself-access-to-lots-of-things trick results in no /proc, which
> > is by dsign.)
> >
> > >
> > > An actual threat model and real thought paid to access capabilities
> > > would help. Almost everything around the interaction of Linux kernel
> > > namespaces and security feels like a jumble of ad-hoc patches added as
> > > afterthoughts in response to random objections.
> >
> > I fully agree.  But if you start thinking for real about access
> > capabilities, there's no way that you're going to conclude that a
> > capability to access some process implies a capability to access the
> > settings of its network namespace.
> >
> > >
> > > >> All these new APIs either need to
> > > > > return something more restrictive than a proc dirfd or they need to
> > > > > follow the same rules.
> > >
> >
> > ...
> >
> > > What's special about libraries? How is a library any worse-off using
> > > openat(2) on a pidfd than it would be just opening the file called
> > > "/proc/$apid"?
> >
> > Because most libraries actually work, right now, without /proc.  Even
> > libraries that spawn subprocesses.  If we make the new API have the
> > property that it doesn't work if you're in a non-root user namespace
> > and /proc isn't mounted, the result will be an utter mess.
> >
> > >
> > > > > Yes, this is unfortunate, but it is indeed the current situation.  I
> > > > > suppose that we could return magic restricted dirfds, or we could
> > > > > return things that aren't dirfds and all and have some API that gives
> > > > > you the dirfd associated with a procfd but only if you can see
> > > > > /proc/PID.
> > > >
> > > > What would be your opinion to having a
> > > > /proc/<pid>/handle
> > > > file instead of having a dirfd. Essentially, what I initially proposed
> > > > at LPC. The change on what we currently have in master would be:
> > > > https://gist.github.com/brauner/59eec91550c5624c9999eaebd95a70df
> > >
> > > And how do you propose, given one of these handle objects, getting a
> > > process's current priority, or its current oom score, or its list of
> > > memory maps? As I mentioned in my original email, and which nobody has
> > > addressed, if you don't use a dirfd as your process handle or you
> > > don't provide an easy way to get one of these proc directory FDs, you
> > > need to duplicate a lot of metadata access interfaces.
> >
> > An API that takes a process handle object and an fd pointing at /proc
> > (the root of the proc fs) and gives you back a proc dirfd would do the
> > trick.  You could do this with no new kernel features at all if you're
> > willing to read the pid, call openat(2), and handle the races in user
> > code.
>
> This seems like something that might be a good fit for two ioctls?

As an aside, we had a long discussion about why fundamental facilities
like this should be system calls, not ioctls. I think the arguments
still apply.

> One ioctl on procfs roots to translate pidfds into that procfs,
> subject to both the normal lookup permission checks and only working
> if the pidfd has a translation into the procfs:
>
> int proc_root_fd = open("/proc", O_RDONLY);
> int proc_dir_fd = ioctl(proc_root_fd, PROC_PIDFD_TO_PROCFSFD, pidfd);
>
> And one ioctl on procfs directories to translate from PGIDs and PIDs to pidfds:
>
> int proc_pgid_fd = open("/proc/self", O_RDONLY);
> int self_pg_pidfd = ioctl(proc_pgid_fd, PROC_PROCFSFD_TO_PIDFD, 0);
> int proc_pid_fd = open("/proc/thread-self", O_RDONLY);
> int self_p_pidfd = ioctl(proc_pid_fd, PROC_PROCFSFD_TO_PIDFD, 0);
>
>
> And then, as you proposed, the new sys_clone() can just return a
> pidfd, and you can convert it into a procfs fd yourself if you want.

I think that's the consensus we reached on the other thread. The
O_DIRECTORY open on /proc/self/fd/mypidfd seems like it'd work well
enough.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-25 20:23                                                                                   ` Daniel Colascione
@ 2019-03-25 23:42                                                                                     ` Andy Lutomirski
  2019-03-25 23:45                                                                                       ` Christian Brauner
  0 siblings, 1 reply; 113+ messages in thread
From: Andy Lutomirski @ 2019-03-25 23:42 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: Jann Horn, Andy Lutomirski, Christian Brauner, Joel Fernandes,
	Suren Baghdasaryan, Steven Rostedt, Sultan Alsawaf, Tim Murray,
	Michal Hocko, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Ingo Molnar, Peter Zijlstra, LKML,
	open list:ANDROID DRIVERS, kernel-team, Oleg Nesterov,
	Serge E. Hallyn, Kees Cook, Jonathan Kowalski, Linux API

On Mon, Mar 25, 2019 at 1:23 PM Daniel Colascione <dancol@google.com> wrote:
>
> On Mon, Mar 25, 2019 at 1:14 PM Jann Horn <jannh@google.com> wrote:
> >
> > On Mon, Mar 25, 2019 at 8:44 PM Andy Lutomirski <luto@kernel.org> wrote:

> > One ioctl on procfs roots to translate pidfds into that procfs,
> > subject to both the normal lookup permission checks and only working
> > if the pidfd has a translation into the procfs:
> >
> > int proc_root_fd = open("/proc", O_RDONLY);
> > int proc_dir_fd = ioctl(proc_root_fd, PROC_PIDFD_TO_PROCFSFD, pidfd);
> >
> > And one ioctl on procfs directories to translate from PGIDs and PIDs to pidfds:
> >
> > int proc_pgid_fd = open("/proc/self", O_RDONLY);
> > int self_pg_pidfd = ioctl(proc_pgid_fd, PROC_PROCFSFD_TO_PIDFD, 0);
> > int proc_pid_fd = open("/proc/thread-self", O_RDONLY);
> > int self_p_pidfd = ioctl(proc_pid_fd, PROC_PROCFSFD_TO_PIDFD, 0);
> >

This sounds okay to me.  Or we could make it so that a procfs
directory fd also works as a pidfd, but that seems more likely to be
problematic than just allowing two-way translation like this

> >
> > And then, as you proposed, the new sys_clone() can just return a
> > pidfd, and you can convert it into a procfs fd yourself if you want.
>
> I think that's the consensus we reached on the other thread. The
> O_DIRECTORY open on /proc/self/fd/mypidfd seems like it'd work well
> enough.

I must have missed this particular email.

IMO, if /proc/self/fd/mypidfd allows O_DIRECTORY open to work, then it
really ought to do function just like /proc/self/fd/mypidfd/. and
/proc/self/fd/mypidfd/status should work.  And these latter two
options seem nutty.

Also, this O_DIRECTORY thing is missing the entire point of the ioctl
interface -- it doesn't require procfs access.

--Andy

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-25 23:42                                                                                     ` Andy Lutomirski
@ 2019-03-25 23:45                                                                                       ` Christian Brauner
  2019-03-26  0:00                                                                                         ` Andy Lutomirski
  0 siblings, 1 reply; 113+ messages in thread
From: Christian Brauner @ 2019-03-25 23:45 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Daniel Colascione, Jann Horn, Joel Fernandes, Suren Baghdasaryan,
	Steven Rostedt, Sultan Alsawaf, Tim Murray, Michal Hocko,
	Greg Kroah-Hartman, Arve Hjønnevåg, Todd Kjos,
	Martijn Coenen, Ingo Molnar, Peter Zijlstra, LKML,
	open list:ANDROID DRIVERS, kernel-team, Oleg Nesterov,
	Serge E. Hallyn, Kees Cook, Jonathan Kowalski, Linux API

On Mon, Mar 25, 2019 at 04:42:14PM -0700, Andy Lutomirski wrote:
> On Mon, Mar 25, 2019 at 1:23 PM Daniel Colascione <dancol@google.com> wrote:
> >
> > On Mon, Mar 25, 2019 at 1:14 PM Jann Horn <jannh@google.com> wrote:
> > >
> > > On Mon, Mar 25, 2019 at 8:44 PM Andy Lutomirski <luto@kernel.org> wrote:
> 
> > > One ioctl on procfs roots to translate pidfds into that procfs,
> > > subject to both the normal lookup permission checks and only working
> > > if the pidfd has a translation into the procfs:
> > >
> > > int proc_root_fd = open("/proc", O_RDONLY);
> > > int proc_dir_fd = ioctl(proc_root_fd, PROC_PIDFD_TO_PROCFSFD, pidfd);
> > >
> > > And one ioctl on procfs directories to translate from PGIDs and PIDs to pidfds:
> > >
> > > int proc_pgid_fd = open("/proc/self", O_RDONLY);
> > > int self_pg_pidfd = ioctl(proc_pgid_fd, PROC_PROCFSFD_TO_PIDFD, 0);
> > > int proc_pid_fd = open("/proc/thread-self", O_RDONLY);
> > > int self_p_pidfd = ioctl(proc_pid_fd, PROC_PROCFSFD_TO_PIDFD, 0);
> > >
> 
> This sounds okay to me.  Or we could make it so that a procfs
> directory fd also works as a pidfd, but that seems more likely to be
> problematic than just allowing two-way translation like this
> 
> > >
> > > And then, as you proposed, the new sys_clone() can just return a
> > > pidfd, and you can convert it into a procfs fd yourself if you want.
> >
> > I think that's the consensus we reached on the other thread. The
> > O_DIRECTORY open on /proc/self/fd/mypidfd seems like it'd work well
> > enough.
> 
> I must have missed this particular email.
> 
> IMO, if /proc/self/fd/mypidfd allows O_DIRECTORY open to work, then it
> really ought to do function just like /proc/self/fd/mypidfd/. and
> /proc/self/fd/mypidfd/status should work.  And these latter two
> options seem nutty.
> 
> Also, this O_DIRECTORY thing is missing the entire point of the ioctl
> interface -- it doesn't require procfs access.

The other option was to encode the pid in the callers pid namespace into
the pidfd's fdinfo so that you can parse it out and open /proc/<pid>.
You'd just need an event on the pidfd to tell you when the process has
died. Jonathan and I just discussed this.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-25 23:45                                                                                       ` Christian Brauner
@ 2019-03-26  0:00                                                                                         ` Andy Lutomirski
  2019-03-26  0:12                                                                                           ` Christian Brauner
  0 siblings, 1 reply; 113+ messages in thread
From: Andy Lutomirski @ 2019-03-26  0:00 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Andy Lutomirski, Daniel Colascione, Jann Horn, Joel Fernandes,
	Suren Baghdasaryan, Steven Rostedt, Sultan Alsawaf, Tim Murray,
	Michal Hocko, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Ingo Molnar, Peter Zijlstra, LKML,
	open list:ANDROID DRIVERS, kernel-team, Oleg Nesterov,
	Serge E. Hallyn, Kees Cook, Jonathan Kowalski, Linux API

On Mon, Mar 25, 2019 at 4:45 PM Christian Brauner <christian@brauner.io> wrote:
>
> On Mon, Mar 25, 2019 at 04:42:14PM -0700, Andy Lutomirski wrote:
> > On Mon, Mar 25, 2019 at 1:23 PM Daniel Colascione <dancol@google.com> wrote:
> > >
> > > On Mon, Mar 25, 2019 at 1:14 PM Jann Horn <jannh@google.com> wrote:
> > > >
> > > > On Mon, Mar 25, 2019 at 8:44 PM Andy Lutomirski <luto@kernel.org> wrote:
> >
> > > > One ioctl on procfs roots to translate pidfds into that procfs,
> > > > subject to both the normal lookup permission checks and only working
> > > > if the pidfd has a translation into the procfs:
> > > >
> > > > int proc_root_fd = open("/proc", O_RDONLY);
> > > > int proc_dir_fd = ioctl(proc_root_fd, PROC_PIDFD_TO_PROCFSFD, pidfd);
> > > >
> > > > And one ioctl on procfs directories to translate from PGIDs and PIDs to pidfds:
> > > >
> > > > int proc_pgid_fd = open("/proc/self", O_RDONLY);
> > > > int self_pg_pidfd = ioctl(proc_pgid_fd, PROC_PROCFSFD_TO_PIDFD, 0);
> > > > int proc_pid_fd = open("/proc/thread-self", O_RDONLY);
> > > > int self_p_pidfd = ioctl(proc_pid_fd, PROC_PROCFSFD_TO_PIDFD, 0);
> > > >
> >
> > This sounds okay to me.  Or we could make it so that a procfs
> > directory fd also works as a pidfd, but that seems more likely to be
> > problematic than just allowing two-way translation like this
> >
> > > >
> > > > And then, as you proposed, the new sys_clone() can just return a
> > > > pidfd, and you can convert it into a procfs fd yourself if you want.
> > >
> > > I think that's the consensus we reached on the other thread. The
> > > O_DIRECTORY open on /proc/self/fd/mypidfd seems like it'd work well
> > > enough.
> >
> > I must have missed this particular email.
> >
> > IMO, if /proc/self/fd/mypidfd allows O_DIRECTORY open to work, then it
> > really ought to do function just like /proc/self/fd/mypidfd/. and
> > /proc/self/fd/mypidfd/status should work.  And these latter two
> > options seem nutty.
> >
> > Also, this O_DIRECTORY thing is missing the entire point of the ioctl
> > interface -- it doesn't require procfs access.
>
> The other option was to encode the pid in the callers pid namespace into
> the pidfd's fdinfo so that you can parse it out and open /proc/<pid>.
> You'd just need an event on the pidfd to tell you when the process has
> died. Jonathan and I just discussed this.

From an application developer's POV, the ioctl interface sounds much,
much nicer.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-26  0:00                                                                                         ` Andy Lutomirski
@ 2019-03-26  0:12                                                                                           ` Christian Brauner
  2019-03-26  0:24                                                                                             ` Andy Lutomirski
  0 siblings, 1 reply; 113+ messages in thread
From: Christian Brauner @ 2019-03-26  0:12 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Daniel Colascione, Jann Horn, Joel Fernandes, Suren Baghdasaryan,
	Steven Rostedt, Sultan Alsawaf, Tim Murray, Michal Hocko,
	Greg Kroah-Hartman, Arve Hjønnevåg, Todd Kjos,
	Martijn Coenen, Ingo Molnar, Peter Zijlstra, LKML,
	open list:ANDROID DRIVERS, kernel-team, Oleg Nesterov,
	Serge E. Hallyn, Kees Cook, Jonathan Kowalski, Linux API

On Mon, Mar 25, 2019 at 05:00:17PM -0700, Andy Lutomirski wrote:
> On Mon, Mar 25, 2019 at 4:45 PM Christian Brauner <christian@brauner.io> wrote:
> >
> > On Mon, Mar 25, 2019 at 04:42:14PM -0700, Andy Lutomirski wrote:
> > > On Mon, Mar 25, 2019 at 1:23 PM Daniel Colascione <dancol@google.com> wrote:
> > > >
> > > > On Mon, Mar 25, 2019 at 1:14 PM Jann Horn <jannh@google.com> wrote:
> > > > >
> > > > > On Mon, Mar 25, 2019 at 8:44 PM Andy Lutomirski <luto@kernel.org> wrote:
> > >
> > > > > One ioctl on procfs roots to translate pidfds into that procfs,
> > > > > subject to both the normal lookup permission checks and only working
> > > > > if the pidfd has a translation into the procfs:
> > > > >
> > > > > int proc_root_fd = open("/proc", O_RDONLY);
> > > > > int proc_dir_fd = ioctl(proc_root_fd, PROC_PIDFD_TO_PROCFSFD, pidfd);
> > > > >
> > > > > And one ioctl on procfs directories to translate from PGIDs and PIDs to pidfds:
> > > > >
> > > > > int proc_pgid_fd = open("/proc/self", O_RDONLY);
> > > > > int self_pg_pidfd = ioctl(proc_pgid_fd, PROC_PROCFSFD_TO_PIDFD, 0);
> > > > > int proc_pid_fd = open("/proc/thread-self", O_RDONLY);
> > > > > int self_p_pidfd = ioctl(proc_pid_fd, PROC_PROCFSFD_TO_PIDFD, 0);
> > > > >
> > >
> > > This sounds okay to me.  Or we could make it so that a procfs
> > > directory fd also works as a pidfd, but that seems more likely to be
> > > problematic than just allowing two-way translation like this
> > >
> > > > >
> > > > > And then, as you proposed, the new sys_clone() can just return a
> > > > > pidfd, and you can convert it into a procfs fd yourself if you want.
> > > >
> > > > I think that's the consensus we reached on the other thread. The
> > > > O_DIRECTORY open on /proc/self/fd/mypidfd seems like it'd work well
> > > > enough.
> > >
> > > I must have missed this particular email.
> > >
> > > IMO, if /proc/self/fd/mypidfd allows O_DIRECTORY open to work, then it
> > > really ought to do function just like /proc/self/fd/mypidfd/. and
> > > /proc/self/fd/mypidfd/status should work.  And these latter two
> > > options seem nutty.
> > >
> > > Also, this O_DIRECTORY thing is missing the entire point of the ioctl
> > > interface -- it doesn't require procfs access.
> >
> > The other option was to encode the pid in the callers pid namespace into
> > the pidfd's fdinfo so that you can parse it out and open /proc/<pid>.
> > You'd just need an event on the pidfd to tell you when the process has
> > died. Jonathan and I just discussed this.
> 
> From an application developer's POV, the ioctl interface sounds much,
> much nicer.

Some people are strongly against ioctl()s some don't. I'm not against
them so both options are fine with me if people can agree.

Christian

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-26  0:12                                                                                           ` Christian Brauner
@ 2019-03-26  0:24                                                                                             ` Andy Lutomirski
  2019-03-28  9:21                                                                                               ` Christian Brauner
  0 siblings, 1 reply; 113+ messages in thread
From: Andy Lutomirski @ 2019-03-26  0:24 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Andy Lutomirski, Daniel Colascione, Jann Horn, Joel Fernandes,
	Suren Baghdasaryan, Steven Rostedt, Sultan Alsawaf, Tim Murray,
	Michal Hocko, Greg Kroah-Hartman, Arve Hjønnevåg,
	Todd Kjos, Martijn Coenen, Ingo Molnar, Peter Zijlstra, LKML,
	open list:ANDROID DRIVERS, kernel-team, Oleg Nesterov,
	Serge E. Hallyn, Kees Cook, Jonathan Kowalski, Linux API

On Mon, Mar 25, 2019 at 5:12 PM Christian Brauner <christian@brauner.io> wrote:
>
> On Mon, Mar 25, 2019 at 05:00:17PM -0700, Andy Lutomirski wrote:
> > On Mon, Mar 25, 2019 at 4:45 PM Christian Brauner <christian@brauner.io> wrote:
> > >
> > > On Mon, Mar 25, 2019 at 04:42:14PM -0700, Andy Lutomirski wrote:
> > > > On Mon, Mar 25, 2019 at 1:23 PM Daniel Colascione <dancol@google.com> wrote:
> > > > >
> > > > > On Mon, Mar 25, 2019 at 1:14 PM Jann Horn <jannh@google.com> wrote:
> > > > > >
> > > > > > On Mon, Mar 25, 2019 at 8:44 PM Andy Lutomirski <luto@kernel.org> wrote:
> > > >
> > > > > > One ioctl on procfs roots to translate pidfds into that procfs,
> > > > > > subject to both the normal lookup permission checks and only working
> > > > > > if the pidfd has a translation into the procfs:
> > > > > >
> > > > > > int proc_root_fd = open("/proc", O_RDONLY);
> > > > > > int proc_dir_fd = ioctl(proc_root_fd, PROC_PIDFD_TO_PROCFSFD, pidfd);
> > > > > >
> > > > > > And one ioctl on procfs directories to translate from PGIDs and PIDs to pidfds:
> > > > > >
> > > > > > int proc_pgid_fd = open("/proc/self", O_RDONLY);
> > > > > > int self_pg_pidfd = ioctl(proc_pgid_fd, PROC_PROCFSFD_TO_PIDFD, 0);
> > > > > > int proc_pid_fd = open("/proc/thread-self", O_RDONLY);
> > > > > > int self_p_pidfd = ioctl(proc_pid_fd, PROC_PROCFSFD_TO_PIDFD, 0);
> > > > > >
> > > >
> > > > This sounds okay to me.  Or we could make it so that a procfs
> > > > directory fd also works as a pidfd, but that seems more likely to be
> > > > problematic than just allowing two-way translation like this
> > > >
> > > > > >
> > > > > > And then, as you proposed, the new sys_clone() can just return a
> > > > > > pidfd, and you can convert it into a procfs fd yourself if you want.
> > > > >
> > > > > I think that's the consensus we reached on the other thread. The
> > > > > O_DIRECTORY open on /proc/self/fd/mypidfd seems like it'd work well
> > > > > enough.
> > > >
> > > > I must have missed this particular email.
> > > >
> > > > IMO, if /proc/self/fd/mypidfd allows O_DIRECTORY open to work, then it
> > > > really ought to do function just like /proc/self/fd/mypidfd/. and
> > > > /proc/self/fd/mypidfd/status should work.  And these latter two
> > > > options seem nutty.
> > > >
> > > > Also, this O_DIRECTORY thing is missing the entire point of the ioctl
> > > > interface -- it doesn't require procfs access.
> > >
> > > The other option was to encode the pid in the callers pid namespace into
> > > the pidfd's fdinfo so that you can parse it out and open /proc/<pid>.
> > > You'd just need an event on the pidfd to tell you when the process has
> > > died. Jonathan and I just discussed this.
> >
> > From an application developer's POV, the ioctl interface sounds much,
> > much nicer.
>
> Some people are strongly against ioctl()s some don't. I'm not against
> them so both options are fine with me if people can agree.
>

There are certainly non-ioctl equivalents that are functionally
equivalent.  For example, there could be a syscall
procfs_open_pidfd(procfs_fd, pid_fd).  I personally don't really mind
ioctl() when it's really an operation on an fd.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: pidfd design
  2019-03-26  0:24                                                                                             ` Andy Lutomirski
@ 2019-03-28  9:21                                                                                               ` Christian Brauner
  0 siblings, 0 replies; 113+ messages in thread
From: Christian Brauner @ 2019-03-28  9:21 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Daniel Colascione, Jann Horn, Joel Fernandes, Suren Baghdasaryan,
	Steven Rostedt, Sultan Alsawaf, Tim Murray, Michal Hocko,
	Greg Kroah-Hartman, Arve Hjønnevåg, Todd Kjos,
	Martijn Coenen, Ingo Molnar, Peter Zijlstra, LKML,
	open list:ANDROID DRIVERS, kernel-team, Oleg Nesterov,
	Serge E. Hallyn, Kees Cook, Jonathan Kowalski, Linux API

On Mon, Mar 25, 2019 at 05:24:49PM -0700, Andy Lutomirski wrote:
> On Mon, Mar 25, 2019 at 5:12 PM Christian Brauner <christian@brauner.io> wrote:
> >
> > On Mon, Mar 25, 2019 at 05:00:17PM -0700, Andy Lutomirski wrote:
> > > On Mon, Mar 25, 2019 at 4:45 PM Christian Brauner <christian@brauner.io> wrote:
> > > >
> > > > On Mon, Mar 25, 2019 at 04:42:14PM -0700, Andy Lutomirski wrote:
> > > > > On Mon, Mar 25, 2019 at 1:23 PM Daniel Colascione <dancol@google.com> wrote:
> > > > > >
> > > > > > On Mon, Mar 25, 2019 at 1:14 PM Jann Horn <jannh@google.com> wrote:
> > > > > > >
> > > > > > > On Mon, Mar 25, 2019 at 8:44 PM Andy Lutomirski <luto@kernel.org> wrote:
> > > > >
> > > > > > > One ioctl on procfs roots to translate pidfds into that procfs,
> > > > > > > subject to both the normal lookup permission checks and only working
> > > > > > > if the pidfd has a translation into the procfs:
> > > > > > >
> > > > > > > int proc_root_fd = open("/proc", O_RDONLY);
> > > > > > > int proc_dir_fd = ioctl(proc_root_fd, PROC_PIDFD_TO_PROCFSFD, pidfd);
> > > > > > >
> > > > > > > And one ioctl on procfs directories to translate from PGIDs and PIDs to pidfds:
> > > > > > >
> > > > > > > int proc_pgid_fd = open("/proc/self", O_RDONLY);
> > > > > > > int self_pg_pidfd = ioctl(proc_pgid_fd, PROC_PROCFSFD_TO_PIDFD, 0);
> > > > > > > int proc_pid_fd = open("/proc/thread-self", O_RDONLY);
> > > > > > > int self_p_pidfd = ioctl(proc_pid_fd, PROC_PROCFSFD_TO_PIDFD, 0);
> > > > > > >
> > > > >
> > > > > This sounds okay to me.  Or we could make it so that a procfs
> > > > > directory fd also works as a pidfd, but that seems more likely to be
> > > > > problematic than just allowing two-way translation like this
> > > > >
> > > > > > >
> > > > > > > And then, as you proposed, the new sys_clone() can just return a
> > > > > > > pidfd, and you can convert it into a procfs fd yourself if you want.
> > > > > >
> > > > > > I think that's the consensus we reached on the other thread. The
> > > > > > O_DIRECTORY open on /proc/self/fd/mypidfd seems like it'd work well
> > > > > > enough.
> > > > >
> > > > > I must have missed this particular email.
> > > > >
> > > > > IMO, if /proc/self/fd/mypidfd allows O_DIRECTORY open to work, then it
> > > > > really ought to do function just like /proc/self/fd/mypidfd/. and
> > > > > /proc/self/fd/mypidfd/status should work.  And these latter two
> > > > > options seem nutty.
> > > > >
> > > > > Also, this O_DIRECTORY thing is missing the entire point of the ioctl
> > > > > interface -- it doesn't require procfs access.
> > > >
> > > > The other option was to encode the pid in the callers pid namespace into
> > > > the pidfd's fdinfo so that you can parse it out and open /proc/<pid>.
> > > > You'd just need an event on the pidfd to tell you when the process has
> > > > died. Jonathan and I just discussed this.
> > >
> > > From an application developer's POV, the ioctl interface sounds much,
> > > much nicer.
> >
> > Some people are strongly against ioctl()s some don't. I'm not against
> > them so both options are fine with me if people can agree.
> >
> 
> There are certainly non-ioctl equivalents that are functionally
> equivalent.  For example, there could be a syscall
> procfs_open_pidfd(procfs_fd, pid_fd).  I personally don't really mind
> ioctl() when it's really an operation on an fd.

I totally missed that mail somehow.
Yes, I agree that an ioctl() makes sense for that.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-03-20  1:52                                                         ` Joel Fernandes
  2019-03-20  2:42                                                           ` pidfd design Daniel Colascione
@ 2019-05-07  2:16                                                           ` Sultan Alsawaf
  2019-05-07  7:04                                                             ` Greg Kroah-Hartman
                                                                               ` (2 more replies)
  1 sibling, 3 replies; 113+ messages in thread
From: Sultan Alsawaf @ 2019-05-07  2:16 UTC (permalink / raw)
  Cc: Christian Brauner, Daniel Colascione, Suren Baghdasaryan,
	Steven Rostedt, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Oleg Nesterov, Andy Lutomirski, Serge E. Hallyn,
	Kees Cook, Joel Fernandes

This is a complete low memory killer solution for Android that is small
and simple. Processes are killed according to the priorities that
Android gives them, so that the least important processes are always
killed first. Processes are killed until memory deficits are satisfied,
as observed from kswapd struggling to free up pages. Simple LMK stops
killing processes when kswapd finally goes back to sleep.

The only tunables are the desired amount of memory to be freed per
reclaim event and desired frequency of reclaim events. Simple LMK tries
to free at least the desired amount of memory per reclaim and waits
until all of its victims' memory is freed before proceeding to kill more
processes.

Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
---
Hello everyone,

I've addressed some of the concerns that were brought up with the first version
of the Simple LMK patch. I understand that a kernel-based solution like this
that contains policy decisions for a specific userspace is not the way to go,
but the Android ecosystem still has a pressing need for a low memory killer that
works well.

Most Android devices still use the ancient and deprecated lowmemorykiller.c
kernel driver; Simple LMK seeks to replace that, at the very least until PSI and
a userspace daemon utilizing PSI are ready for *all* Android devices, and not
just the privileged Pixel phone line.

The feedback I received last time was quite helpful. This Simple LMK patch works
significantly better than the first, improving memory management by a large
margin while being immune to transient spikes in memory usage (since the signal
to start killing processes is determined by how hard kswapd tries to free up
pages, which is something that occurs over a span of time and not a single point
in time).

I'd love to hear some feedback on this new patch. I do encourage those who are
interested to take it for a spin on an Android device. This patch has been
tested successfully on Android 4.4 and 4.9 kernels. For the sake of review here,
I have adapted the patch to 5.1.

Thanks,
Sultan

 drivers/android/Kconfig      |  33 ++++
 drivers/android/Makefile     |   1 +
 drivers/android/simple_lmk.c | 315 +++++++++++++++++++++++++++++++++++
 include/linux/mm_types.h     |   4 +
 include/linux/simple_lmk.h   |  11 ++
 kernel/fork.c                |  13 ++
 mm/vmscan.c                  |  12 ++
 7 files changed, 389 insertions(+)
 create mode 100644 drivers/android/simple_lmk.c
 create mode 100644 include/linux/simple_lmk.h

diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig
index 6fdf2abe4..bdd429338 100644
--- a/drivers/android/Kconfig
+++ b/drivers/android/Kconfig
@@ -54,6 +54,39 @@ config ANDROID_BINDER_IPC_SELFTEST
 	  exhaustively with combinations of various buffer sizes and
 	  alignments.
 
+config ANDROID_SIMPLE_LMK
+	bool "Simple Android Low Memory Killer"
+	depends on !ANDROID_LOW_MEMORY_KILLER && !MEMCG
+	---help---
+	  This is a complete low memory killer solution for Android that is
+	  small and simple. Processes are killed according to the priorities
+	  that Android gives them, so that the least important processes are
+	  always killed first. Processes are killed until memory deficits are
+	  satisfied, as observed from kswapd struggling to free up pages. Simple
+	  LMK stops killing processes when kswapd finally goes back to sleep.
+
+if ANDROID_SIMPLE_LMK
+
+config ANDROID_SIMPLE_LMK_AGGRESSION
+	int "Reclaim frequency selection"
+	range 1 3
+	default 1
+	help
+	  This value determines how frequently Simple LMK will perform memory
+	  reclaims. A lower value corresponds to less frequent reclaims, which
+	  maximizes memory usage. The range of values has a logarithmic
+	  correlation; 2 is twice as aggressive as 1, and 3 is twice as
+	  aggressive as 2, which makes 3 four times as aggressive as 1.
+
+config ANDROID_SIMPLE_LMK_MINFREE
+	int "Minimum MiB of memory to free per reclaim"
+	range 8 512
+	default 64
+	help
+	  Simple LMK will try to free at least this much memory per reclaim.
+
+endif
+
 endif # if ANDROID
 
 endmenu
diff --git a/drivers/android/Makefile b/drivers/android/Makefile
index c7856e320..7c91293b6 100644
--- a/drivers/android/Makefile
+++ b/drivers/android/Makefile
@@ -3,3 +3,4 @@ ccflags-y += -I$(src)			# needed for trace events
 obj-$(CONFIG_ANDROID_BINDERFS)		+= binderfs.o
 obj-$(CONFIG_ANDROID_BINDER_IPC)	+= binder.o binder_alloc.o
 obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o
+obj-$(CONFIG_ANDROID_SIMPLE_LMK)	+= simple_lmk.o
diff --git a/drivers/android/simple_lmk.c b/drivers/android/simple_lmk.c
new file mode 100644
index 000000000..a2ffb57b5
--- /dev/null
+++ b/drivers/android/simple_lmk.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Sultan Alsawaf <sultan@kerneltoast.com>.
+ */
+
+#define pr_fmt(fmt) "simple_lmk: " fmt
+
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/mm.h>
+#include <linux/moduleparam.h>
+#include <linux/oom.h>
+#include <linux/sort.h>
+#include <linux/version.h>
+
+/* The sched_param struct is located elsewhere in newer kernels */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
+#include <uapi/linux/sched/types.h>
+#endif
+
+/* SEND_SIG_FORCED isn't present in newer kernels */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 19, 0)
+#define SIG_INFO_TYPE SEND_SIG_FORCED
+#else
+#define SIG_INFO_TYPE SEND_SIG_PRIV
+#endif
+
+/* The minimum number of pages to free per reclaim */
+#define MIN_FREE_PAGES (CONFIG_ANDROID_SIMPLE_LMK_MINFREE * SZ_1M / PAGE_SIZE)
+
+/* Kill up to this many victims per reclaim. This is limited by stack size. */
+#define MAX_VICTIMS 64
+
+struct victim_info {
+	struct task_struct *tsk;
+	unsigned long size;
+};
+
+/* Pulled from the Android framework. Lower ADJ means higher priority. */
+static const short int adj_prio[] = {
+	906, /* CACHED_APP_MAX_ADJ */
+	905, /* Cached app */
+	904, /* Cached app */
+	903, /* Cached app */
+	902, /* Cached app */
+	901, /* Cached app */
+	900, /* CACHED_APP_MIN_ADJ */
+	800, /* SERVICE_B_ADJ */
+	700, /* PREVIOUS_APP_ADJ */
+	600, /* HOME_APP_ADJ */
+	500, /* SERVICE_ADJ */
+	400, /* HEAVY_WEIGHT_APP_ADJ */
+	300, /* BACKUP_APP_ADJ */
+	200, /* PERCEPTIBLE_APP_ADJ */
+	100, /* VISIBLE_APP_ADJ */
+	0    /* FOREGROUND_APP_ADJ */
+};
+
+static DECLARE_WAIT_QUEUE_HEAD(oom_waitq);
+static bool needs_reclaim;
+
+static int victim_info_cmp(const void *lhs_ptr, const void *rhs_ptr)
+{
+	const struct victim_info *lhs = (typeof(lhs))lhs_ptr;
+	const struct victim_info *rhs = (typeof(rhs))rhs_ptr;
+
+	return rhs->size - lhs->size;
+}
+
+static bool mm_is_duplicate(struct victim_info *varr, int vlen,
+			    struct mm_struct *mm)
+{
+	int i;
+
+	for (i = 0; i < vlen; i++) {
+		if (varr[i].tsk->mm == mm)
+			return true;
+	}
+
+	return false;
+}
+
+static bool vtsk_is_duplicate(struct victim_info *varr, int vlen,
+			      struct task_struct *vtsk)
+{
+	int i;
+
+	for (i = 0; i < vlen; i++) {
+		if (same_thread_group(varr[i].tsk, vtsk))
+			return true;
+	}
+
+	return false;
+}
+
+static unsigned long find_victims(struct victim_info *varr, int *vindex,
+				  int vmaxlen, int min_adj, int max_adj)
+{
+	unsigned long pages_found = 0;
+	int old_vindex = *vindex;
+	struct task_struct *tsk;
+
+	for_each_process(tsk) {
+		struct task_struct *vtsk;
+		unsigned long tasksize;
+		short oom_score_adj;
+
+		/* Make sure there's space left in the victim array */
+		if (*vindex == vmaxlen)
+			break;
+
+		/* Don't kill current, kthreads, init, or duplicates */
+		if (same_thread_group(tsk, current) ||
+		    tsk->flags & PF_KTHREAD ||
+		    is_global_init(tsk) ||
+		    vtsk_is_duplicate(varr, *vindex, tsk))
+			continue;
+
+		vtsk = find_lock_task_mm(tsk);
+		if (!vtsk)
+			continue;
+
+		/* Skip tasks that lack memory or have a redundant mm */
+		if (test_tsk_thread_flag(vtsk, TIF_MEMDIE) ||
+		    mm_is_duplicate(varr, *vindex, vtsk->mm))
+			goto unlock_mm;
+
+		/* Check the task's importance (adj) to see if it's in range */
+		oom_score_adj = vtsk->signal->oom_score_adj;
+		if (oom_score_adj < min_adj || oom_score_adj > max_adj)
+			goto unlock_mm;
+
+		/* Get the total number of physical pages in use by the task */
+		tasksize = get_mm_rss(vtsk->mm);
+		if (!tasksize)
+			goto unlock_mm;
+
+		/* Store this potential victim away for later */
+		varr[*vindex].tsk = vtsk;
+		varr[*vindex].size = tasksize;
+		(*vindex)++;
+
+		/* Keep track of the number of pages that have been found */
+		pages_found += tasksize;
+		continue;
+
+unlock_mm:
+		task_unlock(vtsk);
+	}
+
+	/*
+	 * Sort the victims in descending order of size to prioritize killing
+	 * the larger ones first.
+	 */
+	if (pages_found)
+		sort(&varr[old_vindex], *vindex - old_vindex, sizeof(*varr),
+		     victim_info_cmp, NULL);
+
+	return pages_found;
+}
+
+static void scan_and_kill(unsigned long pages_needed)
+{
+	static DECLARE_WAIT_QUEUE_HEAD(victim_waitq);
+	struct victim_info victims[MAX_VICTIMS];
+	int i, nr_to_kill = 0, nr_victims = 0;
+	unsigned long pages_found = 0;
+	atomic_t victim_count;
+
+	/*
+	 * Hold the tasklist lock so tasks don't disappear while scanning. This
+	 * is preferred to holding an RCU read lock so that the list of tasks
+	 * is guaranteed to be up to date. Keep preemption disabled until the
+	 * SIGKILLs are sent so the victim kill process isn't interrupted.
+	 */
+	read_lock(&tasklist_lock);
+	preempt_disable();
+	for (i = 1; i < ARRAY_SIZE(adj_prio); i++) {
+		pages_found += find_victims(victims, &nr_victims, MAX_VICTIMS,
+					    adj_prio[i], adj_prio[i - 1]);
+		if (pages_found >= pages_needed || nr_victims == MAX_VICTIMS)
+			break;
+	}
+
+	/*
+	 * Calculate the number of tasks that need to be killed and quickly
+	 * release the references to those that'll live.
+	 */
+	for (i = 0, pages_found = 0; i < nr_victims; i++) {
+		struct victim_info *victim = &victims[i];
+		struct task_struct *vtsk = victim->tsk;
+
+		/* The victims' mm lock is taken in find_victims; release it */
+		if (pages_found >= pages_needed) {
+			task_unlock(vtsk);
+			continue;
+		}
+
+		/*
+		 * Grab a reference to the victim so it doesn't disappear after
+		 * the tasklist lock is released.
+		 */
+		get_task_struct(vtsk);
+		pages_found += victim->size;
+		nr_to_kill++;
+	}
+	read_unlock(&tasklist_lock);
+
+	/* Kill the victims */
+	victim_count = (atomic_t)ATOMIC_INIT(nr_to_kill);
+	for (i = 0; i < nr_to_kill; i++) {
+		struct victim_info *victim = &victims[i];
+		struct task_struct *vtsk = victim->tsk;
+
+		pr_info("Killing %s with adj %d to free %lu kiB\n", vtsk->comm,
+			vtsk->signal->oom_score_adj,
+			victim->size << (PAGE_SHIFT - 10));
+
+		/* Configure the victim's mm to notify us when it's freed */
+		vtsk->mm->slmk_waitq = &victim_waitq;
+		vtsk->mm->slmk_counter = &victim_count;
+
+		/* Accelerate the victim's death by forcing the kill signal */
+		do_send_sig_info(SIGKILL, SIG_INFO_TYPE, vtsk, true);
+
+		/* Finally release the victim's mm lock */
+		task_unlock(vtsk);
+	}
+	preempt_enable_no_resched();
+
+	/* Try to speed up the death process now that we can schedule again */
+	for (i = 0; i < nr_to_kill; i++) {
+		struct task_struct *vtsk = victims[i].tsk;
+
+		/* Increase the victim's priority to make it die faster */
+		set_user_nice(vtsk, MIN_NICE);
+
+		/* Allow the victim to run on any CPU */
+		set_cpus_allowed_ptr(vtsk, cpu_all_mask);
+
+		/* Finally release the victim reference acquired earlier */
+		put_task_struct(vtsk);
+	}
+
+	/* Wait until all the victims die */
+	wait_event(victim_waitq, !atomic_read(&victim_count));
+}
+
+static int simple_lmk_reclaim_thread(void *data)
+{
+	static const struct sched_param sched_max_rt_prio = {
+		.sched_priority = MAX_RT_PRIO - 1
+	};
+
+	sched_setscheduler_nocheck(current, SCHED_FIFO, &sched_max_rt_prio);
+
+	while (1) {
+		bool should_stop;
+
+		wait_event(oom_waitq, (should_stop = kthread_should_stop()) ||
+				      READ_ONCE(needs_reclaim));
+
+		if (should_stop)
+			break;
+
+		/*
+		 * Kill a batch of processes and wait for their memory to be
+		 * freed. After their memory is freed, sleep for 20 ms to give
+		 * OOM'd allocations a chance to scavenge for the newly-freed
+		 * pages. Rinse and repeat while there are still OOM'd
+		 * allocations.
+		 */
+		do {
+			scan_and_kill(MIN_FREE_PAGES);
+			msleep(20);
+		} while (READ_ONCE(needs_reclaim));
+	}
+
+	return 0;
+}
+
+void simple_lmk_start_reclaim(void)
+{
+	WRITE_ONCE(needs_reclaim, true);
+	wake_up(&oom_waitq);
+}
+
+void simple_lmk_stop_reclaim(void)
+{
+	WRITE_ONCE(needs_reclaim, false);
+}
+
+/* Initialize Simple LMK when lmkd in Android writes to the minfree parameter */
+static int simple_lmk_init_set(const char *val, const struct kernel_param *kp)
+{
+	static atomic_t init_done = ATOMIC_INIT(0);
+	struct task_struct *thread;
+
+	if (atomic_cmpxchg(&init_done, 0, 1))
+		return 0;
+
+	thread = kthread_run(simple_lmk_reclaim_thread, NULL, "simple_lmkd");
+	BUG_ON(IS_ERR(thread));
+
+	return 0;
+}
+
+static const struct kernel_param_ops simple_lmk_init_ops = {
+	.set = simple_lmk_init_set
+};
+
+/* Needed to prevent Android from thinking there's no LMK and thus rebooting */
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "lowmemorykiller."
+module_param_cb(minfree, &simple_lmk_init_ops, NULL, 0200);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 4ef4bbe78..a02852d6d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -501,6 +501,10 @@ struct mm_struct {
 #if IS_ENABLED(CONFIG_HMM)
 		/* HMM needs to track a few things per mm */
 		struct hmm *hmm;
+#endif
+#ifdef CONFIG_ANDROID_SIMPLE_LMK
+		wait_queue_head_t *slmk_waitq;
+		atomic_t *slmk_counter;
 #endif
 	} __randomize_layout;
 
diff --git a/include/linux/simple_lmk.h b/include/linux/simple_lmk.h
new file mode 100644
index 000000000..e2cd56f1f
--- /dev/null
+++ b/include/linux/simple_lmk.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019 Sultan Alsawaf <sultan@kerneltoast.com>.
+ */
+#ifndef _SIMPLE_LMK_H_
+#define _SIMPLE_LMK_H_
+
+void simple_lmk_start_reclaim(void);
+void simple_lmk_stop_reclaim(void);
+
+#endif /* _SIMPLE_LMK_H_ */
diff --git a/kernel/fork.c b/kernel/fork.c
index 9dcd18aa2..f41bef5fe 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -995,6 +995,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm->pmd_huge_pte = NULL;
 #endif
 	mm_init_uprobes_state(mm);
+#ifdef CONFIG_ANDROID_SIMPLE_LMK
+	mm->slmk_waitq = NULL;
+#endif
 
 	if (current->mm) {
 		mm->flags = current->mm->flags & MMF_INIT_MASK;
@@ -1037,6 +1040,10 @@ struct mm_struct *mm_alloc(void)
 
 static inline void __mmput(struct mm_struct *mm)
 {
+#ifdef CONFIG_ANDROID_SIMPLE_LMK
+	wait_queue_head_t *slmk_waitq = mm->slmk_waitq;
+	atomic_t *slmk_counter = mm->slmk_counter;
+#endif
 	VM_BUG_ON(atomic_read(&mm->mm_users));
 
 	uprobe_clear_state(mm);
@@ -1054,6 +1061,12 @@ static inline void __mmput(struct mm_struct *mm)
 	if (mm->binfmt)
 		module_put(mm->binfmt->module);
 	mmdrop(mm);
+#ifdef CONFIG_ANDROID_SIMPLE_LMK
+	if (slmk_waitq) {
+		atomic_dec(slmk_counter);
+		wake_up(slmk_waitq);
+	}
+#endif
 }
 
 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a815f73ee..f4fb91b53 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -51,6 +51,7 @@
 #include <linux/printk.h>
 #include <linux/dax.h>
 #include <linux/psi.h>
+#include <linux/simple_lmk.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -3541,6 +3542,11 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 		bool balanced;
 		bool ret;
 
+#ifdef CONFIG_ANDROID_SIMPLE_LMK
+		if (sc.priority == CONFIG_ANDROID_SIMPLE_LMK_AGGRESSION)
+			simple_lmk_start_reclaim();
+#endif
+
 		sc.reclaim_idx = classzone_idx;
 
 		/*
@@ -3737,6 +3743,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
 	 * succeed.
 	 */
 	if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
+#ifdef CONFIG_ANDROID_SIMPLE_LMK
+		simple_lmk_stop_reclaim();
+#endif
 		/*
 		 * Compaction records what page blocks it recently failed to
 		 * isolate pages from and skips them in the future scanning.
@@ -3773,6 +3782,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
 	 */
 	if (!remaining &&
 	    prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
+#ifdef CONFIG_ANDROID_SIMPLE_LMK
+		simple_lmk_stop_reclaim();
+#endif
 		trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
 
 		/*
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-07  2:16                                                           ` [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android Sultan Alsawaf
@ 2019-05-07  7:04                                                             ` Greg Kroah-Hartman
  2019-05-07  7:27                                                               ` Sultan Alsawaf
  2019-05-07 12:26                                                             ` Michal Hocko
  2019-05-07 15:31                                                             ` Oleg Nesterov
  2 siblings, 1 reply; 113+ messages in thread
From: Greg Kroah-Hartman @ 2019-05-07  7:04 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: open list:ANDROID DRIVERS, Daniel Colascione, kernel-team,
	Todd Kjos, Kees Cook, Peter Zijlstra, LKML, Tim Murray,
	Michal Hocko, linux-mm, Arve Hjønnevåg, Ingo Molnar,
	Martijn Coenen, Steven Rostedt, Oleg Nesterov, Joel Fernandes,
	Andy Lutomirski, Suren Baghdasaryan, Christian Brauner

On Mon, May 06, 2019 at 07:16:22PM -0700, Sultan Alsawaf wrote:
> This is a complete low memory killer solution for Android that is small
> and simple. Processes are killed according to the priorities that
> Android gives them, so that the least important processes are always
> killed first. Processes are killed until memory deficits are satisfied,
> as observed from kswapd struggling to free up pages. Simple LMK stops
> killing processes when kswapd finally goes back to sleep.
> 
> The only tunables are the desired amount of memory to be freed per
> reclaim event and desired frequency of reclaim events. Simple LMK tries
> to free at least the desired amount of memory per reclaim and waits
> until all of its victims' memory is freed before proceeding to kill more
> processes.
> 
> Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
> ---
> Hello everyone,
> 
> I've addressed some of the concerns that were brought up with the first version
> of the Simple LMK patch. I understand that a kernel-based solution like this
> that contains policy decisions for a specific userspace is not the way to go,
> but the Android ecosystem still has a pressing need for a low memory killer that
> works well.
> 
> Most Android devices still use the ancient and deprecated lowmemorykiller.c
> kernel driver; Simple LMK seeks to replace that, at the very least until PSI and
> a userspace daemon utilizing PSI are ready for *all* Android devices, and not
> just the privileged Pixel phone line.

Um, why can't "all" Android devices take the same patches that the Pixel
phones are using today?  They should all be in the public android-common
kernel repositories that all Android devices should be syncing with on a
weekly/monthly basis anyway, right?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-07  7:04                                                             ` Greg Kroah-Hartman
@ 2019-05-07  7:27                                                               ` Sultan Alsawaf
  2019-05-07  7:43                                                                 ` Greg Kroah-Hartman
  0 siblings, 1 reply; 113+ messages in thread
From: Sultan Alsawaf @ 2019-05-07  7:27 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: open list:ANDROID DRIVERS, Daniel Colascione, kernel-team,
	Todd Kjos, Kees Cook, Peter Zijlstra, LKML, Tim Murray,
	Michal Hocko, linux-mm, Arve Hjønnevåg, Ingo Molnar,
	Martijn Coenen, Steven Rostedt, Oleg Nesterov, Joel Fernandes,
	Andy Lutomirski, Suren Baghdasaryan, Christian Brauner

On Tue, May 07, 2019 at 09:04:30AM +0200, Greg Kroah-Hartman wrote:
> Um, why can't "all" Android devices take the same patches that the Pixel
> phones are using today?  They should all be in the public android-common
> kernel repositories that all Android devices should be syncing with on a
> weekly/monthly basis anyway, right?
> 
> thanks,
> 
> greg k-h

Hi Greg,

I only see PSI present in the android-common kernels for 4.9 and above. The vast
majority of Android devices do not run a 4.9+ kernel. It seems unreasonable to
expect OEMs to toil with backporting PSI themselves to get decent memory
management.

But even if they did backport PSI, it wouldn't help too much because a
PSI-enabled LMKD solution is not ready yet. It looks like a PSI-based LMKD is
still under heavy development and won't be ready for all Android devices for
quite some time.

Additionally, it looks like the supposedly-dead lowmemorykiller.c is still being
actively tweaked by Google [1], which does not instill confidence that a
definitive LMK solution a la PSI is coming any time soon.

Thanks,
Sultan

[1] https://android.googlesource.com/kernel/common/+/152bacdd85c46f0c76b00c4acc253e414513634c

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-07  7:27                                                               ` Sultan Alsawaf
@ 2019-05-07  7:43                                                                 ` Greg Kroah-Hartman
  2019-05-07  8:12                                                                   ` Sultan Alsawaf
  0 siblings, 1 reply; 113+ messages in thread
From: Greg Kroah-Hartman @ 2019-05-07  7:43 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: open list:ANDROID DRIVERS, Daniel Colascione, Todd Kjos,
	Kees Cook, Peter Zijlstra, Martijn Coenen, LKML, Tim Murray,
	Michal Hocko, Suren Baghdasaryan, linux-mm,
	Arve Hjønnevåg, Ingo Molnar, Steven Rostedt,
	Oleg Nesterov, Joel Fernandes, Andy Lutomirski, kernel-team,
	Christian Brauner

On Tue, May 07, 2019 at 12:27:21AM -0700, Sultan Alsawaf wrote:
> On Tue, May 07, 2019 at 09:04:30AM +0200, Greg Kroah-Hartman wrote:
> > Um, why can't "all" Android devices take the same patches that the Pixel
> > phones are using today?  They should all be in the public android-common
> > kernel repositories that all Android devices should be syncing with on a
> > weekly/monthly basis anyway, right?
> > 
> > thanks,
> > 
> > greg k-h
> 
> Hi Greg,
> 
> I only see PSI present in the android-common kernels for 4.9 and above. The vast
> majority of Android devices do not run a 4.9+ kernel. It seems unreasonable to
> expect OEMs to toil with backporting PSI themselves to get decent memory
> management.

Given that any "new" android device that gets shipped "soon" should be
using 4.9.y or newer, is this a real issue?

And if it is, I'm sure that asking for those patches to be backported to
4.4.y would be just fine, have you asked?

Note that I know of Android Go devices, running 3.18.y kernels, do NOT
use the in-kernel memory killer, but instead use the userspace solution
today.  So trying to get another in-kernel memory killer solution added
anywhere seems quite odd.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-07  7:43                                                                 ` Greg Kroah-Hartman
@ 2019-05-07  8:12                                                                   ` Sultan Alsawaf
  2019-05-07 10:58                                                                     ` Christian Brauner
  2019-05-07 11:09                                                                     ` Greg Kroah-Hartman
  0 siblings, 2 replies; 113+ messages in thread
From: Sultan Alsawaf @ 2019-05-07  8:12 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: open list:ANDROID DRIVERS, Daniel Colascione, Todd Kjos,
	Kees Cook, Peter Zijlstra, Martijn Coenen, LKML, Tim Murray,
	Michal Hocko, Suren Baghdasaryan, linux-mm,
	Arve Hjønnevåg, Ingo Molnar, Steven Rostedt,
	Oleg Nesterov, Joel Fernandes, Andy Lutomirski, kernel-team,
	Christian Brauner

On Tue, May 07, 2019 at 09:43:34AM +0200, Greg Kroah-Hartman wrote:
> Given that any "new" android device that gets shipped "soon" should be
> using 4.9.y or newer, is this a real issue?

It's certainly a real issue for those who can't buy brand new Android devices
without software bugs every six months :)

> And if it is, I'm sure that asking for those patches to be backported to
> 4.4.y would be just fine, have you asked?
>
> Note that I know of Android Go devices, running 3.18.y kernels, do NOT
> use the in-kernel memory killer, but instead use the userspace solution
> today.  So trying to get another in-kernel memory killer solution added
> anywhere seems quite odd.

It's even more odd that although a userspace solution is touted as the proper
way to go on LKML, almost no Android OEMs are using it, and even in that commit
I linked in the previous message, Google made a rather large set of
modifications to the supposedly-defunct lowmemorykiller.c not one month ago.
What's going on?

Qualcomm still uses lowmemorykiller.c [1] on the Snapdragon 845. If PSI were
backported to 4.4, or even 3.18, would it really be used? I don't really
understand the aversion to an in-kernel memory killer on LKML despite the rest
of the industry's attraction to it. Perhaps there's some inherently great cost
in using the userspace solution that I'm unaware of?

Regardless, even if PSI were backported, a full-fledged LMKD using it has yet to
be made, so it wouldn't be of much use now.

Thanks,
Sultan

[1] https://source.codeaurora.org/quic/la/kernel/msm-4.9/tree/arch/arm64/configs/sdm845_defconfig?h=LA.UM.7.3.r1-07400-sdm845.0#n492

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-07  8:12                                                                   ` Sultan Alsawaf
@ 2019-05-07 10:58                                                                     ` Christian Brauner
  2019-05-07 16:28                                                                       ` Suren Baghdasaryan
  2019-05-07 17:17                                                                       ` Sultan Alsawaf
  2019-05-07 11:09                                                                     ` Greg Kroah-Hartman
  1 sibling, 2 replies; 113+ messages in thread
From: Christian Brauner @ 2019-05-07 10:58 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Greg Kroah-Hartman, open list:ANDROID DRIVERS, Daniel Colascione,
	Todd Kjos, Kees Cook, Peter Zijlstra, Martijn Coenen, LKML,
	Tim Murray, Michal Hocko, Suren Baghdasaryan, linux-mm,
	Arve Hjønnevåg, Ingo Molnar, Steven Rostedt,
	Oleg Nesterov, Joel Fernandes, Andy Lutomirski, kernel-team

On Tue, May 07, 2019 at 01:12:36AM -0700, Sultan Alsawaf wrote:
> On Tue, May 07, 2019 at 09:43:34AM +0200, Greg Kroah-Hartman wrote:
> > Given that any "new" android device that gets shipped "soon" should be
> > using 4.9.y or newer, is this a real issue?
> 
> It's certainly a real issue for those who can't buy brand new Android devices
> without software bugs every six months :)
> 
> > And if it is, I'm sure that asking for those patches to be backported to
> > 4.4.y would be just fine, have you asked?
> >
> > Note that I know of Android Go devices, running 3.18.y kernels, do NOT
> > use the in-kernel memory killer, but instead use the userspace solution
> > today.  So trying to get another in-kernel memory killer solution added
> > anywhere seems quite odd.
> 
> It's even more odd that although a userspace solution is touted as the proper
> way to go on LKML, almost no Android OEMs are using it, and even in that commit

That's probably because without proper kernel changes this is rather
tricky to use safely (see below).

> I linked in the previous message, Google made a rather large set of
> modifications to the supposedly-defunct lowmemorykiller.c not one month ago.
> What's going on?
> 
> Qualcomm still uses lowmemorykiller.c [1] on the Snapdragon 845. If PSI were
> backported to 4.4, or even 3.18, would it really be used? I don't really
> understand the aversion to an in-kernel memory killer on LKML despite the rest
> of the industry's attraction to it. Perhaps there's some inherently great cost
> in using the userspace solution that I'm unaware of?
> 
> Regardless, even if PSI were backported, a full-fledged LMKD using it has yet to
> be made, so it wouldn't be of much use now.

This is work that is ongoing and requires kernel changes to make it
feasible. One of the things that I have been working on for quite a
while is the whole file descriptor for processes thing that is important
for LMKD (Even though I never thought about this use-case when I started
pitching this.). Joel and Daniel have joined in and are working on
making LMKD possible.
What I find odd is that every couple of weeks different solutions to the
low memory problem are pitched. There is simple_lkml, there is LMKD, and
there was a patchset that wanted to speed up memory reclaim at process
kill-time by adding a new flag to the new pidfd_send_signal() syscall.
That all seems - though related - rather uncoordinated. Now granted,
coordinated is usually not how kernel development necessarily works but
it would probably be good to have some sort of direction and from what I
have seen LMKD seems to be the most coordinated effort. But that might
just be my impression.

Christian

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-07  8:12                                                                   ` Sultan Alsawaf
  2019-05-07 10:58                                                                     ` Christian Brauner
@ 2019-05-07 11:09                                                                     ` Greg Kroah-Hartman
  1 sibling, 0 replies; 113+ messages in thread
From: Greg Kroah-Hartman @ 2019-05-07 11:09 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: open list:ANDROID DRIVERS, Daniel Colascione, Todd Kjos,
	Kees Cook, Peter Zijlstra, Martijn Coenen, LKML, Tim Murray,
	Michal Hocko, Suren Baghdasaryan, linux-mm,
	Arve Hjønnevåg, Ingo Molnar, Steven Rostedt,
	Oleg Nesterov, Joel Fernandes, Andy Lutomirski, kernel-team,
	Christian Brauner

On Tue, May 07, 2019 at 01:12:36AM -0700, Sultan Alsawaf wrote:
> On Tue, May 07, 2019 at 09:43:34AM +0200, Greg Kroah-Hartman wrote:
> > Given that any "new" android device that gets shipped "soon" should be
> > using 4.9.y or newer, is this a real issue?
> 
> It's certainly a real issue for those who can't buy brand new Android devices
> without software bugs every six months :)

Heh.

But, your "new code" isn't going to be going into any existing device,
or any device that will come out this year.  The soonest it would be
would be next year, and by then, 4.9.y is fine.

> > And if it is, I'm sure that asking for those patches to be backported to
> > 4.4.y would be just fine, have you asked?
> >
> > Note that I know of Android Go devices, running 3.18.y kernels, do NOT
> > use the in-kernel memory killer, but instead use the userspace solution
> > today.  So trying to get another in-kernel memory killer solution added
> > anywhere seems quite odd.
> 
> It's even more odd that although a userspace solution is touted as the proper
> way to go on LKML, almost no Android OEMs are using it, and even in that commit
> I linked in the previous message, Google made a rather large set of
> modifications to the supposedly-defunct lowmemorykiller.c not one month ago.
> What's going on?

"almost no"?  Again, Android Go is doing that, right?

And yes, there is still some 4.4 android-common work happening in this
area, see this patch that just got merged:
	https://android-review.googlesource.com/c/kernel/common/+/953354

So, for 4.4.y based devices, that should be enough, right?

> Qualcomm still uses lowmemorykiller.c [1] on the Snapdragon 845.

Qualcomm should never be used as an example of a company that has any
idea of what to do in their kernel :)

> If PSI were backported to 4.4, or even 3.18, would it really be used?

Why wouldn't it, if it worked properly?

> I don't really understand the aversion to an in-kernel memory killer
> on LKML despite the rest of the industry's attraction to it. Perhaps
> there's some inherently great cost in using the userspace solution
> that I'm unaware of?

Please see the work that went into PSI and the patches around it.
There's also a lwn.net article last week about the further work ongoing
in this area.  With all of that, you should see how in-kernel memory
killers are NOT the way to go.

> Regardless, even if PSI were backported, a full-fledged LMKD using it has yet to
> be made, so it wouldn't be of much use now.

"LMKD"?  Again, PSI is in the 4.9.y android-common tree, so the
userspace side should be in AOSP, right?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-07  2:16                                                           ` [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android Sultan Alsawaf
  2019-05-07  7:04                                                             ` Greg Kroah-Hartman
@ 2019-05-07 12:26                                                             ` Michal Hocko
  2019-05-07 15:31                                                             ` Oleg Nesterov
  2 siblings, 0 replies; 113+ messages in thread
From: Michal Hocko @ 2019-05-07 12:26 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Christian Brauner, Daniel Colascione, Suren Baghdasaryan,
	Steven Rostedt, Tim Murray, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Oleg Nesterov, Andy Lutomirski, Serge E. Hallyn,
	Kees Cook, Joel Fernandes

On Mon 06-05-19 19:16:22, Sultan Alsawaf wrote:
> This is a complete low memory killer solution for Android that is small
> and simple. Processes are killed according to the priorities that
> Android gives them, so that the least important processes are always
> killed first. Processes are killed until memory deficits are satisfied,
> as observed from kswapd struggling to free up pages. Simple LMK stops
> killing processes when kswapd finally goes back to sleep.
> 
> The only tunables are the desired amount of memory to be freed per
> reclaim event and desired frequency of reclaim events. Simple LMK tries
> to free at least the desired amount of memory per reclaim and waits
> until all of its victims' memory is freed before proceeding to kill more
> processes.

Why do we need something like that in the kernel? I really do not like
an idea of having two OOM killer implementations in the kernel. As
already pointed out newer kernels can do PSI and older kernels can live
with an out of tree code to achieve what they need. I do not see why we
really need this code in the upstream kernel.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-07  2:16                                                           ` [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android Sultan Alsawaf
  2019-05-07  7:04                                                             ` Greg Kroah-Hartman
  2019-05-07 12:26                                                             ` Michal Hocko
@ 2019-05-07 15:31                                                             ` Oleg Nesterov
  2019-05-07 16:35                                                               ` Sultan Alsawaf
  2 siblings, 1 reply; 113+ messages in thread
From: Oleg Nesterov @ 2019-05-07 15:31 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Christian Brauner, Daniel Colascione, Suren Baghdasaryan,
	Steven Rostedt, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Andy Lutomirski, Serge E. Hallyn, Kees Cook,
	Joel Fernandes

I am not going to comment the intent, but to be honest I am skeptical too.

On 05/06, Sultan Alsawaf wrote:
>
> +static unsigned long find_victims(struct victim_info *varr, int *vindex,
> +				  int vmaxlen, int min_adj, int max_adj)
> +{
> +	unsigned long pages_found = 0;
> +	int old_vindex = *vindex;
> +	struct task_struct *tsk;
> +
> +	for_each_process(tsk) {
> +		struct task_struct *vtsk;
> +		unsigned long tasksize;
> +		short oom_score_adj;
> +
> +		/* Make sure there's space left in the victim array */
> +		if (*vindex == vmaxlen)
> +			break;
> +
> +		/* Don't kill current, kthreads, init, or duplicates */
> +		if (same_thread_group(tsk, current) ||
> +		    tsk->flags & PF_KTHREAD ||
> +		    is_global_init(tsk) ||
> +		    vtsk_is_duplicate(varr, *vindex, tsk))
> +			continue;
> +
> +		vtsk = find_lock_task_mm(tsk);

Did you test this patch with lockdep enabled?

If I read the patch correctly, lockdep should complain. vtsk_is_duplicate()
ensures that we do not take the same ->alloc_lock twice or more, but lockdep
can't know this.

> +static void scan_and_kill(unsigned long pages_needed)
> +{
> +	static DECLARE_WAIT_QUEUE_HEAD(victim_waitq);
> +	struct victim_info victims[MAX_VICTIMS];
> +	int i, nr_to_kill = 0, nr_victims = 0;
> +	unsigned long pages_found = 0;
> +	atomic_t victim_count;
> +
> +	/*
> +	 * Hold the tasklist lock so tasks don't disappear while scanning. This
> +	 * is preferred to holding an RCU read lock so that the list of tasks
> +	 * is guaranteed to be up to date. Keep preemption disabled until the
> +	 * SIGKILLs are sent so the victim kill process isn't interrupted.
> +	 */
> +	read_lock(&tasklist_lock);
> +	preempt_disable();

read_lock() disables preemption, every task_lock() too, so this looks
unnecessary.

> +	for (i = 1; i < ARRAY_SIZE(adj_prio); i++) {
> +		pages_found += find_victims(victims, &nr_victims, MAX_VICTIMS,
> +					    adj_prio[i], adj_prio[i - 1]);
> +		if (pages_found >= pages_needed || nr_victims == MAX_VICTIMS)
> +			break;
> +	}
> +
> +	/*
> +	 * Calculate the number of tasks that need to be killed and quickly
> +	 * release the references to those that'll live.
> +	 */
> +	for (i = 0, pages_found = 0; i < nr_victims; i++) {
> +		struct victim_info *victim = &victims[i];
> +		struct task_struct *vtsk = victim->tsk;
> +
> +		/* The victims' mm lock is taken in find_victims; release it */
> +		if (pages_found >= pages_needed) {
> +			task_unlock(vtsk);
> +			continue;
> +		}
> +
> +		/*
> +		 * Grab a reference to the victim so it doesn't disappear after
> +		 * the tasklist lock is released.
> +		 */
> +		get_task_struct(vtsk);

The comment doesn't look correct. the victim can't dissapear until task_unlock()
below, it can't pass exit_mm().

> +		pages_found += victim->size;
> +		nr_to_kill++;
> +	}
> +	read_unlock(&tasklist_lock);
> +
> +	/* Kill the victims */
> +	victim_count = (atomic_t)ATOMIC_INIT(nr_to_kill);
> +	for (i = 0; i < nr_to_kill; i++) {
> +		struct victim_info *victim = &victims[i];
> +		struct task_struct *vtsk = victim->tsk;
> +
> +		pr_info("Killing %s with adj %d to free %lu kiB\n", vtsk->comm,
> +			vtsk->signal->oom_score_adj,
> +			victim->size << (PAGE_SHIFT - 10));
> +
> +		/* Configure the victim's mm to notify us when it's freed */
> +		vtsk->mm->slmk_waitq = &victim_waitq;
> +		vtsk->mm->slmk_counter = &victim_count;
> +
> +		/* Accelerate the victim's death by forcing the kill signal */
> +		do_send_sig_info(SIGKILL, SIG_INFO_TYPE, vtsk, true);
                                                               ^^^^
this should be PIDTYPE_TGID

> +
> +		/* Finally release the victim's mm lock */
> +		task_unlock(vtsk);
> +	}
> +	preempt_enable_no_resched();

See above. And I don't understand how can _no_resched() really help...

> +
> +	/* Try to speed up the death process now that we can schedule again */
> +	for (i = 0; i < nr_to_kill; i++) {
> +		struct task_struct *vtsk = victims[i].tsk;
> +
> +		/* Increase the victim's priority to make it die faster */
> +		set_user_nice(vtsk, MIN_NICE);
> +
> +		/* Allow the victim to run on any CPU */
> +		set_cpus_allowed_ptr(vtsk, cpu_all_mask);
> +
> +		/* Finally release the victim reference acquired earlier */
> +		put_task_struct(vtsk);
> +	}
> +
> +	/* Wait until all the victims die */
> +	wait_event(victim_waitq, !atomic_read(&victim_count));

Can't we avoid the new slmk_waitq/slmk_counter members in mm_struct?

I mean, can't we export victim_waitq and victim_count and, say, set/test
MMF_OOM_VICTIM. In fact I think you should try to re-use mark_oom_victim()
at least.

Oleg.


^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-07 10:58                                                                     ` Christian Brauner
@ 2019-05-07 16:28                                                                       ` Suren Baghdasaryan
  2019-05-07 16:38                                                                         ` Christian Brauner
                                                                                           ` (2 more replies)
  2019-05-07 17:17                                                                       ` Sultan Alsawaf
  1 sibling, 3 replies; 113+ messages in thread
From: Suren Baghdasaryan @ 2019-05-07 16:28 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Sultan Alsawaf, Greg Kroah-Hartman, open list:ANDROID DRIVERS,
	Daniel Colascione, Todd Kjos, Kees Cook, Peter Zijlstra,
	Martijn Coenen, LKML, Tim Murray, Michal Hocko, linux-mm,
	Arve Hjønnevåg, Ingo Molnar, Steven Rostedt,
	Oleg Nesterov, Joel Fernandes, Andy Lutomirski, kernel-team

From: Christian Brauner <christian@brauner.io>
Date: Tue, May 7, 2019 at 3:58 AM
To: Sultan Alsawaf
Cc: Greg Kroah-Hartman, open list:ANDROID DRIVERS, Daniel Colascione,
Todd Kjos, Kees Cook, Peter Zijlstra, Martijn Coenen, LKML, Tim
Murray, Michal Hocko, Suren Baghdasaryan, linux-mm, Arve Hjønnevåg,
Ingo Molnar, Steven Rostedt, Oleg Nesterov, Joel Fernandes, Andy
Lutomirski, kernel-team

> On Tue, May 07, 2019 at 01:12:36AM -0700, Sultan Alsawaf wrote:
> > On Tue, May 07, 2019 at 09:43:34AM +0200, Greg Kroah-Hartman wrote:
> > > Given that any "new" android device that gets shipped "soon" should be
> > > using 4.9.y or newer, is this a real issue?
> >
> > It's certainly a real issue for those who can't buy brand new Android devices
> > without software bugs every six months :)
> >

Hi Sultan,
Looks like you are posting this patch for devices that do not use
userspace LMKD solution due to them using older kernels or due to
their vendors sticking to in-kernel solution. If so, I see couple
logistical issues with this patch. I don't see it being adopted in
upstream kernel 5.x since it re-implements a deprecated mechanism even
though vendors still use it. Vendors on the other hand, will not adopt
it until you show evidence that it works way better than what
lowmemorykilled driver does now. You would have to provide measurable
data and explain your tests before they would consider spending time
on this.
On the implementation side I'm not convinced at all that this would
work better on all devices and in all circumstances. We had cases when
a new mechanism would show very good results until one usecase
completely broke it. Bulk killing of processes that you are doing in
your patch was a very good example of such a decision which later on
we had to rethink. That's why baking these policies into kernel is
very problematic. Another problem I see with the implementation that
it ties process killing with the reclaim scan depth. It's very similar
to how vmpressure works and vmpressure in my experience is very
unpredictable.

> > > And if it is, I'm sure that asking for those patches to be backported to
> > > 4.4.y would be just fine, have you asked?
> > >
> > > Note that I know of Android Go devices, running 3.18.y kernels, do NOT
> > > use the in-kernel memory killer, but instead use the userspace solution
> > > today.  So trying to get another in-kernel memory killer solution added
> > > anywhere seems quite odd.
> >
> > It's even more odd that although a userspace solution is touted as the proper
> > way to go on LKML, almost no Android OEMs are using it, and even in that commit
>
> That's probably because without proper kernel changes this is rather
> tricky to use safely (see below).
>
> > I linked in the previous message, Google made a rather large set of
> > modifications to the supposedly-defunct lowmemorykiller.c not one month ago.
> > What's going on?

If you look into that commit, it adds ability to report kill stats. If
that was a change in how that driver works it would be rejected.

> >
> > Qualcomm still uses lowmemorykiller.c [1] on the Snapdragon 845. If PSI were
> > backported to 4.4, or even 3.18, would it really be used? I don't really
> > understand the aversion to an in-kernel memory killer on LKML despite the rest
> > of the industry's attraction to it. Perhaps there's some inherently great cost
> > in using the userspace solution that I'm unaware of?

Vendors are cautious about adopting userspace solution and it is a
process to address all concerns but we are getting there.

> > Regardless, even if PSI were backported, a full-fledged LMKD using it has yet to
> > be made, so it wouldn't be of much use now.
>
> This is work that is ongoing and requires kernel changes to make it
> feasible. One of the things that I have been working on for quite a
> while is the whole file descriptor for processes thing that is important
> for LMKD (Even though I never thought about this use-case when I started
> pitching this.). Joel and Daniel have joined in and are working on
> making LMKD possible.
> What I find odd is that every couple of weeks different solutions to the
> low memory problem are pitched. There is simple_lkml, there is LMKD, and
> there was a patchset that wanted to speed up memory reclaim at process
> kill-time by adding a new flag to the new pidfd_send_signal() syscall.
> That all seems - though related - rather uncoordinated.

I'm not sure why pidfd_wait and expedited reclaim is seen as
uncoordinated effort. All of them are done to improve userspace LMKD.

> Now granted,
> coordinated is usually not how kernel development necessarily works but
> it would probably be good to have some sort of direction and from what I
> have seen LMKD seems to be the most coordinated effort. But that might
> just be my impression.
>
> Christian

Thanks,
Suren.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-07 15:31                                                             ` Oleg Nesterov
@ 2019-05-07 16:35                                                               ` Sultan Alsawaf
  2019-05-09 15:56                                                                 ` Oleg Nesterov
  0 siblings, 1 reply; 113+ messages in thread
From: Sultan Alsawaf @ 2019-05-07 16:35 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Christian Brauner, Daniel Colascione, Suren Baghdasaryan,
	Steven Rostedt, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Andy Lutomirski, Serge E. Hallyn, Kees Cook,
	Joel Fernandes

On Tue, May 07, 2019 at 05:31:54PM +0200, Oleg Nesterov wrote:
> I am not going to comment the intent, but to be honest I am skeptical too.

The general sentiment has been that this is a really bad idea, but I'm just a
frustrated Android user who wants his phone to not require mountains of zRAM
only to still manage memory poorly. Until I can go out and buy a non-Pixel phone
that uses PSI to make these decisions (and does a good job of it), I'm going to
stick to my hacky little driver for my personal devices. Many others who like to
hack their Android devices to make them last longer will probably find value in
this as well, since there are millions of people who use devices that'll never
seen any of PSI ported to their ancient 3.x kernels.

And yes, I know this would never be accepted to upstream in a million years. I
mostly wanted some code review and guidance, since mm code is pretty tricky :)

> On 05/06, Sultan Alsawaf wrote:
> >
> > +static unsigned long find_victims(struct victim_info *varr, int *vindex,
> > +				  int vmaxlen, int min_adj, int max_adj)
> > +{
> > +	unsigned long pages_found = 0;
> > +	int old_vindex = *vindex;
> > +	struct task_struct *tsk;
> > +
> > +	for_each_process(tsk) {
> > +		struct task_struct *vtsk;
> > +		unsigned long tasksize;
> > +		short oom_score_adj;
> > +
> > +		/* Make sure there's space left in the victim array */
> > +		if (*vindex == vmaxlen)
> > +			break;
> > +
> > +		/* Don't kill current, kthreads, init, or duplicates */
> > +		if (same_thread_group(tsk, current) ||
> > +		    tsk->flags & PF_KTHREAD ||
> > +		    is_global_init(tsk) ||
> > +		    vtsk_is_duplicate(varr, *vindex, tsk))
> > +			continue;
> > +
> > +		vtsk = find_lock_task_mm(tsk);
> 
> Did you test this patch with lockdep enabled?
> 
> If I read the patch correctly, lockdep should complain. vtsk_is_duplicate()
> ensures that we do not take the same ->alloc_lock twice or more, but lockdep
> can't know this.

Yeah, lockdep is fine with this, at least on 4.4.

> > +static void scan_and_kill(unsigned long pages_needed)
> > +{
> > +	static DECLARE_WAIT_QUEUE_HEAD(victim_waitq);
> > +	struct victim_info victims[MAX_VICTIMS];
> > +	int i, nr_to_kill = 0, nr_victims = 0;
> > +	unsigned long pages_found = 0;
> > +	atomic_t victim_count;
> > +
> > +	/*
> > +	 * Hold the tasklist lock so tasks don't disappear while scanning. This
> > +	 * is preferred to holding an RCU read lock so that the list of tasks
> > +	 * is guaranteed to be up to date. Keep preemption disabled until the
> > +	 * SIGKILLs are sent so the victim kill process isn't interrupted.
> > +	 */
> > +	read_lock(&tasklist_lock);
> > +	preempt_disable();
> 
> read_lock() disables preemption, every task_lock() too, so this looks
> unnecessary.

Good point.

> > +	for (i = 1; i < ARRAY_SIZE(adj_prio); i++) {
> > +		pages_found += find_victims(victims, &nr_victims, MAX_VICTIMS,
> > +					    adj_prio[i], adj_prio[i - 1]);
> > +		if (pages_found >= pages_needed || nr_victims == MAX_VICTIMS)
> > +			break;
> > +	}
> > +
> > +	/*
> > +	 * Calculate the number of tasks that need to be killed and quickly
> > +	 * release the references to those that'll live.
> > +	 */
> > +	for (i = 0, pages_found = 0; i < nr_victims; i++) {
> > +		struct victim_info *victim = &victims[i];
> > +		struct task_struct *vtsk = victim->tsk;
> > +
> > +		/* The victims' mm lock is taken in find_victims; release it */
> > +		if (pages_found >= pages_needed) {
> > +			task_unlock(vtsk);
> > +			continue;
> > +		}
> > +
> > +		/*
> > +		 * Grab a reference to the victim so it doesn't disappear after
> > +		 * the tasklist lock is released.
> > +		 */
> > +		get_task_struct(vtsk);
> 
> The comment doesn't look correct. the victim can't dissapear until task_unlock()
> below, it can't pass exit_mm().

I was always unsure about this and decided to hold a reference to the
task_struct to be safe. Thanks for clearing that up.

> > +		pages_found += victim->size;
> > +		nr_to_kill++;
> > +	}
> > +	read_unlock(&tasklist_lock);
> > +
> > +	/* Kill the victims */
> > +	victim_count = (atomic_t)ATOMIC_INIT(nr_to_kill);
> > +	for (i = 0; i < nr_to_kill; i++) {
> > +		struct victim_info *victim = &victims[i];
> > +		struct task_struct *vtsk = victim->tsk;
> > +
> > +		pr_info("Killing %s with adj %d to free %lu kiB\n", vtsk->comm,
> > +			vtsk->signal->oom_score_adj,
> > +			victim->size << (PAGE_SHIFT - 10));
> > +
> > +		/* Configure the victim's mm to notify us when it's freed */
> > +		vtsk->mm->slmk_waitq = &victim_waitq;
> > +		vtsk->mm->slmk_counter = &victim_count;
> > +
> > +		/* Accelerate the victim's death by forcing the kill signal */
> > +		do_send_sig_info(SIGKILL, SIG_INFO_TYPE, vtsk, true);
>                                                                ^^^^
> this should be PIDTYPE_TGID

Thanks, I didn't realize the last argument to do_send_sig_info changed in newer
kernels. The compiler didn't complain, so it went over my head.

> > +
> > +		/* Finally release the victim's mm lock */
> > +		task_unlock(vtsk);
> > +	}
> > +	preempt_enable_no_resched();
> 
> See above. And I don't understand how can _no_resched() really help...

Yeah, good point.

> > +
> > +	/* Try to speed up the death process now that we can schedule again */
> > +	for (i = 0; i < nr_to_kill; i++) {
> > +		struct task_struct *vtsk = victims[i].tsk;
> > +
> > +		/* Increase the victim's priority to make it die faster */
> > +		set_user_nice(vtsk, MIN_NICE);
> > +
> > +		/* Allow the victim to run on any CPU */
> > +		set_cpus_allowed_ptr(vtsk, cpu_all_mask);
> > +
> > +		/* Finally release the victim reference acquired earlier */
> > +		put_task_struct(vtsk);
> > +	}
> > +
> > +	/* Wait until all the victims die */
> > +	wait_event(victim_waitq, !atomic_read(&victim_count));
> 
> Can't we avoid the new slmk_waitq/slmk_counter members in mm_struct?
> 
> I mean, can't we export victim_waitq and victim_count and, say, set/test
> MMF_OOM_VICTIM. In fact I think you should try to re-use mark_oom_victim()
> at least.

This makes the patch less portable across different kernel versions, which is
kind of one of its major goals.

Thanks for the code review, Oleg.

Sultan

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-07 16:28                                                                       ` Suren Baghdasaryan
@ 2019-05-07 16:38                                                                         ` Christian Brauner
  2019-05-07 16:53                                                                         ` Sultan Alsawaf
  2019-05-07 18:46                                                                         ` Joel Fernandes
  2 siblings, 0 replies; 113+ messages in thread
From: Christian Brauner @ 2019-05-07 16:38 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Sultan Alsawaf, Greg Kroah-Hartman, open list:ANDROID DRIVERS,
	Daniel Colascione, Todd Kjos, Kees Cook, Peter Zijlstra,
	Martijn Coenen, LKML, Tim Murray, Michal Hocko, linux-mm,
	Arve Hjønnevåg, Ingo Molnar, Steven Rostedt,
	Oleg Nesterov, Joel Fernandes, Andy Lutomirski, kernel-team

On Tue, May 07, 2019 at 09:28:47AM -0700, Suren Baghdasaryan wrote:
> From: Christian Brauner <christian@brauner.io>
> Date: Tue, May 7, 2019 at 3:58 AM
> To: Sultan Alsawaf
> Cc: Greg Kroah-Hartman, open list:ANDROID DRIVERS, Daniel Colascione,
> Todd Kjos, Kees Cook, Peter Zijlstra, Martijn Coenen, LKML, Tim
> Murray, Michal Hocko, Suren Baghdasaryan, linux-mm, Arve Hjønnevåg,
> Ingo Molnar, Steven Rostedt, Oleg Nesterov, Joel Fernandes, Andy
> Lutomirski, kernel-team
> 
> > On Tue, May 07, 2019 at 01:12:36AM -0700, Sultan Alsawaf wrote:
> > > On Tue, May 07, 2019 at 09:43:34AM +0200, Greg Kroah-Hartman wrote:
> > > > Given that any "new" android device that gets shipped "soon" should be
> > > > using 4.9.y or newer, is this a real issue?
> > >
> > > It's certainly a real issue for those who can't buy brand new Android devices
> > > without software bugs every six months :)
> > >
> 
> Hi Sultan,
> Looks like you are posting this patch for devices that do not use
> userspace LMKD solution due to them using older kernels or due to
> their vendors sticking to in-kernel solution. If so, I see couple
> logistical issues with this patch. I don't see it being adopted in
> upstream kernel 5.x since it re-implements a deprecated mechanism even
> though vendors still use it. Vendors on the other hand, will not adopt
> it until you show evidence that it works way better than what
> lowmemorykilled driver does now. You would have to provide measurable
> data and explain your tests before they would consider spending time
> on this.
> On the implementation side I'm not convinced at all that this would
> work better on all devices and in all circumstances. We had cases when
> a new mechanism would show very good results until one usecase
> completely broke it. Bulk killing of processes that you are doing in
> your patch was a very good example of such a decision which later on
> we had to rethink. That's why baking these policies into kernel is
> very problematic. Another problem I see with the implementation that
> it ties process killing with the reclaim scan depth. It's very similar
> to how vmpressure works and vmpressure in my experience is very
> unpredictable.
> 
> > > > And if it is, I'm sure that asking for those patches to be backported to
> > > > 4.4.y would be just fine, have you asked?
> > > >
> > > > Note that I know of Android Go devices, running 3.18.y kernels, do NOT
> > > > use the in-kernel memory killer, but instead use the userspace solution
> > > > today.  So trying to get another in-kernel memory killer solution added
> > > > anywhere seems quite odd.
> > >
> > > It's even more odd that although a userspace solution is touted as the proper
> > > way to go on LKML, almost no Android OEMs are using it, and even in that commit
> >
> > That's probably because without proper kernel changes this is rather
> > tricky to use safely (see below).
> >
> > > I linked in the previous message, Google made a rather large set of
> > > modifications to the supposedly-defunct lowmemorykiller.c not one month ago.
> > > What's going on?
> 
> If you look into that commit, it adds ability to report kill stats. If
> that was a change in how that driver works it would be rejected.
> 
> > >
> > > Qualcomm still uses lowmemorykiller.c [1] on the Snapdragon 845. If PSI were
> > > backported to 4.4, or even 3.18, would it really be used? I don't really
> > > understand the aversion to an in-kernel memory killer on LKML despite the rest
> > > of the industry's attraction to it. Perhaps there's some inherently great cost
> > > in using the userspace solution that I'm unaware of?
> 
> Vendors are cautious about adopting userspace solution and it is a
> process to address all concerns but we are getting there.
> 
> > > Regardless, even if PSI were backported, a full-fledged LMKD using it has yet to
> > > be made, so it wouldn't be of much use now.
> >
> > This is work that is ongoing and requires kernel changes to make it
> > feasible. One of the things that I have been working on for quite a
> > while is the whole file descriptor for processes thing that is important
> > for LMKD (Even though I never thought about this use-case when I started
> > pitching this.). Joel and Daniel have joined in and are working on
> > making LMKD possible.
> > What I find odd is that every couple of weeks different solutions to the
> > low memory problem are pitched. There is simple_lkml, there is LMKD, and
> > there was a patchset that wanted to speed up memory reclaim at process
> > kill-time by adding a new flag to the new pidfd_send_signal() syscall.
> > That all seems - though related - rather uncoordinated.
> 
> I'm not sure why pidfd_wait and expedited reclaim is seen as
> uncoordinated effort. All of them are done to improve userspace LMKD.

If so that wasn't very obvious and there was some disagreement there as
well whether this would be the right solution.
In any case, the point is that LMKD seems to be the way forward and with
all of the arguments brought forward here this patchset seems like it's
going in the wrong direction.

Christian

> 
> > Now granted,
> > coordinated is usually not how kernel development necessarily works but
> > it would probably be good to have some sort of direction and from what I
> > have seen LMKD seems to be the most coordinated effort. But that might
> > just be my impression.
> >
> > Christian
> 
> Thanks,
> Suren.

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-07 16:28                                                                       ` Suren Baghdasaryan
  2019-05-07 16:38                                                                         ` Christian Brauner
@ 2019-05-07 16:53                                                                         ` Sultan Alsawaf
  2019-05-07 20:01                                                                           ` Suren Baghdasaryan
  2019-05-07 18:46                                                                         ` Joel Fernandes
  2 siblings, 1 reply; 113+ messages in thread
From: Sultan Alsawaf @ 2019-05-07 16:53 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Christian Brauner, Greg Kroah-Hartman, open list:ANDROID DRIVERS,
	Daniel Colascione, Todd Kjos, Kees Cook, Peter Zijlstra,
	Martijn Coenen, LKML, Tim Murray, Michal Hocko, linux-mm,
	Arve Hjønnevåg, Ingo Molnar, Steven Rostedt,
	Oleg Nesterov, Joel Fernandes, Andy Lutomirski, kernel-team

On Tue, May 07, 2019 at 09:28:47AM -0700, Suren Baghdasaryan wrote:
> Hi Sultan,
> Looks like you are posting this patch for devices that do not use
> userspace LMKD solution due to them using older kernels or due to
> their vendors sticking to in-kernel solution. If so, I see couple
> logistical issues with this patch. I don't see it being adopted in
> upstream kernel 5.x since it re-implements a deprecated mechanism even
> though vendors still use it. Vendors on the other hand, will not adopt
> it until you show evidence that it works way better than what
> lowmemorykilled driver does now. You would have to provide measurable
> data and explain your tests before they would consider spending time
> on this.

Yes, this is mostly for the devices already produced that are forced to suffer
with poor memory management. I can't even convince vendors to fix kernel
memory leaks, so there's no way I'd be able to convince them of trying this
patch, especially since it seems like you're having trouble convincing vendors
to stop using lowmemorykiller in the first place. And thankfully, convincing
vendors isn't my job :)

> On the implementation side I'm not convinced at all that this would
> work better on all devices and in all circumstances. We had cases when
> a new mechanism would show very good results until one usecase
> completely broke it. Bulk killing of processes that you are doing in
> your patch was a very good example of such a decision which later on
> we had to rethink. That's why baking these policies into kernel is
> very problematic. Another problem I see with the implementation that
> it ties process killing with the reclaim scan depth. It's very similar
> to how vmpressure works and vmpressure in my experience is very
> unpredictable.

Could you elaborate a bit on why bulk killing isn't good?

> > > I linked in the previous message, Google made a rather large set of
> > > modifications to the supposedly-defunct lowmemorykiller.c not one month ago.
> > > What's going on?
> 
> If you look into that commit, it adds ability to report kill stats. If
> that was a change in how that driver works it would be rejected.

Fair, though it was quite strange seeing something that was supposedly totally
abandoned receiving a large chunk of code for reporting stats.

Thanks,
Sultan

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-07 10:58                                                                     ` Christian Brauner
  2019-05-07 16:28                                                                       ` Suren Baghdasaryan
@ 2019-05-07 17:17                                                                       ` Sultan Alsawaf
  2019-05-07 17:29                                                                         ` Greg Kroah-Hartman
  1 sibling, 1 reply; 113+ messages in thread
From: Sultan Alsawaf @ 2019-05-07 17:17 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Greg Kroah-Hartman, open list:ANDROID DRIVERS, Daniel Colascione,
	Todd Kjos, Kees Cook, Peter Zijlstra, Martijn Coenen, LKML,
	Tim Murray, Michal Hocko, Suren Baghdasaryan, linux-mm,
	Arve Hjønnevåg, Ingo Molnar, Steven Rostedt,
	Oleg Nesterov, Joel Fernandes, Andy Lutomirski, kernel-team

On Tue, May 07, 2019 at 12:58:27PM +0200, Christian Brauner wrote:
> This is work that is ongoing and requires kernel changes to make it
> feasible. One of the things that I have been working on for quite a
> while is the whole file descriptor for processes thing that is important
> for LMKD (Even though I never thought about this use-case when I started
> pitching this.). Joel and Daniel have joined in and are working on
> making LMKD possible.
> What I find odd is that every couple of weeks different solutions to the
> low memory problem are pitched. There is simple_lkml, there is LMKD, and
> there was a patchset that wanted to speed up memory reclaim at process
> kill-time by adding a new flag to the new pidfd_send_signal() syscall.
> That all seems - though related - rather uncoordinated. Now granted,
> coordinated is usually not how kernel development necessarily works but
> it would probably be good to have some sort of direction and from what I
> have seen LMKD seems to be the most coordinated effort. But that might
> just be my impression.

LMKD is just Android's userspace low-memory-killer daemon. It's been around for
a while.

This patch (simple_lmk) is meant to serve as an immediate solution for the
devices that'll never see a single line of PSI code running on them, which
amounts to... well, most Android devices currently in existence. I'm more of a
cowboy who made this patch after waiting a few years for memory management
improvements on Android that never happened. Though it looks like it's going to
happen soon(ish?) for super new devices that'll have the privilege of shipping
with PSI in use.

On Tue, May 07, 2019 at 01:09:21PM +0200, Greg Kroah-Hartman wrote:
> > It's even more odd that although a userspace solution is touted as the proper
> > way to go on LKML, almost no Android OEMs are using it, and even in that commit
> > I linked in the previous message, Google made a rather large set of
> > modifications to the supposedly-defunct lowmemorykiller.c not one month ago.
> > What's going on?
> 
> "almost no"?  Again, Android Go is doing that, right?

I'd check for myself, but I can't seem to find kernel source for an Android Go
device...

This seems more confusing though. Why would the ultra-low-end devices use LMKD
while other devices use the broken lowmemorykiller driver?

> > Qualcomm still uses lowmemorykiller.c [1] on the Snapdragon 845.
> 
> Qualcomm should never be used as an example of a company that has any
> idea of what to do in their kernel :)

Agreed, but nearly all OEMs that use Qualcomm chipsets roll with Qualcomm's
kernel decisions, so Qualcomm has a bit of influence here.

> > If PSI were backported to 4.4, or even 3.18, would it really be used?
> 
> Why wouldn't it, if it worked properly?

For the same mysterious reason that Qualcomm and others cling to
lowmemorykiller, I presume. This is part of what's been confusing me for quite
some time...

> Please see the work that went into PSI and the patches around it.
> There's also a lwn.net article last week about the further work ongoing
> in this area.  With all of that, you should see how in-kernel memory
> killers are NOT the way to go.
> 
> > Regardless, even if PSI were backported, a full-fledged LMKD using it has yet to
> > be made, so it wouldn't be of much use now.
> 
> "LMKD"?  Again, PSI is in the 4.9.y android-common tree, so the
> userspace side should be in AOSP, right?

LMKD as in Android's low-memory-killer daemon. It is in AOSP, but it looks like
it's still a work in progress.

Thanks,
Sultan

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-07 17:17                                                                       ` Sultan Alsawaf
@ 2019-05-07 17:29                                                                         ` Greg Kroah-Hartman
  0 siblings, 0 replies; 113+ messages in thread
From: Greg Kroah-Hartman @ 2019-05-07 17:29 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Christian Brauner, open list:ANDROID DRIVERS, Daniel Colascione,
	kernel-team, Todd Kjos, Kees Cook, Peter Zijlstra, LKML,
	Tim Murray, Michal Hocko, Suren Baghdasaryan, linux-mm,
	Arve Hjønnevåg, Ingo Molnar, Steven Rostedt,
	Oleg Nesterov, Joel Fernandes, Andy Lutomirski, Martijn Coenen

On Tue, May 07, 2019 at 10:17:11AM -0700, Sultan Alsawaf wrote:
> On Tue, May 07, 2019 at 01:09:21PM +0200, Greg Kroah-Hartman wrote:
> > > It's even more odd that although a userspace solution is touted as the proper
> > > way to go on LKML, almost no Android OEMs are using it, and even in that commit
> > > I linked in the previous message, Google made a rather large set of
> > > modifications to the supposedly-defunct lowmemorykiller.c not one month ago.
> > > What's going on?
> > 
> > "almost no"?  Again, Android Go is doing that, right?
> 
> I'd check for myself, but I can't seem to find kernel source for an Android Go
> device...
> 
> This seems more confusing though. Why would the ultra-low-end devices use LMKD
> while other devices use the broken lowmemorykiller driver?

It's probably because the Android Go devices got a lot more "help" from
people at Google than did the other devices you are looking at.  Also,
despite the older kernel version, they are probably running a newer
version of Android userspace, specially tuned just for lower memory
devices.

So those 3.18.y based Android Go devices are newer than the 4.4.y based
"full Android" devices on the market, and even some 4.9.y based devices.

Yes, it is strange :)

> > > Qualcomm still uses lowmemorykiller.c [1] on the Snapdragon 845.
> > 
> > Qualcomm should never be used as an example of a company that has any
> > idea of what to do in their kernel :)
> 
> Agreed, but nearly all OEMs that use Qualcomm chipsets roll with Qualcomm's
> kernel decisions, so Qualcomm has a bit of influence here.

Yes, because almost no OEM wants to mess with their kernel, they just
take QCOM's kernel and run with it.  But don't take that for some sort
of "best design practice" summary at all.

> > > If PSI were backported to 4.4, or even 3.18, would it really be used?
> > 
> > Why wouldn't it, if it worked properly?
> 
> For the same mysterious reason that Qualcomm and others cling to
> lowmemorykiller, I presume. This is part of what's been confusing me for quite
> some time...

QCOM's 4.4.y based kernel work was done 3-4 years ago, if not older.
They didn't know that this was not the "right way" to do things.  The
Google developers have been working for the past few years to do it
correct, but they can not go back in time to change old repos, sorry.

Now that I understand you just want to work on your local device, that
makes more sense.  But I think you will have a better result trying to
do a 4.4 backport of PSI combined with the userspace stuff, than to try
to worry about your driver in 5.2 or newer.

Or you can forward-port your kernel to 4.9, or better yet, 4.14.  That
would probably be a much better thing to do overall as 4.4 is really old
now.

Good luck!

greg k-h

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-07 16:28                                                                       ` Suren Baghdasaryan
  2019-05-07 16:38                                                                         ` Christian Brauner
  2019-05-07 16:53                                                                         ` Sultan Alsawaf
@ 2019-05-07 18:46                                                                         ` Joel Fernandes
  2 siblings, 0 replies; 113+ messages in thread
From: Joel Fernandes @ 2019-05-07 18:46 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Christian Brauner, Sultan Alsawaf, Greg Kroah-Hartman,
	open list:ANDROID DRIVERS, Daniel Colascione, Todd Kjos,
	Kees Cook, Peter Zijlstra, Martijn Coenen, LKML, Tim Murray,
	Michal Hocko, linux-mm, Arve Hjønnevåg, Ingo Molnar,
	Steven Rostedt, Oleg Nesterov, Andy Lutomirski, kernel-team

On Tue, May 07, 2019 at 09:28:47AM -0700, Suren Baghdasaryan wrote:
> From: Christian Brauner <christian@brauner.io>
> Date: Tue, May 7, 2019 at 3:58 AM
> To: Sultan Alsawaf
> Cc: Greg Kroah-Hartman, open list:ANDROID DRIVERS, Daniel Colascione,
> Todd Kjos, Kees Cook, Peter Zijlstra, Martijn Coenen, LKML, Tim
> Murray, Michal Hocko, Suren Baghdasaryan, linux-mm, Arve Hjønnevåg,
> Ingo Molnar, Steven Rostedt, Oleg Nesterov, Joel Fernandes, Andy
> Lutomirski, kernel-team
> 
> > On Tue, May 07, 2019 at 01:12:36AM -0700, Sultan Alsawaf wrote:
> > > On Tue, May 07, 2019 at 09:43:34AM +0200, Greg Kroah-Hartman wrote:
> > > > Given that any "new" android device that gets shipped "soon" should be
> > > > using 4.9.y or newer, is this a real issue?
> > >
> > > It's certainly a real issue for those who can't buy brand new Android devices
> > > without software bugs every six months :)
> > >
> 
> Hi Sultan,
> Looks like you are posting this patch for devices that do not use
> userspace LMKD solution due to them using older kernels or due to
> their vendors sticking to in-kernel solution. If so, I see couple
> logistical issues with this patch. I don't see it being adopted in
> upstream kernel 5.x since it re-implements a deprecated mechanism even
> though vendors still use it. Vendors on the other hand, will not adopt
> it until you show evidence that it works way better than what
> lowmemorykilled driver does now. You would have to provide measurable
> data and explain your tests before they would consider spending time
> on this.
> On the implementation side I'm not convinced at all that this would
> work better on all devices and in all circumstances. We had cases when
> a new mechanism would show very good results until one usecase
> completely broke it. Bulk killing of processes that you are doing in
> your patch was a very good example of such a decision which later on
> we had to rethink. That's why baking these policies into kernel is
> very problematic. Another problem I see with the implementation that
> it ties process killing with the reclaim scan depth. It's very similar
> to how vmpressure works and vmpressure in my experience is very
> unpredictable.

Yeah it does seem conceptually similar, good point.
 
> > > Regardless, even if PSI were backported, a full-fledged LMKD using it has yet to
> > > be made, so it wouldn't be of much use now.
> >
> > This is work that is ongoing and requires kernel changes to make it
> > feasible. One of the things that I have been working on for quite a
> > while is the whole file descriptor for processes thing that is important
> > for LMKD (Even though I never thought about this use-case when I started
> > pitching this.). Joel and Daniel have joined in and are working on
> > making LMKD possible.
> > What I find odd is that every couple of weeks different solutions to the
> > low memory problem are pitched. There is simple_lkml, there is LMKD, and
> > there was a patchset that wanted to speed up memory reclaim at process
> > kill-time by adding a new flag to the new pidfd_send_signal() syscall.
> > That all seems - though related - rather uncoordinated.
> 
> I'm not sure why pidfd_wait and expedited reclaim is seen as
> uncoordinated effort. All of them are done to improve userspace LMKD.

Christian, pidfd_wait and expedited reclaim are both coordinated efforts and
solve different problems related to LMK. simple_lmk is entirely different
effort that we already hesitated about when it was first posted, now we
hesitate again due to the issues Suren and others mentioned.

I think it is a better idea for Sultan to spend his time on using/improving
PSI/LMKd than spending it on the simple_lmk. It could also be a good topic to
discuss in the Android track of the Linux plumbers conference.

thanks,

 - Joel


^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-07 16:53                                                                         ` Sultan Alsawaf
@ 2019-05-07 20:01                                                                           ` Suren Baghdasaryan
  0 siblings, 0 replies; 113+ messages in thread
From: Suren Baghdasaryan @ 2019-05-07 20:01 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Christian Brauner, Greg Kroah-Hartman, open list:ANDROID DRIVERS,
	Daniel Colascione, Todd Kjos, Kees Cook, Peter Zijlstra,
	Martijn Coenen, LKML, Tim Murray, Michal Hocko, linux-mm,
	Arve Hjønnevåg, Ingo Molnar, Steven Rostedt,
	Oleg Nesterov, Joel Fernandes, Andy Lutomirski, kernel-team

From: Sultan Alsawaf <sultan@kerneltoast.com>
Date: Tue, May 7, 2019 at 9:53 AM
To: Suren Baghdasaryan
Cc: Christian Brauner, Greg Kroah-Hartman, open list:ANDROID DRIVERS,
Daniel Colascione, Todd Kjos, Kees Cook, Peter Zijlstra, Martijn
Coenen, LKML, Tim Murray, Michal Hocko, linux-mm, Arve Hjønnevåg, Ingo
Molnar, Steven Rostedt, Oleg Nesterov, Joel Fernandes, Andy
Lutomirski, kernel-team

> On Tue, May 07, 2019 at 09:28:47AM -0700, Suren Baghdasaryan wrote:
> > Hi Sultan,
> > Looks like you are posting this patch for devices that do not use
> > userspace LMKD solution due to them using older kernels or due to
> > their vendors sticking to in-kernel solution. If so, I see couple
> > logistical issues with this patch. I don't see it being adopted in
> > upstream kernel 5.x since it re-implements a deprecated mechanism even
> > though vendors still use it. Vendors on the other hand, will not adopt
> > it until you show evidence that it works way better than what
> > lowmemorykilled driver does now. You would have to provide measurable
> > data and explain your tests before they would consider spending time
> > on this.
>
> Yes, this is mostly for the devices already produced that are forced to suffer
> with poor memory management. I can't even convince vendors to fix kernel
> memory leaks, so there's no way I'd be able to convince them of trying this
> patch, especially since it seems like you're having trouble convincing vendors
> to stop using lowmemorykiller in the first place. And thankfully, convincing
> vendors isn't my job :)
>
> > On the implementation side I'm not convinced at all that this would
> > work better on all devices and in all circumstances. We had cases when
> > a new mechanism would show very good results until one usecase
> > completely broke it. Bulk killing of processes that you are doing in
> > your patch was a very good example of such a decision which later on
> > we had to rethink. That's why baking these policies into kernel is
> > very problematic. Another problem I see with the implementation that
> > it ties process killing with the reclaim scan depth. It's very similar
> > to how vmpressure works and vmpressure in my experience is very
> > unpredictable.
>
> Could you elaborate a bit on why bulk killing isn't good?

Yes. Several months ago we got reports that LMKD got very aggressive
killing more processes in situations which did not require that many
kills to recover from memory pressure. After investigation we tracked
it to the bulk killing which would kill too many processes during a
memory usage spike. When killing gradually the kills would be avoided,
so we had to get back to a more balanced approach. The conceptual
issue with bulk killing is that you can't cancel it when device
recovers quicker than you expected.

> > > > I linked in the previous message, Google made a rather large set of
> > > > modifications to the supposedly-defunct lowmemorykiller.c not one month ago.
> > > > What's going on?
> >
> > If you look into that commit, it adds ability to report kill stats. If
> > that was a change in how that driver works it would be rejected.
>
> Fair, though it was quite strange seeing something that was supposedly totally
> abandoned receiving a large chunk of code for reporting stats.
>
> Thanks,
> Sultan

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-07 16:35                                                               ` Sultan Alsawaf
@ 2019-05-09 15:56                                                                 ` Oleg Nesterov
  2019-05-09 18:33                                                                   ` Sultan Alsawaf
  0 siblings, 1 reply; 113+ messages in thread
From: Oleg Nesterov @ 2019-05-09 15:56 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Christian Brauner, Daniel Colascione, Suren Baghdasaryan,
	Steven Rostedt, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Andy Lutomirski, Serge E. Hallyn, Kees Cook,
	Joel Fernandes

On 05/07, Sultan Alsawaf wrote:
>
> On Tue, May 07, 2019 at 05:31:54PM +0200, Oleg Nesterov wrote:
>
> > Did you test this patch with lockdep enabled?
> >
> > If I read the patch correctly, lockdep should complain. vtsk_is_duplicate()
> > ensures that we do not take the same ->alloc_lock twice or more, but lockdep
> > can't know this.
>
> Yeah, lockdep is fine with this, at least on 4.4.

Impossible ;) I bet lockdep should report the deadlock as soon as find_victims()
calls find_lock_task_mm() when you already have a locked victim.

Nevermind, I guess this code won't run with lockdep enabled...


As for https://github.com/kerneltoast/android_kernel_google_wahoo/commit/afc8c9bf2dbde95941253c168d1adb64cfa2e3ad
Well,

	mmdrop(mm);
	simple_lmk_mm_freed(mm);

looks racy because mmdrop(mm) can free this mm_struct. Yes, simple_lmk_mm_freed()
does not dereference this pointer, but the same memory can be re-allocated as
another ->mm for the new task which can be found by find_victims(), and _in theory_
this all can happen in between, so the "victims[i].mm == mm" can be false positive.

And this also means that simple_lmk_mm_freed() should clear victims[i].mm when
it detects "victims[i].mm == mm", otherwise we have the same theoretical race,
victims_to_kill is only cleared when the last victim goes away.


Another nit... you can drop tasklist_lock right after the 1st "find_victims" loop.

And it seems that you do not really need to walk the "victims" array twice after that,
you can do everything in a single loop, but this is cosmetic.

Oleg.


^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-09 15:56                                                                 ` Oleg Nesterov
@ 2019-05-09 18:33                                                                   ` Sultan Alsawaf
  2019-05-10 15:10                                                                     ` Oleg Nesterov
  0 siblings, 1 reply; 113+ messages in thread
From: Sultan Alsawaf @ 2019-05-09 18:33 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Christian Brauner, Daniel Colascione, Suren Baghdasaryan,
	Steven Rostedt, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Andy Lutomirski, Serge E. Hallyn, Kees Cook,
	Joel Fernandes

On Thu, May 09, 2019 at 05:56:46PM +0200, Oleg Nesterov wrote:
> Impossible ;) I bet lockdep should report the deadlock as soon as find_victims()
> calls find_lock_task_mm() when you already have a locked victim.

I hope you're not a betting man ;)

With the following configured:
CONFIG_DEBUG_RT_MUTEXES=y
CONFIG_DEBUG_SPINLOCK=y
# CONFIG_DEBUG_SPINLOCK_BITE_ON_BUG is not set
CONFIG_DEBUG_SPINLOCK_PANIC_ON_BUG=y
CONFIG_DEBUG_MUTEXES=y
CONFIG_DEBUG_WW_MUTEX_SLOWPATH=y
CONFIG_DEBUG_LOCK_ALLOC=y
CONFIG_PROVE_LOCKING=y
CONFIG_LOCKDEP=y
CONFIG_LOCK_STAT=y
CONFIG_DEBUG_LOCKDEP=y
CONFIG_DEBUG_ATOMIC_SLEEP=y
# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
# CONFIG_LOCK_TORTURE_TEST is not set

And a printk added in vtsk_is_duplicate() to print when a duplicate is detected,
and my phone's memory cut in half to make simple_lmk do something, this is what
I observed:
taimen:/ # dmesg | grep lockdep
[    0.000000] \x09RCU lockdep checking is enabled.
taimen:/ # dmesg | grep simple_lmk
[   23.211091] simple_lmk: Killing android.carrier with adj 906 to free 37420 kiB
[   23.211160] simple_lmk: Killing oadcastreceiver with adj 906 to free 36784 kiB
[   23.248457] simple_lmk: Killing .apps.translate with adj 904 to free 45884 kiB
[   23.248720] simple_lmk: Killing ndroid.settings with adj 904 to free 42868 kiB
[   23.313417] simple_lmk: DUPLICATE VTSK!
[   23.313440] simple_lmk: Killing ndroid.keychain with adj 906 to free 33680 kiB
[   23.313513] simple_lmk: Killing com.whatsapp with adj 904 to free 51436 kiB
[   34.646695] simple_lmk: DUPLICATE VTSK!
[   34.646717] simple_lmk: Killing ndroid.apps.gcs with adj 906 to free 37956 kiB
[   34.646792] simple_lmk: Killing droid.apps.maps with adj 904 to free 63600 kiB
taimen:/ # dmesg | grep lockdep
[    0.000000] \x09RCU lockdep checking is enabled.
taimen:/ # 

> As for https://github.com/kerneltoast/android_kernel_google_wahoo/commit/afc8c9bf2dbde95941253c168d1adb64cfa2e3ad
> Well,
> 
> 	mmdrop(mm);
> 	simple_lmk_mm_freed(mm);
> 
> looks racy because mmdrop(mm) can free this mm_struct. Yes, simple_lmk_mm_freed()
> does not dereference this pointer, but the same memory can be re-allocated as
> another ->mm for the new task which can be found by find_victims(), and _in theory_
> this all can happen in between, so the "victims[i].mm == mm" can be false positive.
> 
> And this also means that simple_lmk_mm_freed() should clear victims[i].mm when
> it detects "victims[i].mm == mm", otherwise we have the same theoretical race,
> victims_to_kill is only cleared when the last victim goes away.

Fair point. Putting simple_lmk_mm_freed(mm) right before mmdrop(mm), and
sprinkling in a cmpxchg in simple_lmk_mm_freed() should fix that up.

> Another nit... you can drop tasklist_lock right after the 1st "find_victims" loop.

True!

> And it seems that you do not really need to walk the "victims" array twice after that,
> you can do everything in a single loop, but this is cosmetic.

Won't this result in potentially holding the task lock way longer than necessary
for multiple tasks that aren't going to be killed?

Thanks,
Sultan

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-09 18:33                                                                   ` Sultan Alsawaf
@ 2019-05-10 15:10                                                                     ` Oleg Nesterov
  2019-05-13 16:45                                                                       ` Sultan Alsawaf
  0 siblings, 1 reply; 113+ messages in thread
From: Oleg Nesterov @ 2019-05-10 15:10 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Christian Brauner, Daniel Colascione, Suren Baghdasaryan,
	Steven Rostedt, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Andy Lutomirski, Serge E. Hallyn, Kees Cook,
	Joel Fernandes

On 05/09, Sultan Alsawaf wrote:
>
> On Thu, May 09, 2019 at 05:56:46PM +0200, Oleg Nesterov wrote:
> > Impossible ;) I bet lockdep should report the deadlock as soon as find_victims()
> > calls find_lock_task_mm() when you already have a locked victim.
>
> I hope you're not a betting man ;)

I am starting to think I am ;)

If you have task1 != task2 this code

	task_lock(task1);
	task_lock(task2);

should trigger print_deadlock_bug(), task1->alloc_lock and task2->alloc_lock are
the "same" lock from lockdep pov, held_lock's will have the same hlock_class().

> CONFIG_PROVE_LOCKING=y

OK,

> And a printk added in vtsk_is_duplicate() to print when a duplicate is detected,

in this case find_lock_task_mm() won't be called, and this is what saves us from
the actual deadlock.


> and my phone's memory cut in half to make simple_lmk do something, this is what
> I observed:
> taimen:/ # dmesg | grep lockdep
> [    0.000000] \x09RCU lockdep checking is enabled.

this reports that CONFIG_PROVE_RCU is enabled ;)

> taimen:/ # dmesg | grep simple_lmk
> [   23.211091] simple_lmk: Killing android.carrier with adj 906 to free 37420 kiB
> [   23.211160] simple_lmk: Killing oadcastreceiver with adj 906 to free 36784 kiB

yes, looks like simple_lmk has at least 2 locked victims. And I have no idea why
you do not see anything else in dmesg. May be debug_locks_off() was already called.

But see above, "grep lockdep" won't work.  Perhaps you can do
"grep -e WARNING -e BUG -e locking".

Oleg.


^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-10 15:10                                                                     ` Oleg Nesterov
@ 2019-05-13 16:45                                                                       ` Sultan Alsawaf
  2019-05-14 16:44                                                                         ` Steven Rostedt
  2019-05-15 14:58                                                                         ` Oleg Nesterov
  0 siblings, 2 replies; 113+ messages in thread
From: Sultan Alsawaf @ 2019-05-13 16:45 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Christian Brauner, Daniel Colascione, Suren Baghdasaryan,
	Steven Rostedt, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Andy Lutomirski, Serge E. Hallyn, Kees Cook,
	Joel Fernandes

On Fri, May 10, 2019 at 05:10:25PM +0200, Oleg Nesterov wrote:
> I am starting to think I am ;)
> 
> If you have task1 != task2 this code
> 
> 	task_lock(task1);
> 	task_lock(task2);
> 
> should trigger print_deadlock_bug(), task1->alloc_lock and task2->alloc_lock are
> the "same" lock from lockdep pov, held_lock's will have the same hlock_class().

Okay, I've stubbed out debug_locks_off(), and lockdep is now complaining about a
bunch of false positives so it is _really_ enabled this time. I grepped for
lockdep last time to try and find a concise way to show over email that lockdep
didn't complain, but that backfired. Here is better evidence:

taimen:/ # dmesg | grep simple_lmk
[   58.349917] simple_lmk: Killing droid.deskclock with adj 906 to free 47548 KiB
[   58.354748] simple_lmk: Killing .android.dialer with adj 906 to free 36576 KiB
[   58.355030] simple_lmk: Killing rbandroid.sleep with adj 904 to free 50016 KiB
[   58.582833] simple_lmk: Killing oadcastreceiver with adj 904 to free 43044 KiB
[   58.587731] simple_lmk: Killing .apps.wellbeing with adj 902 to free 48128 KiB
[   58.588084] simple_lmk: Killing android.carrier with adj 902 to free 43636 KiB
[   58.671857] simple_lmk: Killing ndroid.keychain with adj 902 to free 39992 KiB
[   58.675622] simple_lmk: Killing gs.intelligence with adj 900 to free 49572 KiB
[   58.675770] simple_lmk: Killing le.modemservice with adj 800 to free 41976 KiB
[   58.762060] simple_lmk: Killing ndroid.settings with adj 700 to free 74708 KiB
[   58.763238] simple_lmk: Killing roid.apps.turbo with adj 700 to free 54660 KiB
[   58.873337] simple_lmk: Killing d.process.acore with adj 700 to free 48540 KiB
[   58.873513] simple_lmk: Killing d.process.media with adj 500 to free 46188 KiB
[   58.873713] simple_lmk: Killing putmethod.latin with adj 200 to free 67304 KiB
[   59.014046] simple_lmk: Killing android.vending with adj 201 to free 54900 KiB
[   59.017623] simple_lmk: Killing rak.mibandtools with adj 201 to free 44552 KiB
[   59.018423] simple_lmk: Killing eport.trebuchet with adj 100 to free 106208 KiB
[   59.223633] simple_lmk: Killing id.printspooler with adj 900 to free 39664 KiB
[   59.223762] simple_lmk: Killing gle.android.gms with adj 100 to free 64176 KiB
[   70.955204] simple_lmk: Killing .apps.translate with adj 906 to free 47564 KiB
[   70.955633] simple_lmk: Killing cloudprint:sync with adj 906 to free 31932 KiB
[   70.955787] simple_lmk: Killing oid.apps.photos with adj 904 to free 50204 KiB
[   71.060789] simple_lmk: Killing ecamera.android with adj 906 to free 32232 KiB
[   71.061074] simple_lmk: Killing webview_service with adj 906 to free 26028 KiB
[   71.061199] simple_lmk: Killing com.whatsapp with adj 904 to free 49484 KiB
[   71.190625] simple_lmk: Killing rbandroid.sleep with adj 906 to free 54724 KiB
[   71.190775] simple_lmk: Killing android.vending with adj 906 to free 39848 KiB
[   71.191303] simple_lmk: Killing m.facebook.orca with adj 904 to free 72296 KiB
[   71.342042] simple_lmk: Killing droid.deskclock with adj 902 to free 49284 KiB
[   71.342240] simple_lmk: Killing .apps.wellbeing with adj 900 to free 47632 KiB
[   71.342529] simple_lmk: Killing le.modemservice with adj 800 to free 33648 KiB
[   71.482391] simple_lmk: Killing d.process.media with adj 800 to free 40676 KiB
[   71.482511] simple_lmk: Killing rdog.challegram with adj 700 to free 71920 KiB
taimen:/ #

The first simple_lmk message appears at 58.349917. And based on the timestamps,
it's clear that simple_lmk called task_lock() for multiple different tasks,
which is the pattern you think should cause lockdep to complain. But here is the
full dmesg starting from that point:

[   58.349917] simple_lmk: Killing droid.deskclock with adj 906 to free 47548 KiB
[   58.354748] simple_lmk: Killing .android.dialer with adj 906 to free 36576 KiB
[   58.355030] simple_lmk: Killing rbandroid.sleep with adj 904 to free 50016 KiB
[   58.432654] binder_alloc: 2284: binder_alloc_buf failed to map pages in userspace, no vma
[   58.432671] binder: 1206:1218 transaction failed 29189/-3, size 76-0 line 3189
[   58.582833] simple_lmk: Killing oadcastreceiver with adj 904 to free 43044 KiB
[   58.587731] simple_lmk: Killing .apps.wellbeing with adj 902 to free 48128 KiB
[   58.588084] simple_lmk: Killing android.carrier with adj 902 to free 43636 KiB
[   58.590785] binder: undelivered transaction 58370, process died.
[   58.671857] simple_lmk: Killing ndroid.keychain with adj 902 to free 39992 KiB
[   58.675622] simple_lmk: Killing gs.intelligence with adj 900 to free 49572 KiB
[   58.675770] simple_lmk: Killing le.modemservice with adj 800 to free 41976 KiB
[   58.736678] binder: undelivered transaction 58814, process died.
[   58.736733] binder: release 3099:3128 transaction 57832 in, still active
[   58.736744] binder: send failed reply for transaction 57832 to 1876:3090
[   58.736761] binder: undelivered TRANSACTION_COMPLETE
[   58.736766] binder: undelivered transaction 58752, process died.
[   58.762060] simple_lmk: Killing ndroid.settings with adj 700 to free 74708 KiB
[   58.763238] simple_lmk: Killing roid.apps.turbo with adj 700 to free 54660 KiB
[   58.863590] binder: release 1876:3089 transaction 58117 out, still active
[   58.863606] binder: undelivered TRANSACTION_COMPLETE
[   58.873337] simple_lmk: Killing d.process.acore with adj 700 to free 48540 KiB
[   58.873513] simple_lmk: Killing d.process.media with adj 500 to free 46188 KiB
[   58.873713] simple_lmk: Killing putmethod.latin with adj 200 to free 67304 KiB
[   59.014046] simple_lmk: Killing android.vending with adj 201 to free 54900 KiB
[   59.017623] simple_lmk: Killing rak.mibandtools with adj 201 to free 44552 KiB
[   59.018423] simple_lmk: Killing eport.trebuchet with adj 100 to free 106208 KiB
[   59.028460] binder: 1206:1206 transaction failed 29189/-22, size 100-0 line 3052
[   59.142592] binder_alloc: 2814: binder_alloc_buf, no vma
[   59.142620] binder: 1206:1218 transaction failed 29189/-3, size 76-0 line 3189
[   59.223633] simple_lmk: Killing id.printspooler with adj 900 to free 39664 KiB
[   59.223762] simple_lmk: Killing gle.android.gms with adj 100 to free 64176 KiB
[   59.540176] binder: undelivered transaction 59447, process died.
[   59.540763] binder: undelivered transaction 59446, process died.
[   59.815404] binder: 1206:3140 transaction failed 29189/0, size 12-0 line 2992
[   59.815418] binder: send failed reply for transaction 58117, target dead
[   60.977609] binder: 2105:2308 transaction failed 29189/-22, size 168-0 line 3052
[   63.040202] FG: fg_get_battery_temp: batt temperature original:350, tuned:309
[   63.040219] lge_battery: bm_watch_work: PRESENT:1, CHG_STAT:1, THM_STAT:2, BAT_TEMP:309, BAT_VOLT:4148182, VOTE_CUR:1000000, SET_CUR:1000000 
[   63.076086] msm-dwc3 a800000.ssusb: Avail curr from USB = 2
[   63.077278] PMI: smblib_handle_switcher_power_ok: Weak charger detected: voting 500mA ICL
[   63.080014] PMI: smblib_handle_switcher_power_ok: Reverse boost detected: voting 0mA to suspend input
[   63.081886] FG: fg_get_battery_temp: batt temperature original:350, tuned:310
[   63.093639] fts_touch 5-0049: [FTS] Received Charger Disconnected Event
[   63.104656] healthd: battery l=100 v=4148 t=31.0 h=2 st=3 c=748 fc=3229000 cc=289 chg=
[   63.122546] healthd: battery l=100 v=4148 t=31.0 h=2 st=3 c=748 fc=3229000 cc=289 chg=
[   63.135620] FG: fg_get_battery_temp: batt temperature original:350, tuned:310
[   63.156383] FG: fg_get_battery_temp: batt temperature original:350, tuned:310
[   63.156897] FG: fg_get_battery_temp: batt temperature original:350, tuned:310
[   63.160481] FG: fg_get_battery_temp: batt temperature original:350, tuned:310
[   63.185029] healthd: battery l=100 v=4148 t=31.0 h=2 st=3 c=748 fc=3229000 cc=289 chg=
[   63.189015] healthd: battery l=100 v=4148 t=31.0 h=2 st=3 c=748 fc=3229000 cc=289 chg=
[   63.212484] lge_battery: bm_check_status: wake_unlocked: present[0] chg_state[2] vbus[0]
[   63.213039] FG: fg_get_battery_temp: batt temperature original:350, tuned:310
[   63.231096] FG: fg_get_battery_temp: batt temperature original:350, tuned:310
[   63.233981] healthd: battery l=100 v=4148 t=31.0 h=2 st=3 c=748 fc=3229000 cc=289 chg=
[   63.234663] msm-dwc3 a800000.ssusb: dwc3_msm_suspend: Calling suspend 1996
[   63.249755] msm-dwc3 a800000.ssusb: DWC3 in low power mode
[   63.250247] healthd: battery l=100 v=4148 t=31.0 h=2 st=3 c=748 fc=3229000 cc=289 chg=
[   63.250430] android_work: sent uevent USB_STATE=DISCONNECTED
[   63.294456] msm-dwc3 a800000.ssusb: Avail curr from USB = 0
[   70.492114] binder: undelivered transaction 86938, process died.
[   70.955204] simple_lmk: Killing .apps.translate with adj 906 to free 47564 KiB
[   70.955633] simple_lmk: Killing cloudprint:sync with adj 906 to free 31932 KiB
[   70.955787] simple_lmk: Killing oid.apps.photos with adj 904 to free 50204 KiB
[   71.060789] simple_lmk: Killing ecamera.android with adj 906 to free 32232 KiB
[   71.061074] simple_lmk: Killing webview_service with adj 906 to free 26028 KiB
[   71.061199] simple_lmk: Killing com.whatsapp with adj 904 to free 49484 KiB
[   71.164996] binder: undelivered transaction 87881, process died.
[   71.190625] simple_lmk: Killing rbandroid.sleep with adj 906 to free 54724 KiB
[   71.190775] simple_lmk: Killing android.vending with adj 906 to free 39848 KiB
[   71.191303] simple_lmk: Killing m.facebook.orca with adj 904 to free 72296 KiB
[   71.342042] simple_lmk: Killing droid.deskclock with adj 902 to free 49284 KiB
[   71.342240] simple_lmk: Killing .apps.wellbeing with adj 900 to free 47632 KiB
[   71.342529] simple_lmk: Killing le.modemservice with adj 800 to free 33648 KiB
[   71.482391] simple_lmk: Killing d.process.media with adj 800 to free 40676 KiB
[   71.482511] simple_lmk: Killing rdog.challegram with adj 700 to free 71920 KiB

No lockdep warnings!

Sultan

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-13 16:45                                                                       ` Sultan Alsawaf
@ 2019-05-14 16:44                                                                         ` Steven Rostedt
  2019-05-14 17:31                                                                           ` Sultan Alsawaf
  2019-05-15 14:58                                                                         ` Oleg Nesterov
  1 sibling, 1 reply; 113+ messages in thread
From: Steven Rostedt @ 2019-05-14 16:44 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Oleg Nesterov, Christian Brauner, Daniel Colascione,
	Suren Baghdasaryan, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Andy Lutomirski, Serge E. Hallyn, Kees Cook,
	Joel Fernandes

On Mon, 13 May 2019 09:45:55 -0700
Sultan Alsawaf <sultan@kerneltoast.com> wrote:

> On Fri, May 10, 2019 at 05:10:25PM +0200, Oleg Nesterov wrote:
> > I am starting to think I am ;)
> > 
> > If you have task1 != task2 this code
> > 
> > 	task_lock(task1);
> > 	task_lock(task2);
> > 
> > should trigger print_deadlock_bug(), task1->alloc_lock and task2->alloc_lock are
> > the "same" lock from lockdep pov, held_lock's will have the same hlock_class().  

OK, this has gotten my attention.

This thread is quite long, do you have a git repo I can look at, and
also where is the first task_lock() taken before the
find_lock_task_mm()?

-- Steve


^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-14 16:44                                                                         ` Steven Rostedt
@ 2019-05-14 17:31                                                                           ` Sultan Alsawaf
  0 siblings, 0 replies; 113+ messages in thread
From: Sultan Alsawaf @ 2019-05-14 17:31 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Oleg Nesterov, Christian Brauner, Daniel Colascione,
	Suren Baghdasaryan, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Andy Lutomirski, Serge E. Hallyn, Kees Cook,
	Joel Fernandes

On Tue, May 14, 2019 at 12:44:53PM -0400, Steven Rostedt wrote:
> OK, this has gotten my attention.
> 
> This thread is quite long, do you have a git repo I can look at, and
> also where is the first task_lock() taken before the
> find_lock_task_mm()?
> 
> -- Steve

Hi Steve,

This is the git repo I work on: https://github.com/kerneltoast/android_kernel_google_wahoo

With the newest simple_lmk iteration being this commit: https://github.com/kerneltoast/android_kernel_google_wahoo/commit/6b145b8c28b39f7047393169117f72ea7387d91c

This repo is based off the 4.4 kernel that Google ships on the Pixel 2/2XL.

simple_lmk iterates through the entire task list more than once and locks
potential victims using find_lock_task_mm(). It keeps these potential victims
locked across the multiple times that the task list is iterated.

The locking pattern that Oleg said should cause lockdep to complain is that
iterating through the entire task list more than once can lead to locking the
same task that was locked earlier with find_lock_task_mm(), and thus deadlock.
But there is a check in simple_lmk that avoids locking potential victims that
were already found, which avoids the deadlock, but lockdep doesn't know about
the check (which is done with vtsk_is_duplicate()) and should therefore
complain.

Lockdep does not complain though.

Sultan

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-13 16:45                                                                       ` Sultan Alsawaf
  2019-05-14 16:44                                                                         ` Steven Rostedt
@ 2019-05-15 14:58                                                                         ` Oleg Nesterov
  2019-05-15 17:27                                                                           ` Sultan Alsawaf
  1 sibling, 1 reply; 113+ messages in thread
From: Oleg Nesterov @ 2019-05-15 14:58 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Christian Brauner, Daniel Colascione, Suren Baghdasaryan,
	Steven Rostedt, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Andy Lutomirski, Serge E. Hallyn, Kees Cook,
	Joel Fernandes

On 05/13, Sultan Alsawaf wrote:
>
> On Fri, May 10, 2019 at 05:10:25PM +0200, Oleg Nesterov wrote:
> > I am starting to think I am ;)
> >
> > If you have task1 != task2 this code
> >
> > 	task_lock(task1);
> > 	task_lock(task2);
> >
> > should trigger print_deadlock_bug(), task1->alloc_lock and task2->alloc_lock are
> > the "same" lock from lockdep pov, held_lock's will have the same hlock_class().
>
> Okay, I've stubbed out debug_locks_off(), and lockdep is now complaining about a
> bunch of false positives so it is _really_ enabled this time.

Could you explain in detail what exactly did you do and what do you see in dmesg?

Just in case, lockdep complains only once, print_circular_bug() does debug_locks_off()
so it it has already reported another false positive __lock_acquire() will simply
return after that.

Oleg.


^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-15 14:58                                                                         ` Oleg Nesterov
@ 2019-05-15 17:27                                                                           ` Sultan Alsawaf
  2019-05-15 18:32                                                                             ` Steven Rostedt
  2019-05-16 13:54                                                                             ` Oleg Nesterov
  0 siblings, 2 replies; 113+ messages in thread
From: Sultan Alsawaf @ 2019-05-15 17:27 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Christian Brauner, Daniel Colascione, Suren Baghdasaryan,
	Steven Rostedt, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Andy Lutomirski, Serge E. Hallyn, Kees Cook,
	Joel Fernandes

On Wed, May 15, 2019 at 04:58:32PM +0200, Oleg Nesterov wrote:
> Could you explain in detail what exactly did you do and what do you see in dmesg?
> 
> Just in case, lockdep complains only once, print_circular_bug() does debug_locks_off()
> so it it has already reported another false positive __lock_acquire() will simply
> return after that.
> 
> Oleg.

This is what I did:
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 774ab79d3ec7..009e7d431a88 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3078,6 +3078,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        int class_idx;
        u64 chain_key;

+       BUG_ON(!debug_locks || !prove_locking);
        if (unlikely(!debug_locks))
                return 0;

diff --git a/lib/debug_locks.c b/lib/debug_locks.c
index 124fdf238b3d..4003a18420fb 100644
--- a/lib/debug_locks.c
+++ b/lib/debug_locks.c
@@ -37,6 +37,7 @@ EXPORT_SYMBOL_GPL(debug_locks_silent);
  */
 int debug_locks_off(void)
 {
+       return 0;
        if (debug_locks && __debug_locks_off()) {
                if (!debug_locks_silent) {
                        console_verbose();

And this is my full dmesg, with simple_lmk invoked for the first time near the
bottom:
[    0.000000] Booting Linux on physical CPU 0x0
[    0.000000] Initializing cgroup subsys cpuset
[    0.000000] Initializing cgroup subsys cpu
[    0.000000] Initializing cgroup subsys cpuacct
[    0.000000] Initializing cgroup subsys schedtune
[    0.000000] Linux version 4.4.169-Sultan (sultan@sultan-box) (gcc version 8.1.0 (GCC)) #15 SMP PREEMPT Wed May 15 10:11:03 PDT 2019
[    0.000000] Boot CPU: AArch64 Processor [51af8014]
[    0.000000] Machine: Qualcomm Technologies, Inc. MSM8998 v2.1
[    0.000000] Memory limited to 2048MB
[    0.000000] Reserved memory: reserved region for node 'removed_regions@85800000': base 0x0000000000000000, size 55 MiB
[    0.000000] Reserved memory: reserved region for node 'pil_ipa_gpu_region@95200000': base 0x0000000000000000, size 1 MiB
[    0.000000] Reserved memory: reserved region for node 'pil_slpi_region@94300000': base 0x0000000000000000, size 15 MiB
[    0.000000] Reserved memory: reserved region for node 'pil_mba_region@94100000': base 0x0000000000000000, size 2 MiB
[    0.000000] Reserved memory: reserved region for node 'pil_video_region@93c00000': base 0x0000000000000000, size 5 MiB
[    0.000000] Reserved memory: reserved region for node 'modem_region@8cc00000': base 0x0000000000000000, size 120 MiB
[    0.000000] Reserved memory: reserved region for node 'pil_adsp_region@0x8b200000': base 0x0000000000000000, size 26 MiB
[    0.000000] Reserved memory: reserved region for node 'spss_region@8ab00000': base 0x0000000000000000, size 7 MiB
[    0.000000] Reserved memory: reserved region for node 'alt_ramoops_region@b0e00000': base 0x0000000000000000, size 2 MiB
[    0.000000] Reserved memory: reserved region for node 'ramoops_region@b0000000': base 0x0000000000000000, size 2 MiB
[    0.000000] Reserved memory: reserved region for node 'ramoops_meta_region@affff000': base 0x0000000000000000, size 0 MiB
[    0.000000] Reserved memory: reserved region for node 'easel_mem@96000000': base 0x0000000000000000, size 64 MiB
[    0.000000] Reserved memory: allocated memory for 'linux,cma' node: base 0x0000000000000000, size 32 MiB
[    0.000000] Reserved memory: created CMA memory pool at 0x0000000000000000, size 32 MiB
[    0.000000] Reserved memory: initialized node linux,cma, compatible id shared-dma-pool
[    0.000000] Reserved memory: allocated memory for 'qseecom_region' node: base 0x0000000000000000, size 20 MiB
[    0.000000] Reserved memory: created CMA memory pool at 0x0000000000000000, size 20 MiB
[    0.000000] Reserved memory: initialized node qseecom_region, compatible id shared-dma-pool
[    0.000000] Reserved memory: allocated memory for 'adsp_region' node: base 0x0000000000000000, size 8 MiB
[    0.000000] Reserved memory: created CMA memory pool at 0x0000000000000000, size 8 MiB
[    0.000000] Reserved memory: initialized node adsp_region, compatible id shared-dma-pool
[    0.000000] Reserved memory: allocated memory for 'sp_region' node: base 0x0000000000000000, size 8 MiB
[    0.000000] Reserved memory: created CMA memory pool at 0x0000000000000000, size 8 MiB
[    0.000000] Reserved memory: initialized node sp_region, compatible id shared-dma-pool
[    0.000000] Reserved memory: allocated memory for 'secure_region' node: base 0x0000000000000000, size 92 MiB
[    0.000000] Reserved memory: created CMA memory pool at 0x0000000000000000, size 92 MiB
[    0.000000] Reserved memory: initialized node secure_region, compatible id shared-dma-pool
[    0.000000] Removed memory: created DMA memory pool at 0x0000000000000000, size 55 MiB
[    0.000000] Reserved memory: initialized node removed_regions@85800000, compatible id removed-dma-pool
[    0.000000] Removed memory: created DMA memory pool at 0x0000000000000000, size 7 MiB
[    0.000000] Reserved memory: initialized node spss_region@8ab00000, compatible id removed-dma-pool
[    0.000000] Removed memory: created DMA memory pool at 0x0000000000000000, size 26 MiB
[    0.000000] Reserved memory: initialized node pil_adsp_region@0x8b200000, compatible id removed-dma-pool
[    0.000000] Removed memory: created DMA memory pool at 0x0000000000000000, size 120 MiB
[    0.000000] Reserved memory: initialized node modem_region@8cc00000, compatible id removed-dma-pool
[    0.000000] Removed memory: created DMA memory pool at 0x0000000000000000, size 5 MiB
[    0.000000] Reserved memory: initialized node pil_video_region@93c00000, compatible id removed-dma-pool
[    0.000000] Removed memory: created DMA memory pool at 0x0000000000000000, size 2 MiB
[    0.000000] Reserved memory: initialized node pil_mba_region@94100000, compatible id removed-dma-pool
[    0.000000] Removed memory: created DMA memory pool at 0x0000000000000000, size 15 MiB
[    0.000000] Reserved memory: initialized node pil_slpi_region@94300000, compatible id removed-dma-pool
[    0.000000] Removed memory: created DMA memory pool at 0x0000000000000000, size 1 MiB
[    0.000000] Reserved memory: initialized node pil_ipa_gpu_region@95200000, compatible id removed-dma-pool
[    0.000000] Removed memory: created DMA memory pool at 0x0000000000000000, size 0 MiB
[    0.000000] Reserved memory: initialized node ramoops_meta_region@affff000, compatible id removed-dma-pool
[    0.000000] Removed memory: created DMA memory pool at 0x0000000000000000, size 2 MiB
[    0.000000] Reserved memory: initialized node ramoops_region@b0000000, compatible id removed-dma-pool
[    0.000000] Removed memory: created DMA memory pool at 0x0000000000000000, size 2 MiB
[    0.000000] Reserved memory: initialized node alt_ramoops_region@b0e00000, compatible id removed-dma-pool
[    0.000000] On node 0 totalpages: 464127
[    0.000000] DMA zone: 7252 pages used for memmap
[    0.000000] DMA zone: 0 pages reserved
[    0.000000] DMA zone: 464127 pages, LIFO batch:31
[    0.000000] psci: probing for conduit method from DT.
[    0.000000] psci: PSCIv1.0 detected in firmware.
[    0.000000] psci: Using standard PSCI v0.2 function IDs
[    0.000000] psci: MIGRATE_INFO_TYPE not supported.
[    0.000000] psci: Initializing psci_cpu_init
[    0.000000] psci: Initializing psci_cpu_init
[    0.000000] psci: Initializing psci_cpu_init
[    0.000000] psci: Initializing psci_cpu_init
[    0.000000] psci: Initializing psci_cpu_init
[    0.000000] psci: Initializing psci_cpu_init
[    0.000000] psci: Initializing psci_cpu_init
[    0.000000] PERCPU: Embedded 467 pages/cpu @0000000000000000 s1880984 r0 d31848 u1912832
[    0.000000] pcpu-alloc: s1880984 r0 d31848 u1912832 alloc=467*4096
[    0.000000] pcpu-alloc: [0] 0 [0] 1 [0] 2 [0] 3 [0] 4 [0] 5 [0] 6 [0] 7 
[    0.000000] CPU features: enabling workaround for Kryo2xx Silver erratum 845719
[    0.000000] Built 1 zonelists in Zone order, mobility grouping on.  Total pages: 456875
[    0.000000] Kernel command line: rcupdate.rcu_expedited=1 mem=2G androidboot.hardware=taimen lpm_levels.sleep_disabled=1 user_debug=31 msm_rtb.filter=0 ehci-hcd.park=3 service_locator.enable=1 swiotlb=2048 firmware_class.path=/vendor/firmware loop.max_part=7 raid=noautodetect usbcore.autosuspend=7 androidboot.dtbo_idx=12 buildvariant=user androidboot.revision=rev_10 androidboot.bootreason androidboot.hardware.sku=G011C bootcable.type=400MA androidboot.ddr_size=4096MB androidboot.ddr_info=HYNIX msm_poweroff.download_mode=0 androidboot.ramdump_enable=0 androidboot.hardware.display=MP androidboot.hardware.ufs=128GB,SAMSUNG androidboot.cid=00000000 androidboot.hardware.color=VB androidboot.hardware.variant=GA00137-US androidboot.hardware.mid=2 androidboot.boottime=1BLL:132,1BLE:549,2BLL:29,2BLE:12702,AVB:54,KL:0,KD:1741,ODT:191,SW:10000 androidboot.verifiedbootstate=orange androidboot.keymaster=1 dm="1 vroot none ro 1,0 5159992 verity 1 PARTUUID=73d84a99-00e6-d82f-51d0-c7461b899ae8 PA
 RTUUID
[    0.000000] device-mapper: init: will configure 1 devices
[    0.000000] log_buf_len individual max cpu contribution: 131072 bytes
[    0.000000] log_buf_len total cpu_extra contributions: 917504 bytes
[    0.000000] log_buf_len min size: 131072 bytes
[    0.000000] log_buf_len: 1048576 bytes
[    0.000000] early log buf free: 122244(93%)
[    0.000000] PID hash table entries: 4096 (order: 3, 32768 bytes)
[    0.000000] Dentry cache hash table entries: 262144 (order: 9, 2097152 bytes)
[    0.000000] Inode-cache hash table entries: 131072 (order: 8, 1048576 bytes)
[    0.000000] software IO TLB: mapped [mem 0xf252b000-0xf292b000] (4MB)
[    0.000000] Memory: 1514204K/1856508K available (15870K kernel code, 2071K rwdata, 8788K rodata, 8192K init, 17231K bss, 178464K reserved, 163840K cma-reserved)
[    0.000000] Virtual kernel memory layout:\x0a    modules : 0xffffff8000000000 - 0xffffff8008000000   (   128 MB)\x0a    vmalloc : 0xffffff8008000000 - 0xffffffbdbfff0000   (   246 GB)\x0a      .init : 0x0000000000000000 - 0x0000000000000000   (  8192 KB)\x0a      .text : 0x0000000000000000 - 0x0000000000000000   ( 15872 KB)\x0a    .rodata : 0x0000000000000000 - 0x0000000000000000   ( 10240 KB)\x0a      .data : 0x0000000000000000 - 0x0000000000000000   (  2072 KB)\x0a    vmemmap : 0xffffffbdc0000000 - 0xffffffbfc0000000   (     8 GB maximum)\x0a              0xffffffbe45000000 - 0xffffffbe47000000   (    32 MB actual)\x0a    fixed   : 0xffffffbffe7fd000 - 0xffffffbffec00000   (  4108 KB)\x0a    PCI I/O : 0xffffffbffee00000 - 0xffffffbfffe00000   (    16 MB)\x0a    memory  : 0xffffffe140000000 - 0xffffffe1c0000000   (  2048 MB)
[    0.000000] SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=8, Nodes=1
[    0.000000] Running RCU self tests
[    0.000000] Preemptible hierarchical RCU implementation.
[    0.000000] \x09RCU dyntick-idle grace-period acceleration is enabled.
[    0.000000] \x09RCU lockdep checking is enabled.
[    0.000000] NR_IRQS:64 nr_irqs:64 0
[    0.000000] mpm_init_irq_domain(): Cannot find irq controller for qcom,gpio-parent
[    0.000000] MPM 1 irq mapping errored -517
[    0.000000] \x09Offload RCU callbacks from all CPUs
[    0.000000] \x09Offload RCU callbacks from CPUs: 0-7.
[    0.000000] Architected cp15 and mmio timer(s) running at 19.20MHz (virt/virt).
[    0.000000] clocksource: arch_sys_counter: mask: 0xffffffffffffff max_cycles: 0x46d987e47, max_idle_ns: 440795202767 ns
[    0.000003] sched_clock: 56 bits at 19MHz, resolution 52ns, wraps every 4398046511078ns
[    0.000025] clocksource: Switched to clocksource arch_sys_counter
[    0.001905] Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar
[    0.001913] ... MAX_LOCKDEP_SUBCLASSES:  8
[    0.001917] ... MAX_LOCK_DEPTH:          48
[    0.001921] ... MAX_LOCKDEP_KEYS:        8191
[    0.001926] ... CLASSHASH_SIZE:          4096
[    0.001930] ... MAX_LOCKDEP_ENTRIES:     32768
[    0.001935] ... MAX_LOCKDEP_CHAINS:      65536
[    0.001940] ... CHAINHASH_SIZE:          32768
[    0.001945] memory used by lock dependency info: 8671 kB
[    0.001949] per task-struct memory footprint: 2688 bytes
[    0.001970] Calibrating delay loop (skipped), value calculated using timer frequency.. 38.40 BogoMIPS (lpj=192000)
[    0.001980] pid_max: default: 32768 minimum: 301
[    0.002190] Security Framework initialized
[    0.002198] SELinux:  Initializing.
[    0.002321] SELinux:  Starting in enforcing mode
[    0.002395] Mount-cache hash table entries: 4096 (order: 3, 32768 bytes)
[    0.002405] Mountpoint-cache hash table entries: 4096 (order: 3, 32768 bytes)
[    0.003810] Initializing cgroup subsys freezer
[    0.005710] sched-energy: Sched-energy-costs installed from DT
[    0.005750] ASID allocator initialised with 65536 entries
[    0.009737] NOHZ: local_softirq_pending 02
[    0.010052] NOHZ: local_softirq_pending 02
[    0.296328] Brought up 8 CPUs
[    0.296341] SMP: Total of 8 processors activated.
[    0.296349] CPU features: detected feature: GIC system register CPU interface
[    0.296356] CPU features: detected feature: 32-bit EL0 Support
[    0.296365] CPU: All CPU(s) started at EL1
[    0.296579] alternatives: patching kernel code
[    0.296864] CPU7: update max cpu_capacity 1024
[    0.300249] CPU7: update max cpu_capacity 1024
[    0.653767] clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 19112604462750000 ns
[    0.653804] futex hash table entries: 2048 (order: 6, 262144 bytes)
[    0.668499] pinctrl core: initialized pinctrl subsystem
[    0.670775] NET: Registered protocol family 16
[    0.673633] schedtune: init normalization constants...
[    0.673644] schedtune: CLUSTER[0-3]      min_pwr:     0 max_pwr:    66
[    0.673652] schedtune: CPU[0]            min_pwr:     0 max_pwr:   619
[    0.673659] schedtune: CPU[1]            min_pwr:     0 max_pwr:   619
[    0.673666] schedtune: CPU[2]            min_pwr:     0 max_pwr:   619
[    0.673675] schedtune: CPU[3]            min_pwr:     0 max_pwr:   619
[    0.673682] schedtune: CLUSTER[4-7]      min_pwr:     0 max_pwr:   203
[    0.673689] schedtune: CPU[4]            min_pwr:     0 max_pwr:  1683
[    0.673697] schedtune: CPU[5]            min_pwr:     0 max_pwr:  1683
[    0.673705] schedtune: CPU[6]            min_pwr:     0 max_pwr:  1683
[    0.673713] schedtune: CPU[7]            min_pwr:     0 max_pwr:  1683
[    0.673720] schedtune: SYSTEM            min_pwr:     0 max_pwr:  9477
[    0.673727] schedtune: using normalization constants mul: 3130245765 sh1: 1 sh2: 13
[    0.673733] schedtune: verify normalization constants...
[    0.673739] schedtune: max_pwr/2^0: 9477 => norm_pwr:  1024
[    0.673745] schedtune: max_pwr/2^1: 4738 => norm_pwr:   511
[    0.673750] schedtune: max_pwr/2^2: 2369 => norm_pwr:   255
[    0.673756] schedtune: max_pwr/2^3: 1184 => norm_pwr:   127
[    0.673761] schedtune: max_pwr/2^4:  592 => norm_pwr:    63
[    0.673767] schedtune: max_pwr/2^5:  296 => norm_pwr:    31
[    0.673775] schedtune: configured to support 5 boost groups
[    0.675945] cpuidle: using governor menu
[    0.675969] cpuidle: using governor qcom
[    0.676801] vdso: 2 pages (1 code @ 0000000000000000, 1 data @ 0000000000000000)
[    0.676815] vdso32: 2 pages (1 code @ 0000000000000000, 1 data @ 0000000000000000)
[    0.676847] hw-breakpoint: found 6 breakpoint and 4 watchpoint registers.
[    0.678790] DMA: preallocated 256 KiB pool for atomic allocations
[    0.679023] msm_smem_init: unable to create logging context
[    0.679128] msm_smd_init: unable to create SMD logging context
[    0.679134] msm_smd_init: unable to create SMSM logging context
[    0.689885] <CORE> glink_init: unable to create log context
[    0.690275] <CORE> glink_core_register_transport: unable to create log context for [dsps:smd_trans]
[    0.690570] <CORE> glink_core_register_transport: unable to create log context for [lpass:smd_trans]
[    0.690823] <CORE> glink_core_register_transport: unable to create log context for [mpss:smd_trans]
[    0.691083] <CORE> glink_core_register_transport: unable to create log context for [wcnss:smd_trans]
[    0.691336] <CORE> glink_core_register_transport: unable to create log context for [rpm:smd_trans]
[    0.693345] Failed to create IPC log0
[    0.693351] Failed to create IPC log1
[    0.693356] Failed to create IPC log2
[    0.693361] Failed to create IPC log3
[    0.693366] Failed to create IPC log4
[    0.703899] exit: IPA_USB init success!
[    0.728143] unable to find DT imem restart info node
[    0.728171] restart_handler_init failure
[    0.729717] unable to find DT imem DLOAD mode node
[    0.731089] unable to find DT imem EDLOAD mode node
[    0.733041] spmi spmi-0: PMIC arbiter version v3 (0x30000000)
[    0.756677] sps:sps is ready.
[    0.769428] (NULL device *): msm_gsi_probe:2842 failed to create IPC log, continue...
[    0.774662] platform soc:qcom,msm-adsprpc-mem: assigned reserved memory node adsp_region
[    0.776184] msm_rpm_dev_probe: APSS-RPM communication over GLINK
[    0.799741] <CORE> glink_core_register_transport: unable to create log context for [mpss:smem]
[    0.801887] <CORE> glink_core_register_transport: unable to create log context for [lpass:smem]
[    0.803573] <CORE> glink_core_register_transport: unable to create log context for [dsps:smem]
[    0.804895] <CORE> glink_core_register_transport: unable to create log context for [rpm:smem]
[    0.806695] <CORE> glink_core_register_transport: unable to create log context for [spss:mailbox]
[    0.812618] msm_rpm_trans_notify_state: glink config params: transport=(null), edge=rpm, name=rpm_requests
[    0.817405] platform 17300000.qcom,lpass: assigned reserved memory node pil_adsp_region@0x8b200000
[    0.818715] platform 4080000.qcom,mss: assigned reserved memory node modem_region@8cc00000
[    0.825292] msm-thermal soc:qcom,msm-thermal: probe_psm:Failed reading node=/soc/qcom,msm-thermal, key=qcom,pmic-sw-mode-temp. err=-22. KTM continues
[    0.825383] msm_thermal:vdd_restriction_reg_init Defer regulator vdd-dig probe
[    0.825389] msm_thermal:probe_vdd_rstr Err regulator init. err:-517. KTM continues.
[    0.825397] msm-thermal soc:qcom,msm-thermal: probe_vdd_rstr:Failed reading node=/soc/qcom,msm-thermal, key=qcom,max-freq-level. err=-517. KTM continues
[    0.842089] platform 5c00000.qcom,ssc: assigned reserved memory node pil_slpi_region@94300000
[    0.842624] platform cce0000.qcom,venus: assigned reserved memory node pil_video_region@93c00000
[    0.843573] msm_watchdog 17817000.qcom,wdt: wdog absent resource not present
[    0.844074] msm_watchdog 17817000.qcom,wdt: MSM Watchdog Initialized
[    0.844557] platform 1d0101c.qcom,spss: assigned reserved memory node spss_region@8ab00000
[    0.861513] msm_mpm_dev_probe(): Cannot get clk resource for XO: -517
[    0.920274] __of_mpm_init(): MPM driver mapping exists
[    0.936435] irq: no irq domain found for /soc/qcom,mdss_mdp@c900000 !
[    0.944570] i2c-msm-v2 c179000.i2c: probing driver i2c-msm-v2
[    0.944673] i2c-msm-v2 c179000.i2c: error on clk_get(core_clk):-517
[    0.944686] i2c-msm-v2 c179000.i2c: error probe() failed with err:-517
[    0.945749] i2c-msm-v2 c17a000.i2c: probing driver i2c-msm-v2
[    0.945826] i2c-msm-v2 c17a000.i2c: error on clk_get(core_clk):-517
[    0.945837] i2c-msm-v2 c17a000.i2c: error probe() failed with err:-517
[    0.946886] i2c-msm-v2 c1b5000.i2c: probing driver i2c-msm-v2
[    0.946962] i2c-msm-v2 c1b5000.i2c: error on clk_get(core_clk):-517
[    0.946974] i2c-msm-v2 c1b5000.i2c: error probe() failed with err:-517
[    0.947991] i2c-msm-v2 c1b6000.i2c: probing driver i2c-msm-v2
[    0.948068] i2c-msm-v2 c1b6000.i2c: error on clk_get(core_clk):-517
[    0.948079] i2c-msm-v2 c1b6000.i2c: error probe() failed with err:-517
[    0.949321] i2c-msm-v2 c1b7000.i2c: probing driver i2c-msm-v2
[    0.949396] i2c-msm-v2 c1b7000.i2c: error on clk_get(core_clk):-517
[    0.949407] i2c-msm-v2 c1b7000.i2c: error probe() failed with err:-517
[    0.957792] irq: no irq domain found for /soc/qcom,sde_kms@c900000 !
[    0.962256] platform soc:access_ramoops@0: assigned reserved memory node ramoops_meta_region@affff000
[    0.962725] platform soc:access_ramoops@1: assigned reserved memory node alt_ramoops_region@b0e00000
[    0.963184] platform soc:ramoops: assigned reserved memory node ramoops_region@b0000000
[    0.983492] console [pstore0] enabled
[    0.983836] pstore: Registered ramoops as persistent store backend
[    0.984141] ramoops: attached 0x200000@0xb0000000, ecc: 0/0
[    0.985757] bldr_log_init: can't find compatible 'htc,bldr_log'
[    1.065674] msm-thermal soc:qcom,msm-thermal: probe_psm:Failed reading node=/soc/qcom,msm-thermal, key=qcom,pmic-sw-mode-temp. err=-22. KTM continues
[    1.065995] msm_thermal:vdd_restriction_reg_init Defer regulator vdd-gfx probe
[    1.066003] msm_thermal:probe_vdd_rstr Err regulator init. err:-517. KTM continues.
[    1.066013] msm-thermal soc:qcom,msm-thermal: probe_vdd_rstr:Failed reading node=/soc/qcom,msm-thermal, key=qcom,max-freq-level. err=-517. KTM continues
[    1.067408] msm_mpm_dev_probe(): Cannot get clk resource for XO: -517
[    1.068500] i2c-msm-v2 c179000.i2c: probing driver i2c-msm-v2
[    1.068603] i2c-msm-v2 c179000.i2c: error on clk_get(core_clk):-517
[    1.068617] i2c-msm-v2 c179000.i2c: error probe() failed with err:-517
[    1.069308] i2c-msm-v2 c17a000.i2c: probing driver i2c-msm-v2
[    1.069391] i2c-msm-v2 c17a000.i2c: error on clk_get(core_clk):-517
[    1.069404] i2c-msm-v2 c17a000.i2c: error probe() failed with err:-517
[    1.070145] i2c-msm-v2 c1b5000.i2c: probing driver i2c-msm-v2
[    1.070230] i2c-msm-v2 c1b5000.i2c: error on clk_get(core_clk):-517
[    1.070243] i2c-msm-v2 c1b5000.i2c: error probe() failed with err:-517
[    1.070912] i2c-msm-v2 c1b6000.i2c: probing driver i2c-msm-v2
[    1.070995] i2c-msm-v2 c1b6000.i2c: error on clk_get(core_clk):-517
[    1.071008] i2c-msm-v2 c1b6000.i2c: error probe() failed with err:-517
[    1.071910] i2c-msm-v2 c1b7000.i2c: probing driver i2c-msm-v2
[    1.071994] i2c-msm-v2 c1b7000.i2c: error on clk_get(core_clk):-517
[    1.072007] i2c-msm-v2 c1b7000.i2c: error probe() failed with err:-517
[    1.084611] gcc_mss_q6_bimc_axi_clk: status stuck off
[    1.084652] ------------[ cut here ]------------
[    1.084661] WARNING: at ../drivers/clk/msm/clock-local2.c:670
[    1.084667] 
[    1.084676] CPU: 0 PID: 84 Comm: kworker/u16:1 Not tainted 4.4.169-Sultan #15
[    1.084682] Hardware name: Qualcomm Technologies, Inc. MSM8998 v2.1 (DT)
[    1.084700] Workqueue: deferwq deferred_probe_work_func
[    1.084710] task: 0000000000000000 task.stack: 0000000000000000
[    1.084720] PC is at branch_clk_halt_check+0xa4/0x130
[    1.084727] LR is at branch_clk_halt_check+0xa4/0x130
[    1.084735] pc : [<ffffff9f2f3cf9dc>] lr : [<ffffff9f2f3cf9dc>] pstate: 804000c5
[    1.084741] sp : ffffffe1b14af920
[    1.084746] x29: ffffffe1b14af920 x28: 0000000000000000 
[    1.084758] x27: 0000000000000000 x26: 00000000000000a0 
[    1.084770] x25: ffffff9f2fcd29f0 x24: ffffff9f30b4cce0 
[    1.084782] x23: 0000000080000000 x22: 00000000d0000000 
[    1.084794] x21: ffffff8008f8a040 x20: 0000000000000000 
[    1.084806] x19: 0000000000000000 x18: 0000000000000001 
[    1.084818] x17: 0000000000000001 x16: 00000000024636b2 
[    1.084830] x15: ffffff9f30a05d90 x14: 0000000000000010 
[    1.084842] x13: ffffff9fb1925227 x12: ffffff9f31925230 
[    1.084854] x11: 7f7f7f7f7f7f7f7f x10: ffffffe1b14af700 
[    1.084866] x9 : 00000000ffffffd0 x8 : 00000000000000fd 
[    1.084878] x7 : 0000000000000001 x6 : ffffff9f2e90dff8 
[    1.084890] x5 : 0000000000000000 x4 : 0000000000000001 
[    1.084902] x3 : 0000000000000002 x2 : 403a4a6d7a81e143 
[    1.084914] x1 : 0000000000000000 x0 : 0000000000000029 
[    1.084926] \x0aPC: 0xffffff9f2f3cf99c:
[    1.084933] f99c  d28218e0 7100069f 54000260 6a16003f 54000280 97de0f68 71000673 54ffff01
[    1.084969] f9bc  b4000518 b140071f 540004c8 f9400b01 aa1903e2 f0004c00 912ca000 97d66e11
[    1.085004] f9dc  d4210000 12800da0 a9425bf5 a94363f7 14000007 12040c21 6b17003f 54fffdc1
[    1.085039] f9fc  a9425bf5 52800000 a94363f7 a94153f3 f94023f9 a8c57bfd d65f03c0 d2989580
[    1.085076] \x0aLR: 0xffffff9f2f3cf99c:
[    1.085082] f99c  d28218e0 7100069f 54000260 6a16003f 54000280 97de0f68 71000673 54ffff01
[    1.085118] f9bc  b4000518 b140071f 540004c8 f9400b01 aa1903e2 f0004c00 912ca000 97d66e11
[    1.085154] f9dc  d4210000 12800da0 a9425bf5 a94363f7 14000007 12040c21 6b17003f 54fffdc1
[    1.085189] f9fc  a9425bf5 52800000 a94363f7 a94153f3 f94023f9 a8c57bfd d65f03c0 d2989580
[    1.085225] \x0aSP: 0xffffffe1b14af8e0:
[    1.085232] f8e0  2f3cf9dc ffffff9f b14af920 ffffffe1 2f3cf9dc ffffff9f 804000c5 00000000
[    1.085267] f900  2fcd29f0 ffffff9f 00001ffe 00000000 ffffffff ffffffff 30ede370 ffffff9f
[    1.085301] f920  b14af970 ffffffe1 2f3d0698 ffffff9f 30b4ccd8 ffffff9f 30b44608 ffffff9f
[    1.085336] f940  30b4cce0 ffffff9f 00000040 00000000 00000000 00000000 2fd52830 ffffff9f
[    1.085373] 
[    1.085402] ---[ end trace 24ec051baadc3c82 ]---
[    1.085409] Call trace:
[    1.085416] Exception stack(0xffffffe1b14af730 to 0xffffffe1b14af860)
[    1.085424] f720:                                   0000000000000000 0000008000000000
[    1.085433] f740: 00000000834df000 ffffff9f2f3cf9dc 00000000804000c5 0000000000000029
[    1.085442] f760: ffffff9f31924cb0 0000000030a22a30 ffffffe1b14af780 ffffff9f2e90e584
[    1.085450] f780: ffffffe1b14af820 ffffff9f2e90e8a8 ffffff9f30a05d90 ffffff9f2e90e8b0
[    1.085460] f7a0: ffffff9f2fd52b28 00000000d0000000 0000000080000000 ffffff9f30b4cce0
[    1.085468] f7c0: ffffff9f2fcd29f0 00000000000000a0 0000000000000000 0000000000000000
[    1.085477] f7e0: ffffff9f31925680 403a4a6d7a81e143 0000000000000029 0000000000000000
[    1.085486] f800: 403a4a6d7a81e143 0000000000000002 0000000000000001 0000000000000000
[    1.085495] f820: ffffff9f2e90dff8 0000000000000001 00000000000000fd 00000000ffffffd0
[    1.085505] f840: ffffffe1b14af700 7f7f7f7f7f7f7f7f ffffff9f31925230 ffffff9fb1925227
[    1.085514] [<ffffff9f2f3cf9dc>] branch_clk_halt_check+0xa4/0x130
[    1.085523] [<ffffff9f2f3d0698>] branch_clk_enable+0x80/0xf8
[    1.085531] [<ffffff9f2f3cc6f0>] clk_enable+0xe8/0x148
[    1.085539] [<ffffff9f2f3cd204>] __handoff_clk+0x254/0x358
[    1.085547] [<ffffff9f2f3cd678>] msm_clock_register+0x138/0x280
[    1.085554] [<ffffff9f2f3cd830>] of_msm_clock_register+0x70/0x98
[    1.085565] [<ffffff9f2f3d5ea0>] msm_gcc_8998_probe+0x220/0x398
[    1.085573] [<ffffff9f2ed921fc>] platform_drv_probe+0x3c/0xb0
[    1.085580] [<ffffff9f2ed90a78>] driver_probe_device+0x1b8/0x2f8
[    1.085589] [<ffffff9f2ed90cc0>] __device_attach_driver+0x70/0xb0
[    1.085598] [<ffffff9f2ed8f0d8>] bus_for_each_drv+0x78/0xc8
[    1.085605] [<ffffff9f2ed90738>] __device_attach+0xd0/0x138
[    1.085613] [<ffffff9f2ed90d38>] device_initial_probe+0x10/0x18
[    1.085621] [<ffffff9f2ed8f3a0>] bus_probe_device+0x98/0xa0
[    1.085628] [<ffffff9f2ed9050c>] deferred_probe_work_func+0x74/0xd0
[    1.085638] [<ffffff9f2e8c3e38>] process_one_work+0x250/0x458
[    1.085645] [<ffffff9f2e8c4174>] worker_thread+0x134/0x4e0
[    1.085654] [<ffffff9f2e8cb218>] kthread+0x100/0x108
[    1.085661] [<ffffff9f2e883260>] ret_from_fork+0x10/0x30
[    1.085677] failed to enable always-on clock gcc_mss_q6_bimc_axi_clk
[    1.086318] qcom,gcc-8998 100000.qcom,gcc: Registered GCC clocks
[    1.092962] qcom,mmsscc-8998 c8c0000.qcom,mmsscc: Registered MMSS clocks.
[    1.095349] qcom,gpucc-8998 5065000.qcom,gpucc: Registered GPU clocks (barring gfx3d clocks)
[    1.100823] msm-thermal soc:qcom,msm-thermal: probe_psm:Failed reading node=/soc/qcom,msm-thermal, key=qcom,pmic-sw-mode-temp. err=-22. KTM continues
[    1.101081] msm_thermal:vdd_restriction_reg_init Defer regulator vdd-gfx probe
[    1.101090] msm_thermal:probe_vdd_rstr Err regulator init. err:-517. KTM continues.
[    1.101099] msm-thermal soc:qcom,msm-thermal: probe_vdd_rstr:Failed reading node=/soc/qcom,msm-thermal, key=qcom,max-freq-level. err=-517. KTM continues
[    1.101664] cprh_kbss_read_fuse_data: apc0_pwrcl_corner: speed bin = 2
[    1.101675] cprh_msm8998_kbss_read_fuse_data: apc0_pwrcl_corner: CPR fusing revision = 3
[    1.101931] cprh_kbss_calculate_open_loop_voltages: apc0_pwrcl_corner: fused   LowSVS: open-loop= 598000 uV
[    1.101939] cprh_kbss_calculate_open_loop_voltages: apc0_pwrcl_corner: fused      SVS: open-loop= 656000 uV
[    1.101947] cprh_kbss_calculate_open_loop_voltages: apc0_pwrcl_corner: fused      NOM: open-loop= 768000 uV
[    1.101956] cprh_kbss_calculate_open_loop_voltages: apc0_pwrcl_corner: fused TURBO_L1: open-loop= 876000 uV
[    1.102122] cprh_kbss_calculate_target_quotients: apc0_pwrcl_corner: fused   LowSVS: quot[11]= 572
[    1.102132] cprh_kbss_calculate_target_quotients: apc0_pwrcl_corner: fused      SVS: quot[11]= 714, quot_offset[11]= 140
[    1.102141] cprh_kbss_calculate_target_quotients: apc0_pwrcl_corner: fused      NOM: quot[11]=1018, quot_offset[11]= 300
[    1.102151] cprh_kbss_calculate_target_quotients: apc0_pwrcl_corner: fused TURBO_L1: quot[11]=1287, quot_offset[11]= 265
[    1.103022] cpr3_regulator_init_ctrl: apc0: Default CPR mode = full HW closed-loop
[    1.103667] cpr3_regulator_debugfs_ctrl_add: apc0: cpr3-regulator debugfs base directory creation failed
[    1.104151] cprh_kbss_read_fuse_data: apc1_perfcl_corner: speed bin = 2
[    1.104161] cprh_msm8998_kbss_read_fuse_data: apc1_perfcl_corner: CPR fusing revision = 3
[    1.104445] cprh_kbss_calculate_open_loop_voltages: apc1_perfcl_corner: fused   LowSVS: open-loop= 626000 uV
[    1.104454] cprh_kbss_calculate_open_loop_voltages: apc1_perfcl_corner: fused      SVS: open-loop= 626000 uV
[    1.104462] cprh_kbss_calculate_open_loop_voltages: apc1_perfcl_corner: fused      NOM: open-loop= 748000 uV
[    1.104470] cprh_kbss_calculate_open_loop_voltages: apc1_perfcl_corner: fused TURBO_L1: open-loop= 966000 uV
[    1.104499] cprh_kbss_calculate_open_loop_voltages: apc1_perfcl_corner: fuse corner 1 voltage=626000 uV < fuse corner 0 voltage=634000 uV; overriding: fuse corner 1 voltage=634000
[    1.104680] cprh_kbss_calculate_target_quotients: apc1_perfcl_corner: fused   LowSVS: quot[11]=1034
[    1.104690] cprh_kbss_calculate_target_quotients: apc1_perfcl_corner: fused      SVS: quot[11]=1034, quot_offset[11]=   0
[    1.104698] cprh_kbss_calculate_target_quotients: apc1_perfcl_corner: fused      NOM: quot[11]=1387, quot_offset[11]= 350
[    1.104707] cprh_kbss_calculate_target_quotients: apc1_perfcl_corner: fused TURBO_L1: quot[11]=1891, quot_offset[11]= 500
[    1.105433] cpr3_regulator_init_ctrl: apc1: Default CPR mode = full HW closed-loop
[    1.105882] cpr3_regulator_debugfs_ctrl_add: apc1: cpr3-regulator debugfs base directory creation failed
[    1.106538] cpr3_msm8996_mmss_read_fuse_data: gfx_corner: CPR fusing revision = 2
[    1.106549] cpr3_msm8996_mmss_read_fuse_data: gfx_corner: CPR limitation = none
[    1.107305] cpr3_msm8996_mmss_calculate_open_loop_voltages: gfx_corner: fuse_corner[0] open-loop= 586000 uV
[    1.107313] cpr3_msm8996_mmss_calculate_open_loop_voltages: gfx_corner: fuse_corner[1] open-loop= 658000 uV
[    1.107321] cpr3_msm8996_mmss_calculate_open_loop_voltages: gfx_corner: fuse_corner[2] open-loop= 792000 uV
[    1.107329] cpr3_msm8996_mmss_calculate_open_loop_voltages: gfx_corner: fuse_corner[3] open-loop= 944000 uV
[    1.107754] cpr3_regulator_init_ctrl: gfx: Default CPR mode = closed-loop
[    1.108999] cpr3_regulator_debugfs_ctrl_add: gfx: cpr3-regulator debugfs base directory creation failed
[    1.121105] i2c-msm-v2 c179000.i2c: probing driver i2c-msm-v2
[    1.121312] AXI: msm_bus_scale_register_client(): msm_bus_scale_register_client: Bus driver not ready.
[    1.121321] i2c-msm-v2 c179000.i2c: msm_bus_scale_register_client(mstr-id:86):0 (not a problem)
[    1.122599] AXI: msm_bus_scale_register_client(): msm_bus_scale_register_client: Bus driver not ready.
[    1.123991] i2c-msm-v2 c17a000.i2c: probing driver i2c-msm-v2
[    1.124176] AXI: msm_bus_scale_register_client(): msm_bus_scale_register_client: Bus driver not ready.
[    1.124185] i2c-msm-v2 c17a000.i2c: msm_bus_scale_register_client(mstr-id:86):0 (not a problem)
[    1.125138] AXI: msm_bus_scale_register_client(): msm_bus_scale_register_client: Bus driver not ready.
[    1.126202] i2c-msm-v2 c1b5000.i2c: probing driver i2c-msm-v2
[    1.126388] AXI: msm_bus_scale_register_client(): msm_bus_scale_register_client: Bus driver not ready.
[    1.126398] i2c-msm-v2 c1b5000.i2c: msm_bus_scale_register_client(mstr-id:84):0 (not a problem)
[    1.127397] AXI: msm_bus_scale_register_client(): msm_bus_scale_register_client: Bus driver not ready.
[    1.128949] i2c-msm-v2 c1b6000.i2c: probing driver i2c-msm-v2
[    1.129135] AXI: msm_bus_scale_register_client(): msm_bus_scale_register_client: Bus driver not ready.
[    1.129144] i2c-msm-v2 c1b6000.i2c: msm_bus_scale_register_client(mstr-id:84):0 (not a problem)
[    1.130489] AXI: msm_bus_scale_register_client(): msm_bus_scale_register_client: Bus driver not ready.
[    1.132002] i2c-msm-v2 c1b7000.i2c: probing driver i2c-msm-v2
[    1.132195] AXI: msm_bus_scale_register_client(): msm_bus_scale_register_client: Bus driver not ready.
[    1.132204] i2c-msm-v2 c1b7000.i2c: msm_bus_scale_register_client(mstr-id:84):0 (not a problem)
[    1.133364] AXI: msm_bus_scale_register_client(): msm_bus_scale_register_client: Bus driver not ready.
[    1.136064] gfx3d_clk_src: set OPP pair(257000000 Hz: 648000 uV) on 5000000.qcom,kgsl-3d0
[    1.136134] gfx3d_clk_src: set OPP pair(710000000 Hz: 916000 uV) on 5000000.qcom,kgsl-3d0
[    1.136319] possible reason: unannotated irqs-off.
[    1.136345] ------------[ cut here ]------------
[    1.136352] WARNING: at ../drivers/regulator/core.c:2113
[    1.136359] 
[    1.136369] CPU: 0 PID: 84 Comm: kworker/u16:1 Tainted: G        W       4.4.169-Sultan #15
[    1.136376] Hardware name: Qualcomm Technologies, Inc. MSM8998 v2.1 (DT)
[    1.136393] Workqueue: deferwq deferred_probe_work_func
[    1.136404] task: 0000000000000000 task.stack: 0000000000000000
[    1.136415] PC is at regulator_enable+0x114/0x1c8
[    1.136424] LR is at regulator_enable+0x100/0x1c8
[    1.136432] pc : [<ffffff9f2ece6544>] lr : [<ffffff9f2ece6530>] pstate: 60400045
[    1.136438] sp : ffffffe1b14af8b0
[    1.136445] x29: ffffffe1b14af8b0 x28: 0000000000000000 
[    1.136460] x27: 0000000000000000 x26: 0000000000000010 
[    1.136476] x25: 0000000000000000 x24: 0000000000000000 
[    1.136492] x23: ffffffe1b1077698 x22: ffffffe1b1bda0e8 
[    1.136507] x21: ffffffe1b1066080 x20: 0000000000000000 
[    1.136523] x19: ffffffe1b1bda000 x18: 0000000000000000 
[    1.136538] x17: 0000000000000003 x16: 00000000072d9597 
[    1.136554] x15: ffffff9f30a05d90 x14: 0000000000000008 
[    1.136569] x13: 0000000000001c52 x12: 0000000000001c4b 
[    1.136584] x11: ffffff9f30c0cb00 x10: ffffffe1b105a958 
[    1.136600] x9 : ffffff9f30c0c000 x8 : ffffff9f313598e8 
[    1.136615] x7 : 0000000000000038 x6 : 0000000000000005 
[    1.136631] x5 : 0000000000000040 x4 : 00000042828e1000 
[    1.136646] x3 : 0000000000000004 x2 : 0000000000000007 
[    1.136661] x1 : 0000000000000000 x0 : ffffff9f30c04892 
[    1.136676] \x0aPC: 0xffffff9f2ece6504:
[    1.136683] 6504  2a1403e0 a94153f3 a8c37bfd d65f03c0 2a1403e0 a94153f3 a9425bf5 a8c37bfd
[    1.136730] 6524  d65f03c0 91056260 97f06859 35fffaa0 d000f8e0 91224800 39401001 35fffa21
[    1.136776] 6544  d4210000 52800021 39001001 17ffffcd aa1303e0 97fffe4c b9400e60 35fffc00
[    1.136822] 6564  17ffffce aa1303e0 97fff627 7100001f 2a0003f4 3a561804 54fff980 37f80060
[    1.136870] \x0aLR: 0xffffff9f2ece64f0:
[    1.136877] 64f0  b9402aa1 11000421 b9002aa1 942ae5cf a9425bf5 2a1403e0 a94153f3 a8c37bfd
[    1.136923] 6510  d65f03c0 2a1403e0 a94153f3 a9425bf5 a8c37bfd d65f03c0 91056260 97f06859
[    1.136970] 6530  35fffaa0 d000f8e0 91224800 39401001 35fffa21 d4210000 52800021 39001001
[    1.137017] 6550  17ffffcd aa1303e0 97fffe4c b9400e60 35fffc00 17ffffce aa1303e0 97fff627
[    1.137063] \x0aSP: 0xffffffe1b14af870:
[    1.137071] f870  2ece6530 ffffff9f b14af8b0 ffffffe1 2ece6544 ffffff9f 60400045 00000000
[    1.137118] f890  b1bda000 ffffffe1 00000000 00000000 ffffffff ffffffff 7a81e143 403a4a6d
[    1.137164] f8b0  b14af8e0 ffffffe1 2ecf8e18 ffffff9f 00000000 00000000 b1055e18 ffffffe1
[    1.137212] f8d0  b1b9ec18 ffffffe1 b1b9ec78 ffffffe1 b14af910 ffffffe1 2ece507c ffffff9f
[    1.137259] 
[    1.137267] ---[ end trace 24ec051baadc3c83 ]---
[    1.137274] Call trace:
[    1.137284] Exception stack(0xffffffe1b14af6c0 to 0xffffffe1b14af7f0)
[    1.137294] f6c0: ffffffe1b1bda000 0000008000000000 00000000834df000 ffffff9f2ece6544
[    1.137305] f6e0: 0000000060400045 0000000000000040 ffffffe1b14b0000 ffffffe1b1bda158
[    1.137316] f700: 0000000000000001 000000000004d7c0 ffffff9f31344ba8 0000000000000040
[    1.137326] f720: ffffffe1b14af730 ffffff9f2e88a160 ffffffe1b14af780 ffffff9f2e8ff8c4
[    1.137337] f740: 0000000000000007 0000000000000007 ffffffe1b14b0000 0000000000000038
[    1.137346] f760: ffffffe1b14af790 ffffff9f2e88d248 ffffffe1b14af7b0 403a4a6d7a81e143
[    1.137357] f780: ffffff9f30c04892 0000000000000000 0000000000000007 0000000000000004
[    1.137367] f7a0: 00000042828e1000 0000000000000040 0000000000000005 0000000000000038
[    1.137377] f7c0: ffffff9f313598e8 ffffff9f30c0c000 ffffffe1b105a958 ffffff9f30c0cb00
[    1.137386] f7e0: 0000000000001c4b 0000000000001c52
[    1.137396] [<ffffff9f2ece6544>] regulator_enable+0x114/0x1c8
[    1.137406] [<ffffff9f2ecf8e18>] cpr3_regulator_enable+0x68/0x150
[    1.137415] [<ffffff9f2ece507c>] _regulator_do_enable+0xdc/0x160
[    1.137424] [<ffffff9f2ece64c4>] regulator_enable+0x94/0x1c8
[    1.137436] [<ffffff9f2f3cbb00>] update_vdd+0x278/0x310
[    1.137445] [<ffffff9f2f3cc384>] vote_vdd_level+0x4c/0x90
[    1.137454] [<ffffff9f2f3cd754>] msm_clock_register+0x214/0x280
[    1.137463] [<ffffff9f2f3cd830>] of_msm_clock_register+0x70/0x98
[    1.137473] [<ffffff9f2f3d65e0>] msm_gfxcc_8998_probe+0x198/0x4b0
[    1.137482] [<ffffff9f2ed921fc>] platform_drv_probe+0x3c/0xb0
[    1.137489] [<ffffff9f2ed90a78>] driver_probe_device+0x1b8/0x2f8
[    1.137498] [<ffffff9f2ed90cc0>] __device_attach_driver+0x70/0xb0
[    1.137509] [<ffffff9f2ed8f0d8>] bus_for_each_drv+0x78/0xc8
[    1.137517] [<ffffff9f2ed90738>] __device_attach+0xd0/0x138
[    1.137525] [<ffffff9f2ed90d38>] device_initial_probe+0x10/0x18
[    1.137535] [<ffffff9f2ed8f3a0>] bus_probe_device+0x98/0xa0
[    1.137543] [<ffffff9f2ed9050c>] deferred_probe_work_func+0x74/0xd0
[    1.137555] [<ffffff9f2e8c3e38>] process_one_work+0x250/0x458
[    1.137564] [<ffffff9f2e8c4174>] worker_thread+0x134/0x4e0
[    1.137573] [<ffffff9f2e8cb218>] kthread+0x100/0x108
[    1.137583] [<ffffff9f2e883260>] ret_from_fork+0x10/0x30
[    1.137591] possible reason: unannotated irqs-on.
[    1.137614] ------------[ cut here ]------------
[    1.137620] WARNING: at ../drivers/regulator/core.c:709
[    1.137626] 
[    1.137634] CPU: 0 PID: 84 Comm: kworker/u16:1 Tainted: G        W       4.4.169-Sultan #15
[    1.137642] Hardware name: Qualcomm Technologies, Inc. MSM8998 v2.1 (DT)
[    1.137650] Workqueue: deferwq deferred_probe_work_func
[    1.137661] task: 0000000000000000 task.stack: 0000000000000000
[    1.137670] PC is at drms_uA_update+0x218/0x368
[    1.137678] LR is at drms_uA_update+0x204/0x368
[    1.137685] pc : [<ffffff9f2ece60a0>] lr : [<ffffff9f2ece608c>] pstate: 60400045
[    1.137692] sp : ffffffe1b14af860
[    1.137699] x29: ffffffe1b14af860 x28: 0000000000000000 
[    1.137713] x27: 0000000000000000 x26: 0000000000000010 
[    1.137728] x25: 0000000000000000 x24: 0000000000000000 
[    1.137743] x23: ffffffe1b1077698 x22: ffffffe1b1bda0e8 
[    1.137758] x21: ffffff9f30a05d90 x20: ffffffe1b1bda000 
[    1.137773] x19: ffffffe1b1bda000 x18: 0000000000000000 
[    1.137788] x17: 0000000000000003 x16: 00000000072d9597 
[    1.137803] x15: ffffff9f30a05d90 x14: 0000000000000010 
[    1.137818] x13: ffffff9fb1925227 x12: ffffff9f3192522f 
[    1.137833] x11: ffffff9f31924000 x10: ffffffe1b14af550 
[    1.137848] x9 : 00000000ffffffd0 x8 : 00000000000001d5 
[    1.137863] x7 : 0000000000000001 x6 : ffffff9f2e90dff8 
[    1.137878] x5 : 0000000000000000 x4 : 0000000000000001 
[    1.137894] x3 : 0000000000000004 x2 : 0000000000000007 
[    1.137909] x1 : 0000000000000000 x0 : ffffff9f30c04892 
[    1.137925] \x0aPC: 0xffffff9f2ece6060:
[    1.137932] 6060  f0007b60 91346000 97f2146d a94363f7 17ffffbc f9405800 b5fff360 128002b6
[    1.137979] 6080  17ffffb8 91056280 97f06982 35fff180 d000f8e0 91224800 39400801 35fff101
[    1.138025] 60a0  d4210000 52800021 39000801 17ffff84 f9405800 b5fff180 52800016 17ffffa9
[    1.138072] 60c0  52800016 a94363f7 17ffffa6 a90363f7 97eefbb2 f9431e80 b4000240 f9400001
[    1.138119] \x0aLR: 0xffffff9f2ece604c:
[    1.138126] 604c  f9431e80 b4000b60 f9400001 b4000b21 b94047e2 f0007b60 91346000 97f2146d
[    1.138173] 606c  a94363f7 17ffffbc f9405800 b5fff360 128002b6 17ffffb8 91056280 97f06982
[    1.138219] 608c  35fff180 d000f8e0 91224800 39400801 35fff101 d4210000 52800021 39000801
[    1.138266] 60ac  17ffff84 f9405800 b5fff180 52800016 17ffffa9 52800016 a94363f7 17ffffa6
[    1.138313] \x0aSP: 0xffffffe1b14af820:
[    1.138320] f820  2ece608c ffffff9f b14af860 ffffffe1 2ece60a0 ffffff9f 60400045 00000000
[    1.138367] f840  b1bda000 ffffffe1 b1bda000 ffffffe1 ffffffff ffffffff 00000000 00000000
[    1.138413] f860  b14af8b0 ffffffe1 2ece655c ffffff9f b1bda000 ffffffe1 00000000 00000000
[    1.138460] f880  b1066080 ffffffe1 b1bda0e8 ffffffe1 b1bda000 ffffffe1 00000000 00000000
[    1.138507] 
[    1.138513] ---[ end trace 24ec051baadc3c84 ]---
[    1.138519] Call trace:
[    1.138527] Exception stack(0xffffffe1b14af670 to 0xffffffe1b14af7a0)
[    1.138536] f660:                                   ffffffe1b1bda000 0000008000000000
[    1.138546] f680: 00000000834df000 ffffff9f2ece60a0 0000000060400045 ffffffe1b1bda0e8
[    1.138557] f6a0: ffffffe1b1077698 0000000000000000 0000000000000000 0000000000000010
[    1.138567] f6c0: 0000000000000000 0000000000000000 ffffff9f31925680 0000000000000000
[    1.138577] f6e0: ffffff9f2e90e8a8 ffffff9f00000080 ffffffe1b14af810 ffffffe1b14af810
[    1.138587] f700: ffffffe1b14af7d0 00000000ffffffc8 ffffffe1b14af740 ffffff9f2e90e8bc
[    1.138597] f720: ffffffe1b14af810 403a4a6d7a81e143 ffffff9f30c04892 0000000000000000
[    1.138607] f740: 0000000000000007 0000000000000004 0000000000000001 0000000000000000
[    1.138617] f760: ffffff9f2e90dff8 0000000000000001 00000000000001d5 00000000ffffffd0
[    1.138627] f780: ffffffe1b14af550 ffffff9f31924000 ffffff9f3192522f ffffff9fb1925227
[    1.138636] [<ffffff9f2ece60a0>] drms_uA_update+0x218/0x368
[    1.138645] [<ffffff9f2ece655c>] regulator_enable+0x12c/0x1c8
[    1.138654] [<ffffff9f2ecf8e18>] cpr3_regulator_enable+0x68/0x150
[    1.138664] [<ffffff9f2ece507c>] _regulator_do_enable+0xdc/0x160
[    1.138672] [<ffffff9f2ece64c4>] regulator_enable+0x94/0x1c8
[    1.138682] [<ffffff9f2f3cbb00>] update_vdd+0x278/0x310
[    1.138691] [<ffffff9f2f3cc384>] vote_vdd_level+0x4c/0x90
[    1.138699] [<ffffff9f2f3cd754>] msm_clock_register+0x214/0x280
[    1.138707] [<ffffff9f2f3cd830>] of_msm_clock_register+0x70/0x98
[    1.138717] [<ffffff9f2f3d65e0>] msm_gfxcc_8998_probe+0x198/0x4b0
[    1.138727] [<ffffff9f2ed921fc>] platform_drv_probe+0x3c/0xb0
[    1.138735] [<ffffff9f2ed90a78>] driver_probe_device+0x1b8/0x2f8
[    1.138743] [<ffffff9f2ed90cc0>] __device_attach_driver+0x70/0xb0
[    1.138753] [<ffffff9f2ed8f0d8>] bus_for_each_drv+0x78/0xc8
[    1.138761] [<ffffff9f2ed90738>] __device_attach+0xd0/0x138
[    1.138770] [<ffffff9f2ed90d38>] device_initial_probe+0x10/0x18
[    1.138779] [<ffffff9f2ed8f3a0>] bus_probe_device+0x98/0xa0
[    1.138787] [<ffffff9f2ed9050c>] deferred_probe_work_func+0x74/0xd0
[    1.138796] [<ffffff9f2e8c3e38>] process_one_work+0x250/0x458
[    1.138805] [<ffffff9f2e8c4174>] worker_thread+0x134/0x4e0
[    1.138813] [<ffffff9f2e8cb218>] kthread+0x100/0x108
[    1.138821] [<ffffff9f2e883260>] ret_from_fork+0x10/0x30
[    1.141957] qcom,gfxcc-8998 5065000.qcom,gfxcc: Completed registering all GPU clocks
[    1.145211] cpu-clock-osm 179c0000.qcom,cpu-clock-8998: using pwrcl speed bin 0 and pvs_ver 0
[    1.145256] cpu-clock-osm 179c0000.qcom,cpu-clock-8998: using perfcl speed bin 2 and pvs_ver 0
[    1.153216] add_opp: Set OPP pair (300000000 Hz, 636000 uv) on cpu0
[    1.153372] add_opp: Set OPP pair (1900800000 Hz, 880000 uv) on cpu0
[    1.153417] add_opp: Set OPP pair (300000000 Hz, 636000 uv) on cpu1
[    1.153568] add_opp: Set OPP pair (1900800000 Hz, 880000 uv) on cpu1
[    1.153612] add_opp: Set OPP pair (300000000 Hz, 636000 uv) on cpu2
[    1.153758] add_opp: Set OPP pair (1900800000 Hz, 880000 uv) on cpu2
[    1.153813] add_opp: Set OPP pair (300000000 Hz, 636000 uv) on cpu3
[    1.153962] add_opp: Set OPP pair (1900800000 Hz, 880000 uv) on cpu3
[    1.154005] add_opp: Set OPP pair (300000000 Hz, 632000 uv) on cpu4
[    1.154215] add_opp: Set OPP pair (2457600000 Hz, 992000 uv) on cpu4
[    1.154256] add_opp: Set OPP pair (300000000 Hz, 632000 uv) on cpu5
[    1.154472] add_opp: Set OPP pair (2457600000 Hz, 992000 uv) on cpu5
[    1.154514] add_opp: Set OPP pair (300000000 Hz, 632000 uv) on cpu6
[    1.154725] add_opp: Set OPP pair (2457600000 Hz, 992000 uv) on cpu6
[    1.154766] add_opp: Set OPP pair (300000000 Hz, 632000 uv) on cpu7
[    1.154976] add_opp: Set OPP pair (2457600000 Hz, 992000 uv) on cpu7
[    1.154985] populate_debugfs_dir: osm debugfs base directory creation failed
[    1.154992] populate_debugfs_dir: osm debugfs base directory creation failed
[    1.156539] cpu_clock_osm_driver_probe: OSM driver inited
[    1.156801] msm-thermal soc:qcom,msm-thermal: probe_psm:Failed reading node=/soc/qcom,msm-thermal, key=qcom,pmic-sw-mode-temp. err=-22. KTM continues
[    1.157141] msm-thermal soc:qcom,msm-thermal: probe_ocr:Failed reading node=/soc/qcom,msm-thermal, key=qcom,pmic-opt-curr-temp err:-22. KTM continues
[    1.157155] msm-thermal soc:qcom,msm-thermal: probe_vdd_mx:Failed reading node=/soc/qcom,msm-thermal, key=qcom,mx-restriction-temp. KTM continues
[    1.157817] msm-thermal soc:qcom,msm-thermal: probe_freq_mitigation:Failed reading node=/soc/qcom,msm-thermal, key=qcom,limit-temp. err=-22. KTM continues
[    1.157833] msm-thermal soc:qcom,msm-thermal: msm_thermal:Failed reading node=/soc/qcom,msm-thermal, key=qcom,rpm-phase-resource-type err=-22. KTM continues
[    1.157850] msm-thermal soc:qcom,msm-thermal: msm_thermal:Failed reading node=/soc/qcom,msm-thermal, key=qcom,gfx-sensor-id. err=-22. KTM continues
[    1.248867] socinfo_print: v0.11, id=292, ver=2.1, raw_id=94, raw_ver=2, hw_plat=8, hw_plat_ver=10\x0a accessory_chip=0, hw_plat_subtype=0, pmic_model=65556, pmic_die_revision=131072 foundry_id=3 serial_number=813649624 num_pmics=3
[    1.248887] msm_bus_fabric_init_driver
[    1.317390] msm_bus_dev_init_qos: Skipping QOS init for 1
[    1.317609] msm-bus-type fab-mnoc: Error: Failed to get regulator clk-camss-ahb-no-rate:-517
[    1.317741] msm-bus-type fab-mnoc: Error: Failed to get regulator clk-camss-ahb-no-rate:-517
[    1.317884] fab-mnoc supply clk-mdss-ahb-no-rate not found, using dummy regulator
[    1.318156] fab-mnoc supply clk-mdss-axi-no-rate not found, using dummy regulator
[    1.318382] msm-bus-type fab-mnoc: Error: Failed to get regulator clk-camss-ahb-no-rate:-517
[    1.318516] msm-bus-type fab-mnoc: Error: Failed to get regulator clk-camss-ahb-no-rate:-517
[    1.318641] msm-bus-type fab-mnoc: Error: Failed to get regulator clk-camss-ahb-no-rate:-517
[    1.318768] msm-bus-type fab-mnoc: Error: Failed to get regulator clk-camss-ahb-no-rate:-517
[    1.318893] msm-bus-type fab-mnoc: Error: Failed to get regulator clk-camss-ahb-no-rate:-517
[    1.319004] msm_bus_dev_init_qos: Skipping QOS init for 727
[    1.327029] SCSI subsystem initialized
[    1.329470] usbcore: registered new interface driver usbfs
[    1.329587] usbcore: registered new interface driver hub
[    1.330080] usbcore: registered new device driver usb
[    1.331151] soc:usb_nop_phy supply vcc not found, using dummy regulator
[    1.332977] qcom,qpnp-power-on 800f000.qcom,spmi:qcom,pm8998@0:qcom,power-on@800: PMIC@SID0 Power-on reason: Triggered from Hard Reset and 'warm' boot
[    1.333012] qcom,qpnp-power-on 800f000.qcom,spmi:qcom,pm8998@0:qcom,power-on@800: PMIC@SID0: Power-off reason: Triggered from PS_HOLD (PS_HOLD/MSM controlled shutdown)
[    1.333797] input: qpnp_pon as /devices/virtual/input/input0
[    1.334931] qcom,qpnp-power-on 800f000.qcom,spmi:qcom,pm8998@0:qcom,power-on@800: qcom,report-key:true
[    1.335070] qcom,qpnp-power-on 800f000.qcom,spmi:qcom,pmi8998@2:qcom,power-on@800: No PON config. specified
[    1.335129] qcom,qpnp-power-on 800f000.qcom,spmi:qcom,pmi8998@2:qcom,power-on@800: PMIC@SID2 Power-on reason: Triggered from PON1 (secondary PMIC) and 'warm' boot
[    1.335162] qcom,qpnp-power-on 800f000.qcom,spmi:qcom,pmi8998@2:qcom,power-on@800: PMIC@SID2: Power-off reason: Triggered from GP1 (Keypad_Reset1)
[    1.335235] qcom,qpnp-power-on 800f000.qcom,spmi:qcom,pmi8998@2:qcom,power-on@800: qcom,report-key:false
[    1.337033] media: Linux media interface: v0.10
[    1.337151] Linux video capture interface: v2.00
[    1.347988] EDAC MC: Ver: 3.0.0
[    1.362613] cpufreq: driver msm up and running
[    1.364360] platform soc:qcom,ion:qcom,ion-heap@22: assigned reserved memory node adsp_region
[    1.364979] platform soc:qcom,ion:qcom,ion-heap@27: assigned reserved memory node qseecom_region
[    1.365479] platform soc:qcom,ion:qcom,ion-heap@13: assigned reserved memory node sp_region
[    1.365972] platform soc:qcom,ion:qcom,ion-heap@10: assigned reserved memory node secure_region
[    1.367661] ION heap system created
[    1.368080] ION heap adsp created at 0x0000000000000000 with size 800000
[    1.368091] ION heap qsecom created at 0x0000000000000000 with size 1400000
[    1.368101] ION heap spss created at 0x0000000000000000 with size 800000
[    1.368111] ION heap secure_display created at 0x0000000000000000 with size 5c00000
[    1.368121] ION heap secure_heap created
[    1.383312] ION heap easel_mem created at 0x0000000000000000 with size 4000000
[    1.384877] PMIC@SID0: PM8998 v2.0 options: 0, 0, 0, 0
[    1.385077] PMIC@SID2: PMI8998 v2.1 options: 0, 0, 0, 0
[    1.385294] PMIC@SID4: PM8005 v2.0 options: 0, 0, 0, 0
[    1.389327] ipa ipa_smmu_wlan_cb_probe:5137 could not alloc iommu domain
[    1.389889] IPA smmu_info.s1_bypass=1 smmu_info.fast_map=0
[    1.392195] ipa ipa_smmu_wlan_cb_probe:5137 could not alloc iommu domain
[    1.405953] mdss_pll_probe: MDSS pll label = MDSS DSI 0 PLL
[    1.405967] mdss_pll_probe: mdss_pll_probe: label=MDSS DSI 0 PLL PLL SSC enabled
[    1.420188] dp_pll_lock_status: dp_pll_lock_status: C_READY status is not high. Status=0
[    1.421460] mdss_pll_probe: MDSS pll label = MDSS DSI 1 PLL
[    1.421473] mdss_pll_probe: mdss_pll_probe: label=MDSS DSI 1 PLL PLL SSC enabled
[    1.424810] mdss_pll_probe: MDSS pll label = MDSS DP PLL
[    1.426363] mdss_pll_probe: MDSS pll label = MDSS HDMI PLL
[    1.451589] arm-smmu cd00000.arm,smmu-mmss: found 20 context interrupt(s) but have 17 context banks. assuming 17 context interrupts.
[    1.462272] iommu: Adding device soc:usb_audio_qmi_dev to group 0
[    1.462679] iommu: Adding device 1c00000.qcom,pcie to group 1
[    1.463030] iommu: Adding device 18800000.qcom,icnss to group 2
[    1.463592] iommu: Adding device soc:iommu_test_device to group 3
[    1.466095] iommu: Adding device soc:qcom,msm-audio-ion to group 4
[    1.467231] iommu: Adding device c900000.qcom,sde_kms to group 5
[    1.469187] iommu: Adding device 1e00000.qcom,ipa:ipa_smmu_ap to group 6
[    1.469221] iommu: Adding device 1e00000.qcom,ipa:ipa_smmu_wlan to group 7
[    1.469253] iommu: Adding device 1e00000.qcom,ipa:ipa_smmu_uc to group 8
[    1.470590] Advanced Linux Sound Architecture Driver Initialized.
[    1.471643] Bluetooth: 0000000000000000
[    1.471663] NET: Registered protocol family 31
[    1.471668] Bluetooth: 0000000000000000
[    1.471700] Bluetooth: 0000000000000000
[    1.471710] Bluetooth: 0000000000000000
[    1.471741] Bluetooth: 0000000000000000
[    1.472610] NetLabel: Initializing
[    1.472615] NetLabel:  domain hash size = 128
[    1.472617] cfg80211: World regulatory domain updated:
[    1.472620] cfg80211:  DFS Master region: unset
[    1.472620] cfg80211:   (start_freq - end_freq @ bandwidth), (max_antenna_gain, max_eirp), (dfs_cac_time)
[    1.472621] NetLabel:  protocols = UNLABELED CIPSOv4
[    1.472630] cfg80211:   (2402000 KHz - 2472000 KHz @ 40000 KHz), (N/A, 2000 mBm), (N/A)
[    1.472634] cfg80211:   (2457000 KHz - 2482000 KHz @ 40000 KHz), (N/A, 2000 mBm), (N/A)
[    1.472638] cfg80211:   (2474000 KHz - 2494000 KHz @ 20000 KHz), (N/A, 2000 mBm), (N/A)
[    1.472642] cfg80211:   (5170000 KHz - 5250000 KHz @ 80000 KHz), (N/A, 2000 mBm), (N/A)
[    1.472645] cfg80211:   (5250000 KHz - 5330000 KHz @ 80000 KHz), (N/A, 2000 mBm), (N/A)
[    1.472649] cfg80211:   (5490000 KHz - 5710000 KHz @ 80000 KHz), (N/A, 2000 mBm), (N/A)
[    1.472652] cfg80211:   (5735000 KHz - 5835000 KHz @ 80000 KHz), (N/A, 2000 mBm), (N/A)
[    1.472656] cfg80211:   (57240000 KHz - 63720000 KHz @ 2160000 KHz), (N/A, 0 mBm), (N/A)
[    1.472788] NetLabel:  unlabeled traffic allowed by default
[    1.472900] pcie:pcie_init.
[    1.472905] pcie_init: unable to create IPC log context for pcie0-short
[    1.472908] pcie_init: unable to create IPC log context for pcie0-long
[    1.472912] pcie_init: unable to create IPC log context for pcie0-dump
[    1.472916] pcie_init: unable to create IPC log context for pcie1-short
[    1.472919] pcie_init: unable to create IPC log context for pcie1-long
[    1.472922] pcie_init: unable to create IPC log context for pcie1-dump
[    1.472926] pcie_init: unable to create IPC log context for pcie2-short
[    1.472929] pcie_init: unable to create IPC log context for pcie2-long
[    1.472932] pcie_init: unable to create IPC log context for pcie2-dump
[    1.473672] 1c00000.qcom,pcie supply vreg-3.3 not found, using dummy regulator
[    1.473920] 1c00000.qcom,pcie supply gdsc-smmu not found, using dummy regulator
[    1.474695] msm_pcie_get_resources: PCIe: RC0 can't get tcsr resource.
[    1.474721] msm_pcie_probe: PCIe: RC0 could not get pinctrl sleep state
[    1.477480] qpnp_labibb_regulator_probe: LAB/IBB registered successfully, lab_vreg enable=0 ibb_vreg enable=0 swire_control=0
[    1.480736] ipa ipa3_pre_init:4316 failed to create IPC log, continue...
[    1.482118] ipa ipa3_uc_state_check:302 uC interface not initialized
[    1.485312] ipa ipa3_set_resorce_groups_min_max_limits:3412 skip configuring ipa_rx_hps_clients from HLOS
[    1.486829] ipa ipa3_uc_state_check:302 uC interface not initialized
[    1.490286] clocksource: Switched to clocksource arch_sys_counter
[    1.492128] BUG: key 0000000000000000 not in .data!
[    1.492141] BUG: key 0000000000000000 not in .data!
[    1.492152] BUG: key 0000000000000000 not in .data!
[    1.492228] BUG: key 0000000000000000 not in .data!
[    1.492238] BUG: key 0000000000000000 not in .data!
[    1.492248] BUG: key 0000000000000000 not in .data!
[    1.496388] NET: Registered protocol family 2
[    1.497223] TCP established hash table entries: 16384 (order: 5, 131072 bytes)
[    1.497307] TCP bind hash table entries: 16384 (order: 8, 1310720 bytes)
[    1.497835] TCP: Hash tables configured (established 16384 bind 16384)
[    1.497939] UDP hash table entries: 1024 (order: 5, 196608 bytes)
[    1.498029] UDP-Lite hash table entries: 1024 (order: 5, 196608 bytes)
[    1.498418] NET: Registered protocol family 1
[    1.498462] PCI: CLS 0 bytes, default 64
[    1.500383] hw perfevents: enabled with armv8_pmuv3 PMU driver, 7 counters available
[    1.521157] Initialise system trusted keyring
[    1.521413] vmscan: error setting kswapd cpu affinity mask
[    1.541313] VFS: Disk quotas dquot_6.6.0
[    1.541499] VFS: Dquot-cache hash table entries: 512 (order 0, 4096 bytes)
[    1.543491] Registering sdcardfs 0.1
[    1.544141] fuse init (API version 7.23)
[    1.544551] SELinux:  Registering netfilter hooks
[    1.545415] pfk_ecryptfs [pfk_ecryptfs_init]: PFK ecryptfs inited successfully
[    1.545421] pfk_ext4 [pfk_ext4_init]: PFK EXT4 inited successfully
[    1.545454] pfk [pfk_init]: Driver initialized successfully
[    1.550579] Key type asymmetric registered
[    1.550616] Asymmetric key parser 'x509' registered
[    1.550669] io scheduler noop registered
[    1.550853] io scheduler cfq registered (default)
[    1.562465] msm_dss_get_res_byname: 'vbif_nrt_phys' resource not found
[    1.562479] mdss_mdp_probe+0x1e4/0x1390->msm_dss_ioremap_byname: 'vbif_nrt_phys' msm_dss_get_res_byname failed
[    1.565076] No change in context(0==0), skip
[    1.567178] mdss_mdp_pipe_addr_setup: type:0 ftchid:-1 xinid:0 num:0 rect:0 ndx:0x1 prio:0
[    1.567186] mdss_mdp_pipe_addr_setup: type:0 ftchid:-1 xinid:4 num:1 rect:0 ndx:0x2 prio:1
[    1.567193] mdss_mdp_pipe_addr_setup: type:0 ftchid:-1 xinid:8 num:2 rect:0 ndx:0x4 prio:2
[    1.567199] mdss_mdp_pipe_addr_setup: type:0 ftchid:-1 xinid:12 num:8 rect:0 ndx:0x100 prio:3
[    1.567235] mdss_mdp_pipe_addr_setup: type:2 ftchid:-1 xinid:1 num:6 rect:0 ndx:0x40 prio:4
[    1.567243] mdss_mdp_pipe_addr_setup: type:2 ftchid:-1 xinid:1 num:6 rect:1 ndx:0x40 prio:4
[    1.567248] mdss_mdp_pipe_addr_setup: type:2 ftchid:-1 xinid:5 num:7 rect:0 ndx:0x80 prio:5
[    1.567260] mdss_mdp_pipe_addr_setup: type:2 ftchid:-1 xinid:5 num:7 rect:1 ndx:0x80 prio:5
[    1.567265] mdss_mdp_pipe_addr_setup: type:2 ftchid:-1 xinid:9 num:12 rect:0 ndx:0x1000 prio:6
[    1.567277] mdss_mdp_pipe_addr_setup: type:2 ftchid:-1 xinid:9 num:12 rect:1 ndx:0x1000 prio:6
[    1.567283] mdss_mdp_pipe_addr_setup: type:2 ftchid:-1 xinid:13 num:13 rect:0 ndx:0x2000 prio:7
[    1.567293] mdss_mdp_pipe_addr_setup: type:2 ftchid:-1 xinid:13 num:13 rect:1 ndx:0x2000 prio:7
[    1.567310] mdss_mdp_pipe_addr_setup: type:3 ftchid:-1 xinid:2 num:10 rect:0 ndx:0x400 prio:0
[    1.567316] mdss_mdp_pipe_addr_setup: type:3 ftchid:-1 xinid:10 num:11 rect:0 ndx:0x800 prio:1
[    1.567327] mdss_mdp_parse_dt_handler: Error from prop qcom,mdss-pipe-sw-reset-off : u32 array read
[    1.567403] mdss_mdp_parse_dt_handler: Error from prop qcom,mdss-ib-factor-overlap : u32 array read
[    1.568253] iommu: Adding device c900000.qcom,mdss_mdp:qcom,smmu_mdp_unsec_cb to group 9
[    1.568648] iommu: Adding device c900000.qcom,mdss_mdp:qcom,smmu_mdp_sec_cb to group 10
[    1.568921] mdss_mdp_probe: mdss version = 0x30000001, bootloader display is on, num 1, intf_sel=0x00000100
[    1.571429] mdss_smmu_probe: iommu v2 domain[0] mapping and clk register successful!
[    1.571665] mdss_smmu_probe: iommu v2 domain[2] mapping and clk register successful!
[    1.574961] mdss_dsi_ctrl_probe: DSI Ctrl name = MDSS DSI CTRL->0
[    1.575523] mdss_dsi_find_panel_of_node: cmdline:0:qcom,mdss_dsi_sw43402_dsc_qhd_cmd:1:none:cfg:single_dsi panel_name:qcom,mdss_dsi_sw43402_dsc_qhd_cmd
[    1.575735] mdss_dsi_panel_init: Panel Name = SW43402 cmd mode dsc dsi panel
[    1.575855] mdss_dsi_panel_timing_from_dt: found new timing "qcom,mdss_dsi_sw43402_dsc_qhd_cmd" (0000000000000000)
[    1.575876] mdss_dsi_panel_get_dsc_cfg_np: cannot find dsc config node:
[    1.575931] mdss_dsi_parse_topology_config: cfg_node name config0 lm_split:720x720 pp_split:no
[    1.576027] mdss_dsi_parse_partial_update_caps: partial_update_enabled=0
[    1.576035] mdss_dsi_parse_panel_features: ulps feature enabled
[    1.576043] mdss_dsi_parse_panel_features: ulps during suspend feature disabled
[    1.576051] mdss_dsi_parse_dms_config: dynamic switch feature enabled: 0
[    1.576118] mdss_dsi_set_refresh_rate_range:2599, Unable to read min refresh rate
[    1.576127] mdss_dsi_set_refresh_rate_range:2614, Unable to read max refresh rate
[    1.576132] dyn_fps: min = 60, max = 60
[    1.576377] c994000.qcom,mdss_dsi_ctrl0 supply wqhd-vddio not found, using dummy regulator
[    1.576485] mdss_dsi_parse_ctrl_params:4417 Unable to read qcom,display-id, data=0000000000000000,len=20
[    1.576520] mdss_dsi_parse_gpio_params:4455: ERR_FG gpio not specified
[    1.576549] mdss_dsi_parse_gpio_params: bklt_en gpio not specified
[    1.576622] mdss_dsi_parse_gpio_params: lcd vcl gpio not specified
[    1.576701] msm_dss_get_res_byname: 'dsi_phy_regulator' resource not found
[    1.576720] mdss_dsi_retrieve_ctrl_resources+0xc4/0x230->msm_dss_ioremap_byname: 'dsi_phy_regulator' msm_dss_get_res_byname failed
[    1.576727] mdss_dsi_retrieve_ctrl_resources: ctrl_base=0000000000000000 ctrl_size=400 phy_base=0000000000000000 phy_size=7c0
[    1.577026] dsi_panel_device_register: Continuous splash enabled
[    1.577274] Unable to find fb node for device: c994000.qcom,mdss_dsi_ctrl0
[    1.577858] mdss_register_panel: adding framebuffer device c994000.qcom,mdss_dsi_ctrl0
[    1.585730] request disp ERR_DETECT irq
[    1.586155] mdss_dsi_ctrl_probe: Dsi Ctrl->0 initialized, DSI rev:0x20000000, PHY rev:0x3
[    1.586705] mdss_dsi_status_init: DSI status check interval:3000
[    1.589853] mdss_register_panel: adding framebuffer device soc:qcom,mdss_wb_panel
[    1.591230] mdss_fb_probe: fb0: split_mode:1 left:720 right:720
[    1.591733] mdss_panel_debugfs_init: Debugfs create dir failed with error: -19
[    1.591739] mdss_fb_register: FrameBuffer[0] 1440x2880 registered successfully!
[    1.591847] mdss_dsi_debugfs_setup: debugfs_create_dir dsi fail, error -19
[    1.591857] mdss_dsi_debugfs_init: Error in initilizing dsi ctrl debugfs
[    1.593140] mdss_mdp_splash_parse_dt: mem reservation for splash screen fb not present
[    1.593148] mdss_mdp_splash_parse_dt: no rsvd mem found in DT for splash screen
[    1.593154] mdss_mdp_splash_init: splash memory reserve failed
[    1.593212] mdss_fb_probe: fb1: split_mode:0 left:0 right:0
[    1.593399] mdss_fb_register: FrameBuffer[1] 640x480 registered successfully!
[    1.593733] mdss_mdp_splash_parse_dt: splash mem child node is not present
[    1.596600] glink_loopback_server_init: unable to create log context
[    1.598334] msm_smp2p_init: unable to create log context
[    1.600519] qmi_log_init: Unable to create QMI IPC logging for Req/Resp
[    1.600527] logging for Indications: Unable to create QMI IPC qmi_log_init
[    1.602842] spcom [spcom_init]: spcom driver Ver 1.0 23-Nov-2015.
[    1.603374] spcom [spcom_probe]: Driver Initialization ok.
[    1.604121] apr_init: Unable to create ipc log context
[    1.604178] audio_notifer_reg_service: service SSR_MODEM is in use
[    1.615319] memshare_child soc:qcom,memshare:qcom,client_1: for memshare_GPS segments only will be dumped.
[    1.615901] memshare_child soc:qcom,memshare:qcom,client_2: for memshare_FTM segments only will be dumped.
[    1.616322] memshare_child soc:qcom,memshare:qcom,client_3: for memshare_DIAG segments only will be dumped.
[    1.616458] In memshare_probe, Memshare probe success
[    1.617972] subsys-pil-tz soc:qcom,ipa_fws@1e08000: for ipa_fws segments only will be dumped.
[    1.620319] subsys-pil-tz cce0000.qcom,venus: for venus segments only will be dumped.
[    1.620894] subsys-pil-tz 1d0101c.qcom,spss: for spss segments only will be dumped.
[    1.621672] subsys-pil-tz soc:qcom,kgsl-hyp: for a540_zap segments only will be dumped.
[    1.623958] pil-q6v5-mss 4080000.qcom,mss: No pas_id found.
[    1.625380] msm-dcc 10b3000.dcc: DCC XPU is not specified
[    1.625862] msm-dcc 10b3000.dcc: DCC REG dump setup failed
[    1.625878] msm-dcc 10b3000.dcc: DCC SRAM dump setup failed
[    1.626369] icnss: Unable to create log context
[    1.626374] icnss: Unable to create log long context
[    1.630011] icnss 18800000.qcom,icnss: for wcss_msa0 segments only will be dumped.
[    1.630608] icnss: Unable to create debugfs -19
[    1.630798] icnss: Platform driver probed successfully
[    1.634052] msm_rpmstats_create_sysfs: Cannot find module_kset
[    1.636892] qiib_driver_data_init: unable to create logging context
[    1.640425] msm_serial_hs: Cannot create debugfs dir
[    1.641933] uart_tx_gpio is not available
[    1.641940] uart_rx_gpio is not available
[    1.641945] uart_cts_gpio is not available
[    1.641951] uart_rfr_gpio is not available
[    1.642491] msm_serial_hs c171000.uart: msm_hs_probe: error creating logging context
[    1.643172] sps: BAM device 0x0000000000000000 is not registered yet.
[    1.643271] sps_register_bam_device : unable to create IPC Logging 0 for bam 0x0000000000000000
[    1.643280] sps_register_bam_device : unable to create IPC Logging 1 for bam 0x0000000000000000
[    1.643287] sps_register_bam_device : unable to create IPC Logging 2 for bam 0x0000000000000000
[    1.643294] sps_register_bam_device : unable to create IPC Logging 3 for bam 0x0000000000000000
[    1.643300] sps_register_bam_device : unable to create IPC Logging 4 for bam 0x0000000000000000
[    1.643307] sps:BAM 0x0000000000000000 is registered.
[    1.643522] msm_serial_hs c171000.uart: msm_hs_probe: error creating tx logging context
[    1.643529] msm_serial_hs c171000.uart: msm_hs_probe: error creating rx logging context
[    1.643535] msm_serial_hs c171000.uart: msm_hs_probe: error creating usr logging context
[    1.646234] msm_serial_debugfs_init(): Cannot create loopback.0 debug entry
[    1.646364] c171000.uart: ttyHS0 at MMIO 0xc171000 (irq = 361, base_baud = 460800) is a MSM HS UART
[    1.648294] msm_serial_hs module loaded
[    1.648315] smd_tty_log_init: Unable to create IPC log
[    1.651923] random: nonblocking pool is initialized
[    1.652729] diag: Failed to create IPC logging context
[    1.669856] gdsc_gpu_gx: supplied by gfx_corner
[    1.672285] iommu: Adding device 5040000.qcom,kgsl-iommu:gfx3d_user to group 11
[    1.672843] iommu: Adding device 5040000.qcom,kgsl-iommu:gfx3d_secure to group 12
[    1.697458] brd: module loaded
[    1.706526] loop: module loaded
[    1.707753] zram: Added device: zram0
[    1.709109] QSEECOM: qseecom_probe: qseecom.qsee_version = 0x1001000
[    1.709143] QSEECOM: qseecom_retrieve_ce_data: Device does not support PFE
[    1.709156] QSEECOM: qseecom_probe: no-clock-support=0x1
[    1.709167] QSEECOM: qseecom_probe: qseecom.qsee_reentrancy_support = 2
[    1.709870] QSEECOM: qseecom_probe: qseecom.whitelist_support = 1
[    1.715936] thermal thermal_zone0: failed to read out thermal zone (-5)
[    1.716179] thermal thermal_zone1: failed to read out thermal zone (-5)
[    1.716377] thermal thermal_zone2: failed to read out thermal zone (-5)
[    1.716611] thermal thermal_zone3: failed to read out thermal zone (-5)
[    1.716622] mnh_thermal soc:mnh_thermal: mnh_thermal_probe: initialized
[    1.717031] misc easelcomm-client: registered at misc device minor 60
[    1.718755] fpc1020 soc:fp_fpc1020: found pin control fpc1020_reset_reset
[    1.718765] fpc1020 soc:fp_fpc1020: found pin control fpc1020_reset_active
[    1.718773] fpc1020 soc:fp_fpc1020: found pin control fpc1020_irq_active
[    1.718820] fpc1020 soc:fp_fpc1020: Selected 'fpc1020_reset_reset'
[    1.718843] fpc1020 soc:fp_fpc1020: Selected 'fpc1020_irq_active'
[    1.719407] fpc1020 soc:fp_fpc1020: Selected 'fpc1020_reset_active'
[    1.719682] fpc1020 soc:fp_fpc1020: Selected 'fpc1020_reset_reset'
[    1.724856] fpc1020 soc:fp_fpc1020: Selected 'fpc1020_reset_active'
[    1.730005] fpc1020 soc:fp_fpc1020: IRQ after reset 1
[    1.730013] fpc1020 soc:fp_fpc1020: fpc1020_probe: ok
[    1.730334] fpc1020_init OK
[    1.732818] misc access-metadata: registered 'metadata' 10:59, (1000@0x0000000000000000)
[    1.733057] misc access-ramoops: registered 'ramoops' 10:58, (200000@0x0000000000000000)
[    1.734211] i2c-msm-v2 c1b5000.i2c: msm_bus_scale_register_client(mstr-id:84):0xe (ok)
[    1.738585] prom_parse: Bad cell count for /soc/i2c@c1b5000/qcom,smb138x@8
[    1.738602] prom_parse: Bad cell count for /soc/i2c@c1b5000/qcom,smb138x@8
[    1.741441] PMIC@SID0: (null) v2.1 options: 0, 0, 0, 0
[    1.741480] prom_parse: Bad cell count for /soc/i2c@c1b5000/qcom,smb138x@8
[    1.741801] prom_parse: Bad cell count for /soc/i2c@c1b5000/qcom,smb138x@8
[    1.742278] prom_parse: Bad cell count for /soc/i2c@c1b5000/qcom,smb138x@8
[    1.742294] prom_parse: Bad cell count for /soc/i2c@c1b5000/qcom,smb138x@8
[    1.742615] I2C PMIC: i2c_pmic_probe: I2C PMIC probe successful
[    1.745623] nq-nci 8-0028: nqx_probe: probing NFCC NQxxx exited successfully
[    1.747572] qce 1de0000.qcedev: Qualcomm Crypto 5.3.4 device found @0x1de0000
[    1.747584] qce 1de0000.qcedev: CE device = 0x0\x0a, IO base, CE = 0x0000000000000000\x0a, Consumer (IN) PIPE 2,    Producer (OUT) PIPE 3\x0aIO base BAM = 0x0000000000000000\x0aBAM IRQ 62\x0aEngines Availability = 0x2011053
[    1.747667] sps_register_bam_device : unable to create IPC Logging 0 for bam 0x0000000000000000
[    1.747676] sps_register_bam_device : unable to create IPC Logging 1 for bam 0x0000000000000000
[    1.747686] sps_register_bam_device : unable to create IPC Logging 2 for bam 0x0000000000000000
[    1.747692] sps_register_bam_device : unable to create IPC Logging 3 for bam 0x0000000000000000
[    1.747701] sps_register_bam_device : unable to create IPC Logging 4 for bam 0x0000000000000000
[    1.747709] sps:BAM 0x0000000000000000 is registered.
[    1.748055] sps:BAM 0x0000000000000000 (va:0x0000000000000000) enabled: ver:0x27, number of pipes:16
[    1.748424] QCE50: qce_sps_init:  Qualcomm MSM CE-BAM at 0x0000000001dc4000 irq 62
[    1.762806] qcrypto 1de0000.qcrypto: Qualcomm Crypto 5.3.4 device found @0x1de0000
[    1.762819] qcrypto 1de0000.qcrypto: CE device = 0x0\x0a, IO base, CE = 0x0000000000000000\x0a, Consumer (IN) PIPE 4,    Producer (OUT) PIPE 5\x0aIO base BAM = 0x0000000000000000\x0aBAM IRQ 62\x0aEngines Availability = 0x2011053
[    1.763288] QCE50: qce_sps_init:  Qualcomm MSM CE-BAM at 0x0000000001dc4000 irq 62
[    1.774824] qcrypto 1de0000.qcrypto: qcrypto-ecb-aes
[    1.775135] qcrypto 1de0000.qcrypto: qcrypto-cbc-aes
[    1.775302] qcrypto 1de0000.qcrypto: qcrypto-ctr-aes
[    1.775474] qcrypto 1de0000.qcrypto: qcrypto-ecb-des
[    1.775605] qcrypto 1de0000.qcrypto: qcrypto-cbc-des
[    1.775749] qcrypto 1de0000.qcrypto: qcrypto-ecb-3des
[    1.775892] qcrypto 1de0000.qcrypto: qcrypto-cbc-3des
[    1.776031] qcrypto 1de0000.qcrypto: qcrypto-xts-aes
[    1.776172] qcrypto 1de0000.qcrypto: qcrypto-sha1
[    1.776308] qcrypto 1de0000.qcrypto: qcrypto-sha256
[    1.776448] qcrypto 1de0000.qcrypto: qcrypto-aead-hmac-sha1-cbc-aes
[    1.776584] qcrypto 1de0000.qcrypto: qcrypto-aead-hmac-sha1-cbc-des
[    1.776722] qcrypto 1de0000.qcrypto: qcrypto-aead-hmac-sha1-cbc-3des
[    1.776860] qcrypto 1de0000.qcrypto: qcrypto-aead-hmac-sha256-cbc-aes
[    1.776997] qcrypto 1de0000.qcrypto: qcrypto-aead-hmac-sha256-cbc-des
[    1.777142] qcrypto 1de0000.qcrypto: qcrypto-aead-hmac-sha256-cbc-3des
[    1.777276] qcrypto 1de0000.qcrypto: qcrypto-hmac-sha1
[    1.777412] qcrypto 1de0000.qcrypto: qcrypto-hmac-sha256
[    1.777544] qcrypto 1de0000.qcrypto: qcrypto-aes-ccm
[    1.777681] qcrypto 1de0000.qcrypto: qcrypto-rfc4309-aes-ccm
[    1.785009] qcom_ice_get_pdevice: found ice device 0000000000000000
[    1.785026] qcom_ice_get_pdevice: matching platform device 0000000000000000
[    1.785777] scm_call failed: func id 0x42000c02, ret: -2, syscall returns: 0xfffffffffffffffc, 0x0, 0x0
[    1.786854] ufshcd-qcom 1da4000.ufshc: ufs_qcom_parse_reg_info: Unable to find qcom,vddp-ref-clk-supply regulator, assuming enabled
[    1.788943] scsi host0: ufshcd
[    1.794260] qcom_ice 1db0000.ufsice: QC ICE 3.0.65 device found @0x0000000000000000
[    1.819424] pn81a spi1.0: pn81a_probe: device tree set '8-0028' as eSE power controller
[    1.819624] pn81a spi1.0: pn81a_probe: eSE is configured
[    1.822198] sps: BAM device 0x0000000000000000 is not registered yet.
[    1.822222] sps_register_bam_device : unable to create IPC Logging 0 for bam 0x0000000000000000
[    1.822233] sps_register_bam_device : unable to create IPC Logging 1 for bam 0x0000000000000000
[    1.822242] sps_register_bam_device : unable to create IPC Logging 2 for bam 0x0000000000000000
[    1.822248] sps_register_bam_device : unable to create IPC Logging 3 for bam 0x0000000000000000
[    1.822257] sps_register_bam_device : unable to create IPC Logging 4 for bam 0x0000000000000000
[    1.822264] sps:BAM 0x0000000000000000 is registered.
[    1.824020] sps:BAM 0x0000000000000000 (va:0x0000000000000000) enabled: ver:0x19, number of pipes:18
[    1.825938] Ethernet Channel Bonding Driver: v3.7.1 (April 27, 2011)
[    1.828506] ufshcd-qcom 1da4000.ufshc: ufshcd_print_pwr_info:[RX, TX]: gear=[1, 1], lane[1, 1], pwr[SLOWAUTO_MODE, SLOWAUTO_MODE], rate = 0
[    1.830103] tun: Universal TUN/TAP device driver, 1.6
[    1.830113] tun: (C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>
[    1.830255] PPP generic driver version 2.4.2
[    1.830498] PPP BSD Compression module registered
[    1.830511] PPP Deflate Compression module registered
[    1.830560] PPP MPPE Compression module registered
[    1.830577] NET: Registered protocol family 24
[    1.830709] wcnss_pre_alloc_init: Failed to create debugfs dir
[    1.830720] CLD80211: Initializing
[    1.830858] usbcore: registered new interface driver rtl8150
[    1.830910] usbcore: registered new interface driver r8152
[    1.830960] usbcore: registered new interface driver asix
[    1.831011] usbcore: registered new interface driver ax88179_178a
[    1.831061] usbcore: registered new interface driver cdc_ether
[    1.831260] usbcore: registered new interface driver cdc_ncm
[    1.833447] msm_sharedmem: sharedmem_register_qmi: qmi init successful
[    1.833456] msm_sharedmem: debugfs_init: Failed to create debug_fs directory
[    1.841981] msm_sharedmem: msm_sharedmem_probe: Device created for client 'rmtfs'
[    1.847173] branch_clk_set_rate: Cannot scale gcc_rx1_usb2_clkref_clk clock while HW gating is enabled. Use corresponding hw_ctl_clk to scale it
[    1.847333] pm8998_l24: supplied by pm8998_l12
[    1.851487] msm-dwc3 a800000.ssusb: unable to get dbm device
[    1.854353] usbcore: registered new interface driver usb-storage
[    1.857322] qpnp-pdphy 800f000.qcom,spmi:qcom,pmi8998@2:qcom,usb-pdphy@1700: usbpd_create failed: -517
[    1.859833] usbcore: registered new interface driver xpad
[    1.861301] [Touch] touch_get_device_type = [1]
[    1.861310] [Touch] touch_device_init, sw49408 start
[    1.862532] fts_touch 5-0049: Failed to get tspid gpio
[    1.862542] fts_touch 5-0049: Failed to get tspid2 gpio
[    1.862618] fts_touch 5-0049: irq_gpio = 125
[    1.862646] fts_touch 5-0049: client->irq = 499
[    1.862655] fts_touch 5-0049: Failed to get grip_area property
[    1.862665] fts_touch 5-0049: Failed to get vdd_gpio gpio
[    1.862672] fts_touch 5-0049: Failed to get vio_gpio gpio
[    1.862997] fts_touch 5-0049: FTS Support Hover Event
[    1.863030] fts_touch 5-0049: switch_gpio = 75
[    1.863155] fts_touch 5-0049: fts_power_ctrl: on
[    1.885109] i2c-msm-v2 c179000.i2c: msm_bus_scale_register_client(mstr-id:86):0x12 (ok)
[    1.886204] fts_touch 5-0049: FTS Enable WBCRC
[    1.923913] ufshcd-qcom 1da4000.ufshc: ufshcd_print_pwr_info:[RX, TX]: gear=[3, 3], lane[2, 2], pwr[FAST MODE, FAST MODE], rate = 2
[    1.931416] fts_touch 5-0049: get_pure_autotune_status: PureAutotune : 1 (E)
[    1.931424] fts_touch 5-0049: IC Firmware Version: 0x2064 [v0.44] IC Config Version: 0x0228 IC Main Version: 0x002C AFE Version: 0x07
[    1.931429] fts_touch 5-0049: product id: [51 59 02]
[    1.931434] fts_touch 5-0049: Chip revision: 0, fpc: 5, t_sensor: 1, site: 1, inspector_no: 69
[    1.931439] fts_touch 5-0049: date : 17.09.17 03:23:15
[    1.931844] fts_touch 5-0049: FTS Chip ID : 36 70
[    1.937106] scsi 0:0:0:49488: Well-known LUN    SAMSUNG  KLUDG8V1EE-B0C1  0400 PQ: 0 ANSI: 6
[    1.949521] scsi 0:0:0:49456: Well-known LUN    SAMSUNG  KLUDG8V1EE-B0C1  0400 PQ: 0 ANSI: 6
[    1.961104] ufshcd-qcom 1da4000.ufshc: Change queue_depth to 16
[    1.961281] scsi 0:0:0:49476: Well-known LUN    SAMSUNG  KLUDG8V1EE-B0C1  0400 PQ: 0 ANSI: 6
[    1.963145] ufshcd-qcom 1da4000.ufshc: Change queue_depth to 16
[    1.963351] scsi 0:0:0:0: Direct-Access     SAMSUNG  KLUDG8V1EE-B0C1  0400 PQ: 0 ANSI: 6
[    1.964769] ufshcd-qcom 1da4000.ufshc: Change queue_depth to 16
[    1.964976] scsi 0:0:0:1: Direct-Access     SAMSUNG  KLUDG8V1EE-B0C1  0400 PQ: 0 ANSI: 6
[    1.966020] ufshcd-qcom 1da4000.ufshc: Change queue_depth to 16
[    1.966201] scsi 0:0:0:2: Direct-Access     SAMSUNG  KLUDG8V1EE-B0C1  0400 PQ: 0 ANSI: 6
[    1.967228] ufshcd-qcom 1da4000.ufshc: Change queue_depth to 16
[    1.967407] scsi 0:0:0:3: Direct-Access     SAMSUNG  KLUDG8V1EE-B0C1  0400 PQ: 0 ANSI: 6
[    1.968432] ufshcd-qcom 1da4000.ufshc: Change queue_depth to 16
[    1.968611] scsi 0:0:0:4: Direct-Access     SAMSUNG  KLUDG8V1EE-B0C1  0400 PQ: 0 ANSI: 6
[    1.969650] ufshcd-qcom 1da4000.ufshc: Change queue_depth to 16
[    1.969832] scsi 0:0:0:5: Direct-Access     SAMSUNG  KLUDG8V1EE-B0C1  0400 PQ: 0 ANSI: 6
[    1.970869] ufshcd-qcom 1da4000.ufshc: Change queue_depth to 16
[    1.971057] scsi 0:0:0:6: Direct-Access     SAMSUNG  KLUDG8V1EE-B0C1  0400 PQ: 0 ANSI: 6
[    1.971117] fts_touch 5-0049: FTS Initialized
[    1.971333] input: touchscreen as /devices/soc/c179000.i2c/i2c-5/5-0049/input/input1
[    1.971348] fts_touch 5-0049: fts_input_open
[    1.971366] fts_touch 5-0049: fts_start_device already power on
[    1.971371] fts_touch 5-0049: FTS cmd after wakeup : h0
[    1.971966] fts_touch 5-0049: installing direct irq on GPIO 125
[    1.972508] power_supply touch: Not all required supplies found, defer probe
[    1.972758] drv2624 6-005a: drv2624_i2c_probe enter
[    1.972904] drv2624 6-005a: Looking up ti,irq-gpio property in node /soc/i2c@c17a000/drv2624@5a failed -2
[    1.973325] fts_touch 5-0049: [FTS] Received Force Cal Event [ 0 ]
[    1.974951] sd 0:0:0:0: [sda] Write Protect is off
[    1.974957] sd 0:0:0:0: [sda] Mode Sense: 00 32 00 10
[    1.975392] sd 0:0:0:1: [sdb] Write Protect is off
[    1.975398] sd 0:0:0:1: [sdb] Mode Sense: 00 32 00 10
[    1.977035] drv2624 6-005a: drv2624_i2c_probe, ID status (0x3)
[    1.977518] i2c-msm-v2 c17a000.i2c: msm_bus_scale_register_client(mstr-id:86):0x13 (ok)
[    1.978714] drv2624 6-005a: dev_init_platform_data, LRA = 155, drive_time=0x1b
[    1.982407] sda: sda1 sda2 sda3 sda4 sda5 sda6 sda7 sda8 sda9 sda10 sda11 sda12 sda13
[    1.982518] sd 0:0:0:2: [sdc] Write Protect is off
[    1.982525] sd 0:0:0:2: [sdc] Mode Sense: 00 32 00 10
[    1.982678] sdb: sdb1
[    1.985818] sd 0:0:0:3: [sdd] Write Protect is off
[    1.985824] sd 0:0:0:3: [sdd] Mode Sense: 00 32 00 10
[    1.986743] sdc: sdc1
[    1.989211] sd 0:0:0:4: [sde] Write Protect is off
[    1.989217] sd 0:0:0:4: [sde] Mode Sense: 00 32 00 10
[    1.990699] sdd: sdd1 sdd2
[    1.991423] drv2624 6-005a: drv2624 probe succeeded
[    1.991856] sd 0:0:0:5: [sdf] Write Protect is off
[    1.991862] sd 0:0:0:5: [sdf] Mode Sense: 00 32 00 10
[    1.992897] sd 0:0:0:6: [sdg] Write Protect is off
[    1.992903] sd 0:0:0:6: [sdg] Mode Sense: 00 32 00 10
[    1.993271] [LASER] stmvl53l0_init: Enter
[    1.993276] [LASER] stmvl53l0_init_i2c: Enter
[    1.993323] [LASER] stmvl53l0_init_i2c: End with rc:0
[    1.993327] [LASER] stmvl53l0_init: End
[    1.994968] [LASER] stmvl53l0_probe: Enter++
[    1.994976] [LASER] Laser_parse_dt: calib_file = /persist/ldaf_cal
[    1.994980] [LASER] Laser_parse_dt: pwdn_gpio = 0
[    1.995009] [LASER] Laser_parse_dt: pwdn_gpio = 39
[    1.995283] [LASER] Laser_parse_dt: laser_irq_gpio = 62
[    1.995350] [LASER] Laser_parse_dt: Laser_parse_dt: sensor cali_size = 0
[    1.995411] [LASER] stmvl53l0_read_calibration: Could not read calibration from /persist/ldaf_cal
[    1.995415] [LASER] stmvl53l0_parse_vdd: Enter++
[    1.995507] 9-0029 supply vdd not found, using dummy regulator
[    1.995523] [LASER] stmvl53l0_parse_vdd: End--
[    1.995526] [LASER] Laser_pinctrl_init: Enter++
[    1.995664] qcom,qpnp-rtc 800f000.qcom,spmi:qcom,pm8998@0:qcom,pm8998_rtc: rtc core: registered qpnp_rtc as rtc0
[    1.996016] sdf: sdf1 sdf2 sdf3
[    1.996397] [LASER] Laser_pinctrl_init: End--
[    1.996402] [LASER] stmvl53l0_setup: Enter++
[    1.996423] [LASER] stmvl53l0_setup: register_irq: 259
[    1.996721] fts_touch 5-0049: [FTS] Received Basic Autotune Protection Event [ 0 ]
[    1.996727] fts_touch 5-0049: [FTS] Received Force Cal Done Event
[    1.996816] sde: sde1 sde2 sde3 sde4 sde5 sde6 sde7 sde8 sde9 sde10 sde11 sde12 sde13 sde14 sde15 sde16 sde17 sde18 sde19 sde20 sde21 sde22 sde23 sde24 sde25 sde26 sde27 sde28 sde29 sde30 sde31 sde32 sde33 sde34 sde35 sde36 sde37 sde38 sde39 sde40 sde41
[    1.997311] sdg: sdg1
[    1.997865] i2c /dev entries driver
[    1.998434] [LASER] stmvl53l0_setup: interrupt is hooked
[    1.999814] input: STM VL53L0 proximity sensor as /devices/virtual/input/input2
[    2.006608] [LASER] stmvl53l0_setup: Misc device registration name:9-0029
[    2.008450] i2c-msm-v2 c1b7000.i2c: msm_bus_scale_register_client(mstr-id:84):0x14 (ok)
[    2.009577] iommu: Adding device soc:qcom,cam_smmu:msm_cam_smmu_cb1 to group 13
[    2.009905] CAM-SMMU cam_smmu_populate_sids:2326 __debug cnt = 8, cb->name: :vfe sid [0] = 3072\x0a,
[    2.009912] CAM-SMMU cam_smmu_populate_sids:2326 __debug cnt = 8, cb->name: :vfe sid [1] = 3073\x0a,
[    2.009918] CAM-SMMU cam_smmu_populate_sids:2326 __debug cnt = 8, cb->name: :vfe sid [2] = 3074\x0a,
[    2.009922] CAM-SMMU cam_smmu_populate_sids:2326 __debug cnt = 8, cb->name: :vfe sid [3] = 3075\x0a,
[    2.010149] iommu: Adding device soc:qcom,cam_smmu:msm_cam_smmu_cb2 to group 14
[    2.010379] [LASER] stmvl53l0_setupAPIFunctions: read REVISION_ID: 0x10\x0a API_VERSION: 1.1.20.2
[    2.010384] CAM-SMMU cam_smmu_populate_sids:2326 __debug cnt = 2, cb->name: :cpp sid [0] = 2560\x0a,
[    2.010385] [LASER] stmvl53l0_setupAPIFunctions: to setup API cut 1.1
[    2.010389] [LASER] stmvl53l0_init_client: Enter
[    2.010391] [LASER] stmvl53l0_init_client: Call of VL53L0_DataInit
[    2.010569] iommu: Adding device soc:qcom,cam_smmu:msm_cam_smmu_cb3 to group 15
[    2.010807] CAM-SMMU cam_smmu_populate_sids:2326 __debug cnt = 2, cb->name: :camera_fd sid [0] = 2561\x0a,
[    2.010978] iommu: Adding device soc:qcom,cam_smmu:msm_cam_smmu_cb4 to group 16
[    2.011207] CAM-SMMU cam_smmu_populate_sids:2326 __debug cnt = 2, cb->name: :jpeg_enc0 sid [0] = 2048\x0a,
[    2.011376] iommu: Adding device soc:qcom,cam_smmu:msm_cam_smmu_cb5 to group 17
[    2.011605] CAM-SMMU cam_smmu_populate_sids:2326 __debug cnt = 2, cb->name: :jpeg_dma sid [0] = 2049\x0a,
[    2.020238] CAM-SOC msm_camera_get_reg_base:864 err: mem resource csiphy_clk_mux not found
[    2.020247] csiphy_probe: no mem resource?
[    2.020769] [LASER] stmvl53l0_init_client: Call of VL53L0_StaticInit
[    2.020871] CAM-SOC msm_camera_get_reg_base:864 err: mem resource csiphy_clk_mux not found
[    2.020876] csiphy_probe: no mem resource?
[    2.021481] CAM-SOC msm_camera_get_reg_base:864 err: mem resource csiphy_clk_mux not found
[    2.021485] csiphy_probe: no mem resource?
[    2.025925] msm_actuator_platform_probe:1949 msm_actuator_platform_probe: No/Error Actuator GPIOs
[    2.027191] msm_eeprom_platform_probe failed 1712
[    2.028688] msm_eeprom_platform_probe failed 1782
[    2.029979] msm_camera_pinctrl_init:1265 Getting pinctrl handle failed
[    2.029983] msm_ois_platform_probe:1253 ERR:msm_ois_platform_probe: Error in reading OIS pinctrl
[    2.030128] qcom,ois: probe of ca0c000.qcom,cci:qcom,ois@20 failed with error -22
[    2.036228] gdsc_cpp: supplied by gdsc_camss_top
[    2.040253] MSM-CPP cpp_init_hardware:1138 CPP HW Version: 0x60010000
[    2.040267] MSM-CPP cpp_init_hardware:1156 stream_cnt:0
[    2.040581] MSM-CPP cpp_release_hardware:1219 cpp hw release done
[    2.042024] CAM-SOC msm_camera_get_reg_base:864 err: mem resource vfe_fuse not found
[    2.042029] CAM-SOC msm_camera_get_res_size:907 err: mem resource vfe_fuse not found
[    2.042134] gdsc_vfe0: supplied by gdsc_camss_top
[    2.043372] CAM-SOC msm_camera_get_reg_base:864 err: mem resource vfe_fuse not found
[    2.043377] CAM-SOC msm_camera_get_res_size:907 err: mem resource vfe_fuse not found
[    2.043478] gdsc_vfe1: supplied by gdsc_camss_top
[    2.050365] __msm_jpeg_init:1537] Jpeg Device id 0
[    2.056231] msm_fd_hw_set_dt_parms_by_name: Error property does not exist
[    2.056925] msm_vidc:  err: Failed to create debugfs for msm_vidc
[    2.059820] msm_vidc:  err: Failed to create debugfs for msm_vidc
[    2.061859] iommu: Adding device cc00000.qcom,vidc:non_secure_cb to group 18
[    2.068721] iommu: Adding device cc00000.qcom,vidc:firmware_cb to group 19
[    2.069344] iommu: Adding device cc00000.qcom,vidc:secure_bitstream_cb to group 20
[    2.072291] iommu: Adding device cc00000.qcom,vidc:secure_pixel_cb to group 21
[    2.073790] iommu: Adding device cc00000.qcom,vidc:secure_non_pixel_cb to group 22
[    2.078749] c880000.qcom,vmem supply vdd not found, using dummy regulator
[    2.080974] [LASER] stmvl53l0_read_calibration: Could not read calibration from /persist/ldaf_cal
[    2.080983] [LASER] stmvl53l0_init_client: failed: no calibration data
[    2.080991] [LASER] stmvl53l0_setup: support ver. 1.1.20.2(1.0.5.1) enabled
[    2.080998] [LASER] stmvl53l0_setup: End--
[    2.081988] [LASER] stmvl53l0_probe: Success--
[    2.083261] msm_vidc_vmem: Up and running with 4 banks of memory from [mem size 0x1e4eafc8c9]
[    2.083271] Failed to create '<debugfs>/vmem'
[    2.085975] sde_mdp_parse_dt_prop_len: <SDEROT_INFO> prop qcom,mdss-rot-xin-id : doesn't exist in device tree
[    2.087106] iommu: Adding device c900000.qcom,mdss_rotator:qcom,smmu_rot_unsec_cb to group 23
[    2.087743] iommu: Adding device c900000.qcom,mdss_rotator:qcom,smmu_rot_sec_cb to group 24
[    2.089575] No change in context(0==0), skip
[    2.091642] sde_rotator c900000.qcom,mdss_rotator: <SDEROT_INFO> SDE v4l2 rotator probe success
[    2.093989] sde_smmu_probe: <SDEROT_INFO> iommu v2 domain[0] mapping and clk register successful!
[    2.094435] sde_smmu_probe: <SDEROT_INFO> iommu v2 domain[1] mapping and clk register successful!
[    2.103486] thermal thermal_zone4: failed to read out thermal zone (-19)
[    2.104483] thermal thermal_zone5: failed to read out thermal zone (-19)
[    2.105808] PMI: smblib_get_prop_usb_port_temp: Couldn't get USB thermal zone rc=-19
[    2.106484] PMI: smblib_get_prop_usb_port_temp: Couldn't get USB thermal zone rc=-19
[    2.106492] PMI: port_overheat_work: Couldn't get USB port temp rc=-19
[    2.117350] QPNP SMB2 probed successfully usb:present=1 type=0 batt:present = 1 health = 1 charge = 3
[    2.119062] PMI: smblib_get_prop_usb_port_temp: Couldn't get USB thermal zone rc=-19
[    2.122676] lge_battery: bm_init: Couldn't get pl_psy
[    2.122685] lge_battery: lge_battery_probe: bm_init fail
[    2.123074] qpnp_adc_get_devicetree_data: Loaded custom map for usb_port_temp
[    2.123788] qpnp_vadc_read: Error reading vadc_hc channel 21
[    2.123799] thermal thermal_zone6: failed to read out thermal zone (-517)
[    2.156758] tsens_debugfs_init: Error creating TSENS directory
[    2.156785] lmh_interface:lmh_mon_init_driver Error creating debugfs dir:lmh_monitor. err:-19
[    2.159780] msm_lmh_dcvs:lmh_activate_trip lmh_activate_trip: disable not supported
[    2.161262] msm_lmh_dcvs:lmh_activate_trip lmh_activate_trip: disable not supported
[    2.161977] md: linear personality registered for level -1
[    2.162294] device-mapper: uevent: version 1.0.3
[    2.162708] device-mapper: ioctl: 4.34.0-ioctl (2015-10-28) initialised: dm-devel@redhat.com
[    2.162974] device-mapper: verity-avb: AVB error handler initialized with vbmeta device: PARTUUID=cd3ccfa6-70cc-66d2-7499-8e657eb9b121
[    2.162992] device-mapper: req-crypt: dm-req-crypt successfully initalized.\x0a
[    2.164177] bt_power_populate_dt_pinfo: bt-reset-gpio not provided in device tree
[    2.170575] ledtrig-cpu: registered to indicate activity on CPUs
[    2.170599] power_supply touch: touch: Found supply : dc
[    2.170613] power_supply touch: touch: Found supply : usb
[    2.170625] power_supply touch: touch: Found supply : main
[    2.170637] power_supply touch: touch: Found supply : pc_port
[    2.170648] power_supply touch: touch: Found supply : battery
[    2.170839] hidraw: raw HID events driver (C) Jiri Kosina
[    2.175478] usbcore: registered new interface driver usbhid
[    2.175486] usbhid: USB HID core driver
[    2.175834] ashmem: initialized
[    2.179224] qpnp_coincell_charger_show_state: enabled=Y, voltage=3200 mV, resistance=800 ohm
[    2.180841] fts_touch 5-0049: [FTS] Received Charger Connected Event
[    2.184593] bimc-bwmon 1008000.qcom,cpu-bwmon: BW HWmon governor registered.
[    2.189563] devfreq soc:qcom,cpubw: Couldn't update frequency transition information.
[    2.190210] devfreq soc:qcom,mincpubw: Couldn't update frequency transition information.
[    2.190845] devfreq soc:qcom,memlat-cpu0: Couldn't update frequency transition information.
[    2.191310] devfreq soc:qcom,memlat-cpu4: Couldn't update frequency transition information.
[    2.194502] probe: Failed to create IPC log context
[    2.203106] [smem]htc_radio_smem_init.
[    2.204314] usbcore: registered new interface driver snd-usb-audio
[    2.213660] tas2557s 7-004d: tas2557_i2c_probe enter
[    2.220549] bcm15602 9-0008: bcm15602_resetb_irq_handler: completing reset
[    2.220975] bcm15602 9-0008: Part: 0x5602, Rev: 2, Vendor Rev: 0x21
[    2.221174] bcm15602 9-0008: Last reboot reason: normal
[    2.270596] tas2557s 7-004d: PG2.1 found
[    2.270913] tas2557s 7-004d: tas2557_register_codec, enter
[    2.271087] tas2557s 7-004d: tas2557_register_misc, leave
[    2.271093] tas2557s 7-004d: tiload_driver_init
[    2.271116] tas2557s 7-004d: allocated Major Number: 227
[    2.271391] tas2557s 7-004d: Registered TiLoad driver, Major number: 227
[    2.300347] msm-dai-tdm soc:qcom,msm-dai-tdm-pri-rx: msm_dai_tdm_q6_probe: dev_name: soc:qcom,msm-dai-tdm-pri-rx group_id: 0x9100
[    2.300828] msm-dai-q6-tdm soc:qcom,msm-dai-tdm-pri-rx:qcom,msm-dai-q6-tdm-pri-rx-0: msm_dai_q6_tdm_dev_probe: dev_name: soc:qcom,msm-dai-tdm-pri-rx:qcom,msm-dai-q6-tdm-pri-rx-0 dev_id: 0x9000
[    2.300840] msm-dai-q6-tdm soc:qcom,msm-dai-tdm-pri-rx:qcom,msm-dai-q6-tdm-pri-rx-0: msm_dai_q6_tdm_dev_probe: Custom tdm header not supported
[    2.300897] msm-dai-tdm soc:qcom,msm-dai-tdm-pri-tx: msm_dai_tdm_q6_probe: dev_name: soc:qcom,msm-dai-tdm-pri-tx group_id: 0x9101
[    2.301352] msm-dai-q6-tdm soc:qcom,msm-dai-tdm-pri-tx:qcom,msm-dai-q6-tdm-pri-tx-0: msm_dai_q6_tdm_dev_probe: dev_name: soc:qcom,msm-dai-tdm-pri-tx:qcom,msm-dai-q6-tdm-pri-tx-0 dev_id: 0x9001
[    2.301362] msm-dai-q6-tdm soc:qcom,msm-dai-tdm-pri-tx:qcom,msm-dai-q6-tdm-pri-tx-0: msm_dai_q6_tdm_dev_probe: Custom tdm header not supported
[    2.301413] msm-dai-tdm soc:qcom,msm-dai-tdm-sec-rx: msm_dai_tdm_q6_probe: dev_name: soc:qcom,msm-dai-tdm-sec-rx group_id: 0x9110
[    2.301875] msm-dai-q6-tdm soc:qcom,msm-dai-tdm-sec-rx:qcom,msm-dai-q6-tdm-sec-rx-0: msm_dai_q6_tdm_dev_probe: dev_name: soc:qcom,msm-dai-tdm-sec-rx:qcom,msm-dai-q6-tdm-sec-rx-0 dev_id: 0x9010
[    2.301886] msm-dai-q6-tdm soc:qcom,msm-dai-tdm-sec-rx:qcom,msm-dai-q6-tdm-sec-rx-0: msm_dai_q6_tdm_dev_probe: Custom tdm header not supported
[    2.301941] msm-dai-tdm soc:qcom,msm-dai-tdm-sec-tx: msm_dai_tdm_q6_probe: dev_name: soc:qcom,msm-dai-tdm-sec-tx group_id: 0x9111
[    2.302394] msm-dai-q6-tdm soc:qcom,msm-dai-tdm-sec-tx:qcom,msm-dai-q6-tdm-sec-tx-0: msm_dai_q6_tdm_dev_probe: dev_name: soc:qcom,msm-dai-tdm-sec-tx:qcom,msm-dai-q6-tdm-sec-tx-0 dev_id: 0x9011
[    2.302406] msm-dai-q6-tdm soc:qcom,msm-dai-tdm-sec-tx:qcom,msm-dai-q6-tdm-sec-tx-0: msm_dai_q6_tdm_dev_probe: Custom tdm header not supported
[    2.302456] msm-dai-tdm soc:qcom,msm-dai-tdm-tert-rx: msm_dai_tdm_q6_probe: dev_name: soc:qcom,msm-dai-tdm-tert-rx group_id: 0x9120
[    2.302908] msm-dai-q6-tdm soc:qcom,msm-dai-tdm-tert-rx:qcom,msm-dai-q6-tdm-tert-rx-0: msm_dai_q6_tdm_dev_probe: dev_name: soc:qcom,msm-dai-tdm-tert-rx:qcom,msm-dai-q6-tdm-tert-rx-0 dev_id: 0x9020
[    2.302919] msm-dai-q6-tdm soc:qcom,msm-dai-tdm-tert-rx:qcom,msm-dai-q6-tdm-tert-rx-0: msm_dai_q6_tdm_dev_probe: Custom tdm header not supported
[    2.302968] msm-dai-tdm soc:qcom,msm-dai-tdm-tert-tx: msm_dai_tdm_q6_probe: dev_name: soc:qcom,msm-dai-tdm-tert-tx group_id: 0x9121
[    2.303421] msm-dai-q6-tdm soc:qcom,msm-dai-tdm-tert-tx:qcom,msm-dai-q6-tdm-tert-tx-0: msm_dai_q6_tdm_dev_probe: dev_name: soc:qcom,msm-dai-tdm-tert-tx:qcom,msm-dai-q6-tdm-tert-tx-0 dev_id: 0x9021
[    2.303433] msm-dai-q6-tdm soc:qcom,msm-dai-tdm-tert-tx:qcom,msm-dai-q6-tdm-tert-tx-0: msm_dai_q6_tdm_dev_probe: Custom tdm header not supported
[    2.303485] msm-dai-tdm soc:qcom,msm-dai-tdm-quat-rx: msm_dai_tdm_q6_probe: dev_name: soc:qcom,msm-dai-tdm-quat-rx group_id: 0x9130
[    2.303937] msm-dai-q6-tdm soc:qcom,msm-dai-tdm-quat-rx:qcom,msm-dai-q6-tdm-quat-rx-0: msm_dai_q6_tdm_dev_probe: dev_name: soc:qcom,msm-dai-tdm-quat-rx:qcom,msm-dai-q6-tdm-quat-rx-0 dev_id: 0x9030
[    2.303949] msm-dai-q6-tdm soc:qcom,msm-dai-tdm-quat-rx:qcom,msm-dai-q6-tdm-quat-rx-0: msm_dai_q6_tdm_dev_probe: Custom tdm header not supported
[    2.304001] msm-dai-tdm soc:qcom,msm-dai-tdm-quat-tx: msm_dai_tdm_q6_probe: dev_name: soc:qcom,msm-dai-tdm-quat-tx group_id: 0x9131
[    2.304451] msm-dai-q6-tdm soc:qcom,msm-dai-tdm-quat-tx:qcom,msm-dai-q6-tdm-quat-tx-0: msm_dai_q6_tdm_dev_probe: dev_name: soc:qcom,msm-dai-tdm-quat-tx:qcom,msm-dai-q6-tdm-quat-tx-0 dev_id: 0x9031
[    2.304464] msm-dai-q6-tdm soc:qcom,msm-dai-tdm-quat-tx:qcom,msm-dai-q6-tdm-quat-tx-0: msm_dai_q6_tdm_dev_probe: Custom tdm header not supported
[    2.326178] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.326766] GACT probability NOT on
[    2.326824] Mirror/redirect action on
[    2.326849] u32 classifier
[    2.326852] Actions configured
[    2.326967] Netfilter messages via NETLINK v0.30.
[    2.327115] nf_conntrack version 0.5.0 (16384 buckets, 65536 max)
[    2.327639] ctnetlink v0.93: registering with nfnetlink.
[    2.328748] xt_time: kernel timezone is -0000
[    2.328792] wireguard: WireGuard 0.0.20190406 loaded. See www.wireguard.com for information.
[    2.328796] wireguard: Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
[    2.328843] IPv4 over IPsec tunneling driver
[    2.329806] ip_tables: (C) 2000-2006 Netfilter Core Team
[    2.330536] arp_tables: (C) 2002 David S. Miller
[    2.330704] Initializing XFRM netlink socket
[    2.331589] NET: Registered protocol family 10
[    2.333881] mip6: Mobile IPv6
[    2.333913] ip6_tables: (C) 2000-2006 Netfilter Core Team
[    2.334924] sit: IPv6 over IPv4 tunneling driver
[    2.336624] NET: Registered protocol family 17
[    2.336652] NET: Registered protocol family 15
[    2.336784] bridge: automatic filtering via arp/ip/ip6tables has been deprecated. Update your scripts to load br_netfilter if you need this.
[    2.336794] Ebtables v2.0 registered
[    2.336942] l2tp_core: L2TP core driver, V2.0
[    2.336958] l2tp_ppp: PPPoL2TP kernel driver, V2.0
[    2.336964] l2tp_ip: L2TP IP encapsulation support (L2TPv3)
[    2.336988] l2tp_netlink: L2TP netlink interface
[    2.337022] l2tp_eth: L2TP ethernet pseudowire support (L2TPv3)
[    2.337027] l2tp_ip6: L2TP IP encapsulation support for IPv6 (L2TPv3)
[    2.338286] NET: Registered protocol family 27
[    2.338688] IPC_RTR: ipc_router_create_log_ctx: Unable to create IPC logging for [local_IPCRTR]
[    2.351713] subsys-pil-tz 17300000.qcom,lpass: for adsp segments only will be dumped.
[    2.354232] subsys-pil-tz 5c00000.qcom,ssc: for slpi segments only will be dumped.
[    2.355400] pil-q6v5-mss 4080000.qcom,mss: No pas_id found.
[    2.356112] platform 4080000.qcom,mss:qcom,mba-mem@0: assigned reserved memory node pil_mba_region@94100000
[    2.357133] pil-q6v5-mss 4080000.qcom,mss: for modem segments only will be dumped.
[    2.362700] sps_register_bam_device : unable to create IPC Logging 0 for bam 0x0000000000000000
[    2.362706] sps_register_bam_device : unable to create IPC Logging 1 for bam 0x0000000000000000
[    2.362711] sps_register_bam_device : unable to create IPC Logging 2 for bam 0x0000000000000000
[    2.362714] sps_register_bam_device : unable to create IPC Logging 3 for bam 0x0000000000000000
[    2.362719] sps_register_bam_device : unable to create IPC Logging 4 for bam 0x0000000000000000
[    2.362723] sps:BAM 0x0000000000000000 is registered.
[    2.363766] Invalid index Defaulting curr to 0
[    2.365640] qpnp-pdphy 800f000.qcom,spmi:qcom,pmi8998@2:qcom,usb-pdphy@1700: usbpd_create failed: -517
[    2.375139] thermal thermal_zone37: failed to read out thermal zone (-19)
[    2.379249] qcom,fg-gen3 800f000.qcom,spmi:qcom,pmi8998@2:qpnp,fg: Start WAR to set SP_SAT_CC_CLR_AUTO_BIT
[    2.379578] qcom,fg-gen3 800f000.qcom,spmi:qcom,pmi8998@2:qpnp,fg: WAR: 0x13 reg value: 0xbf
[    2.379584] qcom,fg-gen3 800f000.qcom,spmi:qcom,pmi8998@2:qpnp,fg: WAR: Bit 3 has been set, no WAR needed
[    2.379686] FG: comp_temp_by_chg_current: failed to get POWER_SUPPLY_PROP_CURRENT_NOW rc = -61
[    2.379690] FG: fg_get_battery_temp: batt temperature original:430, tuned:375
[    2.379695] FG: fg_gen3_probe: battery SOC:80 voltage: 4128407uV temp: 375 id: 0KOhms
[    2.389680] SMB138X: smb138x_parallel_get_prop: parallel power supply get prop 4 not supported
[    2.390262] FG: fg_psy_get_property: unsupported property 4
[    2.390384] FG: fg_get_battery_temp: batt temperature original:430, tuned:375
[    2.390630] SMB138X: smb138x_probe: SMB138X probed successfully mode=1
[    2.391955] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.391960] lge_battery: lge_battery_probe: bm_init fail
[    2.393146] SMB138X: smb138x_get_prop_charger_temp: Couldnt read chg temp at 1th iteration rc = -61
[    2.393185] FG: fg_get_time_to_full: battery profile is not loaded
[    2.394579] SMB138X: smb138x_get_prop_connector_health: Couldn't read connector temperature rc=-61
[    2.394602] SMB138X: smb138x_parallel_get_prop: parallel power supply get prop 4 not supported
[    2.396923] SMB138X: smb138x_get_prop_charger_temp: Couldnt read chg temp at 1th iteration rc = -61
[    2.398173] SMB138X: smb138x_get_prop_connector_health: Couldn't read connector temperature rc=-61
[    2.401823] FG: fg_get_battery_temp: batt temperature original:430, tuned:375
[    2.403399] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.403783] ipa ipa3_smp2p_probe:5521 failed to enable irq wake
[    2.404948] FG: fg_get_battery_temp: batt temperature original:430, tuned:375
[    2.407220] Invalid index Defaulting curr to 0
[    2.411119] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.411126] lge_battery: lge_battery_probe: bm_init fail
[    2.419511] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.421952] msm-dwc3 a800000.ssusb: charger detection in progress
[    2.422235] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.422239] lge_battery: lge_battery_probe: bm_init fail
[    2.430487] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.431179] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.431184] lge_battery: lge_battery_probe: bm_init fail
[    2.432649] EDAC DEVICE0: Giving out device to module soc:arm64-cpu-erp controller cache: DEV soc:arm64-cpu-erp (POLLED)
[    2.433397] ARM64 CPU ERP: Could not find <cci-irq> IRQ property. Proceeding anyway.
[    2.440007] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.441176] Registered cp15_barrier emulation handler
[    2.441192] Registered setend emulation handler
[    2.441783] registered taskstats version 1
[    2.441792] Loading compiled-in X.509 certificates
[    2.444571] Loaded X.509 cert 'Easel: 2d9cb8fb66a52266cb3b00b3e3db335fadf908e4'
[    2.445225] msm_smem 86000000.qcom,smem: for smem segments only will be dumped.
[    2.445540] spss_utils [spss_init]: spss-utils driver Ver 1.2 13-Jan-2017.
[    2.446104] spss_utils [spss_probe]: Initialization completed ok, firmware_name [spss2p].
[    2.448187] fastrpc soc:qcom,msm-adsprpc-mem: for adsp_rh segments only will be dumped.
[    2.448618] iommu: Adding device soc:qcom,msm_fastrpc:qcom,msm_fastrpc_cpz_cb1 to group 25
[    2.450818] iommu: Adding device soc:qcom,msm_fastrpc:qcom,msm_fastrpc_compute_cb1 to group 26
[    2.451875] iommu: Adding device soc:qcom,msm_fastrpc:qcom,msm_fastrpc_compute_cb2 to group 27
[    2.452818] iommu: Adding device soc:qcom,msm_fastrpc:qcom,msm_fastrpc_compute_cb3 to group 28
[    2.453712] iommu: Adding device soc:qcom,msm_fastrpc:qcom,msm_fastrpc_compute_cb4 to group 29
[    2.454598] iommu: Adding device soc:qcom,msm_fastrpc:qcom,msm_fastrpc_compute_cb6 to group 30
[    2.455488] iommu: Adding device soc:qcom,msm_fastrpc:qcom,msm_fastrpc_compute_cb7 to group 31
[    2.456370] iommu: Adding device soc:qcom,msm_fastrpc:qcom,msm_fastrpc_compute_cb8 to group 32
[    2.458607] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.458665] lge_battery: lge_battery_probe: bm_init fail
[    2.469047] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.470423] ngd_msm_ctrl 171c0000.slim: error creating ipc_logging context
[    2.471416] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.471422] lge_battery: lge_battery_probe: bm_init fail
[    2.479748] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.480650] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.480655] lge_battery: lge_battery_probe: bm_init fail
[    2.488784] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.489479] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.489484] lge_battery: lge_battery_probe: bm_init fail
[    2.490923] ngd_msm_ctrl 17240000.slim: error creating ipc_logging context
[    2.498115] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.498850] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.498854] lge_battery: lge_battery_probe: bm_init fail
[    2.507280] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.507562] RNDIS_IPA module is loaded.
[    2.508133] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.508139] lge_battery: lge_battery_probe: bm_init fail
[    2.513538] msm_pcie_enable: PCIe: Assert the reset of endpoint of RC0.
[    2.516850] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.517658] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.517663] lge_battery: lge_battery_probe: bm_init fail
[    2.520530] msm_pcie_enable: PCIe RC0 PHY is ready!
[    2.521557] msm_pcie_enable: PCIe: Release the reset of endpoint of RC0.
[    2.521893] misc mnh_sm: mnh_sm_ready_irq_handler: mnh device is ready to boot
[    2.526281] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.527041] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.527045] lge_battery: lge_battery_probe: bm_init fail
[    2.535601] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.536301] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.536305] lge_battery: lge_battery_probe: bm_init fail
[    2.537969] msm_pcie_enable: PCIe RC0 link initialized
[    2.538088] PCI host bridge /soc/qcom,pcie@01c00000 ranges:
[    2.538100] No bus range found for /soc/qcom,pcie@01c00000, using [bus 00-ff]
[    2.538266] IO 0x1b030000..0x1b04ffff -> 0x1b030000
[    2.538430] MEM 0x1b100000..0x1bffffff -> 0x1b100000
[    2.539235] pci-msm 1c00000.qcom,pcie: PCI host bridge to bus 0000:00
[    2.539246] pci_bus 0000:00: root bus resource [bus 00-ff]
[    2.539252] pci_bus 0000:00: root bus resource [io  0x0000-0x1ffff] (bus address [0x1b030000-0x1b04ffff])
[    2.539256] pci_bus 0000:00: root bus resource [mem 0x00000000-0x00efffff]
[    2.539494] pci 0000:00:00.0: [17cb:0105] type 01 class 0x060400
[    2.539581] pci 0000:00:00.0: reg 0x10: [mem 0x00000000-0x00000fff 64bit]
[    2.539962] pci 0000:00:00.0: PME# supported from D0 D3hot D3cold
[    2.540568] iommu: Adding device 0000:00:00.0 to group 33
[    2.540861] pci 0000:00:00.0: bridge configuration invalid ([bus 00-00]), reconfiguring
[    2.541325] pci 0000:01:00.0: [8086:3140] type 00 class 0x000000
[    2.541535] pci 0000:01:00.0: reg 0x10: [mem 0x00000000-0x00000fff 64bit pref]
[    2.541602] pci 0000:01:00.0: reg 0x18: [mem 0x00000000-0x007fffff 64bit]
[    2.541668] pci 0000:01:00.0: reg 0x20: [mem 0x00000000-0x003fffff 64bit pref]
[    2.541885] pci 0000:01:00.0: setting pcie class
[    2.542112] pci 0000:01:00.0: supports D1 D2
[    2.542116] pci 0000:01:00.0: PME# supported from D0 D1 D3hot D3cold
[    2.542756] iommu: Adding device 0000:01:00.0 to group 34
[    2.542786] pci_bus 0000:01: busn_res: [bus 00-fe] end is updated to 01
[    2.542851] pci 0000:00:00.0: BAR 8: assigned [mem 0x00000000-0x007fffff]
[    2.542858] pci 0000:00:00.0: BAR 9: assigned [mem 0x00000000-0x005fffff 64bit pref]
[    2.542864] pci 0000:00:00.0: BAR 0: assigned [mem 0x00000000-0x00000fff 64bit]
[    2.542901] pci 0000:01:00.0: BAR 2: assigned [mem 0x00000000-0x007fffff 64bit]
[    2.543012] pci 0000:01:00.0: BAR 4: assigned [mem 0x00000000-0x003fffff 64bit pref]
[    2.543069] pci 0000:01:00.0: BAR 0: assigned [mem 0x00000000-0x00000fff 64bit pref]
[    2.543124] pci 0000:00:00.0: PCI bridge to [bus 00]
[    2.543143] pci 0000:00:00.0:   bridge window [mem 0x00000000-0x007fffff]
[    2.543157] pci 0000:00:00.0:   bridge window [mem 0x00000000-0x005fffff 64bit pref]
[    2.543706] pci 0000:00:00.0: enabling device (0000 -> 0002)
[    2.543733] mnh_pci 0000:01:00.0: enabling device (0000 -> 0002)
[    2.544786] mnh_pci 0000:01:00.0: vector :10 , msi_num:10, irq:755
[    2.544820] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.545643] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.545647] lge_battery: lge_battery_probe: bm_init fail
[    2.545914] mnh_pci 0000:01:00.0: request irq:756
[    2.546065] mnh_pci 0000:01:00.0: request irq:757
[    2.546248] mnh_pci 0000:01:00.0: request irq:758
[    2.546399] mnh_pci 0000:01:00.0: request irq:759
[    2.546621] mnh_pci 0000:01:00.0: request irq:760
[    2.546778] mnh_pci 0000:01:00.0: request irq:763
[    2.546941] mnh_pci 0000:01:00.0: request irq:764
[    2.550930] mnh_pci 0000:01:00.0: attached to IOMMU
[    2.554080] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.554842] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.554846] lge_battery: lge_battery_probe: bm_init fail
[    2.563297] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.563985] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.563989] lge_battery: lge_battery_probe: bm_init fail
[    2.572265] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.572958] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.572963] lge_battery: lge_battery_probe: bm_init fail
[    2.580616] msm_pcie_disable: PCIe: Assert the reset of endpoint of RC0.
[    2.580763] misc mnh_sm: mnh_sm_ready_irq_handler: mnh device is ready to suspend
[    2.581264] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.582224] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.582228] lge_battery: lge_battery_probe: bm_init fail
[    2.584758] misc mnh_sm: MNH SM initialized successfully
[    2.585426] [KEY] Target does not use pinctrl
[    2.585446] [KEY] gpio_keys_setup_key, error=-524, debounce(15, 15)
[    2.585503] [KEY] keycode = 115, gpio = 126, irq = 323
[    2.585730] input: gpio-keys as /devices/soc/soc:gpio_keys/input/input3
[    2.588581] qcom,qpnp-rtc 800f000.qcom,spmi:qcom,pm8998@0:qcom,pm8998_rtc: setting system clock to 1970-01-14 16:51:06 UTC (1183866)
[    2.590661] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.591527] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.591531] lge_battery: lge_battery_probe: bm_init fail
[    2.591808] msm_thermal:create_thermal_debugfs Error creating debugfs dir:msm_thermal. err:-19
[    2.592781] lmh_lite:lmh_parse_sensor Registering sensor:[GLM_soc]
[    2.592842] lmh_interface:lmh_create_debugfs_nodes Error creating debugfs file:hw_trace_enable. err:-19
[    2.594813] lmh_lite:lmh_parse_sensor Registering sensor:[LLM_cp1-]
[    2.595574] lmh_lite:lmh_parse_sensor Registering sensor:[LLM_cp0-]
[    2.596182] soc:qcom,lmh supply vdd-apss not found, using dummy regulator
[    2.596566] lmh_lite:lmh_get_sensor_devicetree Error getting ODCM thresh. err:-22
[    2.600570] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.600793] lmh_interface:lmh_debug_register Error creating debugfs dir:debug. err:-19
[    2.600798] lmh_lite:lmh_debug_init Error registering debug ops. err:-19
[    2.600802] lmh_lite:lmh_probe LMH debug init failed. err:-19
[    2.601434] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.601438] lge_battery: lge_battery_probe: bm_init fail
[    2.602541] parse_cpu_levels: idx 1 503
[    2.602545] parse_cpu_levels: idx 2 1744
[    2.602600] parse_cpu_levels: idx 2 2222
[    2.604178] parse_cpu_levels: idx 1 1301
[    2.604182] parse_cpu_levels: idx 2 1820
[    2.604186] parse_cpu_levels: idx 2 1999
[    2.604218] calculate_residency: residency < 0 for LPM
[    2.609897] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.610756] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.610761] lge_battery: lge_battery_probe: bm_init fail
[    2.619179] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.619963] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.619968] lge_battery: lge_battery_probe: bm_init fail
[    2.627122] rmnet_ipa3 started initialization
[    2.628422] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.629377] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.629382] lge_battery: lge_battery_probe: bm_init fail
[    2.629861] qcom,cc-debug-8998 162000.qcom,debugcc: Registered debug mux
[    2.632490] gfx_mem_acc_corner: disabling
[    2.632594] apc0_pwrcl_corner: disabling
[    2.632600] apc1_perfcl_corner: disabling
[    2.633874] regulator_proxy_consumer_remove_all: removing regulator proxy consumer requests
[    2.635085] clock_late_init: Removing enables held for handed-off clocks
[    2.639064] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.639916] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.639920] lge_battery: lge_battery_probe: bm_init fail
[    2.648281] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.649093] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.649098] lge_battery: lge_battery_probe: bm_init fail
[    2.657488] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.658305] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.658310] lge_battery: lge_battery_probe: bm_init fail
[    2.663996] ALSA device list:
[    2.664003] No soundcards found.
[    2.664173] Warning: unable to open an initial console.
[    2.666786] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.667565] md: Skipping autodetection of RAID arrays. (raid=autodetect will force)
[    2.667571] device-mapper: init: attempting early device configuration.
[    2.668270] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.668275] lge_battery: lge_battery_probe: bm_init fail
[    2.668785] device-mapper: init: adding target '0 5159992 verity 1 PARTUUID=73d84a99-00e6-d82f-51d0-c7461b899ae8 PARTUUID=73d84a99-00e6-d82f-51d0-c7461b899ae8 4096 4096 644999 644999 sha1 8038bb57ef981f2aa8aba7f7b76b99cbb135c492 ca8152e6806d191b165d3d5fa67fb275d0d450d00ef73d03c4510d2cbc4f49f4 10 restart_on_corruption ignore_zero_blocks use_fec_from_device PARTUUID=73d84a99-00e6-d82f-51d0-c7461b899ae8 fec_roots 2 fec_blocks 650080 fec_start 650080'
[    2.675273] device-mapper: init: dm-0 is ready
[    2.676855] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.677742] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.677747] lge_battery: lge_battery_probe: bm_init fail
[    2.686132] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.686983] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.686988] lge_battery: lge_battery_probe: bm_init fail
[    2.695390] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.696289] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.696294] lge_battery: lge_battery_probe: bm_init fail
[    2.704750] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.705592] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.705597] lge_battery: lge_battery_probe: bm_init fail
[    2.714099] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.714987] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.714992] lge_battery: lge_battery_probe: bm_init fail
[    2.723669] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.724560] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.724565] lge_battery: lge_battery_probe: bm_init fail
[    2.734820] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.735956] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.735961] lge_battery: lge_battery_probe: bm_init fail
[    2.746529] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.747571] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.747576] lge_battery: lge_battery_probe: bm_init fail
[    2.757862] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.758847] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.758853] lge_battery: lge_battery_probe: bm_init fail
[    2.769026] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.770001] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.770007] lge_battery: lge_battery_probe: bm_init fail
[    2.780322] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.781375] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.781381] lge_battery: lge_battery_probe: bm_init fail
[    2.791588] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.792619] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.792625] lge_battery: lge_battery_probe: bm_init fail
[    2.801229] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.802086] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.802092] lge_battery: lge_battery_probe: bm_init fail
[    2.810625] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.811410] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.811415] lge_battery: lge_battery_probe: bm_init fail
[    2.819778] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.820655] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.820662] lge_battery: lge_battery_probe: bm_init fail
[    2.828969] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.829706] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.829710] lge_battery: lge_battery_probe: bm_init fail
[    2.837959] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.838763] lge_battery: bm_init: Battery id is zero, deferring probe!
[    2.838767] lge_battery: lge_battery_probe: bm_init fail
[    2.840385] of_batterydata_get_best_profile: lge_blt35_tocad_3620mah found
[    2.842963] device-mapper: verity-fec: 8:7: FEC 0: corrected 16 errors
[    2.844262] EXT4-fs (dm-0): couldn't mount as ext3 due to feature incompatibilities
[    2.853689] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.862171] lge_battery: bm_vote_fcc_update: vote id[0], set cur[3550000]
[    2.862297] lge_battery: bm_check_status: wake_locked: present[1] chg_state[1] vbus[0]
[    2.862308] lge_battery: lge_battery_probe: Battery manager driver probe success!
[    2.864833] device-mapper: verity-fec: 8:7: FEC 0: corrected 16 errors
[    2.865017] EXT4-fs (dm-0): couldn't mount as ext2 due to feature incompatibilities
[    2.870939] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.879907] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.883990] device-mapper: verity-fec: 8:7: FEC 0: corrected 16 errors
[    2.888910] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.889631] EXT4-fs (dm-0): mounted filesystem without journal. Opts: (null)
[    2.889917] VFS: Mounted root (ext4 filesystem) readonly on device 252:0.
[    2.893193] Freeing unused kernel memory: 8192K
[    2.898865] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.908218] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.917244] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.926607] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.927025] init: init first stage started!
[    2.928099] init: Using Android DT directory /proc/device-tree/firmware/android/
[    2.934673] init: [libfs_mgr]fs_mgr_read_fstab_default(): failed to find device default fstab
[    2.936895] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.946940] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.956901] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.966391] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.975470] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.984394] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    2.998043] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.007052] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.013427] EXT4-fs (sda4): mounted filesystem with ordered data mode. Opts: barrier=1
[    3.013530] init: [libfs_mgr]__mount(source=/dev/block/platform/soc/1da4000.ufshc/by-name/persist,target=/persist,type=ext4)=0: Success
[    3.015916] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.016483] EXT4-fs (sde14): mounted filesystem without journal. Opts: barrier=1
[    3.016536] init: [libfs_mgr]__mount(source=/dev/block/platform/soc/1da4000.ufshc/by-name/vendor_a,target=/vendor,type=ext4)=0: Success
[    3.017893] init: Skipped setting INIT_AVB_VERSION (not in recovery mode)
[    3.018048] init: Loading SELinux policy
[    3.026893] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.031528] SELinux: 8192 avtab hash slots, 22282 rules.
[    3.036052] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.042674] SELinux: 8192 avtab hash slots, 22282 rules.
[    3.042686] SELinux:  1 users, 4 roles, 1563 types, 0 bools, 1 sens, 1024 cats
[    3.042691] SELinux:  93 classes, 22282 rules
[    3.044821] SELinux:  Completing initialization.
[    3.044824] SELinux:  Setting up existing superblocks.
[    3.044905] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.053895] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.063043] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.071960] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.074338] selinux: SELinux: Loaded policy from /vendor/etc/selinux/precompiled_sepolicy\x0a
[    3.080884] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.081521] selinux: SELinux: Loaded file_contexts\x0a
[    3.088127] init: init second stage started!
[    3.089940] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.099018] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.106694] init: Using Android DT directory /proc/device-tree/firmware/android/
[    3.110253] selinux: SELinux: Loaded file_contexts\x0a
[    3.110272] init: Running restorecon...
[    3.111017] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.114572] init: waitid failed: No child processes
[    3.119971] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.120234] init: Couldn't load property file '/odm/default.prop': open() failed: No such file or directory: No such file or directory
[    3.122867] init: Created socket '/dev/socket/property_service', mode 666, user 0, group 0
[    3.124113] init: Forked subcontext for 'u:r:vendor_init:s0' with pid 577
[    3.124798] init: Forked subcontext for 'u:r:vendor_init:s0' with pid 578
[    3.126208] init: Parsing file /init.rc...
[    3.129131] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.130620] init: Added '/init.environ.rc' to import list
[    3.130666] init: Added '/init.usb.rc' to import list
[    3.130694] init: Added '/init.taimen.rc' to import list
[    3.130737] init: Added '/vendor/etc/init/hw/init.taimen.rc' to import list
[    3.130754] init: Added '/init.usb.configfs.rc' to import list
[    3.130778] init: Added '/init.zygote64_32.rc' to import list
[    3.132941] init: Parsing file /init.environ.rc...
[    3.133627] init: Parsing file /init.usb.rc...
[    3.134707] init: Parsing file /init.taimen.rc...
[    3.134734] init: Unable to read config file '/init.taimen.rc': open() failed: No such file or directory
[    3.134748] init: /init.rc: 9: Could not import file '/init.taimen.rc': No such file or directory
[    3.136032] init: Parsing file /vendor/etc/init/hw/init.taimen.rc...
[    3.136758] init: Added '/vendor/etc/init/hw/init.taimen.usb.rc' to import list
[    3.137707] init: /vendor/etc/init/hw/init.taimen.rc: 745: Unable to decode GID for 'qcom_diag': getpwnam failed: No such file or directory
[    3.138153] init: Parsing file /vendor/etc/init/hw/init.taimen.usb.rc...
[    3.138395] init: Added '/vendor/etc/init/hw/init.wahoo.usb.rc' to import list
[    3.138654] init: Parsing file /vendor/etc/init/hw/init.wahoo.usb.rc...
[    3.138728] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.138990] init: Parsing file /init.usb.configfs.rc...
[    3.139615] init: Parsing file /init.zygote64_32.rc...
[    3.140172] init: Parsing directory /system/etc/init...
[    3.141054] init: Parsing file /system/etc/init/android.hidl.allocator@1.0-service.rc...
[    3.141405] init: Parsing file /system/etc/init/atrace.rc...
[    3.141989] init: Parsing file /system/etc/init/audioserver.rc...
[    3.142379] init: Parsing file /system/etc/init/blank_screen.rc...
[    3.148091] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.149328] init: Parsing file /system/etc/init/bootanim.rc...
[    3.150255] init: Parsing file /system/etc/init/bootstat.rc...
[    3.150735] init: Parsing file /system/etc/init/bufferhubd.rc...
[    3.151138] init: Parsing file /system/etc/init/cameraserver.rc...
[    3.157983] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.158095] init: Parsing file /system/etc/init/cppreopts.rc...
[    3.158579] init: Parsing file /system/etc/init/drmserver.rc...
[    3.158987] init: Parsing file /system/etc/init/dumpstate.rc...
[    3.159303] init: Parsing file /system/etc/init/gatekeeperd.rc...
[    3.159604] init: Parsing file /system/etc/init/healthd.rc...
[    3.159917] init: Parsing file /system/etc/init/hwservicemanager.rc...
[    3.160305] init: Parsing file /system/etc/init/incidentd.rc...
[    3.160709] init: Parsing file /system/etc/init/installd.rc...
[    3.161334] init: Parsing file /system/etc/init/keystore.rc...
[    3.161647] init: Parsing file /system/etc/init/lmkd.rc...
[    3.161924] init: Parsing file /system/etc/init/logd.rc...
[    3.162235] init: Parsing file /system/etc/init/mdnsd.rc...
[    3.162564] init: Parsing file /system/etc/init/mediadrmserver.rc...
[    3.164093] init: Parsing file /system/etc/init/mediaextractor.rc...
[    3.164470] init: Parsing file /system/etc/init/mediametrics.rc...
[    3.165469] init: Parsing file /system/etc/init/mediaserver.rc...
[    3.167648] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.168389] init: Parsing file /system/etc/init/mtpd.rc...
[    3.170262] init: Parsing file /system/etc/init/netd.rc...
[    3.170624] init: Parsing file /system/etc/init/otapreopt.rc...
[    3.170965] init: Parsing file /system/etc/init/perfetto.rc...
[    3.171376] init: Parsing file /system/etc/init/performanced.rc...
[    3.171787] init: Parsing file /system/etc/init/racoon.rc...
[    3.172608] init: Parsing file /system/etc/init/recovery-persist.rc...
[    3.172980] init: Parsing file /system/etc/init/recovery-refresh.rc...
[    3.173310] init: Parsing file /system/etc/init/servicemanager.rc...
[    3.173660] init: Parsing file /system/etc/init/statsd.rc...
[    3.173974] init: Parsing file /system/etc/init/storaged.rc...
[    3.174299] init: Parsing file /system/etc/init/surfaceflinger.rc...
[    3.174630] init: Parsing file /system/etc/init/thermalservice.rc...
[    3.174959] init: Parsing file /system/etc/init/tombstoned.rc...
[    3.175304] init: Parsing file /system/etc/init/uncrypt.rc...
[    3.175675] init: Parsing file /system/etc/init/update_engine.rc...
[    3.176022] init: Parsing file /system/etc/init/update_verifier.rc...
[    3.176337] init: Parsing file /system/etc/init/usbd.rc...
[    3.176680] init: Parsing file /system/etc/init/vdc.rc...
[    3.176757] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.176994] init: Parsing file /system/etc/init/virtual_touchpad.rc...
[    3.177363] init: Parsing file /system/etc/init/vold.rc...
[    3.178296] init: Parsing file /system/etc/init/vr_hwc.rc...
[    3.178606] init: Parsing file /system/etc/init/wait_for_keymaster.rc...
[    3.178912] init: Parsing file /system/etc/init/wifi-events.rc...
[    3.179288] init: Parsing file /system/etc/init/wificond.rc...
[    3.179658] init: Parsing file /product/etc/init...
[    3.179683] init: Unable to read config file '/product/etc/init': open() failed: No such file or directory
[    3.179706] init: Parsing file /odm/etc/init...
[    3.179726] init: Unable to read config file '/odm/etc/init': open() failed: No such file or directory
[    3.179747] init: Parsing directory /vendor/etc/init...
[    3.179905] init: Parsing file /vendor/etc/init/android.hardware.audio@2.0-service.rc...
[    3.180677] init: Parsing file /vendor/etc/init/android.hardware.biometrics.fingerprint@2.1-service.fpc.rc...
[    3.181163] init: Parsing file /vendor/etc/init/android.hardware.bluetooth@1.0-service-qti.rc...
[    3.181781] init: Parsing file /vendor/etc/init/android.hardware.boot@1.0-service.rc...
[    3.182596] init: Parsing file /vendor/etc/init/android.hardware.camera.provider@2.4-service.rc...
[    3.183240] init: Parsing file /vendor/etc/init/android.hardware.cas@1.0-service.rc...
[    3.183818] init: Parsing file /vendor/etc/init/android.hardware.configstore@1.1-service.rc...
[    3.184273] init: Parsing file /vendor/etc/init/android.hardware.contexthub@1.0-service.rc...
[    3.184732] init: Parsing file /vendor/etc/init/android.hardware.drm@1.0-service.rc...
[    3.185291] init: Parsing file /vendor/etc/init/android.hardware.drm@1.1-service.clearkey.rc...
[    3.185746] init: Parsing file /vendor/etc/init/android.hardware.drm@1.1-service.widevine.rc...
[    3.186333] init: Parsing file /vendor/etc/init/android.hardware.dumpstate@1.0-service.wahoo.rc...
[    3.186551] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.187057] init: Parsing file /vendor/etc/init/android.hardware.gatekeeper@1.0-service-qti.rc...
[    3.187566] init: Parsing file /vendor/etc/init/android.hardware.gnss@1.0-service-qti.rc...
[    3.188008] init: Parsing file /vendor/etc/init/android.hardware.graphics.allocator@2.0-service.rc...
[    3.188506] init: Parsing file /vendor/etc/init/android.hardware.graphics.composer@2.1-service.rc...
[    3.189058] init: Parsing file /vendor/etc/init/android.hardware.health@2.0-service.wahoo.rc...
[    3.189501] init: Parsing file /vendor/etc/init/android.hardware.keymaster@3.0-service-qti.rc...
[    3.189967] init: Parsing file /vendor/etc/init/android.hardware.light@2.0-service.rc...
[    3.190651] init: Parsing file /vendor/etc/init/android.hardware.media.omx@1.0-service.rc...
[    3.191213] init: Parsing file /vendor/etc/init/android.hardware.memtrack@1.0-service.rc...
[    3.191679] init: Parsing file /vendor/etc/init/android.hardware.nfc@1.1-service.rc...
[    3.192143] init: Parsing file /vendor/etc/init/android.hardware.oemlock@1.0-service.rc...
[    3.192643] init: Parsing file /vendor/etc/init/android.hardware.power@1.2-service.wahoo-libperfmgr.rc...
[    3.193134] init: Parsing file /vendor/etc/init/android.hardware.sensors@1.0-service.rc...
[    3.193611] init: Parsing file /vendor/etc/init/android.hardware.usb@1.1-service.wahoo.rc...
[    3.194114] init: Parsing file /vendor/etc/init/android.hardware.vibrator@1.2-service.wahoo.rc...
[    3.194583] init: Parsing file /vendor/etc/init/android.hardware.vr@1.0-service.wahoo.rc...
[    3.195099] init: Parsing file /vendor/etc/init/android.hardware.wifi@1.0-service.rc...
[    3.195602] init: Parsing file /vendor/etc/init/esed.rc...
[    3.195808] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.196117] init: Parsing file /vendor/etc/init/hostapd.android.rc...
[    3.196820] init: Parsing file /vendor/etc/init/init-taimen.rc...
[    3.197380] init: Parsing file /vendor/etc/init/init.taimen.diag.rc...
[    3.197889] init: Parsing file /vendor/etc/init/init.taimen.logging.rc...
[    3.198391] init: Parsing file /vendor/etc/init/rild.rc...
[    3.198863] init: Parsing file /vendor/etc/init/vndservicemanager.rc...
[    3.199547] init: processing action (early-init) from (/init.rc:14)
[    3.204993] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.205182] init: starting service 'ueventd'...
[    3.211028] init: processing action (early-init) from (/vendor/etc/init/hw/init.taimen.rc:37)
[    3.214679] ueventd: ueventd started!
[    3.217823] init: starting service 'vendor.insmod_sh'...
[    3.219046] init: processing action (wait_for_coldboot_done) from (<Builtin Action>:0)
[    3.219307] selinux: SELinux: Loaded file_contexts\x0a
[    3.219551] ueventd: Parsing file /ueventd.rc...
[    3.219689] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.223807] ueventd: Parsing file /vendor/ueventd.rc...
[    3.224237] ueventd: Parsing file /odm/ueventd.rc...
[    3.224266] ueventd: Unable to read config file '/odm/ueventd.rc': open() failed: No such file or directory
[    3.224375] ueventd: Parsing file /ueventd.taimen.rc...
[    3.224394] ueventd: Unable to read config file '/ueventd.taimen.rc': open() failed: No such file or directory
[    3.230465] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.242169] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.253051] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.262108] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.271343] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.280022] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.281217] device-mapper: verity-fec: 8:7: FEC 1155072: corrected 8 errors
[    3.292263] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.301741] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.313570] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.322856] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.331775] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.341441] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.350348] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.359705] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.368532] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.377554] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.386450] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.395325] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.404204] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.413008] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.422056] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.430972] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.439764] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.448618] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.457555] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.466346] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.468120] FG: fg_get_battery_temp: batt temperature original:430, tuned:375
[    3.473262] FG: fg_get_battery_temp: batt temperature original:430, tuned:375
[    3.475668] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.484770] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.493669] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.502534] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.511195] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.519752] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.528348] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.537037] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.545716] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.554367] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.562948] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.571533] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.580108] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.588735] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.597354] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.603249] FG: fg_get_battery_temp: batt temperature original:430, tuned:375
[    3.606074] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.608886] FG: fg_get_battery_temp: batt temperature original:430, tuned:375
[    3.614807] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.623411] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.631997] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.640564] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.649118] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.657705] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.666292] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.674887] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.683579] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.692148] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.700722] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.709267] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.717847] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.726433] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.735007] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.743595] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.752173] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.754820] FG: fg_get_battery_temp: batt temperature original:430, tuned:375
[    3.757099] SMB138X: smb138x_parallel_get_prop: parallel power supply get prop 4 not supported
[    3.760788] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.762580] FG: fg_get_battery_temp: batt temperature original:430, tuned:375
[    3.770198] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.780550] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.789615] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.801913] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.805254] msm-dwc3 a800000.ssusb: dwc3_msm_resume: exiting lpm
[    3.808422] ueventd: firmware: loading 'drv2624.bin' for '/devices/soc/c17a000.i2c/i2c-6/6-005a/firmware/drv2624.bin'
[    3.811102] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.821434] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.829747] ueventd: loading /devices/soc/c17a000.i2c/i2c-6/6-005a/firmware/drv2624.bin took 21ms
[    3.830335] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.832109] drv2624 6-005a: drv2624_firmware_load, firmware good
[    3.833376] msm-dwc3 a800000.ssusb: DWC3 exited from low power mode
[    3.840715] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.849896] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.857360] SMB138X: smb138x_parallel_get_prop: parallel power supply get prop 4 not supported
[    3.863652] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.872978] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.882481] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.891532] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.902514] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.911287] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.919866] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.928498] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.937132] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.945729] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.954345] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.962984] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.971693] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.980305] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.988880] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    3.997803] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.006448] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.015056] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.023648] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.032345] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.040943] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.049510] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.060738] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.061776] ueventd: firmware: loading 'tas2557s_PG21_uCDSP.bin' for '/devices/soc/c1b5000.i2c/i2c-7/7-004d/firmware/tas2557s_PG21_uCDSP.bin'
[    4.067026] ueventd: loading /devices/soc/c1b5000.i2c/i2c-7/7-004d/firmware/tas2557s_PG21_uCDSP.bin took 5ms
[    4.067643] tas2557s 7-004d: tas2557_fw_ready:
[    4.071371] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.075996] ueventd: firmware: loading 'tas2557_cal.bin' for '/devices/soc/c1b5000.i2c/i2c-7/7-004d/firmware/tas2557_cal.bin'
[    4.080563] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.091251] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.102698] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.112077] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.122015] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.131055] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.140239] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.149620] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.159526] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.168366] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.177766] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.186872] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.196002] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.204895] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.213823] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.222752] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.231571] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.240480] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.249343] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.258810] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.267810] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.276643] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.285517] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.294490] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.303399] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.312251] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.321071] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.330123] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.353035] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.368299] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.377812] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.386525] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.395515] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.417491] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.426354] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.435362] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.444244] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.453075] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.470338] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.479189] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.488314] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.497230] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.515515] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.524490] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.533473] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.543513] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.554472] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.566321] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.577354] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.586732] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.595850] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.607675] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.613916] IPC_RTR: msm_ipc_router_smd_driver_register Already driver registered IPCRTR
[    4.613929] IPC_RTR: msm_ipc_router_smd_driver_register Already driver registered IPCRTR
[    4.616856] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.625915] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.625920] ueventd: Coldboot took 1.395 seconds
[    4.629489] init: wait for '/dev/.coldboot_done' took 1410ms
[    4.629608] init: Command 'wait_for_coldboot_done' action=wait_for_coldboot_done (<Builtin Action>:0) took 1410ms and succeeded
[    4.631156] init: processing action (MixHwrngIntoLinuxRng) from (<Builtin Action>:0)
[    4.632470] init: Mixed 512 bytes from /dev/hw_random into /dev/urandom
[    4.632519] init: processing action (SetMmapRndBits) from (<Builtin Action>:0)
[    4.634106] init: Service 'vendor.insmod_sh' (pid 581) exited with status 0
[    4.634261] init: processing action (SetKptrRestrict) from (<Builtin Action>:0)
[    4.634661] init: processing action (keychord_init) from (<Builtin Action>:0)
[    4.634899] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.635025] keychord: using input dev qpnp_pon for fevent
[    4.635035] keychord: using input dev gpio-keys for fevent
[    4.635241] init: processing action (console_init) from (<Builtin Action>:0)
[    4.635334] init: processing action (init) from (/init.rc:41)
[    4.635992] init: Command 'copy /default.prop /dev/urandom' action=init (/init.rc:46) took 0ms and failed: Could not read input file '/default.prop': open() failed: Too many symbolic links encountered
[    4.643964] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.653349] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.662424] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.672038] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.681171] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.682110] init: Command 'mount cgroup2 cg2_bpf /dev/cg2_bpf nodev noexec nosuid' action=init (/init.rc:225) took 0ms and failed: mount() failed: No such device
[    4.689609] Registered swp emulation handler
[    4.690202] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.692790] init: processing action (init) from (/init.environ.rc:2)
[    4.692855] init: processing action (init) from (/vendor/etc/init/hw/init.taimen.rc:43)
[    4.694038] init: Command 'write /sys/devices/soc/${ro.boot.bootdevice}/clkscale_enable 0' action=init (/vendor/etc/init/hw/init.taimen.rc:45) took 1ms and failed: Unable to write to file '/sys/devices/soc/1da4000.ufshc/clkscale_enable': open() failed: Permission denied
[    4.695156] init: wait for '/dev/block/platform/soc/1da4000.ufshc' took 0ms
[    4.699351] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.699802] init: starting service 'vendor.qseecomd'...
[    4.704284] init: Command 'symlink /sdcard /storage/sdcard0' action=init (/vendor/etc/init/hw/init.taimen.rc:59) took 2ms and failed: symlink() failed: Read-only file system
[    4.705266] msm_thermal:store_cc_enabled Core control disabled
[    4.708774] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.718370] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.727607] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.737223] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.747194] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.756682] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.757174] capability: warning: `qseecomd' uses 32-bit capabilities (legacy support in use)
[    4.763424] scm_call failed: func id 0x72000206, ret: -1, syscall returns: 0x0, 0x0, 0x0
[    4.765312] scm_call failed: func id 0x72000206, ret: -1, syscall returns: 0x0, 0x0, 0x0
[    4.765981] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.766888] init: processing action (ro.boot.revision=* && init) from (/vendor/etc/init/init-taimen.rc:31)
[    4.767082] init: processing action (StartBoringSslSelfTest) from (<Builtin Action>:0)
[    4.767750] init: processing action (MixHwrngIntoLinuxRng) from (<Builtin Action>:0)
[    4.768117] init: Mixed 512 bytes from /dev/hw_random into /dev/urandom
[    4.768153] init: processing action (late-init) from (/init.rc:278)
[    4.768185] init: processing action (late-init) from (/system/etc/init/atrace.rc:3)
[    4.773376] init: processing action (queue_property_triggers) from (<Builtin Action>:0)
[    4.773421] scm_call failed: func id 0x72000206, ret: -1, syscall returns: 0x0, 0x0, 0x0
[    4.773429] init: processing action (fs) from (/vendor/etc/init/hw/init.taimen.rc:162)
[    4.774374] init: starting service 'hwservicemanager'...
[    4.775436] scm_call failed: func id 0x72000206, ret: -1, syscall returns: 0x0, 0x0, 0x0
[    4.775990] init: Untracked pid 611 exited with status 0
[    4.779153] scm_call failed: func id 0x72000206, ret: -1, syscall returns: 0x0, 0x0, 0x0
[    4.783361] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.784381] init: [libfs_mgr]__mount(source=/dev/block/platform/soc/1da4000.ufshc/by-name/modem_a,target=/firmware,type=vfat)=0: Success
[    4.785642] init: Parsing file /product/etc/init...
[    4.785679] init: Unable to read config file '/product/etc/init': open() failed: No such file or directory
[    4.785702] init: Parsing file /odm/etc/init...
[    4.785722] init: Unable to read config file '/odm/etc/init': open() failed: No such file or directory
[    4.794949] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.803092] init: processing action (fs) from (/system/etc/init/logd.rc:18)
[    4.803583] init: processing action (fs) from (/system/etc/init/wifi-events.rc:17)
[    4.803645] init: processing action (post-fs) from (/init.rc:311)
[    4.804862] init: Couldn't load property file '/odm/build.prop': open() failed: No such file or directory: No such file or directory
[    4.806773] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.807843] init: Couldn't load property file '/factory/factory.prop': open() failed: No such file or directory: No such file or directory
[    4.809899] init: /recovery not specified in fstab
[    4.810283] init: starting service 'logd'...
[    4.811705] init: starting service 'servicemanager'...
[    4.813228] init: Created socket '/dev/socket/logd', mode 666, user 1036, group 1036
[    4.813943] init: starting service 'vndservicemanager'...
[    4.814188] init: Created socket '/dev/socket/logdr', mode 666, user 1036, group 1036
[    4.817783] init: Created socket '/dev/socket/logdw', mode 222, user 1036, group 1036
[    4.818360] init: Command 'chown system cache /cache' action=post-fs (/init.rc:338) took 0ms and failed: lchown() failed: Read-only file system
[    4.818373] init: Opened file '/proc/kmsg', flags 0
[    4.818459] init: Command 'chmod 0770 /cache' action=post-fs (/init.rc:339) took 0ms and failed: fchmodat() failed: Read-only file system
[    4.818482] init: Opened file '/dev/kmsg', flags 1
[    4.819147] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.821561] selinux: SELinux:  Could not stat /metadata: No such file or directory.\x0a
[    4.823457] init: processing action (post-fs) from (/vendor/etc/init/hw/init.taimen.rc:221)
[    4.825712] init: start_waiting_for_property("sys.listeners.registered", "true"): already set
[    4.825758] init: processing action (post-fs) from (/system/etc/init/recovery-refresh.rc:1)
[    4.826597] init: starting service 'exec 1 (/system/bin/recovery-refresh)'...
[    4.829108] init: processing action (late-fs) from (/init.rc:376)
[    4.831667] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.835275] init: starting service 'vendor.boot-hal-1-0'...
[    4.837778] init: starting service 'vendor.gatekeeper-1-0'...
[    4.840421] init: starting service 'vendor.keymaster-3-0'...
[    4.841474] init: processing action (late-fs) from (/vendor/etc/init/hw/init.taimen.rc:198)
[    4.842226] init: starting service 'vendor.devstart_sh'...
[    4.843825] init: starting service 'surfaceflinger'...
[    4.844540] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.846335] init: starting service 'bootanim'...
[    4.850276] init: starting service 'vendor.hwcomposer-2-1'...
[    4.852674] init: Created socket '/dev/socket/pdx/system/vr/display/client', mode 666, user 1000, group 1003
[    4.854113] init: starting service 'vendor.configstore-hal'...
[    4.854574] init: Created socket '/dev/socket/pdx/system/vr/display/manager', mode 666, user 1000, group 1003
[    4.859957] init: Service 'exec 1 (/system/bin/recovery-refresh)' (pid 620) exited with status 254
[    4.861128] init: starting service 'vendor.gralloc-2-0'...
[    4.864629] init: Created socket '/dev/socket/pdx/system/vr/display/vsync', mode 666, user 1000, group 1003
[    4.865125] logd.auditd: start
[    4.872262] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.889298] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.894382] adsp-loader soc:qcom,msm-adsp-loader: adsp_loader_do: scheduling work to load ADSP fw
[    4.897792] init: Wait for property took 32ms
[    4.901709] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.905418] init: [libfs_mgr]superblock s_max_mnt_count:65535,/dev/block/platform/soc/1da4000.ufshc/by-name/userdata
[    4.913044] subsys-pil-tz 17300000.qcom,lpass: adsp: loading from 0x0000000000000000 to 0x0000000000000000
[    4.913678] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.914875] EXT4-fs (sda13): Ignoring removed nomblk_io_submit option
[    4.917094] sensors-ssc soc:qcom,msm-ssc-sensors: slpi_loader_do: scheduling work to load SLPI fw
[    4.928653] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.929699] subsys-restart: __subsystem_get(): Changing subsys fw_name to slpi_v2
[    4.941099] subsys-pil-tz 5c00000.qcom,ssc: slpi: loading from 0x0000000000000000 to 0x0000000000000000
[    4.950746] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.955730] EXT4-fs (sda13): mounted filesystem with ordered data mode. Opts: errors=remount-ro,nomblk_io_submit
[    4.959582] init: [libfs_mgr]check_fs(): mount(/dev/block/platform/soc/1da4000.ufshc/by-name/userdata,/data,ext4)=0: Success
[    4.967567] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.981284] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    4.993645] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.003098] init: [libfs_mgr]check_fs(): unmount(/data) succeeded
[    5.003277] init: [libfs_mgr]Running /system/bin/e2fsck on /dev/block/platform/soc/1da4000.ufshc/by-name/userdata
[    5.005716] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.016133] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.026116] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.035230] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.044359] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.057558] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.068834] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.078143] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.094163] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.097461] mdss_fb_blank_sub: mdss_fb_open+0xb0/0x1b0 mode:0
[    5.105254] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.107196] e2fsck: e2fsck 1.43.3 (04-Sep-2016)\x0a
[    5.107233] e2fsck: /dev/block/platform/soc/1da4000.ufshc/by-name/userdata: clean, 127296/7380992 files, 14896284/29518843 blocks\x0a
[    5.114295] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.115020] init: [libfs_mgr]superblock s_max_mnt_count:65535,/dev/block/platform/soc/1da4000.ufshc/by-name/userdata
[    5.123709] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.132710] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.141819] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.150965] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.159910] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.165082] EXT4-fs (sda13): mounted filesystem with ordered data mode. Opts: errors=panic,barrier=1,noauto_da_alloc
[    5.165493] init: [libfs_mgr]__mount(source=/dev/block/platform/soc/1da4000.ufshc/by-name/userdata,target=/data,type=ext4)=0: Success
[    5.165733] init: [libfs_mgr]/data is file encrypted
[    5.168238] init: Keyring created with id 896236821 in process 1
[    5.168576] init: Command 'mount_all /vendor/etc/fstab.${ro.hardware} --late' action=late-fs (/vendor/etc/init/hw/init.taimen.rc:213) took 270ms and succeeded
[    5.169095] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.176176] init: starting service 'vendor.time_daemon'...
[    5.177590] init: processing action (late-fs) from (/vendor/etc/init/init-taimen.rc:1)
[    5.178372] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.185998] init: processing action (post-fs-data) from (/init.rc:384)
[    5.188805] init: starting service 'vold'...
[    5.190753] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.195066] init: starting service 'exec 2 (/system/bin/vdc --wait cryptfs enablefilecrypto)'...
[    5.196220] init: SVC_EXEC pid 656 (uid 0 gid 0+0 context default) started; waiting...
[    5.199862] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.218956] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.235240] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.248558] init: Service 'vendor.devstart_sh' (pid 624) exited with status 0
[    5.249263] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.259946] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.268964] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.277916] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.286975] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.295972] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.305294] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.314506] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.322135] subsys-pil-tz 5c00000.qcom,ssc: slpi: Brought out of reset
[    5.323594] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.324589] vdc: Waited 70ms for vold
[    5.337326] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.351708] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.362585] subsys-pil-tz 5c00000.qcom,ssc: Subsystem error monitoring/handling services are up
[    5.364279] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.364660] subsys-pil-tz 5c00000.qcom,ssc: slpi: Power/Clock ready interrupt received
[    5.370140] sensors-ssc soc:qcom,msm-ssc-sensors: slpi_load_fw: SLPI image is loaded
[    5.382565] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.382830] IPC_RTR: ipc_router_create_log_ctx: Unable to create IPC logging for [dsps_IPCRTR]
[    5.409957] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.425560] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.437734] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.446890] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.459933] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.472260] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.482600] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.491542] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.492622] subsys-pil-tz 17300000.qcom,lpass: adsp: Brought out of reset
[    5.500134] subsys-pil-tz soc:qcom,kgsl-hyp: a540_zap: loading from 0x0000000000000000 to 0x0000000000000000
[    5.500811] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.502338] apr_tal_link_state_cb: edge[lpass] link state[0]
[    5.505913] subsys-pil-tz 17300000.qcom,lpass: Subsystem error monitoring/handling services are up
[    5.507550] subsys-pil-tz 17300000.qcom,lpass: adsp: Power/Clock ready interrupt received
[    5.509976] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: platform (null) not registered
[    5.516240] subsys-pil-tz soc:qcom,kgsl-hyp: a540_zap: Brought out of reset
[    5.520586] IPC_RTR: ipc_router_create_log_ctx: Unable to create IPC logging for [lpass_IPCRTR]
[    5.532185] devfreq soc:qcom,kgsl-busmon: Couldn't update frequency transition information.
[    5.532409] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.541371] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.549351] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.553617] init: Service 'exec 2 (/system/bin/vdc --wait cryptfs enablefilecrypto)' (pid 656) exited with status 0 waiting took 0.357000 seconds
[    5.557775] init: Setting policy on /data/bootchart
[    5.570265] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.582173] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.593500] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.604528] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.606023] init: Policy for /data/bootchart set to 21ca43de98cea7bf modes 127/126
[    5.606053] init: Command 'mkdir /data/bootchart 0755 shell shell' action=post-fs-data (/init.rc:397) took 52ms and succeeded
[    5.607252] init: Received control message 'interface_start' for 'android.hardware.graphics.composer@2.1::IComposer/default' from pid: 613 (/system/bin/hwservicemanager)
[    5.607273] init: Could not find service hosting interface android.hardware.graphics.composer@2.1::IComposer/default
[    5.607588] init: Received control message 'interface_start' for 'android.hardware.graphics.composer@2.1::IComposer/default' from pid: 613 (/system/bin/hwservicemanager)
[    5.607603] init: Could not find service hosting interface android.hardware.graphics.composer@2.1::IComposer/default
[    5.611238] sysmon-qmi: sysmon_clnt_svc_arrive: Connection established between QMI handle and slpi's SSCTL service
[    5.615405] diag: In diag_send_feature_mask_update, control channel is not open, p: 3, 0000000000000000
[    5.616835] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.620906] possible reason: unannotated irqs-off.
[    5.620935] ------------[ cut here ]------------
[    5.620942] WARNING: at ../drivers/regulator/core.c:2221
[    5.620947] 
[    5.620955] CPU: 0 PID: 6 Comm: kworker/u16:0 Tainted: G        W       4.4.169-Sultan #15
[    5.620960] Hardware name: Qualcomm Technologies, Inc. MSM8998 v2.1 (DT)
[    5.620976] Workqueue: kgsl-workqueue kgsl_idle_check
[    5.620985] task: 0000000000000000 task.stack: 0000000000000000
[    5.620996] PC is at _regulator_disable+0x74/0x190
[    5.621003] LR is at _regulator_disable+0x60/0x190
[    5.621008] pc : [<ffffff9f2ece6264>] lr : [<ffffff9f2ece6250>] pstate: 60400145
[    5.621013] sp : ffffffe155bdba30
[    5.621018] x29: ffffffe155bdba30 x28: 0000000000000000 
[    5.621029] x27: 0000000000000000 x26: ffffff9f30c04000 
[    5.621041] x25: 0000000000000000 x24: ffffffe155b7a000 
[    5.621051] x23: ffffff9f30a948c0 x22: ffffffe1b1bda0e8 
[    5.621063] x21: ffffffe1b1bda000 x20: ffffffe1b1055e18 
[    5.621074] x19: ffffffe1b1bda000 x18: 0000000000000001 
[    5.621085] x17: 00000000fa83b2da x16: 000000000000000c 
[    5.621096] x15: 000000000009c3ff x14: 0000000000000006 
[    5.621107] x13: 0000000000005603 x12: 00000000000055fe 
[    5.621118] x11: ffffff9f30c0cb00 x10: ffffffe1b105a958 
[    5.621129] x9 : ffffff9f30c0c000 x8 : ffffff9f313598e8 
[    5.621140] x7 : 0000000000000038 x6 : 0000000000000003 
[    5.621151] x5 : 0000000000000040 x4 : 00000042828e1000 
[    5.621161] x3 : 0000000000000004 x2 : 0000000000000005 
[    5.621172] x1 : 0000000000000000 x0 : ffffff9f30c04892 
[    5.621184] \x0aPC: 0xffffff9f2ece6224:
[    5.621190] 6224  b9402821 37200581 51000400 52800014 b9000e60 2a1403e0 a94153f3 a8c37bfd
[    5.621226] 6244  d65f03c0 91056260 97f06911 35fffde0 d000f8e0 91224800 39400c01 35fffd61
[    5.621261] 6264  d4210000 52800021 39000c01 17ffffe7 52800014 b4fffe01 3941c020 3707fdc0
[    5.621296] 6284  b9402820 36180240 f90013f5 91010275 d2800002 d2808001 aa1503e0 97ef9a9c
[    5.621332] \x0aLR: 0xffffff9f2ece6210:
[    5.621337] 6210  f9431e61 34000700 7100041f 540002c0 b4000061 b9402821 37200581 51000400
[    5.621373] 6230  52800014 b9000e60 2a1403e0 a94153f3 a8c37bfd d65f03c0 91056260 97f06911
[    5.621407] 6250  35fffde0 d000f8e0 91224800 39400c01 35fffd61 d4210000 52800021 39000c01
[    5.621442] 6270  17ffffe7 52800014 b4fffe01 3941c020 3707fdc0 b9402820 36180240 f90013f5
[    5.621477] \x0aSP: 0xffffffe155bdb9f0:
[    5.621483] b9f0  2ece6250 ffffff9f 55bdba30 ffffffe1 2ece6264 ffffff9f 60400145 00000000
[    5.621517] ba10  b1bda000 ffffffe1 b1055e18 ffffffe1 ffffffff ffffffff 2e8ffc98 ffffff9f
[    5.621551] ba30  55bdba60 ffffffe1 2ece63cc ffffff9f b1066080 ffffffe1 b1055e18 ffffffe1
[    5.621585] ba50  30a05d90 ffffff9f 7a81e143 403a4a6d 55bdba90 ffffffe1 2ecf8bf8 ffffff9f
[    5.621620] 
[    5.621626] ---[ end trace 24ec051baadc3c86 ]---
[    5.621632] Call trace:
[    5.621638] Exception stack(0xffffffe155bdb840 to 0xffffffe155bdb970)
[    5.621646] b840: ffffffe1b1bda000 0000008000000000 00000000834df000 ffffff9f2ece6264
[    5.621654] b860: 0000000060400145 ffffff9f30c0cb00 00000000000055fe 0000000000005603
[    5.621661] b880: 0000000000000006 000000000009c3ff 000000000000000c 00000000fa83b2da
[    5.621669] b8a0: 0000000000000001 ffffffe1b1bda158 0000000000000000 0000000000000000
[    5.621676] b8c0: 0000000000000000 0000000000000001 0000000000000000 ffffff9f2ece63c4
[    5.621684] b8e0: ffffffe155bdb930 ffffff9f2e8ff8c4 0000000000000005 403a4a6d7a81e143
[    5.621691] b900: ffffff9f30c04892 0000000000000000 0000000000000005 0000000000000004
[    5.621699] b920: 00000042828e1000 0000000000000040 0000000000000003 0000000000000038
[    5.621706] b940: ffffff9f313598e8 ffffff9f30c0c000 ffffffe1b105a958 ffffff9f30c0cb00
[    5.621713] b960: 00000000000055fe 0000000000005603
[    5.621721] [<ffffff9f2ece6264>] _regulator_disable+0x74/0x190
[    5.621727] [<ffffff9f2ece63cc>] regulator_disable+0x4c/0xb0
[    5.621735] [<ffffff9f2ecf8bf8>] cpr3_regulator_disable+0x70/0x228
[    5.621742] [<ffffff9f2ece53dc>] _regulator_do_disable+0x8c/0xb8
[    5.621749] [<ffffff9f2ece62b4>] _regulator_disable+0xc4/0x190
[    5.621756] [<ffffff9f2ece63cc>] regulator_disable+0x4c/0xb0
[    5.621763] [<ffffff9f2ece63f4>] regulator_disable+0x74/0xb0
[    5.621771] [<ffffff9f2ed83fbc>] adreno_regulator_disable_poll+0x3c/0x128
[    5.621778] [<ffffff9f2ed5dacc>] kgsl_pwrctrl_pwrrail+0xc4/0x180
[    5.621785] [<ffffff9f2ed5e63c>] kgsl_pwrctrl_disable+0x54/0x70
[    5.621792] [<ffffff9f2ed5f09c>] _slumber+0x11c/0x200
[    5.621798] [<ffffff9f2ed5fc74>] kgsl_pwrctrl_change_state+0x20c/0x3b8
[    5.621805] [<ffffff9f2ed5febc>] kgsl_idle_check+0x9c/0xd8
[    5.621814] [<ffffff9f2e8c3e38>] process_one_work+0x250/0x458
[    5.621821] [<ffffff9f2e8c4174>] worker_thread+0x134/0x4e0
[    5.621828] [<ffffff9f2e8cb218>] kthread+0x100/0x108
[    5.621836] [<ffffff9f2e883260>] ret_from_fork+0x10/0x30
[    5.627305] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.641267] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.655542] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.657532] init: Setting policy on /data/misc
[    5.657961] init: Found policy 21ca43de98cea7bf at /data/misc which matches expected value
[    5.666538] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.671571] sysmon-qmi: sysmon_clnt_svc_arrive: Connection established between QMI handle and adsp's SSCTL service
[    5.677708] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.683958] diag: In diag_send_feature_mask_update, control channel is not open, p: 1, 0000000000000000
[    5.688638] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.699557] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.710745] sps_register_bam_device : unable to create IPC Logging 0 for bam 0x0000000000000000
[    5.710760] sps_register_bam_device : unable to create IPC Logging 1 for bam 0x0000000000000000
[    5.710769] sps_register_bam_device : unable to create IPC Logging 2 for bam 0x0000000000000000
[    5.710775] sps_register_bam_device : unable to create IPC Logging 3 for bam 0x0000000000000000
[    5.710782] sps_register_bam_device : unable to create IPC Logging 4 for bam 0x0000000000000000
[    5.710810] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.711170] sps:BAM 0x0000000000000000 (va:0x0000000000000000) enabled: ver:0x19, number of pipes:19
[    5.711179] sps:BAM 0x0000000000000000 is registered.
[    5.715421] init: Setting policy on /data/local
[    5.715849] sps_register_bam_device : unable to create IPC Logging 0 for bam 0x0000000000000000
[    5.715860] sps_register_bam_device : unable to create IPC Logging 1 for bam 0x0000000000000000
[    5.715869] sps_register_bam_device : unable to create IPC Logging 2 for bam 0x0000000000000000
[    5.715874] sps_register_bam_device : unable to create IPC Logging 3 for bam 0x0000000000000000
[    5.715881] sps_register_bam_device : unable to create IPC Logging 4 for bam 0x0000000000000000
[    5.716018] sps:BAM 0x0000000000000000 (va:0x0000000000000000) enabled: ver:0x19, number of pipes:31
[    5.716028] sps:BAM 0x0000000000000000 is registered.
[    5.716368] init: Found policy 21ca43de98cea7bf at /data/local which matches expected value
[    5.721908] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.732961] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.736909] slimbus sb-1: of_slim: invalid E-addr
[    5.743971] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.744088] init: Setting policy on /data/vendor
[    5.744597] init: Found policy 21ca43de98cea7bf at /data/vendor which matches expected value
[    5.744979] init: Not setting policy on /data/vendor_ce
[    5.746239] init: Not setting policy on /data/vendor_de
[    5.749326] wcd-slim tasha-slim-pgd: Platform data from device tree
[    5.749507] wcd-slim tasha-slim-pgd: msm_cdc_dt_parse_vreg_info: cdc-vdd-buck: vol=[1800000 1800000]uV, curr=[650000]uA, ond 0
[    5.749534] wcd-slim tasha-slim-pgd: msm_cdc_dt_parse_vreg_info: cdc-buck-sido: vol=[1800000 1800000]uV, curr=[250000]uA, ond 0
[    5.749559] wcd-slim tasha-slim-pgd: msm_cdc_dt_parse_vreg_info: cdc-vdd-tx-h: vol=[1800000 1800000]uV, curr=[25000]uA, ond 0
[    5.749584] wcd-slim tasha-slim-pgd: msm_cdc_dt_parse_vreg_info: cdc-vdd-rx-h: vol=[1800000 1800000]uV, curr=[25000]uA, ond 0
[    5.749676] wcd-slim tasha-slim-pgd: msm_cdc_dt_parse_vreg_info: cdc-vddpx-1: vol=[1800000 1800000]uV, curr=[10000]uA, ond 0
[    5.750672] wcd-slim tasha-slim-pgd: wcd9xxx_slim_probe: probing for wcd type: 2, name: tasha-slim-pgd
[    5.751252] init: Not setting policy on /data/data
[    5.751830] init: Setting policy on /data/app-private
[    5.753874] init: Policy for /data/app-private set to 21ca43de98cea7bf modes 127/126
[    5.754921] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.756321] init: Setting policy on /data/app-ephemeral
[    5.757018] init: Policy for /data/app-ephemeral set to 21ca43de98cea7bf modes 127/126
[    5.758051] init: Setting policy on /data/app-asec
[    5.759393] init: Policy for /data/app-asec set to 21ca43de98cea7bf modes 127/126
[    5.760747] init: Setting policy on /data/app-lib
[    5.762426] init: Policy for /data/app-lib set to 21ca43de98cea7bf modes 127/126
[    5.764365] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.764371] init: Setting policy on /data/app
[    5.765912] init: Found policy 21ca43de98cea7bf at /data/app which matches expected value
[    5.766756] init: Setting policy on /data/property
[    5.767737] init: Found policy 21ca43de98cea7bf at /data/property which matches expected value
[    5.768600] init: Setting policy on /data/tombstones
[    5.769311] init: Found policy 21ca43de98cea7bf at /data/tombstones which matches expected value
[    5.772883] init: Setting policy on /data/dalvik-cache
[    5.773543] init: Found policy 21ca43de98cea7bf at /data/dalvik-cache which matches expected value
[    5.774382] init: Setting policy on /data/ota
[    5.775112] init: Policy for /data/ota set to 21ca43de98cea7bf modes 127/126
[    5.775249] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.775660] init: Setting policy on /data/ota_package
[    5.776303] init: Policy for /data/ota_package set to 21ca43de98cea7bf modes 127/126
[    5.776828] init: Setting policy on /data/resource-cache
[    5.777544] init: Found policy 21ca43de98cea7bf at /data/resource-cache which matches expected value
[    5.778088] init: Not setting policy on /data/lost+found
[    5.778917] init: Setting policy on /data/drm
[    5.779625] init: Found policy 21ca43de98cea7bf at /data/drm which matches expected value
[    5.780160] init: Setting policy on /data/mediadrm
[    5.783915] init: Policy for /data/mediadrm set to 21ca43de98cea7bf modes 127/126
[    5.784498] init: Setting policy on /data/anr
[    5.784580] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.785601] init: Found policy 21ca43de98cea7bf at /data/anr which matches expected value
[    5.787418] init: Setting policy on /data/nfc
[    5.788151] init: Found policy 21ca43de98cea7bf at /data/nfc which matches expected value
[    5.790418] init: Setting policy on /data/backup
[    5.791194] init: Found policy 21ca43de98cea7bf at /data/backup which matches expected value
[    5.791717] init: Setting policy on /data/ss
[    5.792363] init: Policy for /data/ss set to 21ca43de98cea7bf modes 127/126
[    5.792811] init: Setting policy on /data/system
[    5.793235] init: Found policy 21ca43de98cea7bf at /data/system which matches expected value
[    5.794306] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.796657] init: Not setting policy on /data/system_de
[    5.798344] init: Not setting policy on /data/system_ce
[    5.799058] init: Not setting policy on /data/misc_de
[    5.800809] init: Not setting policy on /data/misc_ce
[    5.801214] init: Not setting policy on /data/user
[    5.802924] init: Not setting policy on /data/user_de
[    5.804007] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.805224] init: Not setting policy on /data/media
[    5.805984] init: Setting policy on /data/media/obb
[    5.806625] init: Found policy 21ca43de98cea7bf at /data/media/obb which matches expected value
[    5.807160] init: Setting policy on /data/cache
[    5.807781] init: Found policy 21ca43de98cea7bf at /data/cache which matches expected value
[    5.810148] init: starting service 'exec 3 (/system/bin/vdc --wait cryptfs init_user0)'...
[    5.811362] init: SVC_EXEC pid 705 (uid 0 gid 0+0 context default) started; waiting...
[    5.814251] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.819044] mdss_fb_blank_sub: mdss_fb_blank+0x78/0x178 mode:0
[    5.819284] init: Received control message 'start' for 'bootanim' from pid: 625 (/system/bin/surfaceflinger)
[    5.823304] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.831662] fts_touch 5-0049: fts_resume power state : 0
[    5.831677] fts_touch 5-0049: fts_resume: calling resume from active state, skipping
[    5.833697] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.835955] vdc: Waited 0ms for vold
[    5.836191] lge_battery: bm_vote_fcc_update: vote id[1], set cur[1000000]
[    5.842516] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.850359] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.858220] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.866037] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.873874] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.882522] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.889313] init: Service 'exec 3 (/system/bin/vdc --wait cryptfs init_user0)' (pid 705) exited with status 0 waiting took 0.078000 seconds
[    5.890243] selinux: SELinux: Skipping restorecon_recursive(/data)\x0a
[    5.890385] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.890774] init: starting service 'exec 4 (/system/bin/tzdatacheck /system/usr/share/zoneinfo /data/misc/zoneinfo)'...
[    5.893020] init: SVC_EXEC pid 713 (uid 1000 gid 1000+0 context default) started; waiting...
[    5.898381] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.906309] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.914030] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.915792] init: Service 'exec 4 (/system/bin/tzdatacheck /system/usr/share/zoneinfo /data/misc/zoneinfo)' (pid 713) exited with status 0 waiting took 0.024000 seconds
[    5.916067] init: processing action (post-fs-data) from (/init.usb.rc:6)
[    5.920263] wcd-slim tasha-slim-pgd: wcd9xxx_slim_probe: failed to get slimbus tasha-slim-pgd logical address: -6
[    5.920592] init: Setting policy on /data/adb
[    5.921799] init: Found policy 21ca43de98cea7bf at /data/adb which matches expected value
[    5.921815] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.921837] init: processing action (post-fs-data) from (/vendor/etc/init/hw/init.taimen.rc:250)
[    5.922718] init: starting service 'netd'...
[    5.924915] wcd-slim tavil-slim-pgd: Platform data from device tree
[    5.924948] wcd-slim tavil-slim-pgd: msm_cdc_dt_parse_vreg_info: cdc-vdd-buck: vol=[1800000 1800000]uV, curr=[650000]uA, ond 0
[    5.925055] wcd-slim tavil-slim-pgd: msm_cdc_dt_parse_vreg_info: cdc-buck-sido: vol=[1800000 1800000]uV, curr=[250000]uA, ond 0
[    5.925072] wcd-slim tavil-slim-pgd: msm_cdc_dt_parse_vreg_info: cdc-vdd-tx-h: vol=[1800000 1800000]uV, curr=[25000]uA, ond 0
[    5.925085] wcd-slim tavil-slim-pgd: msm_cdc_dt_parse_vreg_info: cdc-vdd-rx-h: vol=[1800000 1800000]uV, curr=[25000]uA, ond 0
[    5.925098] wcd-slim tavil-slim-pgd: msm_cdc_dt_parse_vreg_info: cdc-vddpx-1: vol=[1800000 1800000]uV, curr=[10000]uA, ond 0
[    5.925199] wcd-slim tavil-slim-pgd: wcd9xxx_slim_probe: probing for wcd type: 4, name: tavil-slim-pgd
[    5.927060] init: start_waiting_for_property("sys.time.set", "true"): already set
[    5.927095] init: processing action (post-fs-data) from (/system/etc/init/bootstat.rc:7)
[    5.927774] init: Created socket '/dev/socket/netd', mode 660, user 0, group 1000
[    5.928893] init: Created socket '/dev/socket/dnsproxyd', mode 660, user 0, group 3003
[    5.929875] init: Created socket '/dev/socket/mdns', mode 660, user 0, group 1000
[    5.930950] init: Created socket '/dev/socket/fwmarkd', mode 660, user 0, group 3003
[    5.931226] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.932753] init: processing action (post-fs-data) from (/system/etc/init/incidentd.rc:21)
[    5.934291] init: processing action (post-fs-data) from (/system/etc/init/otapreopt.rc:3)
[    5.934749] init: starting service 'exec 5 (/system/bin/otapreopt_slot)'...
[    5.936446] init: SVC_EXEC pid 715 (uid 0 gid 0+0 context default) started; waiting...
[    5.939243] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.947026] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.954733] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.962520] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.970321] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.978164] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.980308] slimbus:1 laddr:0xcf, EAPC:0x1:0x50
[    5.981426] wcd-slim tavil-slim-pgd: wcd9xxx_slim_device_up: slim device up, dev_up = 1
[    5.981506] slimbus:1 laddr:0xce, EAPC:0x0:0x50
[    5.986485] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    5.988008] wcd-slim tavil-slim-pgd: wcd934x_get_codec_info: wcd9xxx chip id major 0x108, minor 0x1
[    5.988026] wcd9xxx_core_res_init: num_irqs = 32, num_irq_regs = 4
[    5.995601] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    6.005230] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    6.013261] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    6.015542] init: Service 'exec 5 (/system/bin/otapreopt_slot)' (pid 715) exited with status 0 waiting took 0.079000 seconds
[    6.018149] selinux: SELinux: Skipping restorecon_recursive(/data/dalvik-cache/arm)\x0a
[    6.018427] selinux: SELinux: Skipping restorecon_recursive(/data/dalvik-cache/arm64)\x0a
[    6.018591] selinux: SELinux:  Could not stat /data/dalvik-cache/mips: No such file or directory.\x0a
[    6.018694] selinux: SELinux:  Could not stat /data/dalvik-cache/mips64: No such file or directory.\x0a
[    6.018784] selinux: SELinux:  Could not stat /data/dalvik-cache/x86: No such file or directory.\x0a
[    6.018931] selinux: SELinux:  Could not stat /data/dalvik-cache/x86_64: No such file or directory.\x0a
[    6.018996] init: processing action (post-fs-data) from (/system/etc/init/recovery-persist.rc:1)
[    6.020184] init: starting service 'exec 6 (/system/bin/recovery-persist)'...
[    6.021254] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    6.021285] init: processing action (post-fs-data) from (/system/etc/init/statsd.rc:25)
[    6.023126] init: processing action (post-fs-data) from (/vendor/etc/init/esed.rc:6)
[    6.025648] init: starting service 'vendor.ese_load'...
[    6.028282] init: processing action (post-fs-data) from (/vendor/etc/init/hostapd.android.rc:9)
[    6.029402] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    6.032932] init: processing action (post-fs-data) from (/vendor/etc/init/init.taimen.logging.rc:1)
[    6.035480] init: processing action (ro.crypto.state=encrypted && ro.crypto.type=file && zygote-start) from (/init.rc:574)
[    6.035969] init: starting service 'update_verifier_nonencrypted'...
[    6.037262] init: SVC_EXEC pid 725 (uid 0 gid 2001+1 context default) started; waiting...
[    6.037434] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    6.045425] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    6.053724] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    6.054858] init: Service 'exec 6 (/system/bin/recovery-persist)' (pid 723) exited with status 0
[    6.061714] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    6.063127] update_verifier: Started with arg 1: nonencrypted
[    6.069858] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: CODEC DAI tavil_vifeedback not registered
[    6.071419] update_verifier: Booting slot 0: isSlotMarkedSuccessful=1
[    6.071438] update_verifier: Leaving update_verifier.
[    6.073361] init: Service 'update_verifier_nonencrypted' (pid 725) exited with status 0 waiting took 0.036000 seconds
[    6.074623] init: starting service 'zygote'...
[    6.077253] init: starting service 'zygote_secondary'...
[    6.079441] init: Created socket '/dev/socket/zygote', mode 660, user 0, group 1000
[    6.083957] init: processing action (zygote-start) from (/vendor/etc/init/hw/init.taimen.rc:268)
[    6.084139] tavil_codec tavil_codec: tavil_soc_codec_probe()
[    6.087300] init: Setting policy on /data/tombstones
[    6.087479] init: Found policy 21ca43de98cea7bf at /data/tombstones which matches expected value
[    6.090439] init: Created socket '/dev/socket/zygote_secondary', mode 660, user 0, group 1000
[    6.096652] init: Not setting policy on /data/media
[    6.111656] init: Setting policy on /data/hostapd
[    6.112415] init: Policy for /data/hostapd set to 21ca43de98cea7bf modes 127/126
[    6.112885] tavil_codec tavil_codec: tavil_dsd_init: DSD unsupported for this codec version
[    6.113343] wlan: Loading driver v5.2.1.1K ()
[    6.153568] msm-pcm-routing soc:qcom,msm-pcm-routing: ASoC: no dapm match for MI2S_TX --> MI2S_TX --> MultiMedia10 Mixer
[    6.153585] msm-pcm-routing soc:qcom,msm-pcm-routing: ASoC: Failed to add route MI2S_TX -> MI2S_TX -> MultiMedia10 Mixer
[    6.153921] msm-pcm-routing soc:qcom,msm-pcm-routing: ASoC: no dapm match for AUX_PCM_TX --> AUX_PCM_TX --> MultiMedia10 Mixer
[    6.153930] msm-pcm-routing soc:qcom,msm-pcm-routing: ASoC: Failed to add route AUX_PCM_TX -> AUX_PCM_TX -> MultiMedia10 Mixer
[    6.154029] msm-pcm-routing soc:qcom,msm-pcm-routing: ASoC: no dapm match for SEC_AUX_PCM_TX --> SEC_AUX_PCM_TX --> MultiMedia10 Mixer
[    6.154036] msm-pcm-routing soc:qcom,msm-pcm-routing: ASoC: Failed to add route SEC_AUX_PCM_TX -> SEC_AUX_PCM_TX -> MultiMedia10 Mixer
[    6.154136] msm-pcm-routing soc:qcom,msm-pcm-routing: ASoC: no dapm match for TERT_AUX_PCM_TX --> TERT_AUX_PCM_TX --> MultiMedia10 Mixer
[    6.154143] msm-pcm-routing soc:qcom,msm-pcm-routing: ASoC: Failed to add route TERT_AUX_PCM_TX -> TERT_AUX_PCM_TX -> MultiMedia10 Mixer
[    6.154242] msm-pcm-routing soc:qcom,msm-pcm-routing: ASoC: no dapm match for QUAT_AUX_PCM_TX --> QUAT_AUX_PCM_TX --> MultiMedia10 Mixer
[    6.154250] msm-pcm-routing soc:qcom,msm-pcm-routing: ASoC: Failed to add route QUAT_AUX_PCM_TX -> QUAT_AUX_PCM_TX -> MultiMedia10 Mixer
[    6.154512] msm-pcm-routing soc:qcom,msm-pcm-routing: ASoC: no dapm match for TERT_MI2S_TX --> TERT_MI2S_TX --> MultiMedia10 Mixer
[    6.154520] msm-pcm-routing soc:qcom,msm-pcm-routing: ASoC: Failed to add route TERT_MI2S_TX -> TERT_MI2S_TX -> MultiMedia10 Mixer
[    6.154607] msm-pcm-routing soc:qcom,msm-pcm-routing: ASoC: no dapm match for INT2_MI2S_TX --> INT2_MI2S_TX --> MultiMedia10 Mixer
[    6.154615] msm-pcm-routing soc:qcom,msm-pcm-routing: ASoC: Failed to add route INT2_MI2S_TX -> INT2_MI2S_TX -> MultiMedia10 Mixer
[    6.154703] msm-pcm-routing soc:qcom,msm-pcm-routing: ASoC: no dapm match for INT3_MI2S_TX --> INT3_MI2S_TX --> MultiMedia10 Mixer
[    6.154710] msm-pcm-routing soc:qcom,msm-pcm-routing: ASoC: Failed to add route INT3_MI2S_TX -> INT3_MI2S_TX -> MultiMedia10 Mixer
[    6.154726] sps:BAM 0x0000000000000000 (va:0x0000000000000000) enabled: ver:0x19, number of pipes:18
[    6.166632] wlan_hdd_state wlan major(225) initialized
[    6.166641] wlan: driver loaded
[    6.166791] init: Command 'write /sys/kernel/boot_wlan/boot_wlan 1' action=zygote-start (/vendor/etc/init/hw/init.taimen.rc:308) took 54ms and succeeded
[    6.170531] wcd-dsp-mgr soc:qcom,wcd-dsp-mgr: for wdsp segments only will be dumped.
[    6.178817] init: Setting policy on /data/connectivity
[    6.181540] init: Policy for /data/connectivity set to 21ca43de98cea7bf modes 127/126
[    6.182481] init: Setting policy on /data/dpm
[    6.183059] init: Found policy 21ca43de98cea7bf at /data/dpm which matches expected value
[    6.183288] wcd-dsp-mgr soc:qcom,wcd-dsp-mgr: bound tavil_codec (ops wcd_ctrl_component_ops)
[    6.183328] wcd-spi-v2 spi2.0: wcd_spi_component_bind: Failed debugfs init
[    6.183407] wcd-dsp-mgr soc:qcom,wcd-dsp-mgr: bound spi2.0 (ops wcd_spi_component_ops)
[    6.183421] wcd-dsp-mgr soc:qcom,wcd-dsp-mgr: bound soc:qcom,glink-spi-xprt-wdsp (ops glink_component_ops)
[    6.226427] init: processing action (load_persist_props_action) from (/init.rc:268)
[    6.228356] init: starting service 'logd-reinit'...
[    6.229527] init: processing action (firmware_mounts_complete) from (/init.rc:274)
[    6.229646] init: processing action (early-boot) from (/vendor/etc/init/hw/init.taimen.rc:403)
[    6.229836] init: start_waiting_for_property("sys.qcom.devup", "1"): already set
[    6.229948] init: start_waiting_for_property("sys.all.modules.ready", "1"): already set
[    6.240171] init: processing action (early-boot) from (/vendor/etc/init/hw/init.wahoo.usb.rc:17)
[    6.241025] init: Command 'mount configfs none /config' action=early-boot (/vendor/etc/init/hw/init.wahoo.usb.rc:18) took 0ms and failed: mount() failed: Device or resource busy
[    6.249026] logd.daemon: reinit
[    6.252691] init: Service 'logd-reinit' (pid 753) exited with status 0
[    6.258277] Mass Storage Function, version: 2009/09/11
[    6.258289] LUN: removable file: (no medium)
[    6.285803] ueventd: firmware: could not find firmware for tas2557_cal.bin
[    6.285916] ueventd: loading /devices/soc/c1b5000.i2c/i2c-7/7-004d/firmware/tas2557_cal.bin took 2210ms
[    6.292602] file system registered
[    6.329213] pn81a spi1.0: ese_open: NFC controller found
[    6.330572] nq-nci 8-0028: setting ese_gpio high
[    6.354155] tas2557s 7-004d: Couldn't load tas2557_cal.bin
[    6.354169] tas2557s 7-004d: FW Size       = 51609
[    6.354174] tas2557s 7-004d: Checksum      = 0x35E64F0F
[    6.354178] tas2557s 7-004d: PPC Version   = 0x5C000
[    6.354182] tas2557s 7-004d: FW  Version    = 0x1010000
[    6.354185] tas2557s 7-004d: Driver Version= 0x0300
[    6.354190] tas2557s 7-004d: Timestamp     = 1501470973
[    6.354193] tas2557s 7-004d: DDC Name      = tas2557s_PG21_uCDSP
[    6.354197] tas2557s 7-004d: Description   = TI SmartAmp
[    6.356206] tas2557s 7-004d: find default configuration 0
[    6.380787] f_cdev_alloc: port_name:at_usb0 (0000000000000000) portno:(0)
[    6.381062] init: Command 'mkdir /config/usb_gadget/g1/functions/cser.dun.0' action=early-boot (/vendor/etc/init/hw/init.wahoo.usb.rc:35) took 87ms and succeeded
[    6.385280] f_cdev_alloc: port_name:at_usb1 (0000000000000000) portno:(1)
[    6.409766] init: Service 'vendor.ese_load' (pid 724) exited with status 1
[    6.426484] init: processing action (early-boot) from (/system/etc/init/installd.rc:5)
[    6.429562] tas2557s 7-004d: load program 0 (Tuning Mode)
[    6.429572] tas2557s 7-004d: TAS2557 load data: Tuning Mode Program, Blocks = 3, Block Type = 13
[    6.435780] init: processing action (boot) from (/init.rc:581)
[    6.446559] msm_audrx_init: dev_namesoc:qcom,msm-dai-q6:qcom,msm-dai-q6-sb-0-rx
[    6.454127] init: starting service 'vendor.thermal-engine'...
[    6.455692] apr_tal_notify_state: Channel state[0]
[    6.458377] init: Created socket '/dev/socket/thermal-send-client', mode 666, user 1000, group 1000
[    6.459689] init: Created socket '/dev/socket/thermal-recv-client', mode 660, user 1000, group 1000
[    6.460850] init: Created socket '/dev/socket/thermal-recv-passive-client', mode 666, user 1000, group 1000
[    6.472062] init: starting service 'hidl_memory'...
[    6.477193] init: starting service 'healthd'...
[    6.483202] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: no source widget found for Handset 2nd Mic
[    6.483214] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: Failed to add route Handset 2nd Mic -> direct -> MIC BIAS3
[    6.483238] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: no source widget found for Handset 3rd Mic
[    6.483242] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: Failed to add route Handset 3rd Mic -> direct -> MIC BIAS4
[    6.483980] init: starting service 'vr_hwc'...
[    6.484622] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: no sink widget found for SpkrLeft IN
[    6.484630] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: Failed to add route SPK1 OUT -> direct -> SpkrLeft IN
[    6.484653] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: no sink widget found for SpkrRight IN
[    6.484657] msm8998-asoc-snd 1711a000.sound-tavil: ASoC: Failed to add route SPK2 OUT -> direct -> SpkrRight IN
[    6.499593] init: starting service 'vendor.audio-hal-2-0'...
[    6.507358] init: starting service 'vendor.bluetooth-1-0'...
[    6.509879] init: starting service 'vendor.camera-provider-2-4'...
[    6.511825] init: couldn't write 769 to /dev/cpuset/camera-daemon/tasks: No such file or directory
[    6.516229] init: starting service 'vendor.cas-hal-1-0'...
[    6.529099] init: starting service 'vendor.contexthub-hal-1-0'...
[    6.532989] init: starting service 'vendor.drm-hal-1-0'...
[    6.534252] healthd: Unknown power supply type 'Wipower'
[    6.534460] healthd: Unknown power supply type 'BMS'
[    6.534916] healthd: Unknown power supply type 'Main'
[    6.541156] healthd: Unknown power supply type 'Parallel'
[    6.541451] init: starting service 'vendor.drm-clearkey-hal-1-1'...
[    6.552128] init: starting service 'vendor.drm-widevine-hal-1-1'...
[    6.556470] init: starting service 'vendor.dumpstate-1-0'...
[    6.562587] init: starting service 'vendor.gnss_service'...
[    6.569747] init: starting service 'vendor.health-hal-2-0'...
[    6.571930] init: Opened file '/dev/kmsg', flags 1
[    6.577099] init: starting service 'vendor.light-hal-2-0'...
[    6.580506] init: starting service 'vendor.memtrack-hal-1-0'...
[    6.586892] FG: fg_get_battery_temp: batt temperature original:430, tuned:375
[    6.593895] healthd: battery l=80 v=3902 t=37.5 h=2 st=2 c=964 fc=3229000 cc=291 chg=u
[    6.600657] init: starting service 'nfc_hal_service'...
[    6.606266] init: starting service 'vendor.oemlock_hal'...
[    6.621319] init: starting service 'vendor.oemlock_bridge'...
[    6.625758] init: starting service 'vendor.power-hal-1-2'...
[    6.627212] init: Created socket '/dev/socket/oemlock', mode 666, user 1000, group 1000
[    6.639612] init: starting service 'vendor.sensors-hal-1-0'...
[    6.695014] init: starting service 'vendor.usb-hal-1-1'...
[    6.706354] init: starting service 'vendor.vibrator-1-2'...
[    6.732996] init: starting service 'vendor.vr-wahoo-1-0'...
[    6.733563] healthd: Unknown power supply type 'Wipower'
[    6.736283] healthd: Unknown power supply type 'BMS'
[    6.737147] healthd: Unknown power supply type 'Main'
[    6.737771] healthd: Unknown power supply type 'Parallel'
[    6.742545] init: starting service 'vendor.wifi_hal_legacy'...
[    6.763140] init: starting service 'vendor.esed'...
[    6.788839] init: Command 'class_start hal' action=boot (/init.rc:676) took 335ms and succeeded
[    6.789934] init: starting service 'vendor.msm_irqbalance'...
[    6.801072] init: starting service 'vendor.per_mgr'...
[    6.804493] init: Created socket '/dev/socket/msm_irqbalance', mode 660, user 0, group 1000
[    6.809652] FG: fg_get_battery_temp: batt temperature original:430, tuned:374
[    6.811996] init: starting service 'vendor.sensors'...
[    6.816625] init: starting service 'vendor.irsc_util'...
[    6.827072] init: starting service 'vendor.rmt_storage'...
[    6.831451] init: starting service 'vendor.tftp_server'...
[    6.837967] init: starting service 'vendor.pd_mapper'...
[    6.848454] msm8998-asoc-snd 1711a000.sound-tavil: Sound card msm8998-tavil-taimen-snd-card registered
[    6.850798] healthd: battery l=80 v=3902 t=37.4 h=2 st=2 c=964 fc=3229000 cc=291 chg=u
[    6.872790] init: starting service 'audioserver'...
[    6.874276] init: starting service 'bufferhubd'...
[    6.895786] init: starting service 'lmkd'...
[    6.898938] init: Created socket '/dev/socket/pdx/system/buffer_hub/client', mode 660, user 1000, group 1000
[    6.905251] nq-nci 8-0028: setting ese_gpio high
[    6.909738] init: Created socket '/dev/socket/lmkd', mode 660, user 1000, group 1000
[    6.913009] init: starting service 'performanced'...
[    6.934374] init: starting service 'thermalservice'...
[    6.957461] init: starting service 'virtual_touchpad'...
[    6.965854] init: Created socket '/dev/socket/pdx/system/performance/client', mode 666, user 1000, group 1000
[    6.967514] init: Command 'class_start core' action=boot (/init.rc:678) took 178ms and succeeded
[    6.967548] init: processing action (boot) from (/init.usb.rc:21)
[    6.967783] init: processing action (persist.sys.usb.config=* && boot) from (/init.usb.rc:102)
[    6.967811] init: processing action (boot) from (/vendor/etc/init/hw/init.taimen.rc:445)
[    7.029880] init: Service 'vendor.irsc_util' (pid 819) exited with status 0
[    7.041562] servloc: init_service_locator: Service locator initialized
[    7.042400] servloc: service_locator_svc_arrive: Connection established with the Service locator
[    7.072901] servloc: service_locator_send_msg: No matching domains found
[    7.072919] error locating audio-PD
[    7.072924] audio-PDs matched:0
[    7.074749] servloc: service_locator_send_msg: No matching domains found
[    7.074761] audio_pdr_locator_callback: Service avs/audio returned invalid total domains 0
[    7.074781] audio_notifer_reg_service: service SSR_ADSP is in use
[    7.075113] servloc: service_locator_send_msg: No matching domains found
[    7.075125] error locating audio-PD
[    7.075130] audio-PDs matched:0
[    7.086769] init: processing action (boot) from (/vendor/etc/init/hw/init.wahoo.usb.rc:57)
[    7.093059] init: processing action (boot) from (/system/etc/init/bootstat.rc:61)
[    7.093099] init: processing action (boot) from (/system/etc/init/dumpstate.rc:1)
[    7.093594] init: processing action (boot) from (/vendor/etc/init/android.hardware.dumpstate@1.0-service.wahoo.rc:7)
[    7.095134] init: processing action (boot) from (/vendor/etc/init/android.hardware.usb@1.1-service.wahoo.rc:6)
[    7.115702] init: processing action (boot) from (/vendor/etc/init/init-taimen.rc:27)
[    7.116283] init: processing action (enable_property_trigger) from (<Builtin Action>:0)
[    7.116539] init: processing action (security.perf_harden=1) from (/init.rc:735)
[    7.116926] init: processing action (sys.listeners.registered=true) from (/vendor/etc/init/hw/init.taimen.rc:246)
[    7.119008] init: starting service 'vendor.ipastart_sh'...
[    7.122172] init: processing action (init.svc.zygote=running) from (/vendor/etc/init/hw/init.taimen.rc:520)
[    7.122877] init: starting service 'vendor.folio_daemon'...
[    7.124535] init: processing action (persist.sys.ssr.restart_level=*) from (/vendor/etc/init/hw/init.taimen.rc:541)
[    7.125614] init: starting service 'vendor.ssr_setup'...
[    7.132552] init: processing action (init.svc.vendor.per_mgr=running) from (/vendor/etc/init/hw/init.taimen.rc:628)
[    7.136086] init: starting service 'vendor.per_proxy'...
[    7.139769] init: processing action (persist.traced.enable=1) from (/system/etc/init/perfetto.rc:48)
[    7.144735] init: starting service 'traced'...
[    7.158122] ipa ipa3_uc_state_check:302 uC interface not initialized
[    7.160758] init: Created socket '/dev/socket/traced_consumer', mode 666, user 0, group 0
[    7.161953] init: Created socket '/dev/socket/traced_producer', mode 666, user 0, group 0
[    7.169986] init: starting service 'traced_probes'...
[    7.180455] init: processing action (ro.boot.slot_suffix=*) from (/system/etc/init/update_engine.rc:8)
[    7.180492] init: processing action (sys.all.modules.ready=1) from (/vendor/etc/init/init-taimen.rc:37)
[    7.183775] fts_touch 5-0049: [fw_update] try:1
[    7.183794] fts_touch 5-0049: fts_fw_update : firmware name : ftm4_fw.ftb
[    7.185166] init: Opened file '/dev/kmsg', flags 1
[    7.198127] fts_touch 5-0049: fts_fw_compare: bin_fw_ver_addr_1 = 0x0001CA7C , bin_fw_ver_addr_2 = 0x0001CA7D
[    7.198552] fts_touch 5-0049: fts_fw_compare : binary[0.44.0] device[0.44.0] -> update: 0
[    7.198560] fts_touch 5-0049: fts_fw_update : skip fw_upgrade(ic_fw_ver == bin_fw_ver)
[    7.203649] init: processing action (nonencrypted) from (/init.rc:680)
[    7.203740] init: Could not start service 'flash_recovery' as part of class 'main': Cannot find '/system/bin/install-recovery.sh': No such file or directory
[    7.205203] init: starting service 'vendor.adsprpcd'...
[    7.207205] init: starting service 'vendor.imsqmidaemon'...
[    7.211636] init: Could not start service 'vendor.qmuxd' as part of class 'main': Cannot find '/vendor/bin/qmuxd': No such file or directory
[    7.213610] init: Created socket '/dev/socket/ims_qmid', mode 660, user 1000, group 1001
[    7.216121] init: starting service 'vendor.cnd'...
[    7.219076] init: starting service 'vendor.netmgrd'...
[    7.232199] init: starting service 'vendor.port-bridge'...
[    7.242720] subsys-restart: __subsystem_get(): Changing subsys fw_name to modem
[    7.256116] 'opened /dev/adsprpc-smd c 226 0'
[    7.256864] init: starting service 'vendor.ipacm'...
[    7.282444] init: starting service 'vendor.qti'...
[    7.287844] subsys-pil-tz soc:qcom,ipa_fws@1e08000: ipa_fws: loading from 0x0000000000000000 to 0x0000000000000000
[    7.298273] init: starting service 'cameraserver'...
[    7.299800] init: starting service 'drm'...
[    7.307048] init: couldn't write 917 to /dev/cpuset/camera-daemon/tasks: No such file or directory
[    7.320116] init: starting service 'incidentd'...
[    7.321694] init: starting service 'installd'...
[    7.323120] init: starting service 'keystore'...
[    7.326643] init: starting service 'mediadrm'...
[    7.335573] ueventd: firmware: loading 'modem.mdt' for '/devices/soc/4080000.qcom,mss/firmware/modem.mdt'
[    7.340872] init: starting service 'mediaextractor'...
[    7.344465] init: starting service 'mediametrics'...
[    7.347706] init: starting service 'media'...
[    7.361774] init: starting service 'statsd'...
[    7.363515] init: starting service 'storaged'...
[    7.365148] init: starting service 'wificond'...
[    7.371889] init: Created socket '/dev/socket/statsdw', mode 222, user 1066, group 1066
[    7.371960] init: starting service 'vendor.media.omx'...
[    7.373369] init: Failed to open file '/d/mmc0/mmc0:0001/ext_csd': No such file or directory
[    7.380874] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.mdt took 54ms
[    7.388890] init: starting service 'vendor.ril-daemon'...
[    7.420415] init: Command 'class_start main' action=nonencrypted (/init.rc:681) took 216ms and succeeded
[    7.421072] init: Service 'vendor.ssr_setup' (pid 877) exited with status 0
[    7.421918] init: starting service 'vendor.init-elabel-sh'...
[    7.431092] pil-q6v5-mss 4080000.qcom,mss: modem: loading from 0x0000000000000000 to 0x0000000000000000
[    7.490903] init: starting service 'vendor.init-radio-sh'...
[    7.497170] init: starting service 'vendor.cnss-daemon'...
[    7.504949] init: starting service 'vendor.loc_launcher'...
[    7.514766] ueventd: firmware: loading 'mba.mbn' for '/devices/soc/4080000.qcom,mss/firmware/mba.mbn'
[    7.540769] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/mba.mbn took 27ms
[    7.548533] subsys-pil-tz soc:qcom,ipa_fws@1e08000: ipa_fws: Brought out of reset
[    7.553333] init: starting service 'vendor.chre'...
[    7.555193] ueventd: firmware: loading 'msadp' for '/devices/soc/4080000.qcom,mss/firmware/msadp'
[    7.555943] ueventd: firmware: could not find firmware for msadp
[    7.557332] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/msadp took 2ms
[    7.561146] init: Created socket '/dev/socket/chre', mode 660, user 0, group 1000
[    7.561653] pil-q6v5-mss 4080000.qcom,mss: Debug policy not present - msadp. Continue.
[    7.561686] pil-q6v5-mss 4080000.qcom,mss: Loading MBA and DP (if present) from 0x0000000000000000 to 0x0000000000000000 size 100000
[    7.571309] selinux: SELinux: Could not get canonical path for /sys/devices/soc/4080000.qcom,mss/firmware/msadp restorecon: No such file or directory.\x0a
[    7.571398] ueventd: selinux_android_restorecon(/sys/devices/soc/4080000.qcom,mss/firmware/msadp) failed: No such file or directory
[    7.603362] tas2557s 7-004d: TAS2557 load data: Tuning Mode Program, Blocks = 3, Block Type = 1
[    7.603478] init: starting service 'gatekeeperd'...
[    7.604852] msm_thermal:set_enabled enabled = 0
[    7.605021] init: starting service 'tombstoned'...
[    7.606802] init: starting service 'update_engine'...
[    7.608377] init: starting service 'usbd'...
[    7.609555] msm_thermal:store_cpus_offlined "thermal-engine"(PID:922) request cpus offlined mask 0
[    7.609654] ipa ipa3_uc_state_check:302 uC interface not initialized
[    7.626256] init: starting service 'fps_hal'...
[    7.627499] init: Command 'class_start late_start' action=nonencrypted (/init.rc:682) took 205ms and succeeded
[    7.630814] init: Created socket '/dev/socket/tombstoned_crash', mode 666, user 1000, group 1000
[    7.631624] init: Created socket '/dev/socket/tombstoned_intercept', mode 666, user 1000, group 1000
[    7.631908] tas2557s 7-004d: TAS2557 load data: Tuning Mode Program, Blocks = 3, Block Type = 8
[    7.632221] init: Created socket '/dev/socket/tombstoned_java_trace', mode 666, user 1000, group 1000
[    7.635957] pil-q6v5-mss 4080000.qcom,mss: MBA boot done
[    7.667916] ipa ipa3_uc_state_check:302 uC interface not initialized
[    7.675773] init: processing action (init.svc.mediadrm=running) from (/vendor/etc/init/android.hardware.drm@1.1-service.widevine.rc:1)
[    7.680553] init: Service 'vendor.ipastart_sh' (pid 872) exited with status 0
[    7.681038] init: Service 'vendor.init-elabel-sh' (pid 950) exited with status 0
[    7.682128] init: starting service 'vendor.move_data_sh'...
[    7.688409] ueventd: firmware: loading 'modem.b02' for '/devices/soc/4080000.qcom,mss/firmware/modem.b02'
[    7.693739] msm_qti_pp_get_rms_value_control, back not active to query rms be_idx:3
[    7.694034] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b02 took 5ms
[    7.698922] msm_voice_sound_focus_get: Error getting Sound Focus Params, err=-22
[    7.699050] msm_voice_source_tracking_get: Error getting Source Tracking Params, err=-22
[    7.699148] msm_audio_get_copp_idx_from_port_id: Invalid FE, exiting
[    7.699153] msm_audio_sound_focus_get: Could not get copp idx for port_id=16385
[    7.699248] msm_audio_get_copp_idx_from_port_id: Invalid FE, exiting
[    7.699253] msm_audio_source_tracking_get: Could not get copp idx for port_id=16385
[    7.699357] msm_voice_sound_focus_get: Error getting Sound Focus Params, err=-22
[    7.699463] msm_voice_source_tracking_get: Error getting Source Tracking Params, err=-22
[    7.699560] msm_audio_get_copp_idx_from_port_id: Invalid FE, exiting
[    7.699564] msm_audio_sound_focus_get: Could not get copp idx for port_id=4101
[    7.699664] msm_audio_get_copp_idx_from_port_id: Invalid FE, exiting
[    7.699668] msm_audio_source_tracking_get: Could not get copp idx for port_id=4101
[    7.699775] msm_voice_sound_focus_get: Error getting Sound Focus Params, err=-22
[    7.699883] msm_voice_source_tracking_get: Error getting Source Tracking Params, err=-22
[    7.699981] msm_audio_get_copp_idx_from_port_id: Invalid FE, exiting
[    7.699986] msm_audio_sound_focus_get: Could not get copp idx for port_id=4149
[    7.701066] 'opened /dev/sdsprpc-smd c 226 2'
[    7.719115] tas2557s 7-004d: TAS2557 load data: MusicPlayback, Blocks = 4, Block Type = 4
[    7.725479] tas2557s 7-004d: TAS2557 load data: MusicPlayback, Blocks = 4, Block Type = 11
[    7.730693] ueventd: firmware: loading 'modem.b03' for '/devices/soc/4080000.qcom,mss/firmware/modem.b03'
[    7.734448] tas2557s 7-004d: TAS2557 load data: MusicPlayback, Blocks = 4, Block Type = 3
[    7.739983] msm_audio_get_copp_idx_from_port_id: Invalid FE, exiting
[    7.739995] msm_audio_source_tracking_get: Could not get copp idx for port_id=4149
[    7.748847] core_get_license_status: cmdrsp_license_result.result = 0x15 for module 0x131ff
[    7.749552] msm-ext-disp-audio-codec-rx soc:qcom,msm_ext_disp:qcom,msm-ext-disp-audio-codec-rx: msm_ext_disp_audio_type_get: codec_data, get_audio_edid_blk() or get_intf_id is NULL
[    7.753553] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b03 took 23ms
[    7.796235] ueventd: firmware: loading 'modem.b04' for '/devices/soc/4080000.qcom,mss/firmware/modem.b04'
[    7.818349] gsi soc:qcom,msm_gsi: gsi_register_device:766 GSI irq is wake enabled 43
[    7.851197] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b04 took 55ms
[    7.892973] ueventd: firmware: loading 'modem.b05' for '/devices/soc/4080000.qcom,mss/firmware/modem.b05'
[    7.893065] init: Service 'vendor.move_data_sh' (pid 1012) exited with status 0
[    7.893830] init: processing action (vendor.ims.QMI_DAEMON_STATUS=1) from (/vendor/etc/init/hw/init.taimen.rc:723)
[    7.897000] msm_pm_qos_add_request: add request
[    7.908188] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b05 took 15ms
[    7.908808] input: uinput-folio as /devices/virtual/input/input4
[    7.913783] ueventd: firmware: loading 'modem.b06' for '/devices/soc/4080000.qcom,mss/firmware/modem.b06'
[    7.919133] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b06 took 5ms
[    7.920683] init: starting service 'vendor.imsdatadaemon'...
[    7.925260] init: Received control message 'start' for 'adbd' from pid: 993 (/system/bin/usbd)
[    7.925466] init: starting service 'adbd'...
[    7.925485] QSEECOM: qseecom_load_app: App (fpctzappfingerprint) does'nt exist, loading apps for first time
[    7.930365] msm_sensor_fill_eeprom_subdevid_by_name:216 Eeprom userspace probe for onsemi_lc898123f40xc
[    7.942708] init: Created socket '/dev/socket/ims_datad', mode 660, user 1000, group 1001
[    7.942804] init: Service 'usbd' (pid 993) exited with status 0
[    7.946254] init: Created socket '/dev/socket/adbd', mode 660, user 1000, group 1000
[    7.956194] ueventd: firmware: loading 'modem.b07' for '/devices/soc/4080000.qcom,mss/firmware/modem.b07'
[    7.967963] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b07 took 11ms
[    7.975263] ueventd: firmware: loading 'modem.b08' for '/devices/soc/4080000.qcom,mss/firmware/modem.b08'
[    7.989628] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b08 took 14ms
[    8.006956] ueventd: firmware: loading 'modem.b09' for '/devices/soc/4080000.qcom,mss/firmware/modem.b09'
[    8.011901] init: Received control message 'interface_start' for 'android.hardware.camera.provider@2.4::ICameraProvider/legacy/0' from pid: 613 (/system/bin/hwservicemanager)
[    8.011965] init: Could not find service hosting interface android.hardware.camera.provider@2.4::ICameraProvider/legacy/0
[    8.012378] init: Received control message 'interface_start' for 'android.hardware.camera.provider@2.4::ICameraProvider/legacy/0' from pid: 613 (/system/bin/hwservicemanager)
[    8.012408] init: Could not find service hosting interface android.hardware.camera.provider@2.4::ICameraProvider/legacy/0
[    8.016851] read descriptors
[    8.016930] read strings
[    8.025978] msm_cci_init:1427: hw_version = 0x10060000
[    8.032653] msm_sensor_power_up: [05-15 17:11:40.559] imx362
[    8.032664] imx362 probe succeeded
[    8.039861] tas2557s 7-004d: TAS2557 load data: MusicPlayback, Blocks = 4, Block Type = 10
[    8.052094] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b09 took 45ms
[    8.054782] msm_sensor_power_down: [05-15 17:11:40.579] imx362
[    8.056564] QSEECOM: qseecom_load_app: App with id 196611 (fpctzappfingerprint) now loaded
[    8.058523] ueventd: firmware: loading 'modem.b10' for '/devices/soc/4080000.qcom,mss/firmware/modem.b10'
[    8.061701] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b10 took 3ms
[    8.077761] ueventd: firmware: loading 'modem.b11' for '/devices/soc/4080000.qcom,mss/firmware/modem.b11'
[    8.088865] input: uinput-fpc as /devices/virtual/input/input5
[    8.091563] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b11 took 13ms
[    8.091607] msm_sensor_fill_eeprom_subdevid_by_name:216 Eeprom userspace probe for primax_g802l
[    8.093855] ueventd: firmware: loading 'modem.b12' for '/devices/soc/4080000.qcom,mss/firmware/modem.b12'
[    8.115735] msm_cci_init:1427: hw_version = 0x10060000
[    8.117942] msm_sensor_power_up: [05-15 17:11:40.639] imx179
[    8.117954] imx179 probe succeeded
[    8.124477] msm_sensor_power_down: [05-15 17:11:40.649] imx179
[    8.141526] msm_csid_init: CSID_VERSION = 0x30050000
[    8.165666] msm_csid_irq CSID0_IRQ_STATUS_ADDR = 0x800
[    8.190941] msm_csid_init: CSID_VERSION = 0x30050000
[    8.213182] msm_csid_irq CSID2_IRQ_STATUS_ADDR = 0x800
[    8.222796] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b12 took 129ms
[    8.223911] rmnet_ipa3 started initialization
[    8.225221] ueventd: firmware: loading 'modem.b13' for '/devices/soc/4080000.qcom,mss/firmware/modem.b13'
[    8.228155] IPA SSR support = True
[    8.228172] IPA ipa-loaduC = True
[    8.228179] IPA SG support = True
[    8.228187] IPA Napi Enable = False
[    8.228194] using default for wan-rx-desc-size = 256
[    8.229513] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b13 took 4ms
[    8.232434] IPA driver initialization was successful.
[    8.239204] ueventd: firmware: loading 'modem.b14' for '/devices/soc/4080000.qcom,mss/firmware/modem.b14'
[    8.240530] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b14 took 1ms
[    8.244175] ueventd: firmware: loading 'modem.b16' for '/devices/soc/4080000.qcom,mss/firmware/modem.b16'
[    8.260353] init: Service 'vendor.init-radio-sh' (pid 953) exited with status 0
[    8.273067] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b16 took 29ms
[    8.280861] ueventd: firmware: loading 'modem.b17' for '/devices/soc/4080000.qcom,mss/firmware/modem.b17'
[    8.294585] tas2557s 7-004d: tas2557_get_Cali_prm_r0, no calibration data
[    8.294755] tas2557s 7-004d: tas2557_get_Cali_prm_r0, no calibration data
[    8.294911] tas2557s 7-004d: tas2557_calibration_get = 0
[    8.305546] msm_dai_q6_ext_disp_drift_get:  afe port not started. status_mask = 0
[    8.305989] msm_dai_q6_ext_disp_drift_get:  afe port not started. status_mask = 0
[    8.308550] msm_pcm_volume_ctl_get substream runtime not found
[    8.308681] msm_pcm_compress_ctl_get substream runtime not found
[    8.308920] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.310125] msm_pcm_volume_ctl_get substream runtime not found
[    8.310372] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.310676] msm_pcm_volume_ctl_get substream runtime not found
[    8.310802] msm_pcm_volume_ctl_get substream runtime not found
[    8.314602] msm_compr_audio_effects_config_get: stream or effects inactive
[    8.314883] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.317137] msm_pcm_volume_ctl_get substream runtime not found
[    8.317285] msm_pcm_compress_ctl_get substream runtime not found
[    8.317564] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.318449] msm_compr_audio_effects_config_get: stream or effects inactive
[    8.318707] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.322839] msm_pcm_volume_ctl_get substream runtime not found
[    8.323117] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.323388] msm_pcm_volume_ctl_get substream runtime not found
[    8.323516] msm_pcm_volume_ctl_get substream runtime not found
[    8.328940] msm_pcm_volume_ctl_get substream runtime not found
[    8.329192] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.329584] msm_compr_audio_effects_config_get: stream or effects inactive
[    8.329837] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.331304] msm_compr_audio_effects_config_get: stream or effects inactive
[    8.331561] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.332990] msm_compr_audio_effects_config_get: stream or effects inactive
[    8.333241] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.334624] msm_compr_audio_effects_config_get: stream or effects inactive
[    8.334919] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.336432] msm_compr_audio_effects_config_get: stream or effects inactive
[    8.336705] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.338121] msm_compr_audio_effects_config_get: stream or effects inactive
[    8.338362] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.339490] msm_pcm_volume_ctl_get substream or runtime not found
[    8.379857] msm-dwc3 a800000.ssusb: Avail curr from USB = 100
[    8.433521] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b17 took 153ms
[    8.457421] ipa ipa3_uc_state_check:307 uC is not loaded
[    8.486711] ueventd: firmware: loading 'modem.b18' for '/devices/soc/4080000.qcom,mss/firmware/modem.b18'
[    8.497111] rmnet_ipa completed initialization
[    8.511739] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b18 took 25ms
[    8.552596] msm_cci_init:1427: hw_version = 0x10060000
[    8.586624] android_work: sent uevent USB_STATE=CONNECTED
[    8.588447] android_work: sent uevent USB_STATE=DISCONNECTED
[    8.626066] ueventd: firmware: loading 'modem.b20' for '/devices/soc/4080000.qcom,mss/firmware/modem.b20'
[    8.628904] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b20 took 3ms
[    8.631048] ueventd: firmware: loading 'modem.b21' for '/devices/soc/4080000.qcom,mss/firmware/modem.b21'
[    8.638667] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b21 took 7ms
[    8.640839] ueventd: firmware: loading 'modem.b22' for '/devices/soc/4080000.qcom,mss/firmware/modem.b22'
[    8.678856] SMB138X: smb138x_parallel_get_prop: parallel power supply get prop 4 not supported
[    8.720555] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b22 took 79ms
[    8.724716] ueventd: firmware: loading 'modem.b23' for '/devices/soc/4080000.qcom,mss/firmware/modem.b23'
[    8.729439] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b23 took 5ms
[    8.738168] ueventd: firmware: loading 'modem.b24' for '/devices/soc/4080000.qcom,mss/firmware/modem.b24'
[    8.746275] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b24 took 9ms
[    8.754897] ueventd: firmware: loading 'modem.b25' for '/devices/soc/4080000.qcom,mss/firmware/modem.b25'
[    8.759650] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b25 took 4ms
[    8.762951] ueventd: firmware: loading 'modem.b26' for '/devices/soc/4080000.qcom,mss/firmware/modem.b26'
[    8.764035] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b26 took 1ms
[    8.765476] ueventd: firmware: loading 'modem.b27' for '/devices/soc/4080000.qcom,mss/firmware/modem.b27'
[    8.766531] ueventd: loading /devices/soc/4080000.qcom,mss/firmware/modem.b27 took 1ms
[    8.783912] android_work: sent uevent USB_STATE=CONNECTED
[    8.790245] configfs-gadget gadget: high-speed config #1: b
[    8.792946] msm-dwc3 a800000.ssusb: Avail curr from USB = 500
[    8.794426] android_work: sent uevent USB_STATE=CONFIGURED
[    8.894599] msm_qti_pp_get_rms_value_control, back not active to query rms be_idx:3
[    8.899745] msm_voice_sound_focus_get: Error getting Sound Focus Params, err=-22
[    8.899867] msm_voice_source_tracking_get: Error getting Source Tracking Params, err=-22
[    8.899966] msm_audio_get_copp_idx_from_port_id: Invalid FE, exiting
[    8.899971] msm_audio_sound_focus_get: Could not get copp idx for port_id=16385
[    8.900120] msm_audio_get_copp_idx_from_port_id: Invalid FE, exiting
[    8.900125] msm_audio_source_tracking_get: Could not get copp idx for port_id=16385
[    8.900235] msm_voice_sound_focus_get: Error getting Sound Focus Params, err=-22
[    8.900346] msm_voice_source_tracking_get: Error getting Source Tracking Params, err=-22
[    8.900445] msm_audio_get_copp_idx_from_port_id: Invalid FE, exiting
[    8.900448] msm_audio_sound_focus_get: Could not get copp idx for port_id=4101
[    8.900558] msm_audio_get_copp_idx_from_port_id: Invalid FE, exiting
[    8.900562] msm_audio_source_tracking_get: Could not get copp idx for port_id=4101
[    8.900669] msm_voice_sound_focus_get: Error getting Sound Focus Params, err=-22
[    8.900784] msm_voice_source_tracking_get: Error getting Source Tracking Params, err=-22
[    8.900884] msm_audio_get_copp_idx_from_port_id: Invalid FE, exiting
[    8.900888] msm_audio_sound_focus_get: Could not get copp idx for port_id=4149
[    8.900987] msm_audio_get_copp_idx_from_port_id: Invalid FE, exiting
[    8.900991] msm_audio_source_tracking_get: Could not get copp idx for port_id=4149
[    8.905688] core_get_license_status: cmdrsp_license_result.result = 0x15 for module 0x131ff
[    8.906485] msm-ext-disp-audio-codec-rx soc:qcom,msm_ext_disp:qcom,msm-ext-disp-audio-codec-rx: msm_ext_disp_audio_type_get: codec_data, get_audio_edid_blk() or get_intf_id is NULL
[    8.908340] tas2557s 7-004d: tas2557_get_Cali_prm_r0, no calibration data
[    8.908450] tas2557s 7-004d: tas2557_get_Cali_prm_r0, no calibration data
[    8.908554] tas2557s 7-004d: tas2557_calibration_get = 0
[    8.916307] msm_dai_q6_ext_disp_drift_get:  afe port not started. status_mask = 0
[    8.916678] msm_dai_q6_ext_disp_drift_get:  afe port not started. status_mask = 0
[    8.918563] msm_pcm_volume_ctl_get substream runtime not found
[    8.918670] msm_pcm_compress_ctl_get substream runtime not found
[    8.918875] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.919621] msm_pcm_volume_ctl_get substream runtime not found
[    8.919829] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.920116] msm_pcm_volume_ctl_get substream runtime not found
[    8.920224] msm_pcm_volume_ctl_get substream runtime not found
[    8.921266] msm_compr_audio_effects_config_get: stream or effects inactive
[    8.921477] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.922866] msm_pcm_volume_ctl_get substream runtime not found
[    8.922978] msm_pcm_compress_ctl_get substream runtime not found
[    8.923202] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.923759] msm_compr_audio_effects_config_get: stream or effects inactive
[    8.923969] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.925344] msm_pcm_volume_ctl_get substream runtime not found
[    8.925557] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.925775] msm_pcm_volume_ctl_get substream runtime not found
[    8.925886] msm_pcm_volume_ctl_get substream runtime not found
[    8.929189] msm_pcm_volume_ctl_get substream runtime not found
[    8.929411] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.929699] msm_compr_audio_effects_config_get: stream or effects inactive
[    8.929911] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.931063] msm_compr_audio_effects_config_get: stream or effects inactive
[    8.931278] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.932333] msm_compr_audio_effects_config_get: stream or effects inactive
[    8.932551] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.933493] msm_compr_audio_effects_config_get: stream or effects inactive
[    8.933702] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.934654] msm_compr_audio_effects_config_get: stream or effects inactive
[    8.934873] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.935844] msm_compr_audio_effects_config_get: stream or effects inactive
[    8.936057] msm_adsp_stream_callback_get: ASM Stream PP Event Data Unavailable
[    8.936825] msm_pcm_volume_ctl_get substream or runtime not found
[    8.982166] FG: fg_get_battery_temp: batt temperature original:440, tuned:377
[    8.983892] healthd: battery l=80 v=3879 t=37.7 h=2 st=2 c=1060 fc=3229000 cc=291 chg=u
[    8.985963] FG: fg_get_battery_temp: batt temperature original:440, tuned:377
[    8.989150] healthd: battery l=80 v=3879 t=37.7 h=2 st=2 c=1060 fc=3229000 cc=291 chg=u
[    9.060356] pil-q6v5-mss 4080000.qcom,mss: modem: Brought out of reset
[    9.084371] FG: fg_get_battery_temp: batt temperature original:440, tuned:377
[    9.084450] SMB138X: smb138x_parallel_get_prop: parallel power supply get prop 4 not supported
[    9.086177] healthd: battery l=80 v=3879 t=37.7 h=2 st=2 c=1060 fc=3229000 cc=291 chg=u
[    9.087510] wcdcal_hwdep_ioctl_shared: incorrect firmware size 154 for vbat
[    9.087893] FG: fg_get_battery_temp: batt temperature original:440, tuned:377
[    9.089265] healthd: battery l=80 v=3879 t=37.7 h=2 st=2 c=1060 fc=3229000 cc=291 chg=u
[    9.187359] apr_tal_link_state_cb: edge[mpss] link state[0]
[    9.193568] pil-q6v5-mss 4080000.qcom,mss: Subsystem error monitoring/handling services are up
[    9.195829] pil-q6v5-mss 4080000.qcom,mss: modem: Power/Clock ready interrupt received
[    9.197839] IPC_RTR: ipc_router_create_log_ctx: Unable to create IPC logging for [mpss_IPCRTR]
[    9.199253] ipa-wan ipa3_ssr_notifier_cb:2548 ipa3_ssr_notifier_cb:2548 IPA received MPSS AFTER_POWERUP
[    9.199263] ipa-wan ipa3_ssr_notifier_cb:2553 IPA AFTER_POWERUP handling is complete
[    9.333005] sysmon-qmi: sysmon_clnt_svc_arrive: Connection established between QMI handle and modem's SSCTL service
[    9.333077] service-notifier: root_service_service_arrive: Connection established between QMI handle and 180 service
[    9.454131] FG: fg_get_battery_temp: batt temperature original:440, tuned:377
[    9.455653] healthd: battery l=80 v=3879 t=37.7 h=2 st=2 c=1060 fc=3229000 cc=291 chg=u
[    9.456048] FG: fg_get_battery_temp: batt temperature original:440, tuned:377
[    9.457640] healthd: battery l=80 v=3879 t=37.7 h=2 st=2 c=1060 fc=3229000 cc=291 chg=u
[    9.519561] diag: In diag_send_feature_mask_update, control channel is not open, p: 0, 0000000000000000
[    9.604174] ipa-wan ipa3_handle_indication_req:152 not send indication
[    9.606775] Sending QMI_IPA_INIT_MODEM_DRIVER_REQ_V01
[    9.623332] ipa ipa3_uc_state_check:307 uC is not loaded
[    9.628977] QMI_IPA_INIT_MODEM_DRIVER_REQ_V01 response received
[    9.630781] IPC_RTR: process_new_server_msg: Server 00001002 create rejected, version = 0
[    9.647469] ipa ipa3_uc_state_check:307 uC is not loaded
[    9.671160] ipa ipa3_uc_state_check:307 uC is not loaded
[    9.697898] ipa ipa3_uc_wdi_event_log_info_handler:370 WDI stats ofst=0x47130
[    9.697914] ipa ipa3_uc_ntn_event_log_info_handler:39 NTN feature missing 0x9
[    9.698091] ipa ipa3_uc_state_check:307 uC is not loaded
[    9.709563] ipa ipa3_uc_state_check:307 uC is not loaded
[    9.967484] msm_cci_init:1427: hw_version = 0x10060000
[   10.007513] IPC_RTR: msm_ipc_router_send_to: permission failure for cnss-daemon
[   10.007525] IPC_RTR: msm_ipc_router_sendmsg: Send_to failure -1
[   10.067726] init: processing action (vendor.ims.DATA_DAEMON_STATUS=1) from (/vendor/etc/init/hw/init.taimen.rc:738)
[   10.068687] init: starting service 'vendor.ims_rtp_daemon'...
[   10.073772] init: starting service 'vendor.imsrcsservice'...
[   10.366901] MSM-CPP cpp_init_hardware:1138 CPP HW Version: 0x60010000
[   10.366931] MSM-CPP cpp_init_hardware:1156 stream_cnt:0
[   10.384500] service-notifier: root_service_service_ind_cb: Indication received from msm/modem/wlan_pd, state: 0x1fffffff, trans-id: 1
[   10.412802] service-notifier: send_ind_ack: Indication ACKed for transid 1, service msm/modem/wlan_pd, instance 180!
[   10.412825] icnss: QMI Server Connected: state: 0x981
[   10.430789] MSM-CPP cpp_release_hardware:1219 cpp hw release done
[   10.518797] IPC_RTR: process_new_server_msg: Server 00001003 create rejected, version = 0
[   10.616238] ip_local_port_range: prefer different parity for start/end values.
[   10.910386] misc mnh_sm: mnh_sm_set_state: request state 0
[   10.913173] misc mnh_sm: mnh_sm_set_state: finished state 0
[   10.929572] misc mnh_sm: mnh_sm_set_state: request state 0
[   10.929593] misc mnh_sm: mnh_sm_set_state: finished state 0
[   10.937747] msm_pm_qos_update_request: update request 100
[   10.942761] msm_pm_qos_update_request: update request -1
[   10.942791] ipa ipa3_assign_policy:3328 get close-by 8192
[   10.942797] ipa ipa3_assign_policy:3334 set rx_buff_sz 7808
[   10.942801] ipa ipa3_assign_policy:3356 set aggr_limit 6
[   10.949757] msm_csid_init: CSID_VERSION = 0x30050000
[   10.952790] msm_csid_irq CSID0_IRQ_STATUS_ADDR = 0x800
[   10.978907] ipa-wan ipa3_wwan_ioctl:1699 dev(rmnet_data0) register to IPA
[   11.001188] MSM-CPP cpp_init_hardware:1138 CPP HW Version: 0x60010000
[   11.001213] MSM-CPP cpp_init_hardware:1156 stream_cnt:0
[   11.002393] init: Received control message 'interface_start' for 'android.frameworks.sensorservice@1.0::ISensorManager/default' from pid: 613 (/system/bin/hwservicemanager)
[   11.002425] init: Could not find service hosting interface android.frameworks.sensorservice@1.0::ISensorManager/default
[   11.002731] init: Received control message 'interface_start' for 'android.frameworks.sensorservice@1.0::ISensorManager/default' from pid: 613 (/system/bin/hwservicemanager)
[   11.002752] init: Could not find service hosting interface android.frameworks.sensorservice@1.0::ISensorManager/default
[   11.031047] msm_cci_init:1427: hw_version = 0x10060000
[   11.039684] msm_sensor_power_up: [05-15 17:11:43.559] imx362
[   11.108040] msm_stopGyroThread:251 [OISDBG] msm_stopGyroThread:E
[   11.108056] msm_stopGyroThread:261 [OISDBG] invalid timer state = 0
[   11.108061] msm_stopGyroThread:262 [OISDBG] msm_stopGyroThread:X
[   11.120242] MSM-CPP cpp_release_hardware:1219 cpp hw release done
[   11.128778] [OISFW]:msm_sensor_checkfw 
[   11.128794] [OISFW]:checkFWUpdate 1. sid = 26
[   11.236651] [OISFW]:checkHWFWversion
[   11.236823] [OISFW]:checkHWFWversion 0x8000 =  0x090e0819.
[   11.236828] [OISFW]:checkHWFWversion FW_version =  0x19.
[   11.236832] [OISFW]checkHWFWversion: No need to update.
[   11.236997] [OISFW]:checkFWUpdate 0x8000 =  0x090e0819
[   11.237007] [OISFW]:checkFWUpdate 2. sid = 26
[   11.237015] [OISFW]:checkFWUpdate rc = 0
[   11.237150] [VCMFW]: msm_sensor_checkvcmfw:E sid = 26
[   11.237157] [VCMFW]: msm_sensor_checkvcmfw:E addr_type = 2
[   11.237163] [VCMFW]:checkVCMFWUpdate :E
[   11.237169] [VCMFW]:checkVCMFWUpdate addr_type = 1
[   11.237175] [VCMFW]:checkVCMFWUpdate sid = 114
[   11.250125] [VCMFW]:checkVCMFWUpdate UlReadVal =  0x60710600
[   11.250613] [VCMFW]checkVCMFWUpdate: No need to update AF FW
[   11.250621] [VCMFW]:checkVCMFWUpdate :X
[   11.250625] [VCMFW]: msm_sensor_checkvcmfw:X sid = 26
[   11.250629] [VCMFW]: msm_sensor_checkvcmfw:X addr_type = 2
[   11.250722] msm_sensor_power_down: [05-15 17:11:43.779] imx362
[   11.278098] msm_pm_qos_update_request: update request 100
[   11.278126] msm_pm_qos_update_request: update request -1
[   11.302734] msm_csid_init: CSID_VERSION = 0x30050000
[   11.303491] msm_csid_irq CSID2_IRQ_STATUS_ADDR = 0x800
[   11.338415] MSM-CPP cpp_init_hardware:1138 CPP HW Version: 0x60010000
[   11.338453] MSM-CPP cpp_init_hardware:1156 stream_cnt:0
[   11.352924] msm_cci_init:1427: hw_version = 0x10060000
[   11.353273] msm_sensor_power_up: [05-15 17:11:43.879] imx179
[   11.371511] msm_sensor_power_down: [05-15 17:11:43.899] imx179
[   11.383847] MSM-CPP cpp_release_hardware:1219 cpp hw release done
[   11.393137] ipa-wan ipa3_wwan_ioctl:1699 dev(rmnet_data1) register to IPA
[   11.695732] ipa-wan ipa3_wwan_ioctl:1699 dev(rmnet_data2) register to IPA
[   11.788497] FG: fg_get_battery_temp: batt temperature original:440, tuned:377
[   11.792756] healthd: battery l=79 v=3913 t=37.7 h=2 st=2 c=864 fc=3229000 cc=291 chg=u
[   11.801108] FG: fg_get_battery_temp: batt temperature original:440, tuned:377
[   11.804102] healthd: battery l=79 v=3913 t=37.7 h=2 st=2 c=864 fc=3229000 cc=291 chg=u
[   11.832693] FG: fg_get_battery_temp: batt temperature original:440, tuned:378
[   11.834319] healthd: battery l=79 v=3913 t=37.8 h=2 st=2 c=864 fc=3229000 cc=291 chg=u
[   11.922192] init: processing action (sys.uidcpupower=*) from (/vendor/etc/init/hw/init.taimen.rc:805)
[   12.010693] init: processing action (sys.sysctl.extra_free_kbytes=*) from (/init.rc:725)
[   12.029141] ipa-wan ipa3_wwan_ioctl:1699 dev(rmnet_data3) register to IPA
[   12.359993] ipa-wan ipa3_wwan_ioctl:1699 dev(rmnet_data4) register to IPA
[   12.637283] ipa-wan ipa3_wwan_ioctl:1699 dev(rmnet_data5) register to IPA
[   12.698319] read descriptors
[   12.698340] read strings
[   12.699374] read descriptors
[   12.699390] read strings
[   12.715173] acc_open
[   12.715193] acc_release
[   12.823433] ipa-wan ipa3_wwan_ioctl:1699 dev(rmnet_data6) register to IPA
[   12.887236] nq-nci 8-0028: setting ese_gpio high
[   12.889543] ipa-wan ipa3_wwan_ioctl:1699 dev(rmnet_data7) register to IPA
[   13.527786] NOHZ: local_softirq_pending 08
[   13.530232] HTB: quantum of class 10001 is big. Consider r2q change.
[   13.574044] HTB: quantum of class 10010 is big. Consider r2q change.
[   13.579338] sdcardfs version 2.0
[   13.579357] sdcardfs: dev_name -> /data/media
[   13.579363] sdcardfs: options -> fsuid=1023,fsgid=1023,multiuser,derive_gid,default_normal,mask=6,userid=0,gid=1015
[   13.579370] sdcardfs: mnt -> 0000000000000000
[   13.585350] sdcardfs: mounted on top of /data/media type ext4
[   13.588665] Remount options were mask=23,gid=9997 for vfsmnt 0000000000000000.
[   13.588691] sdcardfs : options - debug:1
[   13.588697] sdcardfs : options - gid:9997
[   13.588702] sdcardfs : options - mask:23
[   13.590547] Remount options were mask=7,gid=9997 for vfsmnt 0000000000000000.
[   13.590564] sdcardfs : options - debug:1
[   13.590571] sdcardfs : options - gid:9997
[   13.590576] sdcardfs : options - mask:7
[   13.604285] SMB138X: smb138x_parallel_get_prop: parallel power supply get prop 4 not supported
[   13.774210] qpnp_led.red: b:00 on:50 off:0
[   13.774441] qpnp_led.green: b:00 on:50 off:0
[   13.774483] qpnp_led.blue: b:00 on:50 off:0
[   13.774665] qpnp_led.red: b:00 on:50 off:0
[   13.774707] qpnp_led.green: b:00 on:50 off:0
[   13.774746] qpnp_led.blue: b:00 on:50 off:0
[   13.774888] qpnp_led.red: b:00 on:50 off:0
[   13.774930] qpnp_led.green: b:00 on:50 off:0
[   13.774969] qpnp_led.blue: b:00 on:50 off:0
[   13.985185] msm_vidc: info: Opening video instance: 0000000000000000, 1
[   13.988260] SMB138X: smb138x_parallel_get_prop: parallel power supply get prop 4 not supported
[   13.993380] FG: fg_get_battery_temp: batt temperature original:440, tuned:378
[   13.994745] healthd: battery l=79 v=3946 t=37.8 h=2 st=2 c=684 fc=3229000 cc=291 chg=u
[   13.994857] FG: fg_get_battery_temp: batt temperature original:440, tuned:379
[   13.996885] healthd: battery l=79 v=3946 t=37.9 h=2 st=2 c=684 fc=3229000 cc=291 chg=u
[   14.046064] NOHZ: local_softirq_pending 08
[   14.046467] NOHZ: local_softirq_pending 08
[   14.058276] subsys-restart: __subsystem_get(): Changing subsys fw_name to venus
[   14.074621] subsys-pil-tz cce0000.qcom,venus: venus: loading from 0x0000000000000000 to 0x0000000000000000
[   14.176554] subsys-pil-tz cce0000.qcom,venus: venus: Brought out of reset
[   14.213741] icnss: WLAN FW is ready: 0xd87
[   14.226676] wlan: probing driver v5.2.1.1K
[   14.227805] msm_vidc:  err: Failed to create debugfs for msm_vidc
[   14.231092] ipa ipa3_uc_reg_rdyCB:1719 bad parm. inout=0000000000000000 
[   14.328673] ipa ipa3_uc_reg_rdyCB:1719 bad parm. inout=0000000000000000 
[   14.337155] ipa ipa3_uc_reg_rdyCB:1719 bad parm. inout=0000000000000000 
[   14.341166] msm_vidc: info: Closed video instance: 0000000000000000
[   14.345637] msm_vidc: info: Opening video instance: 0000000000000000, 1
[   14.345832] msm_vidc:  err: Failed to create debugfs for msm_vidc
[   14.355517] FG: fg_get_battery_temp: batt temperature original:440, tuned:379
[   14.357716] FG: fg_get_battery_temp: batt temperature original:440, tuned:379
[   14.369963] healthd: battery l=79 v=3934 t=37.9 h=2 st=2 c=763 fc=3229000 cc=291 chg=u
[   14.371809] healthd: battery l=79 v=3934 t=37.9 h=2 st=2 c=763 fc=3229000 cc=291 chg=u
[   14.372846] msm_vidc: info: Closed video instance: 0000000000000000
[   14.391365] IPC_RTR: process_new_server_msg: Server 00001003 create rejected, version = 0
[   14.407467] msm_vidc: info: Opening video instance: 0000000000000000, 1
[   14.407671] msm_vidc:  err: Failed to create debugfs for msm_vidc
[   14.420776] msm_vidc: info: Closed video instance: 0000000000000000
[   14.432074] msm_vidc: info: Opening video instance: 0000000000000000, 1
[   14.432268] msm_vidc:  err: Failed to create debugfs for msm_vidc
[   14.451766] msm_vidc: info: Closed video instance: 0000000000000000
[   14.457285] msm_vidc: info: Opening video instance: 0000000000000000, 1
[   14.457490] msm_vidc:  err: Failed to create debugfs for msm_vidc
[   14.495661] msm_vidc: info: Closed video instance: 0000000000000000
[   14.529406] msm_vidc: info: Opening video instance: 0000000000000000, 1
[   14.529844] msm_vidc:  err: Failed to create debugfs for msm_vidc
[   14.575461] msm_vidc: info: Closed video instance: 0000000000000000
[   14.596488] msm_vidc: info: Opening video instance: 0000000000000000, 1
[   14.596703] msm_vidc:  err: Failed to create debugfs for msm_vidc
[   14.623473] msm_vidc: info: Closed video instance: 0000000000000000
[   14.652615] msm_vidc: info: Opening video instance: 0000000000000000, 1
[   14.652813] msm_vidc:  err: Failed to create debugfs for msm_vidc
[   14.677832] msm_vidc: info: Closed video instance: 0000000000000000
[   14.697715] msm_vidc: info: Opening video instance: 0000000000000000, 1
[   14.697914] msm_vidc:  err: Failed to create debugfs for msm_vidc
[   14.738424] msm_vidc: info: Closed video instance: 0000000000000000
[   14.753634] msm_vidc: info: Opening video instance: 0000000000000000, 0
[   14.754110] msm_vidc:  err: Failed to create debugfs for msm_vidc
[   14.997275] Host SW:5.2.1.1K, FW:2.2.1.738.0, HW:WCN3990_V2.1
[   15.033031] cfg80211: Regulatory domain changed to country: US
[   15.033042] cfg80211:  DFS Master region: FCC
[   15.033047] cfg80211:   (start_freq - end_freq @ bandwidth), (max_antenna_gain, max_eirp), (dfs_cac_time)
[   15.033053] cfg80211:   (2402000 KHz - 2472000 KHz @ 40000 KHz), (N/A, 3000 mBm), (N/A)
[   15.033058] cfg80211:   (5170000 KHz - 5250000 KHz @ 80000 KHz, 160000 KHz AUTO), (N/A, 3000 mBm), (N/A)
[   15.033063] cfg80211:   (5250000 KHz - 5330000 KHz @ 80000 KHz, 160000 KHz AUTO), (N/A, 2400 mBm), (0 s)
[   15.033067] cfg80211:   (5490000 KHz - 5730000 KHz @ 160000 KHz), (N/A, 2400 mBm), (0 s)
[   15.033070] cfg80211:   (5735000 KHz - 5835000 KHz @ 80000 KHz), (N/A, 3000 mBm), (N/A)
[   15.033074] cfg80211:   (57240000 KHz - 70200000 KHz @ 2160000 KHz), (N/A, 4000 mBm), (N/A)
[   15.123059] msm_vidc: info: Closed video instance: 0000000000000000
[   15.146681] msm_vidc: info: Opening video instance: 0000000000000000, 0
[   15.147508] msm_vidc:  err: Failed to create debugfs for msm_vidc
[   15.158782] msm_vidc: info: Closed video instance: 0000000000000000
[   15.173095] msm_vidc: info: Opening video instance: 0000000000000000, 0
[   15.173679] msm_vidc:  err: Failed to create debugfs for msm_vidc
[   15.188327] msm_vidc: info: Closed video instance: 0000000000000000
[   15.193282] msm_vidc: info: Opening video instance: 0000000000000000, 0
[   15.193736] msm_vidc:  err: Failed to create debugfs for msm_vidc
[   15.209831] msm_vidc: info: Closed video instance: 0000000000000000
[   15.221971] msm_vidc: info: Opening video instance: 0000000000000000, 0
[   15.222412] msm_vidc:  err: Failed to create debugfs for msm_vidc
[   15.231796] msm_vidc: info: Closed video instance: 0000000000000000
[   16.338597] init: Service 'bootanim' (pid 626) exited with status 0
[   16.421700] init: processing action (sys.boot_completed=1) from (/init.rc:719)
[   16.421743] init: processing action (sys.boot_completed=1) from (/vendor/etc/init/hw/init.taimen.rc:573)
[   16.423079] init: starting service 'vendor.power_sh'...
[   16.425055] init: Command 'write /sys/devices/soc/${ro.boot.bootdevice}/clkscale_enable 1' action=sys.boot_completed=1 (/vendor/etc/init/hw/init.taimen.rc:578) took 0ms and failed: Unable to write to file '/sys/devices/soc/1da4000.ufshc/clkscale_enable': open() failed: Permission denied
[   16.457650] init: processing action (sys.boot_completed=1 && sys.logbootcomplete=1) from (/system/etc/init/bootstat.rc:70)
[   16.458209] init: starting service 'exec 7 (/system/bin/bootstat --set_system_boot_reason --record_boot_complete --record_boot_reason --record_time_since_factory_reset -l)'...
[   16.459479] init: processing action (sys.boot_completed=1 && sys.wifitracing.started=0) from (/system/etc/init/wifi-events.rc:20)
[   16.459928] selinux: SELinux: Could not get canonical path for /sys/kernel/debug/tracing/instances/wifi restorecon: No such file or directory.\x0a
[   16.461842] init: processing action (sys.boot_completed=1) from (/vendor/etc/init/init-taimen.rc:16)
[   16.464308] init: processing action (persist.sys.zram_enabled=1 && sys-boot-completed-set) from (/vendor/etc/init/hw/init.taimen.rc:625)
[   16.476497] possible reason: unannotated irqs-off.
[   16.476516] ------------[ cut here ]------------
[   16.476521] WARNING: at ../fs/kernfs/dir.c:29
[   16.476525] 
[   16.476530] CPU: 7 PID: 1 Comm: init Tainted: G        W       4.4.169-Sultan #15
[   16.476534] Hardware name: Qualcomm Technologies, Inc. MSM8998 v2.1 (DT)
[   16.476539] task: 0000000000000000 task.stack: 0000000000000000
[   16.476549] PC is at kernfs_active+0x48/0x68
[   16.476553] LR is at kernfs_active+0x44/0x68
[   16.476556] pc : [<ffffff9f2ea40c38>] lr : [<ffffff9f2ea40c34>] pstate: 60400145
[   16.476560] sp : ffffffe155b6f9c0
[   16.476563] x29: ffffffe155b6f9c0 x28: 0000000000000000 
[   16.476571] x27: 0000000000000000 x26: ffffff9f2fbf1c58 
[   16.476578] x25: ffffff9f30a2a000 x24: ffffff9f30a2ae98 
[   16.476585] x23: ffffffe1600b4080 x22: ffffffe1b0db3bd0 
[   16.476592] x21: ffffffe1b0db3bd0 x20: ffffffe143c5b150 
[   16.476599] x19: ffffffe1b0db3bd0 x18: 0000000000000006 
[   16.476605] x17: 000000000000003f x16: 0000000000000000 
[   16.476612] x15: ffffff9f30a05d90 x14: 0000000000000006 
[   16.476619] x13: 000000000000065a x12: 0000000000000655 
[   16.476626] x11: ffffff9f30c0cb00 x10: ffffff9f30a2af18 
[   16.476633] x9 : ffffff9f30c0c000 x8 : ffffff9f31346e88 
[   16.476639] x7 : 00000042835a6000 x6 : 0000000000001ac2 
[   16.476648] x5 : 0000000000000040 x4 : 00000042835a6000 
[   16.476657] x3 : 0000000000000004 x2 : 0000000000000005 
[   16.476664] x1 : ffffff9f30a31cd8 x0 : 0000000000000000 
[   16.476673] \x0aPC: 0xffffff9f2ea40bf8:
[   16.476677] 0bf8  910003fd f9000bf3 aa0003f3 b9442020 350000e0 b9400660 f9400bf3 2a2003e0
[   16.476702] 0c18  a8c27bfd 531f7c00 d65f03c0 b000ff80 9131a000 9101c000 97fafe98 35fffec0
[   16.476724] 0c38  d4210000 b9400660 f9400bf3 2a2003e0 a8c27bfd 531f7c00 d65f03c0 d503201f
[   16.476751] 0c58  a9bc7bfd aa0103e4 52801801 910003fd 72a04801 a90153f3 a9025bf5 aa0003f6
[   16.476774] \x0aLR: 0xffffff9f2ea40bf4:
[   16.476777] 0bf4  f0010001 910003fd f9000bf3 aa0003f3 b9442020 350000e0 b9400660 f9400bf3
[   16.476802] 0c14  2a2003e0 a8c27bfd 531f7c00 d65f03c0 b000ff80 9131a000 9101c000 97fafe98
[   16.476824] 0c34  35fffec0 d4210000 b9400660 f9400bf3 2a2003e0 a8c27bfd 531f7c00 d65f03c0
[   16.476845] 0c54  d503201f a9bc7bfd aa0103e4 52801801 910003fd 72a04801 a90153f3 a9025bf5
[   16.476868] \x0aSP: 0xffffffe155b6f980:
[   16.476871] f980  2ea40c34 ffffff9f 55b6f9c0 ffffffe1 2ea40c38 ffffff9f 60400145 00000000
[   16.476892] f9a0  b0db3bd0 ffffffe1 43c5b150 ffffffe1 00000000 00000080 2ea42204 ffffff9f
[   16.476914] f9c0  55b6f9e0 ffffffe1 2ea42404 ffffff9f 30a31c68 ffffff9f 7a81e143 403a4a6d
[   16.476935] f9e0  55b6fa10 ffffffe1 2ea42504 ffffff9f 43c5b150 ffffffe1 600b4080 ffffffe1
[   16.476957] 
[   16.476961] ---[ end trace 24ec051baadc3c87 ]---
[   16.476964] Call trace:
[   16.476969] Exception stack(0xffffffe155b6f7d0 to 0xffffffe155b6f900)
[   16.476973] f7c0:                                   ffffffe1b0db3bd0 0000008000000000
[   16.476978] f7e0: 00000000834df000 ffffff9f2ea40c38 0000000060400145 0000000000000000
[   16.476982] f800: 0000000000000001 0000000000000000 ffffff9f2ea423ac 0000000000000140
[   16.476987] f820: ffffffe155b88000 ffffff9f30a31cd8 ffffffe155b6f870 0000000000000000
[   16.476991] f840: ffffff9f31344ba8 ffffff9f3196eb28 ffffffe100000005 ffffff9f2e88a160
[   16.476995] f860: ffffffe155b6f8b0 ffffff9f2e8ff8c4 0000000000000005 0000000000000007
[   16.477000] f880: ffffffe155b88000 403a4a6d7a81e143 0000000000000000 ffffff9f30a31cd8
[   16.477004] f8a0: 0000000000000005 0000000000000004 00000042835a6000 0000000000000040
[   16.477009] f8c0: 0000000000001ac2 00000042835a6000 ffffff9f31346e88 ffffff9f30c0c000
[   16.477013] f8e0: ffffff9f30a2af18 ffffff9f30c0cb00 0000000000000655 000000000000065a
[   16.477018] [<ffffff9f2ea40c38>] kernfs_active+0x48/0x68
[   16.477023] [<ffffff9f2ea42404>] kernfs_add_one+0x84/0x138
[   16.477027] [<ffffff9f2ea42504>] kernfs_create_dir_ns+0x4c/0x88
[   16.477033] [<ffffff9f2ea44b70>] sysfs_create_dir_ns+0x40/0xb0
[   16.477038] [<ffffff9f2eb59048>] kobject_add_internal+0x90/0x290
[   16.477043] [<ffffff9f2eb592d4>] kobject_init_and_add+0x8c/0xd0
[   16.477047] [<ffffff9f2e9b1548>] sysfs_slab_add+0x1a0/0x220
[   16.477051] [<ffffff9f2e9b1c4c>] __kmem_cache_create+0x16c/0x2e0
[   16.477056] [<ffffff9f2e98d9c0>] kmem_cache_create+0x150/0x240
[   16.477060] [<ffffff9f2e9b7004>] zs_create_pool+0xcc/0x340
[   16.477066] [<ffffff9f2edb3f98>] disksize_store+0x70/0x1b0
[   16.477072] [<ffffff9f2ed8b590>] dev_attr_store+0x18/0x28
[   16.477075] [<ffffff9f2ea441e8>] sysfs_kf_write+0x48/0x70
[   16.477079] [<ffffff9f2ea43404>] kernfs_fop_write+0xcc/0x1b0
[   16.477084] [<ffffff9f2e9bb5b8>] __vfs_write+0x30/0x100
[   16.477088] [<ffffff9f2e9bb898>] vfs_write+0x128/0x1b0
[   16.477092] [<ffffff9f2e9bba3c>] SyS_write+0x5c/0xc0
[   16.477098] [<ffffff9f2e8832b0>] el0_svc_naked+0x24/0x28
[   16.489505] NOHZ: local_softirq_pending 08
[   16.493182] zram0: detected capacity change from 0 to 536870912
[   16.545457] logd: logdr: UID=1000 GID=1007 PID=2248 n tail=0 logMask=80 pid=0 start=0ns timeout=0ns
[   16.562679] mkswap: Swapspace size: 524284k, UUID=114efeb2-3cec-41c2-adef-d1f4d292c454
[   16.576440] Adding 524284k swap on /dev/block/zram0.  Priority:-1 extents:1 across:524284k SS
[   16.577733] init: Command 'swapon_all /vendor/etc/fstab.${ro.hardware}' action=persist.sys.zram_enabled=1 && sys-boot-completed-set (/vendor/etc/init/hw/init.taimen.rc:626) took 113ms and succeeded
[   16.579704] init: processing action (sys.post_boot.parsed=1) from (/vendor/etc/init/hw/init.taimen.rc:567)
[   16.603678] init: processing action (ro.boot.bootreason=*) from (/system/etc/init/bootstat.rc:4)
[   16.608791] init: Service 'exec 7 (/system/bin/bootstat --set_system_boot_reason --record_boot_complete --record_boot_reason --record_time_since_factory_reset -l)' (pid 2248) exited with status 0
[   16.624798] init: Service 'vendor.power_sh' (pid 2245) exited with status 0
[   18.536422] SMB138X: smb138x_parallel_get_prop: parallel power supply get prop 4 not supported
[   18.914957] SMB138X: smb138x_parallel_get_prop: parallel power supply get prop 4 not supported
[   18.918365] FG: fg_get_battery_temp: batt temperature original:430, tuned:375
[   18.918871] FG: fg_get_battery_temp: batt temperature original:430, tuned:375
[   18.922283] healthd: battery l=79 v=4093 t=37.5 h=2 st=2 c=5 fc=3229000 cc=291 chg=u
[   18.922462] healthd: battery l=79 v=4093 t=37.5 h=2 st=2 c=5 fc=3229000 cc=291 chg=u
[   19.295247] FG: fg_get_battery_temp: batt temperature original:430, tuned:375
[   19.295457] FG: fg_get_battery_temp: batt temperature original:430, tuned:375
[   19.299542] healthd: battery l=79 v=4093 t=37.5 h=2 st=2 c=5 fc=3229000 cc=291 chg=u
[   19.299846] healthd: battery l=79 v=4093 t=37.5 h=2 st=2 c=5 fc=3229000 cc=291 chg=u
[   19.484390] ------------[ cut here ]------------
[   19.484419] WARNING: at ../kernel/sched/sched.h:1394
[   19.484425] 
[   19.484434] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G        W       4.4.169-Sultan #15
[   19.484440] Hardware name: Qualcomm Technologies, Inc. MSM8998 v2.1 (DT)
[   19.484447] task: 0000000000000000 task.stack: 0000000000000000
[   19.484462] PC is at select_energy_cpu_brute+0x7d0/0x928
[   19.484468] LR is at select_energy_cpu_brute+0x408/0x928
[   19.484475] pc : [<ffffff9f2e8e82c0>] lr : [<ffffff9f2e8e7ef8>] pstate: 404001c5
[   19.484479] sp : ffffff9f30a03700
[   19.484485] x29: ffffff9f30a03700 x28: 0000000000000002 
[   19.484497] x27: 0000000000000002 x26: ffffff9f30a05e8c 
[   19.484508] x25: ffffff9f30a06158 x24: ffffff9f2f808000 
[   19.484519] x23: ffffffe1b309a800 x22: ffffff9f30a05f34 
[   19.484530] x21: 000000000000004a x20: 0000000000000045 
[   19.484540] x19: ffffffe1b1489880 x18: 0000000000000000 
[   19.484551] x17: 0000000000000000 x16: 0000000000000000 
[   19.484561] x15: 0000000000000000 x14: 000000000000000a 
[   19.484572] x13: 0000000000002e9f x12: ffffff9f30a05e88 
[   19.484583] x11: 0000000000000000 x10: 0000000000000005 
[   19.484593] x9 : 000000000000001a x8 : 0000000000000000 
[   19.484604] x7 : ffffff9f2e8fa32c x6 : 000000001edff655 
[   19.484615] x5 : 000000000000000c x4 : 00000042828e1000 
[   19.484626] x3 : 0000000000000009 x2 : ffffff9f3024a000 
[   19.484636] x1 : 0000000000000000 x0 : 0000000000000000 
[   19.484647] \x0aPC: 0xffffff9f2e8e8280:
[   19.484653] 8280  912c6000 52806be1 39000e83 940070a1 17fffe40 9400c1e7 35ffcac0 52800023
[   19.484687] 82a0  d00097c2 b00097e0 91154042 911b4000 52831f61 39001a83 94007096 17fffe4d
[   19.484718] 82c0  d4210000 17ffff0e b94137e0 340003f5 90011921 39692021 34000881 f0011902
[   19.484749] 82e0  91160042 7100001f 5a80a401 a9459045 4b050084 6b01009f 5400074d 7100001f
[   19.484782] \x0aLR: 0xffffff9f2e8e7eb8:
[   19.484788] 7eb8  b9400042 d376d421 9ac20821 17ffffa8 f9404fe2 b94097e1 eb02001f 1a9b2021
[   19.484818] 7ed8  9a802040 b90097e1 f9004fe0 17ffff87 b940afe0 7100001f 54fff08c 9400c2cf
[   19.484849] 7ef8  34001e40 b940c3e2 b94b12e0 37f805e2 f9403fe1 b9403021 7100003f b940cbe1
[   19.484880] 7f18  7a411000 1a81b000 b900cbe0 1a82b360 b900c3e0 17ffff75 eb0102bf 9a959020
[   19.484911] \x0aSP: 0xffffff9f30a036c0:
[   19.484917] 36c0  2e8e7ef8 ffffff9f 30a03700 ffffff9f 2e8e82c0 ffffff9f 404001c5 00000000
[   19.484947] 36e0  30a03700 ffffff9f 2e8e7db8 ffffff9f ffffffff ffffffff 2e8e7ef8 ffffff9f
[   19.484978] 3700  30a038a0 ffffff9f 2e8e9184 ffffff9f 00000000 00000000 00000000 00000000
[   19.485009] 3720  00000000 00000000 0000004f 00000000 00000004 00000000 30a138d0 ffffff9f
[   19.485040] 
[   19.485046] ---[ end trace 24ec051baadc3c88 ]---
[   19.485052] Call trace:
[   19.485061] Exception stack(0xffffff9f30a03510 to 0xffffff9f30a03640)
[   19.485068] 3500:                                   ffffffe1b1489880 0000008000000000
[   19.485077] 3520: 0000000082408000 ffffff9f2e8e82c0 00000000404001c5 ffffff9f2e901f78
[   19.485086] 3540: ffffff9f30a14288 ffffff9f30a43000 ffffff9f30a05d90 ffffff9f30a14090
[   19.485095] 3560: ffffff9f31925000 ffffff9f2e901dbc ffffff9f30a03670 ffffff9f2e904024
[   19.485103] 3580: ffffff9f30a234c8 0000000000000000 0000000000000000 0000000000000002
[   19.485112] 35a0: 0000000000000000 0000000000000000 ffffff9f2e8fa32c 00000000000001c0
[   19.485120] 35c0: ffffff9f30a138d0 403a4a6d7a81e143 0000000000000000 0000000000000000
[   19.485128] 35e0: ffffff9f3024a000 0000000000000009 00000042828e1000 000000000000000c
[   19.485137] 3600: 000000001edff655 ffffff9f2e8fa32c 0000000000000000 000000000000001a
[   19.485145] 3620: 0000000000000005 0000000000000000 ffffff9f30a05e88 0000000000002e9f
[   19.485155] [<ffffff9f2e8e82c0>] select_energy_cpu_brute+0x7d0/0x928
[   19.485162] [<ffffff9f2e8e9184>] select_task_rq_fair+0xd6c/0xe28
[   19.485171] [<ffffff9f2e8d4760>] try_to_wake_up+0x180/0x3c8
[   19.485177] [<ffffff9f2e8d540c>] wake_up_process+0x14/0x20
[   19.485186] [<ffffff9f2e8cb3ac>] insert_kthread_work+0x8c/0xa0
[   19.485192] [<ffffff9f2e8cbe48>] queue_kthread_work+0x98/0xc0
[   19.485203] [<ffffff9f2ec9a27c>] queue_cmd+0x84/0xe8
[   19.485211] [<ffffff9f2ec9cf04>] tx_data+0x41c/0x4d8
[   19.485218] [<ffffff9f2ec9cff8>] tx+0x18/0x20
[   19.485227] [<ffffff9f2ec8d618>] glink_tx_common+0x798/0xb30
[   19.485235] [<ffffff9f2ec8f1e4>] glink_tx+0x24/0x30
[   19.485242] [<ffffff9f2eca72e4>] msm_rpm_glink_send_buffer+0x6c/0xf0
[   19.485250] [<ffffff9f2ecaa0cc>] msm_rpm_enter_sleep+0x7c/0x1c8
[   19.485260] [<ffffff9f2f108694>] cluster_prepare+0x414/0xb00
[   19.485267] [<ffffff9f2f1088cc>] cluster_prepare+0x64c/0xb00
[   19.485275] [<ffffff9f2f1096ac>] lpm_cpuidle_enter+0xd4/0x360
[   19.485283] [<ffffff9f2f105094>] cpuidle_enter_state+0x12c/0x258
[   19.485291] [<ffffff9f2f1051f8>] cpuidle_enter+0x18/0x20
[   19.485301] [<ffffff9f2e8f4c78>] call_cpuidle+0x20/0x58
[   19.485308] [<ffffff9f2e8f4de8>] cpu_startup_entry+0x138/0x278
[   19.485319] [<ffffff9f2f79b20c>] rest_init+0x148/0x158
[   19.485329] [<ffffff9f30200bd8>] start_kernel+0x3d8/0x400
[   19.485337] [<ffffff9f302001c4>] __primary_switched+0x30/0x8c
[   19.485350] ------------[ cut here ]------------
[   19.485356] WARNING: at ../kernel/sched/sched.h:1394
[   19.485361] 
[   19.485367] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G        W       4.4.169-Sultan #15
[   19.485372] Hardware name: Qualcomm Technologies, Inc. MSM8998 v2.1 (DT)
[   19.485378] task: 0000000000000000 task.stack: 0000000000000000
[   19.485385] PC is at select_energy_cpu_brute+0x7d0/0x928
[   19.485390] LR is at select_energy_cpu_brute+0x408/0x928
[   19.485396] pc : [<ffffff9f2e8e82c0>] lr : [<ffffff9f2e8e7ef8>] pstate: 404001c5
[   19.485401] sp : ffffff9f30a03700
[   19.485406] x29: ffffff9f30a03700 x28: 0000000000000003 
[   19.485417] x27: 0000000000000003 x26: ffffff9f30a05e8c 
[   19.485428] x25: ffffff9f30a06158 x24: ffffff9f2f808000 
[   19.485438] x23: ffffffe1b326d800 x22: ffffff9f30a05f34 
[   19.485449] x21: 000000000000004a x20: 0000000000000045 
[   19.485460] x19: ffffffe1b1489880 x18: 0000000000000000 
[   19.485470] x17: 0000000000000000 x16: 0000000000000000 
[   19.485481] x15: 0000000000000000 x14: 000000000000000a 
[   19.485491] x13: 0000000000002e9f x12: ffffff9f30a05e88 
[   19.485502] x11: 0000000000000000 x10: 0000000000000005 
[   19.485513] x9 : 000000000000001a x8 : 0000000000000000 
[   19.485524] x7 : ffffff9f2e8fa32c x6 : 000000001edff655 
[   19.485535] x5 : 000000000000000c x4 : 00000042828e1000 
[   19.485546] x3 : 0000000000000009 x2 : ffffff9f3024a000 
[   19.485557] x1 : 0000000000000000 x0 : 0000000000000000 
[   19.485568] \x0aPC: 0xffffff9f2e8e8280:
[   19.485575] 8280  912c6000 52806be1 39000e83 940070a1 17fffe40 9400c1e7 35ffcac0 52800023
[   19.485608] 82a0  d00097c2 b00097e0 91154042 911b4000 52831f61 39001a83 94007096 17fffe4d
[   19.485640] 82c0  d4210000 17ffff0e b94137e0 340003f5 90011921 39692021 34000881 f0011902
[   19.485672] 82e0  91160042 7100001f 5a80a401 a9459045 4b050084 6b01009f 5400074d 7100001f
[   19.485703] \x0aLR: 0xffffff9f2e8e7eb8:
[   19.485709] 7eb8  b9400042 d376d421 9ac20821 17ffffa8 f9404fe2 b94097e1 eb02001f 1a9b2021
[   19.485740] 7ed8  9a802040 b90097e1 f9004fe0 17ffff87 b940afe0 7100001f 54fff08c 9400c2cf
[   19.485771] 7ef8  34001e40 b940c3e2 b94b12e0 37f805e2 f9403fe1 b9403021 7100003f b940cbe1
[   19.485803] 7f18  7a411000 1a81b000 b900cbe0 1a82b360 b900c3e0 17ffff75 eb0102bf 9a959020
[   19.485835] \x0aSP: 0xffffff9f30a036c0:
[   19.485840] 36c0  2e8e7ef8 ffffff9f 30a03700 ffffff9f 2e8e82c0 ffffff9f 404001c5 00000000
[   19.485872] 36e0  30a03700 ffffff9f 2e8e7db8 ffffff9f ffffffff ffffffff 2e8e7ef8 ffffff9f
[   19.485903] 3700  30a038a0 ffffff9f 2e8e9184 ffffff9f 00000000 00000000 00000000 00000000
[   19.485934] 3720  00000000 00000000 0000004f 00000000 00000004 00000000 30a138d0 ffffff9f
[   19.485965] 
[   19.485971] ---[ end trace 24ec051baadc3c89 ]---
[   19.485975] Call trace:
[   19.485981] Exception stack(0xffffff9f30a03510 to 0xffffff9f30a03640)
[   19.485988] 3500:                                   ffffffe1b1489880 0000008000000000
[   19.485996] 3520: 00000000834df000 ffffff9f2e8e82c0 00000000404001c5 ffffff9f2e901f78
[   19.486004] 3540: ffffff9f30a14288 ffffff9f30a43000 ffffff9f30a05d90 ffffff9f30a14090
[   19.486013] 3560: ffffff9f31925000 ffffff9f2e901dbc ffffff9f30a03670 ffffff9f2e904024
[   19.486021] 3580: ffffff9f30a234c8 0000000000000000 0000000000000000 0000000000000002
[   19.486029] 35a0: 0000000000000000 0000000000000000 ffffff9f2e8fa32c 00000000000001c0
[   19.486038] 35c0: ffffff9f30a138d0 403a4a6d7a81e143 0000000000000000 0000000000000000
[   19.486046] 35e0: ffffff9f3024a000 0000000000000009 00000042828e1000 000000000000000c
[   19.486055] 3600: 000000001edff655 ffffff9f2e8fa32c 0000000000000000 000000000000001a
[   19.486063] 3620: 0000000000000005 0000000000000000 ffffff9f30a05e88 0000000000002e9f
[   19.486072] [<ffffff9f2e8e82c0>] select_energy_cpu_brute+0x7d0/0x928
[   19.486079] [<ffffff9f2e8e9184>] select_task_rq_fair+0xd6c/0xe28
[   19.486086] [<ffffff9f2e8d4760>] try_to_wake_up+0x180/0x3c8
[   19.486093] [<ffffff9f2e8d540c>] wake_up_process+0x14/0x20
[   19.486100] [<ffffff9f2e8cb3ac>] insert_kthread_work+0x8c/0xa0
[   19.486107] [<ffffff9f2e8cbe48>] queue_kthread_work+0x98/0xc0
[   19.486115] [<ffffff9f2ec9a27c>] queue_cmd+0x84/0xe8
[   19.486122] [<ffffff9f2ec9cf04>] tx_data+0x41c/0x4d8
[   19.486128] [<ffffff9f2ec9cff8>] tx+0x18/0x20
[   19.486135] [<ffffff9f2ec8d618>] glink_tx_common+0x798/0xb30
[   19.486143] [<ffffff9f2ec8f1e4>] glink_tx+0x24/0x30
[   19.486149] [<ffffff9f2eca72e4>] msm_rpm_glink_send_buffer+0x6c/0xf0
[   19.486155] [<ffffff9f2ecaa0cc>] msm_rpm_enter_sleep+0x7c/0x1c8
[   19.486162] [<ffffff9f2f108694>] cluster_prepare+0x414/0xb00
[   19.486169] [<ffffff9f2f1088cc>] cluster_prepare+0x64c/0xb00
[   19.486176] [<ffffff9f2f1096ac>] lpm_cpuidle_enter+0xd4/0x360
[   19.486184] [<ffffff9f2f105094>] cpuidle_enter_state+0x12c/0x258
[   19.486190] [<ffffff9f2f1051f8>] cpuidle_enter+0x18/0x20
[   19.486197] [<ffffff9f2e8f4c78>] call_cpuidle+0x20/0x58
[   19.486205] [<ffffff9f2e8f4de8>] cpu_startup_entry+0x138/0x278
[   19.486212] [<ffffff9f2f79b20c>] rest_init+0x148/0x158
[   19.486220] [<ffffff9f30200bd8>] start_kernel+0x3d8/0x400
[   19.486227] [<ffffff9f302001c4>] __primary_switched+0x30/0x8c
[   19.708541] ------------[ cut here ]------------
[   19.708557] WARNING: at ../kernel/sched/sched.h:1394
[   19.708563] 
[   19.708571] CPU: 7 PID: 0 Comm: swapper/7 Tainted: G        W       4.4.169-Sultan #15
[   19.708575] Hardware name: Qualcomm Technologies, Inc. MSM8998 v2.1 (DT)
[   19.708582] task: 0000000000000000 task.stack: 0000000000000000
[   19.708595] PC is at select_energy_cpu_brute+0x7d0/0x928
[   19.708601] LR is at select_energy_cpu_brute+0x408/0x928
[   19.708606] pc : [<ffffff9f2e8e82c0>] lr : [<ffffff9f2e8e7ef8>] pstate: 404001c5
[   19.708611] sp : ffffffe148f2b760
[   19.708616] x29: ffffffe148f2b760 x28: 0000000000000002 
[   19.708626] x27: 0000000000000002 x26: ffffff9f30a05e8c 
[   19.708636] x25: ffffff9f30a06158 x24: ffffff9f2f808000 
[   19.708645] x23: ffffffe1b309a800 x22: ffffff9f30a05f34 
[   19.708654] x21: 000000000000004e x20: 000000000000004e 
[   19.708662] x19: ffffffe1b1489880 x18: 0000000000000001 
[   19.708671] x17: 00000000fa83b2da x16: 0000000000036ad1 
[   19.708680] x15: 0000000000036ad1 x14: 0000000000353a3a 
[   19.708689] x13: 0000000000000400 x12: ffffff9f30a05e88 
[   19.708698] x11: 0000000000000000 x10: 0000000000000005 
[   19.708706] x9 : 000000000000001a x8 : 0000000000000000 
[   19.708715] x7 : ffffff9f2e8fa32c x6 : 000000001edff655 
[   19.708724] x5 : 000000000000000c x4 : 00000042835a6000 
[   19.708733] x3 : 0000000000000009 x2 : ffffff9f3024a000 
[   19.708741] x1 : 0000000000000000 x0 : 0000000000000000 
[   19.708751] \x0aPC: 0xffffff9f2e8e8280:
[   19.708756] 8280  912c6000 52806be1 39000e83 940070a1 17fffe40 9400c1e7 35ffcac0 52800023
[   19.708785] 82a0  d00097c2 b00097e0 91154042 911b4000 52831f61 39001a83 94007096 17fffe4d
[   19.708813] 82c0  d4210000 17ffff0e b94137e0 340003f5 90011921 39692021 34000881 f0011902
[   19.708841] 82e0  91160042 7100001f 5a80a401 a9459045 4b050084 6b01009f 5400074d 7100001f
[   19.708869] \x0aLR: 0xffffff9f2e8e7eb8:
[   19.708873] 7eb8  b9400042 d376d421 9ac20821 17ffffa8 f9404fe2 b94097e1 eb02001f 1a9b2021
[   19.708902] 7ed8  9a802040 b90097e1 f9004fe0 17ffff87 b940afe0 7100001f 54fff08c 9400c2cf
[   19.708929] 7ef8  34001e40 b940c3e2 b94b12e0 37f805e2 f9403fe1 b9403021 7100003f b940cbe1
[   19.708957] 7f18  7a411000 1a81b000 b900cbe0 1a82b360 b900c3e0 17ffff75 eb0102bf 9a959020
[   19.708987] \x0aSP: 0xffffffe148f2b720:
[   19.708992] b720  2e8e7ef8 ffffff9f 48f2b760 ffffffe1 2e8e82c0 ffffff9f 404001c5 00000000
[   19.709020] b740  48f2b760 ffffffe1 2e8e7db8 ffffff9f ffffffff ffffffff 2e8e7ef8 ffffff9f
[   19.709048] b760  48f2b900 ffffffe1 2e8e9184 ffffff9f 00000000 00000000 00000000 00000000
[   19.709075] b780  00000000 00000000 0000002c 00000000 00000004 00000000 48f16200 ffffffe1
[   19.709103] 
[   19.709108] ---[ end trace 24ec051baadc3c8a ]---
[   19.709112] Call trace:
[   19.709118] Exception stack(0xffffffe148f2b570 to 0xffffffe148f2b6a0)
[   19.709123] b560:                                   ffffffe1b1489880 0000008000000000
[   19.709130] b580: 00000000834df000 ffffff9f2e8e82c0 00000000404001c5 ffffff9f2e901f78
[   19.709137] b5a0: ffffffe148f16bb8 ffffff9f30a43000 ffffff9f30a05d90 ffffffe148f169c0
[   19.709144] b5c0: ffffff9f31925000 ffffff9f2e901dbc ffffffe148f2b6d0 ffffff9f2e904024
[   19.709149] b5e0: ffffff9f30a234c8 0000000000000000 0000000000000000 0000000000000002
[   19.709155] b600: 0000000000000000 0000000000000000 ffffff9f2e8fa32c 00000000000001c0
[   19.709162] b620: ffffffe148f16200 403a4a6d7a81e143 0000000000000000 0000000000000000
[   19.709167] b640: ffffff9f3024a000 0000000000000009 00000042835a6000 000000000000000c
[   19.709173] b660: 000000001edff655 ffffff9f2e8fa32c 0000000000000000 000000000000001a
[   19.709178] b680: 0000000000000005 0000000000000000 ffffff9f30a05e88 0000000000000400
[   19.709185] [<ffffff9f2e8e82c0>] select_energy_cpu_brute+0x7d0/0x928
[   19.709191] [<ffffff9f2e8e9184>] select_task_rq_fair+0xd6c/0xe28
[   19.709198] [<ffffff9f2e8d4760>] try_to_wake_up+0x180/0x3c8
[   19.709204] [<ffffff9f2e8d540c>] wake_up_process+0x14/0x20
[   19.709212] [<ffffff9f2e8cb3ac>] insert_kthread_work+0x8c/0xa0
[   19.709217] [<ffffff9f2e8cbe48>] queue_kthread_work+0x98/0xc0
[   19.709224] [<ffffff9f2ec9a27c>] queue_cmd+0x84/0xe8
[   19.709229] [<ffffff9f2ec9cf04>] tx_data+0x41c/0x4d8
[   19.709234] [<ffffff9f2ec9cff8>] tx+0x18/0x20
[   19.709242] [<ffffff9f2ec8d618>] glink_tx_common+0x798/0xb30
[   19.709247] [<ffffff9f2ec8f1e4>] glink_tx+0x24/0x30
[   19.709253] [<ffffff9f2eca72e4>] msm_rpm_glink_send_buffer+0x6c/0xf0
[   19.709258] [<ffffff9f2ecaa0cc>] msm_rpm_enter_sleep+0x7c/0x1c8
[   19.709266] [<ffffff9f2f108694>] cluster_prepare+0x414/0xb00
[   19.709271] [<ffffff9f2f1088cc>] cluster_prepare+0x64c/0xb00
[   19.709277] [<ffffff9f2f1096ac>] lpm_cpuidle_enter+0xd4/0x360
[   19.709282] [<ffffff9f2f105094>] cpuidle_enter_state+0x12c/0x258
[   19.709287] [<ffffff9f2f1051f8>] cpuidle_enter+0x18/0x20
[   19.709294] [<ffffff9f2e8f4c78>] call_cpuidle+0x20/0x58
[   19.709299] [<ffffff9f2e8f4de8>] cpu_startup_entry+0x138/0x278
[   19.709306] [<ffffff9f2e88ecdc>] secondary_start_kernel+0x10c/0x128
[   19.709310] [<0000000080fa603c>] 0x80fa603c
[   19.709316] ------------[ cut here ]------------
[   19.709321] WARNING: at ../kernel/sched/sched.h:1394
[   19.709325] 
[   19.709330] CPU: 7 PID: 0 Comm: swapper/7 Tainted: G        W       4.4.169-Sultan #15
[   19.709335] Hardware name: Qualcomm Technologies, Inc. MSM8998 v2.1 (DT)
[   19.709339] task: 0000000000000000 task.stack: 0000000000000000
[   19.709344] PC is at select_energy_cpu_brute+0x7d0/0x928
[   19.709349] LR is at select_energy_cpu_brute+0x408/0x928
[   19.709353] pc : [<ffffff9f2e8e82c0>] lr : [<ffffff9f2e8e7ef8>] pstate: 404001c5
[   19.709357] sp : ffffffe148f2b760
[   19.709361] x29: ffffffe148f2b760 x28: 0000000000000003 
[   19.709370] x27: 0000000000000003 x26: ffffff9f30a05e8c 
[   19.709379] x25: ffffff9f30a06158 x24: ffffff9f2f808000 
[   19.709387] x23: ffffffe1b326d800 x22: ffffff9f30a05f34 
[   19.709396] x21: 0000000000000050 x20: 000000000000004e 
[   19.709405] x19: ffffffe1b1489880 x18: 0000000000000001 
[   19.709414] x17: 00000000fa83b2da x16: 0000000000036ad1 
[   19.709422] x15: 0000000000036ad1 x14: 0000000000353a3a 
[   19.709431] x13: 0000000000000400 x12: ffffff9f30a05e88 
[   19.709440] x11: 0000000000000000 x10: 0000000000000005 
[   19.709449] x9 : 000000000000001a x8 : 0000000000000000 
[   19.709457] x7 : ffffff9f2e8fa32c x6 : 000000001edff655 
[   19.709466] x5 : 000000000000000c x4 : 00000042835a6000 
[   19.709475] x3 : 0000000000000009 x2 : ffffff9f3024a000 
[   19.709483] x1 : 0000000000000000 x0 : 0000000000000000 
[   19.709492] \x0aPC: 0xffffff9f2e8e8280:
[   19.709497] 8280  912c6000 52806be1 39000e83 940070a1 17fffe40 9400c1e7 35ffcac0 52800023
[   19.709525] 82a0  d00097c2 b00097e0 91154042 911b4000 52831f61 39001a83 94007096 17fffe4d
[   19.709553] 82c0  d4210000 17ffff0e b94137e0 340003f5 90011921 39692021 34000881 f0011902
[   19.709581] 82e0  91160042 7100001f 5a80a401 a9459045 4b050084 6b01009f 5400074d 7100001f
[   19.709609] \x0aLR: 0xffffff9f2e8e7eb8:
[   19.709613] 7eb8  b9400042 d376d421 9ac20821 17ffffa8 f9404fe2 b94097e1 eb02001f 1a9b2021
[   19.709641] 7ed8  9a802040 b90097e1 f9004fe0 17ffff87 b940afe0 7100001f 54fff08c 9400c2cf
[   19.709668] 7ef8  34001e40 b940c3e2 b94b12e0 37f805e2 f9403fe1 b9403021 7100003f b940cbe1
[   19.709696] 7f18  7a411000 1a81b000 b900cbe0 1a82b360 b900c3e0 17ffff75 eb0102bf 9a959020
[   19.709724] \x0aSP: 0xffffffe148f2b720:
[   19.709729] b720  2e8e7ef8 ffffff9f 48f2b760 ffffffe1 2e8e82c0 ffffff9f 404001c5 00000000
[   19.709757] b740  48f2b760 ffffffe1 2e8e7db8 ffffff9f ffffffff ffffffff 2e8e7ef8 ffffff9f
[   19.709785] b760  48f2b900 ffffffe1 2e8e9184 ffffff9f 00000000 00000000 00000000 00000000
[   19.709813] b780  00000000 00000000 0000002c 00000000 00000004 00000000 48f16200 ffffffe1
[   19.709841] 
[   19.709845] ---[ end trace 24ec051baadc3c8b ]---
[   19.709848] Call trace:
[   19.709853] Exception stack(0xffffffe148f2b570 to 0xffffffe148f2b6a0)
[   19.709858] b560:                                   ffffffe1b1489880 0000008000000000
[   19.709864] b580: 00000000834df000 ffffff9f2e8e82c0 00000000404001c5 ffffff9f2e901f78
[   19.709871] b5a0: ffffffe148f16bb8 ffffff9f30a43000 ffffff9f30a05d90 ffffffe148f169c0
[   19.709876] b5c0: ffffff9f31925000 ffffff9f2e901dbc ffffffe148f2b6d0 ffffff9f2e904024
[   19.709882] b5e0: ffffff9f30a234c8 0000000000000000 0000000000000000 0000000000000002
[   19.709887] b600: 0000000000000000 0000000000000000 ffffff9f2e8fa32c 00000000000001c0
[   19.709894] b620: ffffffe148f16200 403a4a6d7a81e143 0000000000000000 0000000000000000
[   19.709900] b640: ffffff9f3024a000 0000000000000009 00000042835a6000 000000000000000c
[   19.709907] b660: 000000001edff655 ffffff9f2e8fa32c 0000000000000000 000000000000001a
[   19.709913] b680: 0000000000000005 0000000000000000 ffffff9f30a05e88 0000000000000400
[   19.709919] [<ffffff9f2e8e82c0>] select_energy_cpu_brute+0x7d0/0x928
[   19.709924] [<ffffff9f2e8e9184>] select_task_rq_fair+0xd6c/0xe28
[   19.709929] [<ffffff9f2e8d4760>] try_to_wake_up+0x180/0x3c8
[   19.709934] [<ffffff9f2e8d540c>] wake_up_process+0x14/0x20
[   19.709940] [<ffffff9f2e8cb3ac>] insert_kthread_work+0x8c/0xa0
[   19.709945] [<ffffff9f2e8cbe48>] queue_kthread_work+0x98/0xc0
[   19.709950] [<ffffff9f2ec9a27c>] queue_cmd+0x84/0xe8
[   19.709955] [<ffffff9f2ec9cf04>] tx_data+0x41c/0x4d8
[   19.709959] [<ffffff9f2ec9cff8>] tx+0x18/0x20
[   19.709964] [<ffffff9f2ec8d618>] glink_tx_common+0x798/0xb30
[   19.709969] [<ffffff9f2ec8f1e4>] glink_tx+0x24/0x30
[   19.709974] [<ffffff9f2eca72e4>] msm_rpm_glink_send_buffer+0x6c/0xf0
[   19.709979] [<ffffff9f2ecaa0cc>] msm_rpm_enter_sleep+0x7c/0x1c8
[   19.709984] [<ffffff9f2f108694>] cluster_prepare+0x414/0xb00
[   19.709989] [<ffffff9f2f1088cc>] cluster_prepare+0x64c/0xb00
[   19.709994] [<ffffff9f2f1096ac>] lpm_cpuidle_enter+0xd4/0x360
[   19.709998] [<ffffff9f2f105094>] cpuidle_enter_state+0x12c/0x258
[   19.710003] [<ffffff9f2f1051f8>] cpuidle_enter+0x18/0x20
[   19.710009] [<ffffff9f2e8f4c78>] call_cpuidle+0x20/0x58
[   19.710014] [<ffffff9f2e8f4de8>] cpu_startup_entry+0x138/0x278
[   19.710020] [<ffffff9f2e88ecdc>] secondary_start_kernel+0x10c/0x128
[   19.710024] [<0000000080fa603c>] 0x80fa603c
[   19.902436] ------------[ cut here ]------------
[   19.902458] WARNING: at ../kernel/sched/sched.h:1394
[   19.902463] 
[   19.902471] CPU: 6 PID: 0 Comm: swapper/6 Tainted: G        W       4.4.169-Sultan #15
[   19.902476] Hardware name: Qualcomm Technologies, Inc. MSM8998 v2.1 (DT)
[   19.902482] task: 0000000000000000 task.stack: 0000000000000000
[   19.902495] PC is at select_energy_cpu_brute+0x7d0/0x928
[   19.902501] LR is at select_energy_cpu_brute+0x408/0x928
[   19.902506] pc : [<ffffff9f2e8e82c0>] lr : [<ffffff9f2e8e7ef8>] pstate: 404001c5
[   19.902511] sp : ffffffe148f27760
[   19.902515] x29: ffffffe148f27760 x28: 0000000000000002 
[   19.902525] x27: 0000000000000002 x26: ffffff9f30a05e8c 
[   19.902534] x25: ffffff9f30a06158 x24: ffffff9f2f808000 
[   19.902543] x23: ffffffe1b309a800 x22: ffffff9f30a05f34 
[   19.902552] x21: 0000000000000064 x20: 0000000000000064 
[   19.902561] x19: ffffffe1b1489880 x18: ffffffe1ab25fb70 
[   19.902570] x17: 0000000000000000 x16: 0000000000000000 
[   19.902579] x15: 0000000000000000 x14: ffffffe148f14980 
[   19.902587] x13: 0000000000000400 x12: 0000000000000002 
[   19.902596] x11: 0000000000000003 x10: ffffffe1b37e59fc 
[   19.902605] x9 : 000000000000001a x8 : 0000000000000000 
[   19.902614] x7 : ffffff9f2e8fa32c x6 : 000000001edff655 
[   19.902622] x5 : 000000000000000c x4 : 00000042833d3000 
[   19.902631] x3 : 0000000000000009 x2 : ffffff9f3024a000 
[   19.902640] x1 : 0000000000000000 x0 : 0000000000000000 
[   19.902649] \x0aPC: 0xffffff9f2e8e8280:
[   19.902654] 8280  912c6000 52806be1 39000e83 940070a1 17fffe40 9400c1e7 35ffcac0 52800023
[   19.902685] 82a0  d00097c2 b00097e0 91154042 911b4000 52831f61 39001a83 94007096 17fffe4d
[   19.902714] 82c0  d4210000 17ffff0e b94137e0 340003f5 90011921 39692021 34000881 f0011902
[   19.902741] 82e0  91160042 7100001f 5a80a401 a9459045 4b050084 6b01009f 5400074d 7100001f
[   19.902770] \x0aLR: 0xffffff9f2e8e7eb8:
[   19.902774] 7eb8  b9400042 d376d421 9ac20821 17ffffa8 f9404fe2 b94097e1 eb02001f 1a9b2021
[   19.902802] 7ed8  9a802040 b90097e1 f9004fe0 17ffff87 b940afe0 7100001f 54fff08c 9400c2cf
[   19.902830] 7ef8  34001e40 b940c3e2 b94b12e0 37f805e2 f9403fe1 b9403021 7100003f b940cbe1
[   19.902857] 7f18  7a411000 1a81b000 b900cbe0 1a82b360 b900c3e0 17ffff75 eb0102bf 9a959020
[   19.902887] \x0aSP: 0xffffffe148f27720:
[   19.902892] 7720  2e8e7ef8 ffffff9f 48f27760 ffffffe1 2e8e82c0 ffffff9f 404001c5 00000000
[   19.902920] 7740  48f27760 ffffffe1 2e8e7db8 ffffff9f ffffffff ffffffff 2e8e7ef8 ffffff9f
[   19.902948] 7760  48f27900 ffffffe1 2e8e9184 ffffff9f 00000000 00000000 00000000 00000000
[   19.902976] 7780  00000000 00000000 00000027 00000000 00000004 00000000 48f14980 ffffffe1
[   19.903004] 
[   19.903008] ---[ end trace 24ec051baadc3c8c ]---
[   19.903013] Call trace:
[   19.903018] Exception stack(0xffffffe148f27570 to 0xffffffe148f276a0)
[   19.903024] 7560:                                   ffffffe1b1489880 0000008000000000
[   19.903031] 7580: 0000000082408000 ffffff9f2e8e82c0 00000000404001c5 ffffff9f2e901f78
[   19.903037] 75a0: ffffffe148f15338 ffffff9f30a43000 ffffff9f30a05d90 ffffffe148f15140
[   19.903044] 75c0: ffffff9f31925000 ffffff9f2e901dbc ffffffe148f276d0 ffffff9f2e904024
[   19.903049] 75e0: ffffff9f30a234c8 0000000000000000 0000000000000000 0000000000000002
[   19.903055] 7600: 0000000000000000 0000000000000000 ffffff9f2e8fa32c 00000000000001c0
[   19.903061] 7620: ffffffe148f14980 403a4a6d7a81e143 0000000000000000 0000000000000000
[   19.903066] 7640: ffffff9f3024a000 0000000000000009 00000042833d3000 000000000000000c
[   19.903072] 7660: 000000001edff655 ffffff9f2e8fa32c 0000000000000000 000000000000001a
[   19.903079] 7680: ffffffe1b37e59fc 0000000000000003 0000000000000002 0000000000000400
[   19.903085] [<ffffff9f2e8e82c0>] select_energy_cpu_brute+0x7d0/0x928
[   19.903091] [<ffffff9f2e8e9184>] select_task_rq_fair+0xd6c/0xe28
[   19.903099] [<ffffff9f2e8d4760>] try_to_wake_up+0x180/0x3c8
[   19.903105] [<ffffff9f2e8d540c>] wake_up_process+0x14/0x20
[   19.903112] [<ffffff9f2e8cb3ac>] insert_kthread_work+0x8c/0xa0
[   19.903117] [<ffffff9f2e8cbe48>] queue_kthread_work+0x98/0xc0
[   19.903125] [<ffffff9f2ec9a27c>] queue_cmd+0x84/0xe8
[   19.903130] [<ffffff9f2ec9cf04>] tx_data+0x41c/0x4d8
[   19.903136] [<ffffff9f2ec9cff8>] tx+0x18/0x20
[   19.903143] [<ffffff9f2ec8d618>] glink_tx_common+0x798/0xb30
[   19.903148] [<ffffff9f2ec8f1e4>] glink_tx+0x24/0x30
[   19.903154] [<ffffff9f2eca72e4>] msm_rpm_glink_send_buffer+0x6c/0xf0
[   19.903160] [<ffffff9f2ecaa0cc>] msm_rpm_enter_sleep+0x7c/0x1c8
[   19.903168] [<ffffff9f2f108694>] cluster_prepare+0x414/0xb00
[   19.903172] [<ffffff9f2f1088cc>] cluster_prepare+0x64c/0xb00
[   19.903178] [<ffffff9f2f1096ac>] lpm_cpuidle_enter+0xd4/0x360
[   19.903183] [<ffffff9f2f105094>] cpuidle_enter_state+0x12c/0x258
[   19.903188] [<ffffff9f2f1051f8>] cpuidle_enter+0x18/0x20
[   19.903195] [<ffffff9f2e8f4c78>] call_cpuidle+0x20/0x58
[   19.903200] [<ffffff9f2e8f4de8>] cpu_startup_entry+0x138/0x278
[   19.903206] [<ffffff9f2e88ecdc>] secondary_start_kernel+0x10c/0x128
[   19.903211] [<0000000080fa603c>] 0x80fa603c
[   19.903218] ------------[ cut here ]------------
[   19.903222] WARNING: at ../kernel/sched/sched.h:1394
[   19.903226] 
[   19.903231] CPU: 6 PID: 0 Comm: swapper/6 Tainted: G        W       4.4.169-Sultan #15
[   19.903236] Hardware name: Qualcomm Technologies, Inc. MSM8998 v2.1 (DT)
[   19.903240] task: 0000000000000000 task.stack: 0000000000000000
[   19.903245] PC is at select_energy_cpu_brute+0x7d0/0x928
[   19.903250] LR is at select_energy_cpu_brute+0x408/0x928
[   19.903255] pc : [<ffffff9f2e8e82c0>] lr : [<ffffff9f2e8e7ef8>] pstate: 404001c5
[   19.903259] sp : ffffffe148f27760
[   19.903263] x29: ffffffe148f27760 x28: 0000000000000003 
[   19.903272] x27: 0000000000000003 x26: ffffff9f30a05e8c 
[   19.903281] x25: ffffff9f30a06158 x24: ffffff9f2f808000 
[   19.903290] x23: ffffffe1b326d800 x22: ffffff9f30a05f34 
[   19.903299] x21: 0000000000000066 x20: 0000000000000064 
[   19.903308] x19: ffffffe1b1489880 x18: ffffffe1ab25fb70 
[   19.903316] x17: 0000000000000000 x16: 0000000000000000 
[   19.903325] x15: 0000000000000000 x14: ffffffe148f14980 
[   19.903334] x13: 0000000000000400 x12: 0000000000000002 
[   19.903343] x11: 0000000000000003 x10: ffffffe1b37e59fc 
[   19.903351] x9 : 000000000000001a x8 : 0000000000000000 
[   19.903360] x7 : ffffff9f2e8fa32c x6 : 000000001edff655 
[   19.903369] x5 : 000000000000000c x4 : 00000042833d3000 
[   19.903377] x3 : 0000000000000009 x2 : ffffff9f3024a000 
[   19.903386] x1 : 0000000000000000 x0 : 0000000000000000 
[   19.903395] \x0aPC: 0xffffff9f2e8e8280:
[   19.903399] 8280  912c6000 52806be1 39000e83 940070a1 17fffe40 9400c1e7 35ffcac0 52800023
[   19.903427] 82a0  d00097c2 b00097e0 91154042 911b4000 52831f61 39001a83 94007096 17fffe4d
[   19.903455] 82c0  d4210000 17ffff0e b94137e0 340003f5 90011921 39692021 34000881 f0011902
[   19.903483] 82e0  91160042 7100001f 5a80a401 a9459045 4b050084 6b01009f 5400074d 7100001f
[   19.903511] \x0aLR: 0xffffff9f2e8e7eb8:
[   19.903516] 7eb8  b9400042 d376d421 9ac20821 17ffffa8 f9404fe2 b94097e1 eb02001f 1a9b2021
[   19.903543] 7ed8  9a802040 b90097e1 f9004fe0 17ffff87 b940afe0 7100001f 54fff08c 9400c2cf
[   19.903572] 7ef8  34001e40 b940c3e2 b94b12e0 37f805e2 f9403fe1 b9403021 7100003f b940cbe1
[   19.903599] 7f18  7a411000 1a81b000 b900cbe0 1a82b360 b900c3e0 17ffff75 eb0102bf 9a959020
[   19.903628] \x0aSP: 0xffffffe148f27720:
[   19.903633] 7720  2e8e7ef8 ffffff9f 48f27760 ffffffe1 2e8e82c0 ffffff9f 404001c5 00000000
[   19.903661] 7740  48f27760 ffffffe1 2e8e7db8 ffffff9f ffffffff ffffffff 2e8e7ef8 ffffff9f
[   19.903689] 7760  48f27900 ffffffe1 2e8e9184 ffffff9f 00000000 00000000 00000000 00000000
[   19.903718] 7780  00000000 00000000 00000027 00000000 00000004 00000000 48f14980 ffffffe1
[   19.903747] 
[   19.903751] ---[ end trace 24ec051baadc3c8d ]---
[   19.903756] Call trace:
[   19.903761] Exception stack(0xffffffe148f27570 to 0xffffffe148f276a0)
[   19.903767] 7560:                                   ffffffe1b1489880 0000008000000000
[   19.903773] 7580: 00000000834df000 ffffff9f2e8e82c0 00000000404001c5 ffffff9f2e901f78
[   19.903779] 75a0: ffffffe148f15338 ffffff9f30a43000 ffffff9f30a05d90 ffffffe148f15140
[   19.903785] 75c0: ffffff9f31925000 ffffff9f2e901dbc ffffffe148f276d0 ffffff9f2e904024
[   19.903790] 75e0: ffffff9f30a234c8 0000000000000000 0000000000000000 0000000000000002
[   19.903796] 7600: 0000000000000000 0000000000000000 ffffff9f2e8fa32c 00000000000001c0
[   19.903802] 7620: ffffffe148f14980 403a4a6d7a81e143 0000000000000000 0000000000000000
[   19.903807] 7640: ffffff9f3024a000 0000000000000009 00000042833d3000 000000000000000c
[   19.903813] 7660: 000000001edff655 ffffff9f2e8fa32c 0000000000000000 000000000000001a
[   19.903819] 7680: ffffffe1b37e59fc 0000000000000003 0000000000000002 0000000000000400
[   19.903824] [<ffffff9f2e8e82c0>] select_energy_cpu_brute+0x7d0/0x928
[   19.903829] [<ffffff9f2e8e9184>] select_task_rq_fair+0xd6c/0xe28
[   19.903834] [<ffffff9f2e8d4760>] try_to_wake_up+0x180/0x3c8
[   19.903839] [<ffffff9f2e8d540c>] wake_up_process+0x14/0x20
[   19.903844] [<ffffff9f2e8cb3ac>] insert_kthread_work+0x8c/0xa0
[   19.903849] [<ffffff9f2e8cbe48>] queue_kthread_work+0x98/0xc0
[   19.903855] [<ffffff9f2ec9a27c>] queue_cmd+0x84/0xe8
[   19.903860] [<ffffff9f2ec9cf04>] tx_data+0x41c/0x4d8
[   19.903865] [<ffffff9f2ec9cff8>] tx+0x18/0x20
[   19.903870] [<ffffff9f2ec8d618>] glink_tx_common+0x798/0xb30
[   19.903876] [<ffffff9f2ec8f1e4>] glink_tx+0x24/0x30
[   19.903880] [<ffffff9f2eca72e4>] msm_rpm_glink_send_buffer+0x6c/0xf0
[   19.903885] [<ffffff9f2ecaa0cc>] msm_rpm_enter_sleep+0x7c/0x1c8
[   19.903890] [<ffffff9f2f108694>] cluster_prepare+0x414/0xb00
[   19.903895] [<ffffff9f2f1088cc>] cluster_prepare+0x64c/0xb00
[   19.903900] [<ffffff9f2f1096ac>] lpm_cpuidle_enter+0xd4/0x360
[   19.903906] [<ffffff9f2f105094>] cpuidle_enter_state+0x12c/0x258
[   19.903911] [<ffffff9f2f1051f8>] cpuidle_enter+0x18/0x20
[   19.903916] [<ffffff9f2e8f4c78>] call_cpuidle+0x20/0x58
[   19.903922] [<ffffff9f2e8f4de8>] cpu_startup_entry+0x138/0x278
[   19.903927] [<ffffff9f2e88ecdc>] secondary_start_kernel+0x10c/0x128
[   19.903931] [<0000000080fa603c>] 0x80fa603c
[   20.550622] Invalid input for loop: ret = -22
[   21.931463] nq-nci 8-0028: setting ese_gpio high
[   22.165250] selinux: SELinux: Skipping restorecon_recursive(/data/system_ce/0)\x0a
[   22.167249] init: Async property child exited with status 0
[   22.169695] selinux: SELinux: Skipping restorecon_recursive(/data/misc_ce/0)\x0a
[   22.170953] init: Async property child exited with status 0
[   22.223156] init: processing action (sys.user.0.ce_available=true) from (/vendor/etc/init/hw/init.taimen.rc:232)
[   22.234279] init: starting service 'vendor.ramoops_sh'...
[   22.330429] init: processing action (sys.ramoops.decrypted=true) from (/vendor/etc/init/hw/init.taimen.rc:236)
[   22.388206] init: Service 'vendor.ramoops_sh' (pid 2467) exited with status 0
[   23.466403] SMB138X: smb138x_parallel_get_prop: parallel power supply get prop 4 not supported
[   23.576157] nq-nci 8-0028: unable to disable clock
[   23.723392] i2c-msm-v2 c1b6000.i2c: msm_bus_scale_register_client(mstr-id:84):0x26 (ok)
[   23.837035] SMB138X: smb138x_parallel_get_prop: parallel power supply get prop 4 not supported
[   23.843673] FG: fg_get_battery_temp: batt temperature original:430, tuned:375
[   23.845561] healthd: battery l=79 v=4073 t=37.5 h=2 st=2 c=85 fc=3229000 cc=291 chg=u
[   23.846209] FG: fg_get_battery_temp: batt temperature original:430, tuned:375
[   23.848440] healthd: battery l=79 v=4073 t=37.5 h=2 st=2 c=85 fc=3229000 cc=291 chg=u
[   24.202223] FG: fg_get_battery_temp: batt temperature original:430, tuned:375
[   24.202799] FG: fg_get_battery_temp: batt temperature original:430, tuned:375
[   24.206741] healthd: battery l=79 v=4073 t=37.5 h=2 st=2 c=85 fc=3229000 cc=291 chg=u
[   24.207182] healthd: battery l=79 v=4073 t=37.5 h=2 st=2 c=85 fc=3229000 cc=291 chg=u
[   27.832537] lowpool[2]: page allocation failure: order:0, mode:0x2200000
[   27.832558] CPU: 6 PID: 3103 Comm: lowpool[2] Tainted: G        W       4.4.169-Sultan #15
[   27.832563] Hardware name: Qualcomm Technologies, Inc. MSM8998 v2.1 (DT)
[   27.832567] Call trace:
[   27.832582] [<ffffff9f2e88a378>] dump_backtrace+0x0/0x1e8
[   27.832588] [<ffffff9f2e88a574>] show_stack+0x14/0x20
[   27.832595] [<ffffff9f2eb54de4>] dump_stack+0xb0/0xec
[   27.832602] [<ffffff9f2e972d20>] warn_alloc_failed+0x110/0x150
[   27.832607] [<ffffff9f2e9765a8>] __alloc_pages_nodemask+0x720/0x960
[   27.832614] [<ffffff9f2e9ae2fc>] new_slab+0x23c/0x280
[   27.832620] [<ffffff9f2e9b0298>] ___slab_alloc.isra.12.constprop.21+0x280/0x2e8
[   27.832625] [<ffffff9f2e9b0348>] __slab_alloc.isra.13.constprop.20+0x48/0x88
[   27.832629] [<ffffff9f2e9b04d0>] kmem_cache_alloc+0x138/0x188
[   27.832637] [<ffffff9f2ead8930>] avc_alloc_node+0x20/0x2b8
[   27.832642] [<ffffff9f2ead8ecc>] avc_compute_av+0xfc/0x2a0
[   27.832646] [<ffffff9f2ead9ec4>] avc_has_perm_flags+0x1f4/0x210
[   27.832651] [<ffffff9f2eadea5c>] selinux_inode_follow_link+0x74/0x98
[   27.832656] [<ffffff9f2ead46e4>] security_inode_follow_link+0x64/0x98
[   27.832663] [<ffffff9f2e9c83c8>] link_path_walk+0x210/0x598
[   27.832668] [<ffffff9f2e9c8e68>] path_lookupat.isra.19+0x98/0x118
[   27.832672] [<ffffff9f2e9c8f68>] filename_lookup.part.20+0x80/0x110
[   27.832677] [<ffffff9f2e9c952c>] user_path_at_empty+0x4c/0x60
[   27.832681] [<ffffff9f2e9c0780>] vfs_fstatat+0x70/0xd8
[   27.832686] [<ffffff9f2e9c0944>] SyS_newfstatat+0x2c/0x60
[   27.832693] [<ffffff9f2ed27ed4>] new_newfstatat+0x4c/0x88
[   27.832697] [<ffffff9f2e88330c>] __sys_trace_return+0x0/0x4
[   27.832701] Mem-Info:
[   27.832708] active_anon:105613 inactive_anon:35453 isolated_anon:0\x0a active_file:54723 inactive_file:55021 isolated_file:56\x0a unevictable:39243 dirty:14 writeback:16 unstable:0\x0a slab_reclaimable:10528 slab_unreclaimable:31501\x0a mapped:87214 shmem:339 pagetables:10298 bounce:0\x0a free:2342 free_pcp:797 free_cma:98
[   27.832720] DMA free:9368kB min:4920kB low:50988kB high:52220kB active_anon:422452kB inactive_anon:141812kB active_file:218892kB inactive_file:220084kB unevictable:156972kB isolated(anon):0kB isolated(file):224kB present:1856508kB managed:1686236kB mlocked:156972kB dirty:56kB writeback:64kB mapped:348856kB shmem:1356kB slab_reclaimable:42112kB slab_unreclaimable:126004kB kernel_stack:33184kB pagetables:41192kB unstable:0kB bounce:0kB free_pcp:3188kB local_pcp:528kB free_cma:392kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
[   27.832725] lowmem_reserve[]: 0 0 0
[   27.832738] DMA: 113*4kB (UME) 94*8kB (UMEH) 154*16kB (UMEH) 154*32kB (MEH) 11*64kB (MH) 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 9300kB
[   27.832785] 149302 total pagecache pages
[   27.832790] 78 pages in swap cache
[   27.832794] Swap cache stats: add 10402, delete 10324, find 43/219
[   27.832797] Free swap  = 483868kB
[   27.832801] Total swap = 524284kB
[   27.832804] 464127 pages RAM
[   27.832807] 0 pages HighMem/MovableOnly
[   27.832810] 42568 pages reserved
[   27.832814] 40960 pages cma reserved
[   27.832932] lowpool[2]: page allocation failure: order:0, mode:0x2200000
[   27.832937] CPU: 6 PID: 3103 Comm: lowpool[2] Tainted: G        W       4.4.169-Sultan #15
[   27.832941] Hardware name: Qualcomm Technologies, Inc. MSM8998 v2.1 (DT)
[   27.832945] Call trace:
[   27.832950] [<ffffff9f2e88a378>] dump_backtrace+0x0/0x1e8
[   27.832954] [<ffffff9f2e88a574>] show_stack+0x14/0x20
[   27.832959] [<ffffff9f2eb54de4>] dump_stack+0xb0/0xec
[   27.832963] [<ffffff9f2e972d20>] warn_alloc_failed+0x110/0x150
[   27.832968] [<ffffff9f2e9765a8>] __alloc_pages_nodemask+0x720/0x960
[   27.832973] [<ffffff9f2e9ae2fc>] new_slab+0x23c/0x280
[   27.832978] [<ffffff9f2e9b0298>] ___slab_alloc.isra.12.constprop.21+0x280/0x2e8
[   27.832982] [<ffffff9f2e9b0348>] __slab_alloc.isra.13.constprop.20+0x48/0x88
[   27.832985] [<ffffff9f2e9b04d0>] kmem_cache_alloc+0x138/0x188
[   27.832990] [<ffffff9f2ead8930>] avc_alloc_node+0x20/0x2b8
[   27.832995] [<ffffff9f2ead8ecc>] avc_compute_av+0xfc/0x2a0
[   27.832999] [<ffffff9f2ead9ec4>] avc_has_perm_flags+0x1f4/0x210
[   27.833003] [<ffffff9f2eadea5c>] selinux_inode_follow_link+0x74/0x98
[   27.833007] [<ffffff9f2ead46e4>] security_inode_follow_link+0x64/0x98
[   27.833011] [<ffffff9f2e9c83c8>] link_path_walk+0x210/0x598
[   27.833016] [<ffffff9f2e9c8e68>] path_lookupat.isra.19+0x98/0x118
[   27.833021] [<ffffff9f2e9c8f68>] filename_lookup.part.20+0x80/0x110
[   27.833025] [<ffffff9f2e9c952c>] user_path_at_empty+0x4c/0x60
[   27.833029] [<ffffff9f2e9c0780>] vfs_fstatat+0x70/0xd8
[   27.833033] [<ffffff9f2e9c0944>] SyS_newfstatat+0x2c/0x60
[   27.833037] [<ffffff9f2ed27ed4>] new_newfstatat+0x4c/0x88
[   27.833041] [<ffffff9f2e88330c>] __sys_trace_return+0x0/0x4
[   27.833044] Mem-Info:
[   27.833050] active_anon:105613 inactive_anon:35453 isolated_anon:0\x0a active_file:54723 inactive_file:55021 isolated_file:56\x0a unevictable:39243 dirty:14 writeback:16 unstable:0\x0a slab_reclaimable:10528 slab_unreclaimable:31501\x0a mapped:87214 shmem:339 pagetables:10298 bounce:0\x0a free:2342 free_pcp:828 free_cma:98
[   27.833061] DMA free:9368kB min:4920kB low:50988kB high:52220kB active_anon:422452kB inactive_anon:141812kB active_file:218892kB inactive_file:220084kB unevictable:156972kB isolated(anon):0kB isolated(file):224kB present:1856508kB managed:1686236kB mlocked:156972kB dirty:56kB writeback:64kB mapped:348856kB shmem:1356kB slab_reclaimable:42112kB slab_unreclaimable:126004kB kernel_stack:33184kB pagetables:41192kB unstable:0kB bounce:0kB free_pcp:3304kB local_pcp:528kB free_cma:392kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
[   27.833065] lowmem_reserve[]: 0 0 0
[   27.833077] DMA: 113*4kB (UME) 94*8kB (UMEH) 154*16kB (UMEH) 154*32kB (MEH) 11*64kB (MH) 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 9300kB
[   27.833122] 149302 total pagecache pages
[   27.833125] 78 pages in swap cache
[   27.833129] Swap cache stats: add 10402, delete 10324, find 43/219
[   27.833132] Free swap  = 483868kB
[   27.833135] Total swap = 524284kB
[   27.833139] 464127 pages RAM
[   27.833142] 0 pages HighMem/MovableOnly
[   27.833145] 42568 pages reserved
[   27.833149] 40960 pages cma reserved
[   27.926586] .dataservices: page allocation failure: order:0, mode:0x2200000
[   27.926603] CPU: 7 PID: 1830 Comm: .dataservices Tainted: G        W       4.4.169-Sultan #15
[   27.926608] Hardware name: Qualcomm Technologies, Inc. MSM8998 v2.1 (DT)
[   27.926612] Call trace:
[   27.926624] [<ffffff9f2e88a378>] dump_backtrace+0x0/0x1e8
[   27.926629] [<ffffff9f2e88a574>] show_stack+0x14/0x20
[   27.926636] [<ffffff9f2eb54de4>] dump_stack+0xb0/0xec
[   27.926642] [<ffffff9f2e972d20>] warn_alloc_failed+0x110/0x150
[   27.926647] [<ffffff9f2e9765a8>] __alloc_pages_nodemask+0x720/0x960
[   27.926653] [<ffffff9f2e9ae2fc>] new_slab+0x23c/0x280
[   27.926658] [<ffffff9f2e9b0298>] ___slab_alloc.isra.12.constprop.21+0x280/0x2e8
[   27.926663] [<ffffff9f2e9b0348>] __slab_alloc.isra.13.constprop.20+0x48/0x88
[   27.926667] [<ffffff9f2e9b04d0>] kmem_cache_alloc+0x138/0x188
[   27.926675] [<ffffff9f2ead8930>] avc_alloc_node+0x20/0x2b8
[   27.926679] [<ffffff9f2ead8ecc>] avc_compute_av+0xfc/0x2a0
[   27.926683] [<ffffff9f2ead9cbc>] avc_has_perm+0x1dc/0x1f0
[   27.926687] [<ffffff9f2eadd0fc>] selinux_binder_transaction+0x194/0x2c0
[   27.926694] [<ffffff9f2ead2d1c>] security_binder_transaction+0x4c/0x78
[   27.926700] [<ffffff9f2f408660>] binder_transaction+0x150/0x1ec8
[   27.926704] [<ffffff9f2f40aaec>] binder_thread_write+0x714/0x1868
[   27.926707] [<ffffff9f2f40c278>] binder_ioctl+0x638/0x7b8
[   27.926712] [<ffffff9f2e9cf358>] do_vfs_ioctl+0x4a8/0x9a8
[   27.926716] [<ffffff9f2e9cf8d0>] SyS_ioctl+0x78/0x98
[   27.926720] [<ffffff9f2e88330c>] __sys_trace_return+0x0/0x4
[   27.926724] Mem-Info:
[   27.926731] active_anon:106003 inactive_anon:35416 isolated_anon:3\x0a active_file:53602 inactive_file:53668 isolated_file:237\x0a unevictable:39243 dirty:0 writeback:8 unstable:0\x0a slab_reclaimable:10698 slab_unreclaimable:31809\x0a mapped:85968 shmem:324 pagetables:10442 bounce:0\x0a free:2355 free_pcp:214 free_cma:201
[   27.926743] DMA free:9420kB min:4920kB low:50988kB high:52220kB active_anon:424012kB inactive_anon:141664kB active_file:214408kB inactive_file:214672kB unevictable:156972kB isolated(anon):12kB isolated(file):948kB present:1856508kB managed:1686236kB mlocked:156972kB dirty:0kB writeback:32kB mapped:343872kB shmem:1296kB slab_reclaimable:42792kB slab_unreclaimable:127236kB kernel_stack:33664kB pagetables:41768kB unstable:0kB bounce:0kB free_pcp:856kB local_pcp:64kB free_cma:804kB writeback_tmp:0kB pages_scanned:1292 all_unreclaimable? no
[   27.926747] lowmem_reserve[]: 0 0 0
[   27.926758] DMA: 93*4kB (UE) 347*8kB (UMEH) 360*16kB (UMEH) 8*32kB (M) 3*64kB (MH) 6*128kB (M) 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 10124kB
[   27.926802] 146846 total pagecache pages
[   27.926805] 87 pages in swap cache
[   27.926808] Swap cache stats: add 11314, delete 11227, find 49/341
[   27.926811] Free swap  = 481116kB
[   27.926814] Total swap = 524284kB
[   27.926817] 464127 pages RAM
[   27.926820] 0 pages HighMem/MovableOnly
[   27.926822] 42568 pages reserved
[   27.926825] 40960 pages cma reserved
[   28.396336] SMB138X: smb138x_parallel_get_prop: parallel power supply get prop 4 not supported
[   28.785695] SMB138X: smb138x_parallel_get_prop: parallel power supply get prop 4 not supported
[   28.802350] FG: fg_get_battery_temp: batt temperature original:440, tuned:380
[   28.803871] healthd: battery l=79 v=3942 t=38.0 h=2 st=2 c=718 fc=3229000 cc=291 chg=u
[   28.804537] FG: fg_get_battery_temp: batt temperature original:440, tuned:380
[   28.831497] healthd: battery l=79 v=3942 t=38.0 h=2 st=2 c=718 fc=3229000 cc=291 chg=u
[   29.180626] FG: fg_get_battery_temp: batt temperature original:440, tuned:379
[   29.181927] FG: fg_get_battery_temp: batt temperature original:440, tuned:378
[   29.183673] healthd: battery l=79 v=3925 t=37.8 h=2 st=2 c=802 fc=3229000 cc=291 chg=u
[   29.186891] healthd: battery l=79 v=3925 t=37.9 h=2 st=2 c=802 fc=3229000 cc=291 chg=u
[   31.038660] Binder:1209_16: page allocation failure: order:0, mode:0x2200000
[   31.038677] CPU: 7 PID: 3233 Comm: Binder:1209_16 Tainted: G        W       4.4.169-Sultan #15
[   31.038681] Hardware name: Qualcomm Technologies, Inc. MSM8998 v2.1 (DT)
[   31.038685] Call trace:
[   31.038697] [<ffffff9f2e88a378>] dump_backtrace+0x0/0x1e8
[   31.038703] [<ffffff9f2e88a574>] show_stack+0x14/0x20
[   31.038709] [<ffffff9f2eb54de4>] dump_stack+0xb0/0xec
[   31.038715] [<ffffff9f2e972d20>] warn_alloc_failed+0x110/0x150
[   31.038720] [<ffffff9f2e9765a8>] __alloc_pages_nodemask+0x720/0x960
[   31.038726] [<ffffff9f2e9ae2fc>] new_slab+0x23c/0x280
[   31.038731] [<ffffff9f2e9b0298>] ___slab_alloc.isra.12.constprop.21+0x280/0x2e8
[   31.038735] [<ffffff9f2e9b0348>] __slab_alloc.isra.13.constprop.20+0x48/0x88
[   31.038739] [<ffffff9f2e9b04d0>] kmem_cache_alloc+0x138/0x188
[   31.038746] [<ffffff9f2ead8930>] avc_alloc_node+0x20/0x2b8
[   31.038750] [<ffffff9f2ead8ecc>] avc_compute_av+0xfc/0x2a0
[   31.038755] [<ffffff9f2ead9cbc>] avc_has_perm+0x1dc/0x1f0
[   31.038759] [<ffffff9f2eadcc48>] selinux_binder_transfer_file+0x180/0x220
[   31.038766] [<ffffff9f2ead2e18>] security_binder_transfer_file+0x58/0x88
[   31.038773] [<ffffff9f2f407418>] binder_translate_fd.isra.20+0x68/0x220
[   31.038776] [<ffffff9f2f409684>] binder_transaction+0x1174/0x1ec8
[   31.038780] [<ffffff9f2f40aaec>] binder_thread_write+0x714/0x1868
[   31.038784] [<ffffff9f2f40c278>] binder_ioctl+0x638/0x7b8
[   31.038789] [<ffffff9f2e9cf358>] do_vfs_ioctl+0x4a8/0x9a8
[   31.038793] [<ffffff9f2e9cf8d0>] SyS_ioctl+0x78/0x98
[   31.038797] [<ffffff9f2e88330c>] __sys_trace_return+0x0/0x4
[   31.038801] Mem-Info:
[   31.038808] active_anon:95719 inactive_anon:32002 isolated_anon:0\x0a active_file:43372 inactive_file:43306 isolated_file:32\x0a unevictable:39243 dirty:0 writeback:0 unstable:0\x0a slab_reclaimable:11393 slab_unreclaimable:33178\x0a mapped:77433 shmem:458 pagetables:10906 bounce:0\x0a free:3542 free_pcp:123 free_cma:1463
[   31.038820] DMA free:14168kB min:4920kB low:50988kB high:52220kB active_anon:382876kB inactive_anon:128008kB active_file:173488kB inactive_file:173224kB unevictable:156972kB isolated(anon):0kB isolated(file):128kB present:1856508kB managed:1686236kB mlocked:156972kB dirty:0kB writeback:0kB mapped:309732kB shmem:1832kB slab_reclaimable:45572kB slab_unreclaimable:132712kB kernel_stack:35664kB pagetables:43624kB unstable:0kB bounce:0kB free_pcp:492kB local_pcp:0kB free_cma:5852kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
[   31.038824] lowmem_reserve[]: 0 0 0
[   31.038837] DMA: 1241*4kB (MECH) 429*8kB (UMECH) 101*16kB (MECH) 25*32kB (UMCH) 34*64kB (UCH) 12*128kB (C) 3*256kB (CH) 0*512kB 0*1024kB 0*2048kB 0*4096kB = 15292kB
[   31.038888] 126737 total pagecache pages
[   31.038891] 250 pages in swap cache
[   31.038895] Swap cache stats: add 38779, delete 38529, find 268/1102
[   31.038898] Free swap  = 374612kB
[   31.038901] Total swap = 524284kB
[   31.038904] 464127 pages RAM
[   31.038907] 0 pages HighMem/MovableOnly
[   31.038910] 42568 pages reserved
[   31.038913] 40960 pages cma reserved
[   32.965365] FG: fg_get_battery_temp: batt temperature original:440, tuned:378
[   32.965383] lge_battery: bm_watch_work: PRESENT:1, CHG_STAT:1, THM_STAT:2, BAT_TEMP:378, BAT_VOLT:3933095, VOTE_CUR:1000000, SET_CUR:1000000 
[   32.969212] dmesg: page allocation failure: order:0, mode:0x2200000
[   32.969229] CPU: 4 PID: 2450 Comm: dmesg Tainted: G        W       4.4.169-Sultan #15
[   32.969233] Hardware name: Qualcomm Technologies, Inc. MSM8998 v2.1 (DT)
[   32.969238] Call trace:
[   32.969249] [<ffffff9f2e88a378>] dump_backtrace+0x0/0x1e8
[   32.969254] [<ffffff9f2e88a574>] show_stack+0x14/0x20
[   32.969260] [<ffffff9f2eb54de4>] dump_stack+0xb0/0xec
[   32.969267] [<ffffff9f2e972d20>] warn_alloc_failed+0x110/0x150
[   32.969272] [<ffffff9f2e9765a8>] __alloc_pages_nodemask+0x720/0x960
[   32.969278] [<ffffff9f2e9ae2fc>] new_slab+0x23c/0x280
[   32.969283] [<ffffff9f2e9b0298>] ___slab_alloc.isra.12.constprop.21+0x280/0x2e8
[   32.969287] [<ffffff9f2e9b0348>] __slab_alloc.isra.13.constprop.20+0x48/0x88
[   32.969291] [<ffffff9f2e9b04d0>] kmem_cache_alloc+0x138/0x188
[   32.969297] [<ffffff9f2ead8930>] avc_alloc_node+0x20/0x2b8
[   32.969302] [<ffffff9f2ead8ecc>] avc_compute_av+0xfc/0x2a0
[   32.969306] [<ffffff9f2ead9cbc>] avc_has_perm+0x1dc/0x1f0
[   32.969312] [<ffffff9f2eae0608>] inode_has_perm.isra.21+0x28/0x38
[   32.969315] [<ffffff9f2eae0a40>] file_has_perm+0xb8/0xc0
[   32.969319] [<ffffff9f2eae0c88>] selinux_file_permission+0xb0/0x118
[   32.969324] [<ffffff9f2ead4f60>] security_file_permission+0x50/0xe8
[   32.969328] [<ffffff9f2e9ba984>] rw_verify_area+0x44/0xf8
[   32.969332] [<ffffff9f2e9bb7c8>] vfs_write+0x58/0x1b0
[   32.969336] [<ffffff9f2e9bba3c>] SyS_write+0x5c/0xc0
[   32.969341] [<ffffff9f2e8832b0>] el0_svc_naked+0x24/0x28
[   32.969344] Mem-Info:
[   32.969351] active_anon:97516 inactive_anon:32535 isolated_anon:0\x0a active_file:48515 inactive_file:48971 isolated_file:52\x0a unevictable:39243 dirty:0 writeback:0 unstable:0\x0a slab_reclaimable:11618 slab_unreclaimable:33333\x0a mapped:71242 shmem:353 pagetables:11172 bounce:0\x0a free:2242 free_pcp:536 free_cma:4
[   32.969363] DMA free:8968kB min:4920kB low:50988kB high:52220kB active_anon:390064kB inactive_anon:130140kB active_file:194060kB inactive_file:195884kB unevictable:156972kB isolated(anon):0kB isolated(file):208kB present:1856508kB managed:1686236kB mlocked:156972kB dirty:0kB writeback:0kB mapped:284968kB shmem:1412kB slab_reclaimable:46472kB slab_unreclaimable:133332kB kernel_stack:36464kB pagetables:44688kB unstable:0kB bounce:0kB free_pcp:2176kB local_pcp:100kB free_cma:16kB writeback_tmp:0kB pages_scanned:156 all_unreclaimable? no
[   32.969367] lowmem_reserve[]: 0 0 0
[   32.969379] DMA: 168*4kB (UMEH) 98*8kB (UMEH) 131*16kB (MH) 19*32kB (UMEH) 7*64kB (UMH) 2*128kB (M) 2*256kB (MH) 2*512kB (UM) 1*1024kB (U) 1*2048kB (M) 0*4096kB = 9472kB
[   32.969433] 137394 total pagecache pages
[   32.969438] 592 pages in swap cache
[   32.969441] Swap cache stats: add 47408, delete 46816, find 357/1791
[   32.969444] Free swap  = 343508kB
[   32.969447] Total swap = 524284kB
[   32.969450] 464127 pages RAM
[   32.969453] 0 pages HighMem/MovableOnly
[   32.969455] 42568 pages reserved
[   32.969458] 40960 pages cma reserved
[   32.969499] dmesg: page allocation failure: order:0, mode:0x2200000
[   32.969503] CPU: 4 PID: 2450 Comm: dmesg Tainted: G        W       4.4.169-Sultan #15
[   32.969507] Hardware name: Qualcomm Technologies, Inc. MSM8998 v2.1 (DT)
[   32.969510] Call trace:
[   32.969514] [<ffffff9f2e88a378>] dump_backtrace+0x0/0x1e8
[   32.969519] [<ffffff9f2e88a574>] show_stack+0x14/0x20
[   32.969523] [<ffffff9f2eb54de4>] dump_stack+0xb0/0xec
[   32.969527] [<ffffff9f2e972d20>] warn_alloc_failed+0x110/0x150
[   32.969531] [<ffffff9f2e9765a8>] __alloc_pages_nodemask+0x720/0x960
[   32.969536] [<ffffff9f2e9ae2fc>] new_slab+0x23c/0x280
[   32.969540] [<ffffff9f2e9b0298>] ___slab_alloc.isra.12.constprop.21+0x280/0x2e8
[   32.969544] [<ffffff9f2e9b0348>] __slab_alloc.isra.13.constprop.20+0x48/0x88
[   32.969548] [<ffffff9f2e9b04d0>] kmem_cache_alloc+0x138/0x188
[   32.969552] [<ffffff9f2ead8930>] avc_alloc_node+0x20/0x2b8
[   32.969556] [<ffffff9f2ead8ecc>] avc_compute_av+0xfc/0x2a0
[   32.969560] [<ffffff9f2ead9cbc>] avc_has_perm+0x1dc/0x1f0
[   32.969565] [<ffffff9f2eadc028>] sock_has_perm+0x120/0x1e0
[   32.969569] [<ffffff9f2eadc198>] selinux_socket_sendmsg+0x18/0x20
[   32.969573] [<ffffff9f2ead7644>] security_socket_sendmsg+0x5c/0x90
[   32.969580] [<ffffff9f2f534ed0>] sock_sendmsg+0x20/0x58
[   32.969584] [<ffffff9f2f534f88>] sock_write_iter+0x80/0xd0
[   32.969588] [<ffffff9f2e9bb648>] __vfs_write+0xc0/0x100
[   32.969593] [<ffffff9f2e9bb7f8>] vfs_write+0x88/0x1b0
[   32.969597] [<ffffff9f2e9bba3c>] SyS_write+0x5c/0xc0
[   32.969601] [<ffffff9f2e8832b0>] el0_svc_naked+0x24/0x28
[   32.969604] Mem-Info:
[   32.969610] active_anon:97516 inactive_anon:32535 isolated_anon:0\x0a active_file:48515 inactive_file:48971 isolated_file:52\x0a unevictable:39243 dirty:0 writeback:0 unstable:0\x0a slab_reclaimable:11618 slab_unreclaimable:33333\x0a mapped:71242 shmem:353 pagetables:11172 bounce:0\x0a free:2494 free_pcp:54 free_cma:4
[   32.969620] DMA free:9976kB min:4920kB low:50988kB high:52220kB active_anon:390064kB inactive_anon:130140kB active_file:194060kB inactive_file:195884kB unevictable:156972kB isolated(anon):0kB isolated(file):208kB present:1856508kB managed:1686236kB mlocked:156972kB dirty:0kB writeback:0kB mapped:284968kB shmem:1412kB slab_reclaimable:46472kB slab_unreclaimable:133332kB kernel_stack:36464kB pagetables:44688kB unstable:0kB bounce:0kB free_pcp:324kB local_pcp:0kB free_cma:16kB writeback_tmp:0kB pages_scanned:156 all_unreclaimable? no
[   32.969624] lowmem_reserve[]: 0 0 0
[   32.969635] DMA: 475*4kB (UMECH) 171*8kB (UMEH) 143*16kB (UMCH) 21*32kB (UMEH) 7*64kB (UMH) 2*128kB (M) 2*256kB (MH) 2*512kB (UM) 1*1024kB (U) 1*2048kB (M) 0*4096kB = 11540kB
[   32.969688] 137394 total pagecache pages
[   32.969691] 592 pages in swap cache
[   32.969695] Swap cache stats: add 47408, delete 46816, find 357/1791
[   32.969698] Free swap  = 343508kB
[   32.969701] Total swap = 524284kB
[   32.969704] 464127 pages RAM
[   32.969707] 0 pages HighMem/MovableOnly
[   32.969710] 42568 pages reserved
[   32.969713] 40960 pages cma reserved
[   33.330619] SMB138X: smb138x_parallel_get_prop: parallel power supply get prop 4 not supported
[   33.704895] SMB138X: smb138x_parallel_get_prop: parallel power supply get prop 4 not supported
[   33.715604] FG: fg_get_battery_temp: batt temperature original:440, tuned:379
[   33.717584] healthd: battery l=79 v=3939 t=37.9 h=2 st=2 c=687 fc=3229000 cc=291 chg=u
[   33.720510] FG: fg_get_battery_temp: batt temperature original:440, tuned:379
[   33.731518] healthd: battery l=79 v=3939 t=37.9 h=2 st=2 c=687 fc=3229000 cc=291 chg=u
[   34.077267] FG: fg_get_battery_temp: batt temperature original:440, tuned:379
[   34.078862] healthd: battery l=79 v=3939 t=37.9 h=2 st=2 c=687 fc=3229000 cc=291 chg=u
[   34.079822] FG: fg_get_battery_temp: batt temperature original:440, tuned:379
[   34.083892] healthd: battery l=79 v=3939 t=37.9 h=2 st=2 c=687 fc=3229000 cc=291 chg=u
[   35.061667] simple_lmk: Killing rbandroid.sleep with adj 904 to free 42368 KiB
[   35.062953] simple_lmk: Killing droid.deskclock with adj 904 to free 40544 KiB
[   35.063141] simple_lmk: Killing android.carrier with adj 904 to free 37528 KiB
[   35.381923] simple_lmk: Killing oadcastreceiver with adj 906 to free 34516 KiB
[   35.385167] simple_lmk: Killing .apps.translate with adj 904 to free 48748 KiB
[   35.388087] simple_lmk: Killing .apps.wellbeing with adj 904 to free 40484 KiB
[   35.524064] binder_alloc: 2203: binder_alloc_buf failed to map pages in userspace, no vma
[   35.524081] binder: 1209:3223 transaction failed 29189/-3, size 76-0 line 3189
[   35.590508] binder: 2115:3009 transaction failed 29189/-22, size 160-0 line 3052
[   35.661294] simple_lmk: Killing ndroid.keychain with adj 906 to free 33836 KiB
[   35.664687] simple_lmk: Killing id.printspooler with adj 904 to free 34744 KiB
[   35.664819] simple_lmk: Killing .android.dialer with adj 904 to free 23912 KiB
[   35.665036] simple_lmk: Killing le.android.talk with adj 902 to free 54548 KiB

Sultan

^ permalink raw reply related	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-15 17:27                                                                           ` Sultan Alsawaf
@ 2019-05-15 18:32                                                                             ` Steven Rostedt
  2019-05-15 18:52                                                                               ` Sultan Alsawaf
  2019-05-16 13:54                                                                             ` Oleg Nesterov
  1 sibling, 1 reply; 113+ messages in thread
From: Steven Rostedt @ 2019-05-15 18:32 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Oleg Nesterov, Christian Brauner, Daniel Colascione,
	Suren Baghdasaryan, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Andy Lutomirski, Serge E. Hallyn, Kees Cook,
	Joel Fernandes

On Wed, 15 May 2019 10:27:28 -0700
Sultan Alsawaf <sultan@kerneltoast.com> wrote:

> On Wed, May 15, 2019 at 04:58:32PM +0200, Oleg Nesterov wrote:
> > Could you explain in detail what exactly did you do and what do you see in dmesg?
> > 
> > Just in case, lockdep complains only once, print_circular_bug() does debug_locks_off()
> > so it it has already reported another false positive __lock_acquire() will simply
> > return after that.
> > 
> > Oleg.  
> 
> This is what I did:
> diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
> index 774ab79d3ec7..009e7d431a88 100644
> --- a/kernel/locking/lockdep.c
> +++ b/kernel/locking/lockdep.c
> @@ -3078,6 +3078,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
>         int class_idx;
>         u64 chain_key;
> 
> +       BUG_ON(!debug_locks || !prove_locking);
>         if (unlikely(!debug_locks))
>                 return 0;
> 
> diff --git a/lib/debug_locks.c b/lib/debug_locks.c
> index 124fdf238b3d..4003a18420fb 100644
> --- a/lib/debug_locks.c
> +++ b/lib/debug_locks.c
> @@ -37,6 +37,7 @@ EXPORT_SYMBOL_GPL(debug_locks_silent);
>   */
>  int debug_locks_off(void)
>  {
> +       return 0;

I'm confused why you did this?

-- Steve

>         if (debug_locks && __debug_locks_off()) {
>                 if (!debug_locks_silent) {
>                         console_verbose();
> 
>

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-15 18:32                                                                             ` Steven Rostedt
@ 2019-05-15 18:52                                                                               ` Sultan Alsawaf
  2019-05-15 20:09                                                                                 ` Steven Rostedt
  0 siblings, 1 reply; 113+ messages in thread
From: Sultan Alsawaf @ 2019-05-15 18:52 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Oleg Nesterov, Christian Brauner, Daniel Colascione,
	Suren Baghdasaryan, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Andy Lutomirski, Serge E. Hallyn, Kees Cook,
	Joel Fernandes

On Wed, May 15, 2019 at 02:32:48PM -0400, Steven Rostedt wrote:
> I'm confused why you did this?

Oleg said that debug_locks_off() could've been called and thus prevented
lockdep complaints about simple_lmk from appearing. To eliminate any possibility
of that, I disabled debug_locks_off().

Oleg also said that __lock_acquire() could return early if lock debugging were
somehow turned off after lockdep reported one bug. To mitigate any possibility
of that as well, I threw in the BUG_ON() for good measure.

I think at this point it's pretty clear that lockdep truly isn't complaining
about simple_lmk's locking pattern, and that lockdep's lack of complaints isn't
due to it being mysteriously turned off...

Sultan

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-15 18:52                                                                               ` Sultan Alsawaf
@ 2019-05-15 20:09                                                                                 ` Steven Rostedt
  0 siblings, 0 replies; 113+ messages in thread
From: Steven Rostedt @ 2019-05-15 20:09 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Oleg Nesterov, Christian Brauner, Daniel Colascione,
	Suren Baghdasaryan, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Andy Lutomirski, Serge E. Hallyn, Kees Cook,
	Joel Fernandes

On Wed, 15 May 2019 11:52:57 -0700
Sultan Alsawaf <sultan@kerneltoast.com> wrote:

> On Wed, May 15, 2019 at 02:32:48PM -0400, Steven Rostedt wrote:
> > I'm confused why you did this?  
> 
> Oleg said that debug_locks_off() could've been called and thus prevented
> lockdep complaints about simple_lmk from appearing. To eliminate any possibility
> of that, I disabled debug_locks_off().

But I believe that when lockdep discovers an issue, the data from then
on is not reliable. Which is why we turn it off. But just commenting
out the disabling makes lockdep unreliable, and is not a proper way to
test your code.

Yes, it can then miss locking issues after one was discovered. Thus,
you are not properly testing the locking in your code.

-- Steve

^ permalink raw reply	[flat|nested] 113+ messages in thread

* Re: [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android
  2019-05-15 17:27                                                                           ` Sultan Alsawaf
  2019-05-15 18:32                                                                             ` Steven Rostedt
@ 2019-05-16 13:54                                                                             ` Oleg Nesterov
  1 sibling, 0 replies; 113+ messages in thread
From: Oleg Nesterov @ 2019-05-16 13:54 UTC (permalink / raw)
  To: Sultan Alsawaf
  Cc: Christian Brauner, Daniel Colascione, Suren Baghdasaryan,
	Steven Rostedt, Tim Murray, Michal Hocko, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Martijn Coenen, Ingo Molnar,
	Peter Zijlstra, LKML, open list:ANDROID DRIVERS, linux-mm,
	kernel-team, Andy Lutomirski, Serge E. Hallyn, Kees Cook,
	Joel Fernandes

On 05/15, Sultan Alsawaf wrote:
>
> On Wed, May 15, 2019 at 04:58:32PM +0200, Oleg Nesterov wrote:
> > Could you explain in detail what exactly did you do and what do you see in dmesg?
> >
> > Just in case, lockdep complains only once, print_circular_bug() does debug_locks_off()
> > so it it has already reported another false positive __lock_acquire() will simply
> > return after that.
> >
> > Oleg.
>
> This is what I did:
> diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
> index 774ab79d3ec7..009e7d431a88 100644
> --- a/kernel/locking/lockdep.c
> +++ b/kernel/locking/lockdep.c
> @@ -3078,6 +3078,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
>         int class_idx;
>         u64 chain_key;
>
> +       BUG_ON(!debug_locks || !prove_locking);
>         if (unlikely(!debug_locks))
>                 return 0;
>
> diff --git a/lib/debug_locks.c b/lib/debug_locks.c
> index 124fdf238b3d..4003a18420fb 100644
> --- a/lib/debug_locks.c
> +++ b/lib/debug_locks.c
> @@ -37,6 +37,7 @@ EXPORT_SYMBOL_GPL(debug_locks_silent);
>   */
>  int debug_locks_off(void)
>  {
> +       return 0;
>         if (debug_locks && __debug_locks_off()) {
>                 if (!debug_locks_silent) {
>                         console_verbose();

OK, this means that debug_locks_off() always returns 0, as if debug_locks was already
cleared.

Thus print_deadlock_bug() will do nothing, it does

	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
		return 0;

iow this means that even if lockdep finds a problem, the problem won't be reported.

> [    1.492128] BUG: key 0000000000000000 not in .data!
> [    1.492141] BUG: key 0000000000000000 not in .data!
> [    1.492152] BUG: key 0000000000000000 not in .data!
> [    1.492228] BUG: key 0000000000000000 not in .data!
> [    1.492238] BUG: key 0000000000000000 not in .data!
> [    1.492248] BUG: key 0000000000000000 not in .data!

I guess this is lockdep_init_map() which does printk("BUG:") itself, but due to your
change above it doesn't do WARN(1) and thus there is no call trace.

Oleg.


^ permalink raw reply	[flat|nested] 113+ messages in thread

end of thread, other threads:[~2019-05-16 13:54 UTC | newest]

Thread overview: 113+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-10 20:34 [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android Sultan Alsawaf
2019-03-10 21:03 ` Greg Kroah-Hartman
2019-03-10 21:26   ` Sultan Alsawaf
2019-03-11 16:32 ` Joel Fernandes
2019-03-11 16:37   ` Joel Fernandes
2019-03-11 17:43 ` Michal Hocko
2019-03-11 17:58   ` Sultan Alsawaf
2019-03-11 20:10     ` Suren Baghdasaryan
2019-03-11 20:46       ` Sultan Alsawaf
2019-03-11 21:11         ` Joel Fernandes
2019-03-11 21:46           ` Sultan Alsawaf
2019-03-11 22:15         ` Suren Baghdasaryan
2019-03-11 22:36           ` Sultan Alsawaf
2019-03-12  8:05           ` Michal Hocko
2019-03-12 14:36             ` Suren Baghdasaryan
2019-03-12 15:25             ` Matthew Wilcox
2019-03-12 15:33               ` Michal Hocko
2019-03-12 15:39                 ` Michal Hocko
2019-03-12 16:37             ` Sultan Alsawaf
2019-03-12 16:48               ` Michal Hocko
2019-03-12 16:58               ` Michal Hocko
2019-03-12 17:15                 ` Suren Baghdasaryan
2019-03-12 17:17               ` Tim Murray
2019-03-12 17:45                 ` Sultan Alsawaf
2019-03-12 18:43                   ` Tim Murray
2019-03-12 18:50                     ` Christian Brauner
2019-03-14 17:47                 ` Joel Fernandes
2019-03-14 20:49                   ` Sultan Alsawaf
2019-03-15  2:54                     ` Joel Fernandes
2019-03-15  3:43                       ` Sultan Alsawaf
2019-03-15  3:16                     ` Steven Rostedt
2019-03-15  3:45                       ` Sultan Alsawaf
2019-03-15  4:36                       ` Daniel Colascione
2019-03-15 13:36                         ` Joel Fernandes
2019-03-15 15:56                         ` Suren Baghdasaryan
2019-03-15 16:12                           ` Daniel Colascione
2019-03-15 16:43                         ` Steven Rostedt
2019-03-15 17:17                           ` Daniel Colascione
2019-03-15 18:03                         ` Christian Brauner
2019-03-15 18:13                           ` Joel Fernandes
2019-03-15 18:24                             ` Christian Brauner
2019-03-15 18:49                               ` Joel Fernandes
2019-03-16 17:31                                 ` Suren Baghdasaryan
2019-03-16 18:00                                   ` Daniel Colascione
2019-03-16 18:57                                     ` Christian Brauner
2019-03-16 19:37                                       ` Suren Baghdasaryan
2019-03-17  1:53                                         ` Joel Fernandes
2019-03-17 11:42                                           ` Christian Brauner
2019-03-17 15:40                                             ` Daniel Colascione
2019-03-18  0:29                                               ` Christian Brauner
2019-03-18 23:50                                                 ` Joel Fernandes
2019-03-19 22:14                                                   ` Christian Brauner
2019-03-19 22:26                                                     ` Joel Fernandes
2019-03-19 22:48                                                     ` Daniel Colascione
2019-03-19 23:10                                                       ` Christian Brauner
2019-03-20  1:52                                                         ` Joel Fernandes
2019-03-20  2:42                                                           ` pidfd design Daniel Colascione
2019-03-20  3:59                                                             ` Christian Brauner
2019-03-20  7:02                                                               ` Daniel Colascione
2019-03-20 11:33                                                                 ` Joel Fernandes
2019-03-20 18:26                                                                   ` Christian Brauner
2019-03-20 18:38                                                                     ` Daniel Colascione
2019-03-20 18:51                                                                       ` Christian Brauner
2019-03-20 18:58                                                                         ` Andy Lutomirski
2019-03-20 19:14                                                                           ` Christian Brauner
2019-03-20 19:40                                                                             ` Daniel Colascione
2019-03-21 17:02                                                                               ` Andy Lutomirski
2019-03-25 20:13                                                                                 ` Jann Horn
2019-03-25 20:23                                                                                   ` Daniel Colascione
2019-03-25 23:42                                                                                     ` Andy Lutomirski
2019-03-25 23:45                                                                                       ` Christian Brauner
2019-03-26  0:00                                                                                         ` Andy Lutomirski
2019-03-26  0:12                                                                                           ` Christian Brauner
2019-03-26  0:24                                                                                             ` Andy Lutomirski
2019-03-28  9:21                                                                                               ` Christian Brauner
2019-03-20 19:19                                                                         ` Joel Fernandes
2019-03-20 19:29                                                                         ` Daniel Colascione
2019-03-24 14:44                                                                           ` Serge E. Hallyn
2019-03-24 18:48                                                                             ` Joel Fernandes
2019-03-20 19:11                                                                     ` Joel Fernandes
2019-05-07  2:16                                                           ` [RFC] simple_lmk: Introduce Simple Low Memory Killer for Android Sultan Alsawaf
2019-05-07  7:04                                                             ` Greg Kroah-Hartman
2019-05-07  7:27                                                               ` Sultan Alsawaf
2019-05-07  7:43                                                                 ` Greg Kroah-Hartman
2019-05-07  8:12                                                                   ` Sultan Alsawaf
2019-05-07 10:58                                                                     ` Christian Brauner
2019-05-07 16:28                                                                       ` Suren Baghdasaryan
2019-05-07 16:38                                                                         ` Christian Brauner
2019-05-07 16:53                                                                         ` Sultan Alsawaf
2019-05-07 20:01                                                                           ` Suren Baghdasaryan
2019-05-07 18:46                                                                         ` Joel Fernandes
2019-05-07 17:17                                                                       ` Sultan Alsawaf
2019-05-07 17:29                                                                         ` Greg Kroah-Hartman
2019-05-07 11:09                                                                     ` Greg Kroah-Hartman
2019-05-07 12:26                                                             ` Michal Hocko
2019-05-07 15:31                                                             ` Oleg Nesterov
2019-05-07 16:35                                                               ` Sultan Alsawaf
2019-05-09 15:56                                                                 ` Oleg Nesterov
2019-05-09 18:33                                                                   ` Sultan Alsawaf
2019-05-10 15:10                                                                     ` Oleg Nesterov
2019-05-13 16:45                                                                       ` Sultan Alsawaf
2019-05-14 16:44                                                                         ` Steven Rostedt
2019-05-14 17:31                                                                           ` Sultan Alsawaf
2019-05-15 14:58                                                                         ` Oleg Nesterov
2019-05-15 17:27                                                                           ` Sultan Alsawaf
2019-05-15 18:32                                                                             ` Steven Rostedt
2019-05-15 18:52                                                                               ` Sultan Alsawaf
2019-05-15 20:09                                                                                 ` Steven Rostedt
2019-05-16 13:54                                                                             ` Oleg Nesterov
2019-03-17 16:35                                             ` Serge E. Hallyn
2019-03-17 17:11                                               ` Daniel Colascione
2019-03-17 17:16                                                 ` Serge E. Hallyn
2019-03-17 22:02                                                   ` Suren Baghdasaryan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).