linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [GIT pull] timer updates for 4.9
@ 2016-10-22 12:02 Thomas Gleixner
  2016-10-23 22:39 ` Linus Torvalds
  0 siblings, 1 reply; 12+ messages in thread
From: Thomas Gleixner @ 2016-10-22 12:02 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: LKML, Andrew Morton, Ingo Molnar, H. Peter Anvin

Linus,

please pull the latest timers-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers-urgent-for-linus

This updates contains:

  - A revert which addresses a boot failure on ARM Sun5i platforms

  - A new clocksource driver, which has been delayed beyond rc1 due to an
    interrupt driver issue which was unearthed by this driver. The
    debugging of that issue and the discussion about the proper solution
    made this driver miss the merge window. There is no point in delaying
    it for a full cycle as it completes the basic mainline support for the
    new JCore platform and does not create any risk outside of that
    platform.

Thanks,

	tglx

------------------>
Chen-Yu Tsai (1):
      Revert "clocksource/drivers/timer_sun5i: Replace code by clocksource_mmio_init"

Rich Felker (2):
      of: Add J-Core timer bindings
      clocksource: Add J-Core timer/clocksource driver


 .../devicetree/bindings/timer/jcore,pit.txt        |  24 ++
 drivers/clocksource/Kconfig                        |  10 +
 drivers/clocksource/Makefile                       |   1 +
 drivers/clocksource/jcore-pit.c                    | 249 +++++++++++++++++++++
 drivers/clocksource/timer-sun5i.c                  |  16 +-
 include/linux/cpuhotplug.h                         |   1 +
 6 files changed, 299 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/timer/jcore,pit.txt
 create mode 100644 drivers/clocksource/jcore-pit.c

diff --git a/Documentation/devicetree/bindings/timer/jcore,pit.txt b/Documentation/devicetree/bindings/timer/jcore,pit.txt
new file mode 100644
index 000000000000..af5dd35469d7
--- /dev/null
+++ b/Documentation/devicetree/bindings/timer/jcore,pit.txt
@@ -0,0 +1,24 @@
+J-Core Programmable Interval Timer and Clocksource
+
+Required properties:
+
+- compatible: Must be "jcore,pit".
+
+- reg: Memory region(s) for timer/clocksource registers. For SMP,
+  there should be one region per cpu, indexed by the sequential,
+  zero-based hardware cpu number.
+
+- interrupts: An interrupt to assign for the timer. The actual pit
+  core is integrated with the aic and allows the timer interrupt
+  assignment to be programmed by software, but this property is
+  required in order to reserve an interrupt number that doesn't
+  conflict with other devices.
+
+
+Example:
+
+timer@200 {
+	compatible = "jcore,pit";
+	reg = < 0x200 0x30 0x500 0x30 >;
+	interrupts = < 0x48 >;
+};
diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index 245190839359..e2c6e43cf8ca 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -417,6 +417,16 @@ config SYS_SUPPORTS_SH_TMU
 config SYS_SUPPORTS_EM_STI
         bool
 
+config CLKSRC_JCORE_PIT
+	bool "J-Core PIT timer driver" if COMPILE_TEST
+	depends on OF
+	depends on GENERIC_CLOCKEVENTS
+	depends on HAS_IOMEM
+	select CLKSRC_MMIO
+	help
+	  This enables build of clocksource and clockevent driver for
+	  the integrated PIT in the J-Core synthesizable, open source SoC.
+
 config SH_TIMER_CMT
 	bool "Renesas CMT timer driver" if COMPILE_TEST
 	depends on GENERIC_CLOCKEVENTS
diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index fd9d6df0bbc0..cf87f407f1ad 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_ATMEL_TCB_CLKSRC)	+= tcb_clksrc.o
 obj-$(CONFIG_X86_PM_TIMER)	+= acpi_pm.o
 obj-$(CONFIG_SCx200HR_TIMER)	+= scx200_hrt.o
 obj-$(CONFIG_CS5535_CLOCK_EVENT_SRC)	+= cs5535-clockevt.o
+obj-$(CONFIG_CLKSRC_JCORE_PIT)		+= jcore-pit.o
 obj-$(CONFIG_SH_TIMER_CMT)	+= sh_cmt.o
 obj-$(CONFIG_SH_TIMER_MTU2)	+= sh_mtu2.o
 obj-$(CONFIG_SH_TIMER_TMU)	+= sh_tmu.o
diff --git a/drivers/clocksource/jcore-pit.c b/drivers/clocksource/jcore-pit.c
new file mode 100644
index 000000000000..54e1665aa03c
--- /dev/null
+++ b/drivers/clocksource/jcore-pit.c
@@ -0,0 +1,249 @@
+/*
+ * J-Core SoC PIT/clocksource driver
+ *
+ * Copyright (C) 2015-2016 Smart Energy Instruments, Inc.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/clockchips.h>
+#include <linux/clocksource.h>
+#include <linux/sched_clock.h>
+#include <linux/cpu.h>
+#include <linux/cpuhotplug.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+
+#define PIT_IRQ_SHIFT		12
+#define PIT_PRIO_SHIFT		20
+#define PIT_ENABLE_SHIFT	26
+#define PIT_PRIO_MASK		0xf
+
+#define REG_PITEN		0x00
+#define REG_THROT		0x10
+#define REG_COUNT		0x14
+#define REG_BUSPD		0x18
+#define REG_SECHI		0x20
+#define REG_SECLO		0x24
+#define REG_NSEC		0x28
+
+struct jcore_pit {
+	struct clock_event_device	ced;
+	void __iomem			*base;
+	unsigned long			periodic_delta;
+	u32				enable_val;
+};
+
+static void __iomem *jcore_pit_base;
+static struct jcore_pit __percpu *jcore_pit_percpu;
+
+static notrace u64 jcore_sched_clock_read(void)
+{
+	u32 seclo, nsec, seclo0;
+	__iomem void *base = jcore_pit_base;
+
+	seclo = readl(base + REG_SECLO);
+	do {
+		seclo0 = seclo;
+		nsec  = readl(base + REG_NSEC);
+		seclo = readl(base + REG_SECLO);
+	} while (seclo0 != seclo);
+
+	return seclo * NSEC_PER_SEC + nsec;
+}
+
+static cycle_t jcore_clocksource_read(struct clocksource *cs)
+{
+	return jcore_sched_clock_read();
+}
+
+static int jcore_pit_disable(struct jcore_pit *pit)
+{
+	writel(0, pit->base + REG_PITEN);
+	return 0;
+}
+
+static int jcore_pit_set(unsigned long delta, struct jcore_pit *pit)
+{
+	jcore_pit_disable(pit);
+	writel(delta, pit->base + REG_THROT);
+	writel(pit->enable_val, pit->base + REG_PITEN);
+	return 0;
+}
+
+static int jcore_pit_set_state_shutdown(struct clock_event_device *ced)
+{
+	struct jcore_pit *pit = container_of(ced, struct jcore_pit, ced);
+
+	return jcore_pit_disable(pit);
+}
+
+static int jcore_pit_set_state_oneshot(struct clock_event_device *ced)
+{
+	struct jcore_pit *pit = container_of(ced, struct jcore_pit, ced);
+
+	return jcore_pit_disable(pit);
+}
+
+static int jcore_pit_set_state_periodic(struct clock_event_device *ced)
+{
+	struct jcore_pit *pit = container_of(ced, struct jcore_pit, ced);
+
+	return jcore_pit_set(pit->periodic_delta, pit);
+}
+
+static int jcore_pit_set_next_event(unsigned long delta,
+				    struct clock_event_device *ced)
+{
+	struct jcore_pit *pit = container_of(ced, struct jcore_pit, ced);
+
+	return jcore_pit_set(delta, pit);
+}
+
+static int jcore_pit_local_init(unsigned cpu)
+{
+	struct jcore_pit *pit = this_cpu_ptr(jcore_pit_percpu);
+	unsigned buspd, freq;
+
+	pr_info("Local J-Core PIT init on cpu %u\n", cpu);
+
+	buspd = readl(pit->base + REG_BUSPD);
+	freq = DIV_ROUND_CLOSEST(NSEC_PER_SEC, buspd);
+	pit->periodic_delta = DIV_ROUND_CLOSEST(NSEC_PER_SEC, HZ * buspd);
+
+	clockevents_config_and_register(&pit->ced, freq, 1, ULONG_MAX);
+
+	return 0;
+}
+
+static irqreturn_t jcore_timer_interrupt(int irq, void *dev_id)
+{
+	struct jcore_pit *pit = this_cpu_ptr(dev_id);
+
+	if (clockevent_state_oneshot(&pit->ced))
+		jcore_pit_disable(pit);
+
+	pit->ced.event_handler(&pit->ced);
+
+	return IRQ_HANDLED;
+}
+
+static int __init jcore_pit_init(struct device_node *node)
+{
+	int err;
+	unsigned pit_irq, cpu;
+	unsigned long hwirq;
+	u32 irqprio, enable_val;
+
+	jcore_pit_base = of_iomap(node, 0);
+	if (!jcore_pit_base) {
+		pr_err("Error: Cannot map base address for J-Core PIT\n");
+		return -ENXIO;
+	}
+
+	pit_irq = irq_of_parse_and_map(node, 0);
+	if (!pit_irq) {
+		pr_err("Error: J-Core PIT has no IRQ\n");
+		return -ENXIO;
+	}
+
+	pr_info("Initializing J-Core PIT at %p IRQ %d\n",
+		jcore_pit_base, pit_irq);
+
+	err = clocksource_mmio_init(jcore_pit_base, "jcore_pit_cs",
+				    NSEC_PER_SEC, 400, 32,
+				    jcore_clocksource_read);
+	if (err) {
+		pr_err("Error registering clocksource device: %d\n", err);
+		return err;
+	}
+
+	sched_clock_register(jcore_sched_clock_read, 32, NSEC_PER_SEC);
+
+	jcore_pit_percpu = alloc_percpu(struct jcore_pit);
+	if (!jcore_pit_percpu) {
+		pr_err("Failed to allocate memory for clock event device\n");
+		return -ENOMEM;
+	}
+
+	err = request_irq(pit_irq, jcore_timer_interrupt,
+			  IRQF_TIMER | IRQF_PERCPU,
+			  "jcore_pit", jcore_pit_percpu);
+	if (err) {
+		pr_err("pit irq request failed: %d\n", err);
+		free_percpu(jcore_pit_percpu);
+		return err;
+	}
+
+	/*
+	 * The J-Core PIT is not hard-wired to a particular IRQ, but
+	 * integrated with the interrupt controller such that the IRQ it
+	 * generates is programmable, as follows:
+	 *
+	 * The bit layout of the PIT enable register is:
+	 *
+	 *	.....e..ppppiiiiiiii............
+	 *
+	 * where the .'s indicate unrelated/unused bits, e is enable,
+	 * p is priority, and i is hard irq number.
+	 *
+	 * For the PIT included in AIC1 (obsolete but still in use),
+	 * any hard irq (trap number) can be programmed via the 8
+	 * iiiiiiii bits, and a priority (0-15) is programmable
+	 * separately in the pppp bits.
+	 *
+	 * For the PIT included in AIC2 (current), the programming
+	 * interface is equivalent modulo interrupt mapping. This is
+	 * why a different compatible tag was not used. However only
+	 * traps 64-127 (the ones actually intended to be used for
+	 * interrupts, rather than syscalls/exceptions/etc.) can be
+	 * programmed (the high 2 bits of i are ignored) and the
+	 * priority pppp is <<2'd and or'd onto the irq number. This
+	 * choice seems to have been made on the hardware engineering
+	 * side under an assumption that preserving old AIC1 priority
+	 * mappings was important. Future models will likely ignore
+	 * the pppp field.
+	 */
+	hwirq = irq_get_irq_data(pit_irq)->hwirq;
+	irqprio = (hwirq >> 2) & PIT_PRIO_MASK;
+	enable_val = (1U << PIT_ENABLE_SHIFT)
+		   | (hwirq << PIT_IRQ_SHIFT)
+		   | (irqprio << PIT_PRIO_SHIFT);
+
+	for_each_present_cpu(cpu) {
+		struct jcore_pit *pit = per_cpu_ptr(jcore_pit_percpu, cpu);
+
+		pit->base = of_iomap(node, cpu);
+		if (!pit->base) {
+			pr_err("Unable to map PIT for cpu %u\n", cpu);
+			continue;
+		}
+
+		pit->ced.name = "jcore_pit";
+		pit->ced.features = CLOCK_EVT_FEAT_PERIODIC
+				  | CLOCK_EVT_FEAT_ONESHOT
+				  | CLOCK_EVT_FEAT_PERCPU;
+		pit->ced.cpumask = cpumask_of(cpu);
+		pit->ced.rating = 400;
+		pit->ced.irq = pit_irq;
+		pit->ced.set_state_shutdown = jcore_pit_set_state_shutdown;
+		pit->ced.set_state_periodic = jcore_pit_set_state_periodic;
+		pit->ced.set_state_oneshot = jcore_pit_set_state_oneshot;
+		pit->ced.set_next_event = jcore_pit_set_next_event;
+
+		pit->enable_val = enable_val;
+	}
+
+	cpuhp_setup_state(CPUHP_AP_JCORE_TIMER_STARTING,
+			  "AP_JCORE_TIMER_STARTING",
+			  jcore_pit_local_init, NULL);
+
+	return 0;
+}
+
+CLOCKSOURCE_OF_DECLARE(jcore_pit, "jcore,pit", jcore_pit_init);
diff --git a/drivers/clocksource/timer-sun5i.c b/drivers/clocksource/timer-sun5i.c
index c184eb84101e..4f87f3e76d83 100644
--- a/drivers/clocksource/timer-sun5i.c
+++ b/drivers/clocksource/timer-sun5i.c
@@ -152,6 +152,13 @@ static irqreturn_t sun5i_timer_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
+static cycle_t sun5i_clksrc_read(struct clocksource *clksrc)
+{
+	struct sun5i_timer_clksrc *cs = to_sun5i_timer_clksrc(clksrc);
+
+	return ~readl(cs->timer.base + TIMER_CNTVAL_LO_REG(1));
+}
+
 static int sun5i_rate_cb_clksrc(struct notifier_block *nb,
 				unsigned long event, void *data)
 {
@@ -210,8 +217,13 @@ static int __init sun5i_setup_clocksource(struct device_node *node,
 	writel(TIMER_CTL_ENABLE | TIMER_CTL_RELOAD,
 	       base + TIMER_CTL_REG(1));
 
-	ret = clocksource_mmio_init(base + TIMER_CNTVAL_LO_REG(1), node->name,
-				    rate, 340, 32, clocksource_mmio_readl_down);
+	cs->clksrc.name = node->name;
+	cs->clksrc.rating = 340;
+	cs->clksrc.read = sun5i_clksrc_read;
+	cs->clksrc.mask = CLOCKSOURCE_MASK(32);
+	cs->clksrc.flags = CLOCK_SOURCE_IS_CONTINUOUS;
+
+	ret = clocksource_register_hz(&cs->clksrc, rate);
 	if (ret) {
 		pr_err("Couldn't register clock source.\n");
 		goto err_remove_notifier;
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 9b207a8c5af3..afe641c02dca 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -81,6 +81,7 @@ enum cpuhp_state {
 	CPUHP_AP_ARM_ARCH_TIMER_STARTING,
 	CPUHP_AP_ARM_GLOBAL_TIMER_STARTING,
 	CPUHP_AP_DUMMY_TIMER_STARTING,
+	CPUHP_AP_JCORE_TIMER_STARTING,
 	CPUHP_AP_EXYNOS4_MCT_TIMER_STARTING,
 	CPUHP_AP_ARM_TWD_STARTING,
 	CPUHP_AP_METAG_TIMER_STARTING,

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [GIT pull] timer updates for 4.9
  2016-10-22 12:02 [GIT pull] timer updates for 4.9 Thomas Gleixner
@ 2016-10-23 22:39 ` Linus Torvalds
  2016-10-23 23:20   ` Linus Torvalds
  0 siblings, 1 reply; 12+ messages in thread
From: Linus Torvalds @ 2016-10-23 22:39 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Andrew Morton, Ingo Molnar, H. Peter Anvin

On Sat, Oct 22, 2016 at 5:02 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
>
> please pull the latest timers-urgent-for-linus git tree from:
>
>    git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers-urgent-for-linus

Hmm. I just hit something that looks very much like a timer bug.

I'm working on cleaning up the KERN_CONT handling of printk.c, and in
the process I'm adding a timer that flushes incomplete lines to the
logs. The code is pretty simple, and it basically just does

  static void deferred_cont_flush(void)
  {
        static DEFINE_TIMER(timer, flush_timer, 0, 0);

        if (!cont.len)
                return;
        mod_timer(&timer, jiffies + HZ/10);
  }

which is then called whenever we have a incomplete line (the
"flush_timer()" function will just flush the pending line to the
console and to the logs.

It worked fine in testing, but at shutdown I just got this:

    do_trap: 165 callbacks suppressed
    traps: gnome-shell[1339] trap int3 ip:7f465aace15b sp:7fffc54f1670 error:0

followed by

    general protection fault: 0000 [#1] SMP
    Modules linked in: fuse xt_CHECKSUM ipt_MASQUERADE
nf_nat_masquerade_ipv4 tun nf_conntrack_netbios_ns
nf_conntrack_broadcast ip6t_rpfilter ip6t_REJECT nf_reject_ipv6
xt_conntrack ebtable_broute bridge stp llc ebtable_nat ip6tabl$
     wmi pinctrl_intel acpi_als kfifo_buf tpm_tis tpm_tis_core
industrialio acpi_pad tpm nfsd auth_rpcgss nfs_acl lockd grace sunrpc
dm_crypt i915 crct10dif_pclmul crc32_pclmul crc32c_intel i2c_algo_bit
drm_kms_helper syscopyarea se$
    CPU: 7 PID: 1339 Comm: gnome-shell Tainted: G           O
4.9.0-rc1-00244-g0c2b6dc4fd4f-dirty #6
    Hardware name: System manufacturer System Product Name/Z170-K,
BIOS 1803 05/06/2016
    task: ffff95c443bcd940 task.stack: ffffa6ce82d6c000
    RIP: 0010:[<ffffffffb10f9df1>]  [<ffffffffb10f9df1>]
detach_if_pending+0x41/0xd0
    RSP: 0000:ffffa6ce82d6fd08  EFLAGS: 00010086
    RAX: dead000000000200 RBX: 0000000100089285 RCX: 0000000000000000
    RDX: ffffffffb1c99580 RSI: ffff95c476ccf140 RDI: ffffffffb1c50920
    RBP: ffffa6ce82d6fd20 R08: 000000000000004a R09: 0000000000000000
    R10: 000000000000000f R11: ffffffffb1f0912c R12: ffffffffb1c50920
    R13: 0000000000000000 R14: ffff95c476ccf140 R15: ffffffffb1c50920
    FS:  00007f46639adac0(0000) GS:ffff95c476dc0000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 00007fea54725010 CR3: 0000000410fa3000 CR4: 00000000003406e0
    DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
    DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
    Stack:
     0000000100089285 0000000000000052 0000000000000051 ffffa6ce82d6fd90
     ffffffffb10fc4e0 ffffffffb1c50948 0000000100089222 00000000cbeae39b
     ffffffffb1f0912b 0000000000000246 00000000cbeae39b ffffffffb1a074af
    Call Trace:
     [<ffffffffb10fc4e0>] mod_timer+0xd0/0x3b0
     [<ffffffffb10e6670>] deferred_cont_flush.part.12+0x20/0x30
     [<ffffffffb10e7139>] console_unlock+0x489/0x4a0
     [<ffffffffb10e092e>] ? down_trylock+0x2e/0x40
     [<ffffffffb10e7443>] vprintk_emit+0x2f3/0x480
     [<ffffffffb10e77a9>] vprintk_default+0x29/0x40
     [<ffffffffb11a1a3b>] printk+0x57/0x73
     [<ffffffffb13c7521>] ? ___ratelimit+0x91/0xd0
     [<ffffffffb101ed76>] do_trap+0x116/0x140
     [<ffffffffb101f2bf>] do_int3+0xdf/0x110
     [<ffffffffb17a17cb>] int3+0x5b/0x60
    Code: 41 55 41 54 53 48 83 3f 00 0f 84 81 00 00 00 41 89 d5 49 89
fc 0f 1f 44 00 00 49 8b 04 24 49 8b 54 24 08 48 85 c0 48 89 02 74 04
<48> 89 50 08 45 84 ed 74 09 49 c7 44 24 08 00 00 00 00 48 b8 00
    RIP  [<ffffffffb10f9df1>] detach_if_pending+0x41/0xd0
     RSP <ffffa6ce82d6fd08>
    ---[ end trace e4a7055c685d5e1e ]---

(there was another GP fault at the same address immediately
afterwards, and then the machine was dead).

The code disassembly at the faulting instruction is

   0: 48 89 50 08           mov    %rdx,0x8(%rax)
   4: 45 84 ed             test   %r13b,%r13b
   7: 74 09                 je     0x12

and obviously %rax is bogus. In fact, %rax has the value
"dead000000000200", which is just LIST_POISON2.

Now, normally, I'd just say "my code is obviously buggy crap", and I
did spend a bit of time on that theory. But quite frankly, that timer
really _is_ local to that one function, and the absolutely only thing
that is ever done on that timer is that "mod_timer()" call.

So I don't think it's my new code. What triggers this is do_trap() doing two

                pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
                        tsk->comm, tsk->pid, str,
                        regs->ip, regs->sp, error_code);
                print_vma_addr(" in ", regs->ip);
                pr_cont("\n");

which does two printk's without '\n' in close succession, so the first
one will turn on the timer, and then the second one will update it
again very soon afterwards.

So I started looking at mod_timer() and detach_if_pending(), but I
don't see anything obvious. I can certainly find the code: the oops
clearly happens in detach_timer():

        __hlist_del(entry);
        if (clear_pending)
                entry->pprev = NULL;
        entry->next = LIST_POISON2;

and the "test   %r13b,%r13b" instruction that follows the GP
insutrction is the test of "clear_pending", and the access that faults
is the

        if (next)
                next->pprev = pprev;

in __hlist_del(), where "next" is that %rax LIST_POISON2 value.

So it smells like "entry" was simply detached _twice_. The second time
it hit that LIST_POISON2 poisoning and died.

Some locking problem in the timer handling?

Without knowing anythign at all, I'd be inclined to blame f00c0afdfa62
("timers: Implement optimization for same expiry time in mod_timer()")
which seems to do unlocked games with mod_timer().

But I may be full of crap. I literally just looked at "what has
changed recently in this area", on the assumption that timers have
historically been robust.

The fact that this happened not during my testing (I had test cases
that did similar things), but at shutdown, makes me also wonder if
maybe some CPU hot-unplug thing might have been involved. Concurrent
timer migration from a CPU that was going down? Although  honestly, it
doesn't seem like that, this happens when gnome-shell is exiting, and
it's simply not that late in the shutdown process yet.

I don't know. It still bothers me that this happens with new code that
I was just testing out, and my gut feel is that "timers are so core
and tested from other code that it can't *possibly* be bug in
mod_timer()". But the code really is *so* simple: literally a single
static timer that is only _ever_ touched by "mod_timer()" and
absolutely nothing else. So even if everything else in my KERN_CONT
cleanup was completely buggy crap, I'm not seeing how it could screw
up the timer.

Ideas?

               Linus

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [GIT pull] timer updates for 4.9
  2016-10-23 22:39 ` Linus Torvalds
@ 2016-10-23 23:20   ` Linus Torvalds
  2016-10-24  9:39     ` Thomas Gleixner
  0 siblings, 1 reply; 12+ messages in thread
From: Linus Torvalds @ 2016-10-23 23:20 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Andrew Morton, Ingo Molnar, H. Peter Anvin

On Sun, Oct 23, 2016 at 3:39 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> Some locking problem in the timer handling?

So I found what looks like a bug in lock_timer_base() wrt migration.

This code:

        for (;;) {
                struct timer_base *base;
                u32 tf = timer->flags;

                if (!(tf & TIMER_MIGRATING)) {
                        base = get_timer_base(tf);
                        spin_lock_irqsave(&base->lock, *flags);
                        if (timer->flags == tf)
                                return base;
                        spin_unlock_irqrestore(&base->lock, *flags);
                }
                cpu_relax();
        }

looks subtly buggy. I think that load of "tf" needs a READ_ONCE() to
make sure that gcc doesn't simply reload the valid of "timer->flags"
at random points.

Yes, the spin_lock_irqsave() is a barrier, but that's the only one.
Afaik, gcc could decide that "I need to spill tf, so I'll just reload
it" after looking up get_timer_base().

And no, I don't think this is the cause of my problem, but I suspect
that something _like_ fragility in lock_timer_base() could cause this.
Looking at "timer->flags" *before* you hold the lock that actually
locks down the value sounds very very fragile.

Another example of the exact same problem is in __mod_timer() itself:

                base = get_timer_base(timer->flags);
    ...
                if (idx == timer_get_idx(timer)) {

and since "timer_get_idx()" depends on timer->flags, doing this all
without holding the base lock smells really really bad.

Again, _if_ that whole "let's check timer_pending without holding the
lock" is valid, I'd suggest doing

                u32 flags = READ_ONCE(timer->flags);
                base = get_timer_base(flags);
    ...
                if (idx == timer_flags_get_idx(flags)) {


or something similar, so that at least we know that even if the
"flags" value changes, we use the *same* flag values to look up the
base and to then checking the index of the wheel of that base.

I dunno. That whole thing looks very fragile to begin with: is it
really ok to change the expiry time of a timer without holding any
locks what-so-ever? The timer may just be firing on another CPU, and
you may be setting the expiry time on a timer that isn't ever going to
fire again.

Anyway, the timer locking (and lack of it) makes me nervous.

And there may well be valid reasons why I'm full of crap, and there's
some reason why this is all safe. Maybe the GP fault I saw was my
fault after all, in some way that I can't for the life of me figure
out right now..

                Linus

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [GIT pull] timer updates for 4.9
  2016-10-23 23:20   ` Linus Torvalds
@ 2016-10-24  9:39     ` Thomas Gleixner
  2016-10-24 14:51       ` Thomas Gleixner
  0 siblings, 1 reply; 12+ messages in thread
From: Thomas Gleixner @ 2016-10-24  9:39 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: LKML, Andrew Morton, Ingo Molnar, H. Peter Anvin

On Sun, 23 Oct 2016, Linus Torvalds wrote:
> So I found what looks like a bug in lock_timer_base() wrt migration.
> 
> This code:
> 
>         for (;;) {
>                 struct timer_base *base;
>                 u32 tf = timer->flags;
> 
>                 if (!(tf & TIMER_MIGRATING)) {
>                         base = get_timer_base(tf);
>                         spin_lock_irqsave(&base->lock, *flags);
>                         if (timer->flags == tf)
>                                 return base;
>                         spin_unlock_irqrestore(&base->lock, *flags);
>                 }
>                 cpu_relax();
>         }
> 
> looks subtly buggy. I think that load of "tf" needs a READ_ONCE() to
> make sure that gcc doesn't simply reload the valid of "timer->flags"
> at random points.

You are right, that needs a READ_ONCE(). Stupid me.
 
> Yes, the spin_lock_irqsave() is a barrier, but that's the only one.
> Afaik, gcc could decide that "I need to spill tf, so I'll just reload
> it" after looking up get_timer_base().
>
> And no, I don't think this is the cause of my problem, but I suspect
> that something _like_ fragility in lock_timer_base() could cause this.

It might explain it, when this really ends up with the wrong base.

> I dunno. That whole thing looks very fragile to begin with: is it
> really ok to change the expiry time of a timer without holding any
> locks what-so-ever? The timer may just be firing on another CPU, and
> you may be setting the expiry time on a timer that isn't ever going to
> fire again.

A timer firing on the other CPU is not an issue, but yes, we should not do
that unlocked. This certainly is a naive over optimization.

I'll go through the locking once more with a fine comb and send out fixes
later today along with another NOHZ issue which I decoded over the weekend.
 
> And there may well be valid reasons why I'm full of crap, and there's
> some reason why this is all safe. Maybe the GP fault I saw was my
> fault after all, in some way that I can't for the life of me figure
> out right now..

Is your pr_cont thing serialized or does it rely on the timer locking (or
the lack of it)?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [GIT pull] timer updates for 4.9
  2016-10-24  9:39     ` Thomas Gleixner
@ 2016-10-24 14:51       ` Thomas Gleixner
  2016-10-24 15:13         ` Thomas Gleixner
  2016-10-24 17:16         ` [GIT pull] timer updates for 4.9 Linus Torvalds
  0 siblings, 2 replies; 12+ messages in thread
From: Thomas Gleixner @ 2016-10-24 14:51 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: LKML, Andrew Morton, Ingo Molnar, H. Peter Anvin

On Mon, 24 Oct 2016, Thomas Gleixner wrote:
> On Sun, 23 Oct 2016, Linus Torvalds wrote:
> > So I found what looks like a bug in lock_timer_base() wrt migration.
> > 
> > This code:
> > 
> >         for (;;) {
> >                 struct timer_base *base;
> >                 u32 tf = timer->flags;
> > 
> >                 if (!(tf & TIMER_MIGRATING)) {
> >                         base = get_timer_base(tf);
> >                         spin_lock_irqsave(&base->lock, *flags);
> >                         if (timer->flags == tf)
> >                                 return base;
> >                         spin_unlock_irqrestore(&base->lock, *flags);
> >                 }
> >                 cpu_relax();
> >         }
> > 
> > looks subtly buggy. I think that load of "tf" needs a READ_ONCE() to
> > make sure that gcc doesn't simply reload the valid of "timer->flags"
> > at random points.
> 
> You are right, that needs a READ_ONCE(). Stupid me.
>  
> > Yes, the spin_lock_irqsave() is a barrier, but that's the only one.
> > Afaik, gcc could decide that "I need to spill tf, so I'll just reload
> > it" after looking up get_timer_base().
> >
> > And no, I don't think this is the cause of my problem, but I suspect
> > that something _like_ fragility in lock_timer_base() could cause this.
> 
> It might explain it, when this really ends up with the wrong base.

Can you please check in the disassembly whether gcc really reloads
timer->flags? Mine does not...

Another thing you might try is to enable debugobjects. As this happens only
at shutdown time the problem might be caused by something else which
wreckages timers in some subtle way.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [GIT pull] timer updates for 4.9
  2016-10-24 14:51       ` Thomas Gleixner
@ 2016-10-24 15:13         ` Thomas Gleixner
  2016-10-25 14:57           ` [tip:timers/urgent] timers: Plug locking race vs. timer migration tip-bot for Thomas Gleixner
  2016-10-25 14:58           ` [tip:timers/urgent] timers: Lock base for same bucket optimization tip-bot for Thomas Gleixner
  2016-10-24 17:16         ` [GIT pull] timer updates for 4.9 Linus Torvalds
  1 sibling, 2 replies; 12+ messages in thread
From: Thomas Gleixner @ 2016-10-24 15:13 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: LKML, Andrew Morton, Ingo Molnar, H. Peter Anvin

On Mon, 24 Oct 2016, Thomas Gleixner wrote:
> Can you please check in the disassembly whether gcc really reloads
> timer->flags? Mine does not...
> 
> Another thing you might try is to enable debugobjects. As this happens only
> at shutdown time the problem might be caused by something else which
> wreckages timers in some subtle way.

Find below a combo patch which cures your findings and the NOHZ related
wreckage, which just makes timers fire early, but does not corrupt stuff in
a way which could explain your splat.

Thanks,

	tglx

8<---------------------
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -878,7 +878,7 @@ static inline struct timer_base *get_tim
 
 #ifdef CONFIG_NO_HZ_COMMON
 static inline struct timer_base *
-__get_target_base(struct timer_base *base, unsigned tflags)
+get_target_base(struct timer_base *base, unsigned tflags)
 {
 #ifdef CONFIG_SMP
 	if ((tflags & TIMER_PINNED) || !base->migration_enabled)
@@ -891,25 +891,27 @@ static inline struct timer_base *
 
 static inline void forward_timer_base(struct timer_base *base)
 {
+	unsigned long jnow = READ_ONCE(jiffies);
+
 	/*
 	 * We only forward the base when it's idle and we have a delta between
 	 * base clock and jiffies.
 	 */
-	if (!base->is_idle || (long) (jiffies - base->clk) < 2)
+	if (!base->is_idle || (long) (jnow - base->clk) < 2)
 		return;
 
 	/*
 	 * If the next expiry value is > jiffies, then we fast forward to
 	 * jiffies otherwise we forward to the next expiry value.
 	 */
-	if (time_after(base->next_expiry, jiffies))
-		base->clk = jiffies;
+	if (time_after(base->next_expiry, jnow))
+		base->clk = jnow;
 	else
 		base->clk = base->next_expiry;
 }
 #else
 static inline struct timer_base *
-__get_target_base(struct timer_base *base, unsigned tflags)
+get_target_base(struct timer_base *base, unsigned tflags)
 {
 	return get_timer_this_cpu_base(tflags);
 }
@@ -917,14 +919,6 @@ static inline struct timer_base *
 static inline void forward_timer_base(struct timer_base *base) { }
 #endif
 
-static inline struct timer_base *
-get_target_base(struct timer_base *base, unsigned tflags)
-{
-	struct timer_base *target = __get_target_base(base, tflags);
-
-	forward_timer_base(target);
-	return target;
-}
 
 /*
  * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
@@ -943,7 +937,14 @@ static struct timer_base *lock_timer_bas
 {
 	for (;;) {
 		struct timer_base *base;
-		u32 tf = timer->flags;
+		u32 tf;
+
+		/*
+		 * We need to use READ_ONCE() here, otherwise the compiler
+		 * might re-read @tf between the check for TIMER_MIGRATING
+		 * and spin_lock().
+		 */
+		tf = READ_ONCE(timer->flags);
 
 		if (!(tf & TIMER_MIGRATING)) {
 			base = get_timer_base(tf);
@@ -964,6 +965,8 @@ static inline int
 	unsigned long clk = 0, flags;
 	int ret = 0;
 
+	BUG_ON(!timer->function);
+
 	/*
 	 * This is a common optimization triggered by the networking code - if
 	 * the timer is re-modified to have the same timeout or ends up in the
@@ -972,13 +975,16 @@ static inline int
 	if (timer_pending(timer)) {
 		if (timer->expires == expires)
 			return 1;
+
 		/*
-		 * Take the current timer_jiffies of base, but without holding
-		 * the lock!
+		 * We lock timer base and calculate the bucket index right
+		 * here. If the timer ends up in the same bucket, then we
+		 * just update the expiry time and avoid the whole
+		 * dequeue/enqueue dance.
 		 */
-		base = get_timer_base(timer->flags);
-		clk = base->clk;
+		base = lock_timer_base(timer, &flags);
 
+		clk = base->clk;
 		idx = calc_wheel_index(expires, clk);
 
 		/*
@@ -988,14 +994,14 @@ static inline int
 		 */
 		if (idx == timer_get_idx(timer)) {
 			timer->expires = expires;
-			return 1;
+			ret = 1;
+			goto out_unlock;
 		}
+	} else {
+		base = lock_timer_base(timer, &flags);
 	}
 
 	timer_stats_timer_set_start_info(timer);
-	BUG_ON(!timer->function);
-
-	base = lock_timer_base(timer, &flags);
 
 	ret = detach_if_pending(timer, base, false);
 	if (!ret && pending_only)
@@ -1025,12 +1031,16 @@ static inline int
 		}
 	}
 
+	/* Try to forward a stale timer base clock */
+	forward_timer_base(base);
+
 	timer->expires = expires;
 	/*
 	 * If 'idx' was calculated above and the base time did not advance
-	 * between calculating 'idx' and taking the lock, only enqueue_timer()
-	 * and trigger_dyntick_cpu() is required. Otherwise we need to
-	 * (re)calculate the wheel index via internal_add_timer().
+	 * between calculating 'idx' and possibly switching the base, only
+	 * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise
+	 * we need to (re)calculate the wheel index via
+	 * internal_add_timer().
 	 */
 	if (idx != UINT_MAX && clk == base->clk) {
 		enqueue_timer(base, timer, idx);
@@ -1510,12 +1520,16 @@ u64 get_next_timer_interrupt(unsigned lo
 	is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
 	base->next_expiry = nextevt;
 	/*
-	 * We have a fresh next event. Check whether we can forward the base:
-	 */
-	if (time_after(nextevt, jiffies))
-		base->clk = jiffies;
-	else if (time_after(nextevt, base->clk))
-		base->clk = nextevt;
+	 * We have a fresh next event. Check whether we can forward the
+	 * base. We can only do that when @basej is past base->clk
+	 * otherwise we might rewind base->clk.
+	 */
+	if (time_after(basej, base->clk)) {
+		if (time_after(nextevt, basej))
+			base->clk = basej;
+		else if (time_after(nextevt, base->clk))
+			base->clk = nextevt;
+	}
 
 	if (time_before_eq(nextevt, basej)) {
 		expires = basem;

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [GIT pull] timer updates for 4.9
  2016-10-24 14:51       ` Thomas Gleixner
  2016-10-24 15:13         ` Thomas Gleixner
@ 2016-10-24 17:16         ` Linus Torvalds
  2016-10-24 19:09           ` Thomas Gleixner
  1 sibling, 1 reply; 12+ messages in thread
From: Linus Torvalds @ 2016-10-24 17:16 UTC (permalink / raw)
  To: Thomas Gleixner, Sergey Senozhatsky
  Cc: LKML, Andrew Morton, Ingo Molnar, H. Peter Anvin

On Mon, Oct 24, 2016 at 7:51 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
>
> Can you please check in the disassembly whether gcc really reloads
> timer->flags? Mine does not...

No, me neither. The code generation for lock_timer_base() looks
reasonable, although not pretty (it needs one spill for the
complexities in get_timer_cpu_base(), and the "*flags" games results
in some unnecessary indirection too).

I will try your patch, but also stare at my code some more.

I'm starting to think that the problem could be due to the timer code
being triggered _way_ too early (printk() ends up being obviously used
long before most things end up using timers), and that the problem I
see is just later fallout from that.

Sergey (added to participants) tried an earlier version of my patch,
and had more debug options enabled, and got

  BUG: spinlock bad magic on CPU#0

from mod_timer() doing _raw_spin_unlock_irqrestore(), when the
printk() callchain happens very early in setup_arch ->
setup_memory_map -> e820_print_map().

So I think the timer bugs I found were _potentially_ true bugs, but
likely not the cause of this all.

init_timers() happens early, but we do printk's even earlier.

I'll add a "if (system_state > SYSTEM_BOOTING)" or something to test
that theory.

                     Linus

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [GIT pull] timer updates for 4.9
  2016-10-24 17:16         ` [GIT pull] timer updates for 4.9 Linus Torvalds
@ 2016-10-24 19:09           ` Thomas Gleixner
  2016-10-24 19:30             ` Linus Torvalds
  0 siblings, 1 reply; 12+ messages in thread
From: Thomas Gleixner @ 2016-10-24 19:09 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Sergey Senozhatsky, LKML, Andrew Morton, Ingo Molnar, H. Peter Anvin

On Mon, 24 Oct 2016, Linus Torvalds wrote:
> On Mon, Oct 24, 2016 at 7:51 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> >
> > Can you please check in the disassembly whether gcc really reloads
> > timer->flags? Mine does not...
> 
> No, me neither. The code generation for lock_timer_base() looks
> reasonable, although not pretty (it needs one spill for the
> complexities in get_timer_cpu_base(), and the "*flags" games results
> in some unnecessary indirection too).
> 
> I will try your patch, but also stare at my code some more.
> 
> I'm starting to think that the problem could be due to the timer code
> being triggered _way_ too early (printk() ends up being obviously used
> long before most things end up using timers), and that the problem I
> see is just later fallout from that.
> 
> Sergey (added to participants) tried an earlier version of my patch,
> and had more debug options enabled, and got
> 
>   BUG: spinlock bad magic on CPU#0
> 
> from mod_timer() doing _raw_spin_unlock_irqrestore(), when the

Weird, that should have triggered in raw_spin_lock() already.

Can you bounce me the patch you are currently testing?

> printk() callchain happens very early in setup_arch ->
> setup_memory_map -> e820_print_map().
> 
> So I think the timer bugs I found were _potentially_ true bugs, but
> likely not the cause of this all.
> 
> init_timers() happens early, but we do printk's even earlier.

These are the things which are not initialized:

1) base->spinlock

   That's a non issue for !debug kernels as the lock initializer is 0
   (unlocked).

2) base->clk
 
   That makes the timer queued at some random array bucket.

3) base->cpu

   That's a non issue as base->cpu is 0 and at this point you are on CPU 0
   and the stupid NOHZ remote queueing is not yet possible.

The hlist_head is not touched by init_timers() as it's NULL initialized
already, so we do not scribble over an already queued timer.

So anything you queue _before_ init_timers() will just be queued to some
random bucket, but it does not explain the wreckage you are seing.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [GIT pull] timer updates for 4.9
  2016-10-24 19:09           ` Thomas Gleixner
@ 2016-10-24 19:30             ` Linus Torvalds
  2016-10-24 21:36               ` Thomas Gleixner
  0 siblings, 1 reply; 12+ messages in thread
From: Linus Torvalds @ 2016-10-24 19:30 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Sergey Senozhatsky, LKML, Andrew Morton, Ingo Molnar, H. Peter Anvin

[-- Attachment #1: Type: text/plain, Size: 1610 bytes --]

On Mon, Oct 24, 2016 at 12:09 PM, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Mon, 24 Oct 2016, Linus Torvalds wrote:
>
> Weird, that should have triggered in raw_spin_lock() already.
>
> Can you bounce me the patch you are currently testing?

The one I'm *currently* testing seems to work for me, but that's
because I moved the timer into klogd and that just avoids the issue.

But I'll attach my old patch just for your testing pleasure.

The fact that it actually mostly worked - and then had some very odd
behavior much later on - makes me wonder if we might have other users
that could have timers that get started early, and nobody notices
because most of the time it doesn't cause any real issues.


> These are the things which are not initialized:
>
> 1) base->spinlock
>
>    That's a non issue for !debug kernels as the lock initializer is 0
>    (unlocked).
>
> 2) base->clk
>
>    That makes the timer queued at some random array bucket.
>
> 3) base->cpu
>
>    That's a non issue as base->cpu is 0 and at this point you are on CPU 0
>    and the stupid NOHZ remote queueing is not yet possible.
>
> The hlist_head is not touched by init_timers() as it's NULL initialized
> already, so we do not scribble over an already queued timer.
>
> So anything you queue _before_ init_timers() will just be queued to some
> random bucket, but it does not explain the wreckage you are seeing.

Here's the old version of the patch that triggered the problem. The
second attachment is the new version that works foe me (and removes a
lot more printk cruft, which is why it's bigger).

              Linus

[-- Attachment #2: old.diff --]
[-- Type: text/plain, Size: 7743 bytes --]

 kernel/printk/printk.c | 132 +++++++++++--------------------------------------
 1 file changed, 28 insertions(+), 104 deletions(-)

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index de08fc90baaf..ed1bfa774427 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -794,7 +794,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
 	return ret;
 }
 
-static void cont_flush(void);
+static void deferred_cont_flush(void);
 
 static ssize_t devkmsg_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *ppos)
@@ -811,7 +811,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 	if (ret)
 		return ret;
 	raw_spin_lock_irq(&logbuf_lock);
-	cont_flush();
 	while (user->seq == log_next_seq) {
 		if (file->f_flags & O_NONBLOCK) {
 			ret = -EAGAIN;
@@ -860,6 +859,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 	ret = len;
 out:
 	mutex_unlock(&user->lock);
+	deferred_cont_flush();
 	return ret;
 }
 
@@ -874,7 +874,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
 		return -ESPIPE;
 
 	raw_spin_lock_irq(&logbuf_lock);
-	cont_flush();
 	switch (whence) {
 	case SEEK_SET:
 		/* the first record */
@@ -913,7 +912,6 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &log_wait, wait);
 
 	raw_spin_lock_irq(&logbuf_lock);
-	cont_flush();
 	if (user->seq < log_next_seq) {
 		/* return error when data has vanished underneath us */
 		if (user->seq < log_first_seq)
@@ -922,6 +920,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
 			ret = POLLIN|POLLRDNORM;
 	}
 	raw_spin_unlock_irq(&logbuf_lock);
+	deferred_cont_flush();
 
 	return ret;
 }
@@ -1300,7 +1299,6 @@ static int syslog_print(char __user *buf, int size)
 		size_t skip;
 
 		raw_spin_lock_irq(&logbuf_lock);
-		cont_flush();
 		if (syslog_seq < log_first_seq) {
 			/* messages are gone, move to first one */
 			syslog_seq = log_first_seq;
@@ -1360,7 +1358,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		return -ENOMEM;
 
 	raw_spin_lock_irq(&logbuf_lock);
-	cont_flush();
 	if (buf) {
 		u64 next_seq;
 		u64 seq;
@@ -1522,7 +1519,6 @@ int do_syslog(int type, char __user *buf, int len, int source)
 	/* Number of chars in the log buffer */
 	case SYSLOG_ACTION_SIZE_UNREAD:
 		raw_spin_lock_irq(&logbuf_lock);
-		cont_flush();
 		if (syslog_seq < log_first_seq) {
 			/* messages are gone, move to first one */
 			syslog_seq = log_first_seq;
@@ -1563,6 +1559,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
 		error = -EINVAL;
 		break;
 	}
+	deferred_cont_flush();
 out:
 	return error;
 }
@@ -1650,46 +1647,41 @@ static inline void printk_delay(void)
 static struct cont {
 	char buf[LOG_LINE_MAX];
 	size_t len;			/* length == 0 means unused buffer */
-	size_t cons;			/* bytes written to console */
 	struct task_struct *owner;	/* task of first print*/
 	u64 ts_nsec;			/* time of first print */
 	u8 level;			/* log level of first message */
 	u8 facility;			/* log facility of first message */
 	enum log_flags flags;		/* prefix, newline flags */
-	bool flushed:1;			/* buffer sealed and committed */
 } cont;
 
-static void cont_flush(void)
+static bool cont_flush(void)
 {
-	if (cont.flushed)
-		return;
-	if (cont.len == 0)
+	if (!cont.len)
+		return false;
+
+	log_store(cont.facility, cont.level, cont.flags, cont.ts_nsec,
+		  NULL, 0, cont.buf, cont.len);
+	cont.len = 0;
+	return true;
+}
+
+static void flush_timer(unsigned long data)
+{
+	if (cont_flush())
+		wake_up_klogd();
+}
+
+static void deferred_cont_flush(void)
+{
+	static DEFINE_TIMER(timer, flush_timer, 0, 0);
+
+	if (!cont.len)
 		return;
-	if (cont.cons) {
-		/*
-		 * If a fragment of this line was directly flushed to the
-		 * console; wait for the console to pick up the rest of the
-		 * line. LOG_NOCONS suppresses a duplicated output.
-		 */
-		log_store(cont.facility, cont.level, cont.flags | LOG_NOCONS,
-			  cont.ts_nsec, NULL, 0, cont.buf, cont.len);
-		cont.flushed = true;
-	} else {
-		/*
-		 * If no fragment of this line ever reached the console,
-		 * just submit it to the store and free the buffer.
-		 */
-		log_store(cont.facility, cont.level, cont.flags, 0,
-			  NULL, 0, cont.buf, cont.len);
-		cont.len = 0;
-	}
+	mod_timer(&timer, jiffies + HZ/10);
 }
 
 static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len)
 {
-	if (cont.len && cont.flushed)
-		return false;
-
 	/*
 	 * If ext consoles are present, flush and skip in-kernel
 	 * continuation.  See nr_ext_console_drivers definition.  Also, if
@@ -1706,8 +1698,6 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char *
 		cont.owner = current;
 		cont.ts_nsec = local_clock();
 		cont.flags = flags;
-		cont.cons = 0;
-		cont.flushed = false;
 	}
 
 	memcpy(cont.buf + cont.len, text, len);
@@ -1726,34 +1716,6 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char *
 	return true;
 }
 
-static size_t cont_print_text(char *text, size_t size)
-{
-	size_t textlen = 0;
-	size_t len;
-
-	if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) {
-		textlen += print_time(cont.ts_nsec, text);
-		size -= textlen;
-	}
-
-	len = cont.len - cont.cons;
-	if (len > 0) {
-		if (len+1 > size)
-			len = size-1;
-		memcpy(text + textlen, cont.buf + cont.cons, len);
-		textlen += len;
-		cont.cons = cont.len;
-	}
-
-	if (cont.flushed) {
-		if (cont.flags & LOG_NEWLINE)
-			text[textlen++] = '\n';
-		/* got everything, release buffer */
-		cont.len = 0;
-	}
-	return textlen;
-}
-
 static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len)
 {
 	/*
@@ -2025,7 +1987,6 @@ static void call_console_drivers(int level,
 				 const char *text, size_t len) {}
 static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
 			     bool syslog, char *buf, size_t size) { return 0; }
-static size_t cont_print_text(char *text, size_t size) { return 0; }
 static bool suppress_message_printing(int level) { return false; }
 
 /* Still needs to be defined for users */
@@ -2296,42 +2257,6 @@ static inline int can_use_console(void)
 	return cpu_online(raw_smp_processor_id()) || have_callable_console();
 }
 
-static void console_cont_flush(char *text, size_t size)
-{
-	unsigned long flags;
-	size_t len;
-
-	raw_spin_lock_irqsave(&logbuf_lock, flags);
-
-	if (!cont.len)
-		goto out;
-
-	if (suppress_message_printing(cont.level)) {
-		cont.cons = cont.len;
-		if (cont.flushed)
-			cont.len = 0;
-		goto out;
-	}
-
-	/*
-	 * We still queue earlier records, likely because the console was
-	 * busy. The earlier ones need to be printed before this one, we
-	 * did not flush any fragment so far, so just let it queue up.
-	 */
-	if (console_seq < log_next_seq && !cont.cons)
-		goto out;
-
-	len = cont_print_text(text, size);
-	raw_spin_unlock(&logbuf_lock);
-	stop_critical_timings();
-	call_console_drivers(cont.level, NULL, 0, text, len);
-	start_critical_timings();
-	local_irq_restore(flags);
-	return;
-out:
-	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
-}
-
 /**
  * console_unlock - unlock the console system
  *
@@ -2360,6 +2285,8 @@ void console_unlock(void)
 		return;
 	}
 
+	deferred_cont_flush();
+
 	/*
 	 * Console drivers are called under logbuf_lock, so
 	 * @console_may_schedule should be cleared before; however, we may
@@ -2385,9 +2312,6 @@ void console_unlock(void)
 		return;
 	}
 
-	/* flush buffered message fragment immediately to console */
-	console_cont_flush(text, sizeof(text));
-
 	for (;;) {
 		struct printk_log *msg;
 		size_t ext_len = 0;

[-- Attachment #3: new.diff --]
[-- Type: text/plain, Size: 19679 bytes --]

 kernel/printk/printk.c | 249 +++++++++++--------------------------------------
 1 file changed, 52 insertions(+), 197 deletions(-)

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index de08fc90baaf..6df4d3f6c276 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -367,7 +367,6 @@ DECLARE_WAIT_QUEUE_HEAD(log_wait);
 /* the next printk record to read by syslog(READ) or /proc/kmsg */
 static u64 syslog_seq;
 static u32 syslog_idx;
-static enum log_flags syslog_prev;
 static size_t syslog_partial;
 
 /* index and sequence number of the first record stored in the buffer */
@@ -381,7 +380,6 @@ static u32 log_next_idx;
 /* the next printk record to write to the console */
 static u64 console_seq;
 static u32 console_idx;
-static enum log_flags console_prev;
 
 /* the next printk record to read after the last 'clear' command */
 static u64 clear_seq;
@@ -650,27 +648,15 @@ static void append_char(char **pp, char *e, char c)
 }
 
 static ssize_t msg_print_ext_header(char *buf, size_t size,
-				    struct printk_log *msg, u64 seq,
-				    enum log_flags prev_flags)
+				    struct printk_log *msg, u64 seq)
 {
 	u64 ts_usec = msg->ts_nsec;
-	char cont = '-';
 
 	do_div(ts_usec, 1000);
 
-	/*
-	 * If we couldn't merge continuation line fragments during the print,
-	 * export the stored flags to allow an optional external merge of the
-	 * records. Merging the records isn't always neccessarily correct, like
-	 * when we hit a race during printing. In most cases though, it produces
-	 * better readable output. 'c' in the record flags mark the first
-	 * fragment of a line, '+' the following.
-	 */
-	if (msg->flags & LOG_CONT)
-		cont = (prev_flags & LOG_CONT) ? '+' : 'c';
-
 	return scnprintf(buf, size, "%u,%llu,%llu,%c;",
-		       (msg->facility << 3) | msg->level, seq, ts_usec, cont);
+		       (msg->facility << 3) | msg->level, seq, ts_usec,
+		       msg->flags & LOG_CONT ? 'c' : '-');
 }
 
 static ssize_t msg_print_ext_body(char *buf, size_t size,
@@ -725,7 +711,6 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
 struct devkmsg_user {
 	u64 seq;
 	u32 idx;
-	enum log_flags prev;
 	struct ratelimit_state rs;
 	struct mutex lock;
 	char buf[CONSOLE_EXT_LOG_MAX];
@@ -794,7 +779,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
 	return ret;
 }
 
-static void cont_flush(void);
+static void deferred_cont_flush(void);
 
 static ssize_t devkmsg_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *ppos)
@@ -811,7 +796,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 	if (ret)
 		return ret;
 	raw_spin_lock_irq(&logbuf_lock);
-	cont_flush();
 	while (user->seq == log_next_seq) {
 		if (file->f_flags & O_NONBLOCK) {
 			ret = -EAGAIN;
@@ -838,12 +822,11 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 
 	msg = log_from_idx(user->idx);
 	len = msg_print_ext_header(user->buf, sizeof(user->buf),
-				   msg, user->seq, user->prev);
+				   msg, user->seq);
 	len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len,
 				  log_dict(msg), msg->dict_len,
 				  log_text(msg), msg->text_len);
 
-	user->prev = msg->flags;
 	user->idx = log_next(user->idx);
 	user->seq++;
 	raw_spin_unlock_irq(&logbuf_lock);
@@ -860,6 +843,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 	ret = len;
 out:
 	mutex_unlock(&user->lock);
+	deferred_cont_flush();
 	return ret;
 }
 
@@ -874,7 +858,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
 		return -ESPIPE;
 
 	raw_spin_lock_irq(&logbuf_lock);
-	cont_flush();
 	switch (whence) {
 	case SEEK_SET:
 		/* the first record */
@@ -913,7 +896,6 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &log_wait, wait);
 
 	raw_spin_lock_irq(&logbuf_lock);
-	cont_flush();
 	if (user->seq < log_next_seq) {
 		/* return error when data has vanished underneath us */
 		if (user->seq < log_first_seq)
@@ -922,6 +904,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
 			ret = POLLIN|POLLRDNORM;
 	}
 	raw_spin_unlock_irq(&logbuf_lock);
+	deferred_cont_flush();
 
 	return ret;
 }
@@ -1226,26 +1209,12 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf)
 	return len;
 }
 
-static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
-			     bool syslog, char *buf, size_t size)
+static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size)
 {
 	const char *text = log_text(msg);
 	size_t text_size = msg->text_len;
-	bool prefix = true;
-	bool newline = true;
 	size_t len = 0;
 
-	if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))
-		prefix = false;
-
-	if (msg->flags & LOG_CONT) {
-		if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE))
-			prefix = false;
-
-		if (!(msg->flags & LOG_NEWLINE))
-			newline = false;
-	}
-
 	do {
 		const char *next = memchr(text, '\n', text_size);
 		size_t text_len;
@@ -1263,22 +1232,17 @@ static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
 			    text_len + 1 >= size - len)
 				break;
 
-			if (prefix)
-				len += print_prefix(msg, syslog, buf + len);
+			len += print_prefix(msg, syslog, buf + len);
 			memcpy(buf + len, text, text_len);
 			len += text_len;
-			if (next || newline)
-				buf[len++] = '\n';
+			buf[len++] = '\n';
 		} else {
 			/* SYSLOG_ACTION_* buffer size only calculation */
-			if (prefix)
-				len += print_prefix(msg, syslog, NULL);
+			len += print_prefix(msg, syslog, NULL);
 			len += text_len;
-			if (next || newline)
-				len++;
+			len++;
 		}
 
-		prefix = true;
 		text = next;
 	} while (text);
 
@@ -1300,12 +1264,10 @@ static int syslog_print(char __user *buf, int size)
 		size_t skip;
 
 		raw_spin_lock_irq(&logbuf_lock);
-		cont_flush();
 		if (syslog_seq < log_first_seq) {
 			/* messages are gone, move to first one */
 			syslog_seq = log_first_seq;
 			syslog_idx = log_first_idx;
-			syslog_prev = 0;
 			syslog_partial = 0;
 		}
 		if (syslog_seq == log_next_seq) {
@@ -1315,13 +1277,11 @@ static int syslog_print(char __user *buf, int size)
 
 		skip = syslog_partial;
 		msg = log_from_idx(syslog_idx);
-		n = msg_print_text(msg, syslog_prev, true, text,
-				   LOG_LINE_MAX + PREFIX_MAX);
+		n = msg_print_text(msg, true, text, LOG_LINE_MAX + PREFIX_MAX);
 		if (n - syslog_partial <= size) {
 			/* message fits into buffer, move forward */
 			syslog_idx = log_next(syslog_idx);
 			syslog_seq++;
-			syslog_prev = msg->flags;
 			n -= syslog_partial;
 			syslog_partial = 0;
 		} else if (!len){
@@ -1360,12 +1320,10 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		return -ENOMEM;
 
 	raw_spin_lock_irq(&logbuf_lock);
-	cont_flush();
 	if (buf) {
 		u64 next_seq;
 		u64 seq;
 		u32 idx;
-		enum log_flags prev;
 
 		/*
 		 * Find first record that fits, including all following records,
@@ -1373,12 +1331,10 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		 */
 		seq = clear_seq;
 		idx = clear_idx;
-		prev = 0;
 		while (seq < log_next_seq) {
 			struct printk_log *msg = log_from_idx(idx);
 
-			len += msg_print_text(msg, prev, true, NULL, 0);
-			prev = msg->flags;
+			len += msg_print_text(msg, true, NULL, 0);
 			idx = log_next(idx);
 			seq++;
 		}
@@ -1386,12 +1342,10 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		/* move first record forward until length fits into the buffer */
 		seq = clear_seq;
 		idx = clear_idx;
-		prev = 0;
 		while (len > size && seq < log_next_seq) {
 			struct printk_log *msg = log_from_idx(idx);
 
-			len -= msg_print_text(msg, prev, true, NULL, 0);
-			prev = msg->flags;
+			len -= msg_print_text(msg, true, NULL, 0);
 			idx = log_next(idx);
 			seq++;
 		}
@@ -1404,7 +1358,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 			struct printk_log *msg = log_from_idx(idx);
 			int textlen;
 
-			textlen = msg_print_text(msg, prev, true, text,
+			textlen = msg_print_text(msg, true, text,
 						 LOG_LINE_MAX + PREFIX_MAX);
 			if (textlen < 0) {
 				len = textlen;
@@ -1412,7 +1366,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 			}
 			idx = log_next(idx);
 			seq++;
-			prev = msg->flags;
 
 			raw_spin_unlock_irq(&logbuf_lock);
 			if (copy_to_user(buf + len, text, textlen))
@@ -1425,7 +1378,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 				/* messages are gone, move to next one */
 				seq = log_first_seq;
 				idx = log_first_idx;
-				prev = 0;
 			}
 		}
 	}
@@ -1522,12 +1474,10 @@ int do_syslog(int type, char __user *buf, int len, int source)
 	/* Number of chars in the log buffer */
 	case SYSLOG_ACTION_SIZE_UNREAD:
 		raw_spin_lock_irq(&logbuf_lock);
-		cont_flush();
 		if (syslog_seq < log_first_seq) {
 			/* messages are gone, move to first one */
 			syslog_seq = log_first_seq;
 			syslog_idx = log_first_idx;
-			syslog_prev = 0;
 			syslog_partial = 0;
 		}
 		if (source == SYSLOG_FROM_PROC) {
@@ -1540,16 +1490,14 @@ int do_syslog(int type, char __user *buf, int len, int source)
 		} else {
 			u64 seq = syslog_seq;
 			u32 idx = syslog_idx;
-			enum log_flags prev = syslog_prev;
 
 			error = 0;
 			while (seq < log_next_seq) {
 				struct printk_log *msg = log_from_idx(idx);
 
-				error += msg_print_text(msg, prev, true, NULL, 0);
+				error += msg_print_text(msg, true, NULL, 0);
 				idx = log_next(idx);
 				seq++;
-				prev = msg->flags;
 			}
 			error -= syslog_partial;
 		}
@@ -1563,6 +1511,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
 		error = -EINVAL;
 		break;
 	}
+	deferred_cont_flush();
 out:
 	return error;
 }
@@ -1650,46 +1599,41 @@ static inline void printk_delay(void)
 static struct cont {
 	char buf[LOG_LINE_MAX];
 	size_t len;			/* length == 0 means unused buffer */
-	size_t cons;			/* bytes written to console */
 	struct task_struct *owner;	/* task of first print*/
 	u64 ts_nsec;			/* time of first print */
 	u8 level;			/* log level of first message */
 	u8 facility;			/* log facility of first message */
 	enum log_flags flags;		/* prefix, newline flags */
-	bool flushed:1;			/* buffer sealed and committed */
 } cont;
 
-static void cont_flush(void)
+static bool cont_flush(void)
 {
-	if (cont.flushed)
-		return;
-	if (cont.len == 0)
+	if (!cont.len)
+		return false;
+
+	log_store(cont.facility, cont.level, cont.flags, cont.ts_nsec,
+		  NULL, 0, cont.buf, cont.len);
+	cont.len = 0;
+	return true;
+}
+
+static void flush_timer(unsigned long data)
+{
+	if (cont_flush())
+		wake_up_klogd();
+}
+
+static void deferred_cont_flush(void)
+{
+	static DEFINE_TIMER(timer, flush_timer, 0, 0);
+
+	if (!cont.len)
 		return;
-	if (cont.cons) {
-		/*
-		 * If a fragment of this line was directly flushed to the
-		 * console; wait for the console to pick up the rest of the
-		 * line. LOG_NOCONS suppresses a duplicated output.
-		 */
-		log_store(cont.facility, cont.level, cont.flags | LOG_NOCONS,
-			  cont.ts_nsec, NULL, 0, cont.buf, cont.len);
-		cont.flushed = true;
-	} else {
-		/*
-		 * If no fragment of this line ever reached the console,
-		 * just submit it to the store and free the buffer.
-		 */
-		log_store(cont.facility, cont.level, cont.flags, 0,
-			  NULL, 0, cont.buf, cont.len);
-		cont.len = 0;
-	}
+	mod_timer(&timer, jiffies + HZ/10);
 }
 
 static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len)
 {
-	if (cont.len && cont.flushed)
-		return false;
-
 	/*
 	 * If ext consoles are present, flush and skip in-kernel
 	 * continuation.  See nr_ext_console_drivers definition.  Also, if
@@ -1706,8 +1650,6 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char *
 		cont.owner = current;
 		cont.ts_nsec = local_clock();
 		cont.flags = flags;
-		cont.cons = 0;
-		cont.flushed = false;
 	}
 
 	memcpy(cont.buf + cont.len, text, len);
@@ -1726,34 +1668,6 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char *
 	return true;
 }
 
-static size_t cont_print_text(char *text, size_t size)
-{
-	size_t textlen = 0;
-	size_t len;
-
-	if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) {
-		textlen += print_time(cont.ts_nsec, text);
-		size -= textlen;
-	}
-
-	len = cont.len - cont.cons;
-	if (len > 0) {
-		if (len+1 > size)
-			len = size-1;
-		memcpy(text + textlen, cont.buf + cont.cons, len);
-		textlen += len;
-		cont.cons = cont.len;
-	}
-
-	if (cont.flushed) {
-		if (cont.flags & LOG_NEWLINE)
-			text[textlen++] = '\n';
-		/* got everything, release buffer */
-		cont.len = 0;
-	}
-	return textlen;
-}
-
 static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len)
 {
 	/*
@@ -1999,11 +1913,9 @@ static u64 syslog_seq;
 static u32 syslog_idx;
 static u64 console_seq;
 static u32 console_idx;
-static enum log_flags syslog_prev;
 static u64 log_first_seq;
 static u32 log_first_idx;
 static u64 log_next_seq;
-static enum log_flags console_prev;
 static struct cont {
 	size_t len;
 	size_t cons;
@@ -2015,17 +1927,16 @@ static char *log_dict(const struct printk_log *msg) { return NULL; }
 static struct printk_log *log_from_idx(u32 idx) { return NULL; }
 static u32 log_next(u32 idx) { return 0; }
 static ssize_t msg_print_ext_header(char *buf, size_t size,
-				    struct printk_log *msg, u64 seq,
-				    enum log_flags prev_flags) { return 0; }
+				    struct printk_log *msg,
+				    u64 seq) { return 0; }
 static ssize_t msg_print_ext_body(char *buf, size_t size,
 				  char *dict, size_t dict_len,
 				  char *text, size_t text_len) { return 0; }
 static void call_console_drivers(int level,
 				 const char *ext_text, size_t ext_len,
 				 const char *text, size_t len) {}
-static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
+static size_t msg_print_text(const struct printk_log *msg,
 			     bool syslog, char *buf, size_t size) { return 0; }
-static size_t cont_print_text(char *text, size_t size) { return 0; }
 static bool suppress_message_printing(int level) { return false; }
 
 /* Still needs to be defined for users */
@@ -2296,42 +2207,6 @@ static inline int can_use_console(void)
 	return cpu_online(raw_smp_processor_id()) || have_callable_console();
 }
 
-static void console_cont_flush(char *text, size_t size)
-{
-	unsigned long flags;
-	size_t len;
-
-	raw_spin_lock_irqsave(&logbuf_lock, flags);
-
-	if (!cont.len)
-		goto out;
-
-	if (suppress_message_printing(cont.level)) {
-		cont.cons = cont.len;
-		if (cont.flushed)
-			cont.len = 0;
-		goto out;
-	}
-
-	/*
-	 * We still queue earlier records, likely because the console was
-	 * busy. The earlier ones need to be printed before this one, we
-	 * did not flush any fragment so far, so just let it queue up.
-	 */
-	if (console_seq < log_next_seq && !cont.cons)
-		goto out;
-
-	len = cont_print_text(text, size);
-	raw_spin_unlock(&logbuf_lock);
-	stop_critical_timings();
-	call_console_drivers(cont.level, NULL, 0, text, len);
-	start_critical_timings();
-	local_irq_restore(flags);
-	return;
-out:
-	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
-}
-
 /**
  * console_unlock - unlock the console system
  *
@@ -2385,9 +2260,6 @@ void console_unlock(void)
 		return;
 	}
 
-	/* flush buffered message fragment immediately to console */
-	console_cont_flush(text, sizeof(text));
-
 	for (;;) {
 		struct printk_log *msg;
 		size_t ext_len = 0;
@@ -2407,7 +2279,6 @@ void console_unlock(void)
 			/* messages are gone, move to first one */
 			console_seq = log_first_seq;
 			console_idx = log_first_idx;
-			console_prev = 0;
 		} else {
 			len = 0;
 		}
@@ -2417,8 +2288,7 @@ void console_unlock(void)
 
 		msg = log_from_idx(console_idx);
 		level = msg->level;
-		if ((msg->flags & LOG_NOCONS) ||
-				suppress_message_printing(level)) {
+		if (suppress_message_printing(level)) {
 			/*
 			 * Skip record we have buffered and already printed
 			 * directly to the console when we received it, and
@@ -2426,22 +2296,14 @@ void console_unlock(void)
 			 */
 			console_idx = log_next(console_idx);
 			console_seq++;
-			/*
-			 * We will get here again when we register a new
-			 * CON_PRINTBUFFER console. Clear the flag so we
-			 * will properly dump everything later.
-			 */
-			msg->flags &= ~LOG_NOCONS;
-			console_prev = msg->flags;
 			goto skip;
 		}
 
-		len += msg_print_text(msg, console_prev, false,
-				      text + len, sizeof(text) - len);
+		len += msg_print_text(msg, false, text + len, sizeof(text) - len);
 		if (nr_ext_console_drivers) {
 			ext_len = msg_print_ext_header(ext_text,
 						sizeof(ext_text),
-						msg, console_seq, console_prev);
+						msg, console_seq);
 			ext_len += msg_print_ext_body(ext_text + ext_len,
 						sizeof(ext_text) - ext_len,
 						log_dict(msg), msg->dict_len,
@@ -2449,7 +2311,6 @@ void console_unlock(void)
 		}
 		console_idx = log_next(console_idx);
 		console_seq++;
-		console_prev = msg->flags;
 		raw_spin_unlock(&logbuf_lock);
 
 		stop_critical_timings();	/* don't trace print latency */
@@ -2483,7 +2344,7 @@ void console_unlock(void)
 	if (retry && console_trylock())
 		goto again;
 
-	if (wake_klogd)
+	if (wake_klogd || cont.len)
 		wake_up_klogd();
 }
 EXPORT_SYMBOL(console_unlock);
@@ -2744,7 +2605,6 @@ void register_console(struct console *newcon)
 		raw_spin_lock_irqsave(&logbuf_lock, flags);
 		console_seq = syslog_seq;
 		console_idx = syslog_idx;
-		console_prev = syslog_prev;
 		raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 		/*
 		 * We're about to replay the log buffer.  Only do this to the
@@ -2883,6 +2743,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
 
 	if (pending & PRINTK_PENDING_WAKEUP)
 		wake_up_interruptible(&log_wait);
+	deferred_cont_flush();
 }
 
 static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
@@ -3095,7 +2956,7 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
 		goto out;
 
 	msg = log_from_idx(dumper->cur_idx);
-	l = msg_print_text(msg, 0, syslog, line, size);
+	l = msg_print_text(msg, syslog, line, size);
 
 	dumper->cur_idx = log_next(dumper->cur_idx);
 	dumper->cur_seq++;
@@ -3165,7 +3026,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	u32 idx;
 	u64 next_seq;
 	u32 next_idx;
-	enum log_flags prev;
 	size_t l = 0;
 	bool ret = false;
 
@@ -3189,27 +3049,23 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	/* calculate length of entire buffer */
 	seq = dumper->cur_seq;
 	idx = dumper->cur_idx;
-	prev = 0;
 	while (seq < dumper->next_seq) {
 		struct printk_log *msg = log_from_idx(idx);
 
-		l += msg_print_text(msg, prev, true, NULL, 0);
+		l += msg_print_text(msg, true, NULL, 0);
 		idx = log_next(idx);
 		seq++;
-		prev = msg->flags;
 	}
 
 	/* move first record forward until length fits into the buffer */
 	seq = dumper->cur_seq;
 	idx = dumper->cur_idx;
-	prev = 0;
 	while (l > size && seq < dumper->next_seq) {
 		struct printk_log *msg = log_from_idx(idx);
 
-		l -= msg_print_text(msg, prev, true, NULL, 0);
+		l -= msg_print_text(msg, true, NULL, 0);
 		idx = log_next(idx);
 		seq++;
-		prev = msg->flags;
 	}
 
 	/* last message in next interation */
@@ -3220,10 +3076,9 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	while (seq < dumper->next_seq) {
 		struct printk_log *msg = log_from_idx(idx);
 
-		l += msg_print_text(msg, prev, syslog, buf + l, size - l);
+		l += msg_print_text(msg, syslog, buf + l, size - l);
 		idx = log_next(idx);
 		seq++;
-		prev = msg->flags;
 	}
 
 	dumper->next_seq = next_seq;

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [GIT pull] timer updates for 4.9
  2016-10-24 19:30             ` Linus Torvalds
@ 2016-10-24 21:36               ` Thomas Gleixner
  0 siblings, 0 replies; 12+ messages in thread
From: Thomas Gleixner @ 2016-10-24 21:36 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Sergey Senozhatsky, LKML, Andrew Morton, Ingo Molnar, H. Peter Anvin

On Mon, 24 Oct 2016, Linus Torvalds wrote:
> The one I'm *currently* testing seems to work for me, but that's
> because I moved the timer into klogd and that just avoids the issue.
> 
> But I'll attach my old patch just for your testing pleasure.

Of course it does not trigger... I probably need to test it on a desktop
machine to get all the goodies of that gnome shell ignoring signals.

> The fact that it actually mostly worked - and then had some very odd
> behavior much later on - makes me wonder if we might have other users
> that could have timers that get started early, and nobody notices
> because most of the time it doesn't cause any real issues.

I added a bunch of timers before init_timers(). There is no fallout aside
of the timers being queued into a random bucket and the spinlock bad magic
bug, which actually happens at spin_lock and not at unlock as I assumed.

While queueing timers before init_timers() is not correct, I think that the
early queueing is a red herring. We should really try to isolate the root
cause of this wreckage.

Can you retest that old patch with my combo patch applied and see if it
still triggers? If yes, which I expect, debugobjects might give us the hint
what to look for.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [tip:timers/urgent] timers: Plug locking race vs. timer migration
  2016-10-24 15:13         ` Thomas Gleixner
@ 2016-10-25 14:57           ` tip-bot for Thomas Gleixner
  2016-10-25 14:58           ` [tip:timers/urgent] timers: Lock base for same bucket optimization tip-bot for Thomas Gleixner
  1 sibling, 0 replies; 12+ messages in thread
From: tip-bot for Thomas Gleixner @ 2016-10-25 14:57 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: hpa, tglx, mingo, peterz, linux-kernel, torvalds, akpm

Commit-ID:  b831275a3553c32091222ac619cfddd73a5553fb
Gitweb:     http://git.kernel.org/tip/b831275a3553c32091222ac619cfddd73a5553fb
Author:     Thomas Gleixner <tglx@linutronix.de>
AuthorDate: Mon, 24 Oct 2016 11:41:56 +0200
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Tue, 25 Oct 2016 16:27:39 +0200

timers: Plug locking race vs. timer migration

Linus noticed that lock_timer_base() lacks a READ_ONCE() for accessing the
timer flags. As a consequence the compiler is allowed to reload the flags
between the initial check for TIMER_MIGRATION and the following timer base
computation and the spin lock of the base.

While this has not been observed (yet), we need to make sure that it never
happens.

Fixes: 0eeda71bc30d ("timer: Replace timer base by a cpu index")
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1610241711220.4983@nanos
Cc: stable@vger.kernel.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/time/timer.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2d47980..0d4b91c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -943,7 +943,14 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
 {
 	for (;;) {
 		struct timer_base *base;
-		u32 tf = timer->flags;
+		u32 tf;
+
+		/*
+		 * We need to use READ_ONCE() here, otherwise the compiler
+		 * might re-read @tf between the check for TIMER_MIGRATING
+		 * and spin_lock().
+		 */
+		tf = READ_ONCE(timer->flags);
 
 		if (!(tf & TIMER_MIGRATING)) {
 			base = get_timer_base(tf);

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [tip:timers/urgent] timers: Lock base for same bucket optimization
  2016-10-24 15:13         ` Thomas Gleixner
  2016-10-25 14:57           ` [tip:timers/urgent] timers: Plug locking race vs. timer migration tip-bot for Thomas Gleixner
@ 2016-10-25 14:58           ` tip-bot for Thomas Gleixner
  1 sibling, 0 replies; 12+ messages in thread
From: tip-bot for Thomas Gleixner @ 2016-10-25 14:58 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: peterz, mingo, akpm, tglx, torvalds, linux-kernel, hpa

Commit-ID:  4da9152a4308dcbf611cde399c695c359fc9145f
Gitweb:     http://git.kernel.org/tip/4da9152a4308dcbf611cde399c695c359fc9145f
Author:     Thomas Gleixner <tglx@linutronix.de>
AuthorDate: Mon, 24 Oct 2016 11:55:10 +0200
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Tue, 25 Oct 2016 16:27:39 +0200

timers: Lock base for same bucket optimization

Linus stumbled over the unlocked modification of the timer expiry value in
mod_timer() which is an optimization for timers which stay in the same
bucket - due to the bucket granularity - despite their expiry time getting
updated.

The optimization itself still makes sense even if we take the lock, because
in case that the bucket stays the same, we avoid the pointless
queue/enqueue dance.

Make the check and the modification of timer->expires protected by the base
lock and shuffle the remaining code around so we can keep the lock held
when we actually have to requeue the timer to a different bucket.

Fixes: f00c0afdfa62 ("timers: Implement optimization for same expiry time in mod_timer()")
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1610241711220.4983@nanos
Cc: stable@vger.kernel.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/time/timer.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 0d4b91c..ccf9130 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -971,6 +971,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 	unsigned long clk = 0, flags;
 	int ret = 0;
 
+	BUG_ON(!timer->function);
+
 	/*
 	 * This is a common optimization triggered by the networking code - if
 	 * the timer is re-modified to have the same timeout or ends up in the
@@ -979,13 +981,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 	if (timer_pending(timer)) {
 		if (timer->expires == expires)
 			return 1;
+
 		/*
-		 * Take the current timer_jiffies of base, but without holding
-		 * the lock!
+		 * We lock timer base and calculate the bucket index right
+		 * here. If the timer ends up in the same bucket, then we
+		 * just update the expiry time and avoid the whole
+		 * dequeue/enqueue dance.
 		 */
-		base = get_timer_base(timer->flags);
-		clk = base->clk;
+		base = lock_timer_base(timer, &flags);
 
+		clk = base->clk;
 		idx = calc_wheel_index(expires, clk);
 
 		/*
@@ -995,14 +1000,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 		 */
 		if (idx == timer_get_idx(timer)) {
 			timer->expires = expires;
-			return 1;
+			ret = 1;
+			goto out_unlock;
 		}
+	} else {
+		base = lock_timer_base(timer, &flags);
 	}
 
 	timer_stats_timer_set_start_info(timer);
-	BUG_ON(!timer->function);
-
-	base = lock_timer_base(timer, &flags);
 
 	ret = detach_if_pending(timer, base, false);
 	if (!ret && pending_only)
@@ -1035,9 +1040,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 	timer->expires = expires;
 	/*
 	 * If 'idx' was calculated above and the base time did not advance
-	 * between calculating 'idx' and taking the lock, only enqueue_timer()
-	 * and trigger_dyntick_cpu() is required. Otherwise we need to
-	 * (re)calculate the wheel index via internal_add_timer().
+	 * between calculating 'idx' and possibly switching the base, only
+	 * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise
+	 * we need to (re)calculate the wheel index via
+	 * internal_add_timer().
 	 */
 	if (idx != UINT_MAX && clk == base->clk) {
 		enqueue_timer(base, timer, idx);

^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2016-10-25 14:59 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-10-22 12:02 [GIT pull] timer updates for 4.9 Thomas Gleixner
2016-10-23 22:39 ` Linus Torvalds
2016-10-23 23:20   ` Linus Torvalds
2016-10-24  9:39     ` Thomas Gleixner
2016-10-24 14:51       ` Thomas Gleixner
2016-10-24 15:13         ` Thomas Gleixner
2016-10-25 14:57           ` [tip:timers/urgent] timers: Plug locking race vs. timer migration tip-bot for Thomas Gleixner
2016-10-25 14:58           ` [tip:timers/urgent] timers: Lock base for same bucket optimization tip-bot for Thomas Gleixner
2016-10-24 17:16         ` [GIT pull] timer updates for 4.9 Linus Torvalds
2016-10-24 19:09           ` Thomas Gleixner
2016-10-24 19:30             ` Linus Torvalds
2016-10-24 21:36               ` Thomas Gleixner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).