All of lore.kernel.org
 help / color / mirror / Atom feed
* [ANNOUNCE] v5.9.1-rt18
@ 2020-10-21 12:53 Sebastian Andrzej Siewior
  2020-10-21 13:14 ` Sebastian Andrzej Siewior
                   ` (2 more replies)
  0 siblings, 3 replies; 29+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-10-21 12:53 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, linux-rt-users, Steven Rostedt

Dear RT folks!

I'm pleased to announce the v5.9.1-rt18 patch set. 

Changes since v5.9.1-rt17:

  - Update the migrate-disable series by Peter Zijlstra to v3. Include
    also fixes discussed in the thread.

  - UP builds did not boot since the replace of the migrate-disable
    code. Reported by Christian Egger. Fixed as a part of v3 by Peter
    Zijlstra.

  - Rebase the printk code on top of the ringer buffer designed for
    printk which was merged in the v5.10 merge window. Patches by John
    Ogness.

Known issues
     - It has been pointed out that due to changes to the printk code the
       internal buffer representation changed. This is only an issue if tools
       like `crash' are used to extract the printk buffer from a kernel memory
       image.

The delta patch against v5.9.1-rt17 is appended below and can be found here:
 
     https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.9/incr/patch-5.9.1-rt17-rt18.patch.xz

You can get this release via the git tree at:

    git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v5.9.1-rt18

The RT patch against v5.9.1 can be found here:

    https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.9/older/patch-5.9.1-rt18.patch.xz

The split quilt queue is available at:

    https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.9/older/patches-5.9.1-rt18.tar.xz

Sebastian

diff --git a/Documentation/admin-guide/kdump/gdbmacros.txt b/Documentation/admin-guide/kdump/gdbmacros.txt
index 220d0a80ca2c9..82aecdcae8a6c 100644
--- a/Documentation/admin-guide/kdump/gdbmacros.txt
+++ b/Documentation/admin-guide/kdump/gdbmacros.txt
@@ -170,57 +170,82 @@ document trapinfo
 	address the kernel panicked.
 end
 
-define dump_log_idx
-	set $idx = $arg0
-	if ($argc > 1)
-		set $prev_flags = $arg1
+define dump_record
+	set var $desc = $arg0
+	set var $info = $arg1
+	if ($argc > 2)
+		set var $prev_flags = $arg2
 	else
-		set $prev_flags = 0
-	end
-	set $msg = ((struct printk_log *) (log_buf + $idx))
-	set $prefix = 1
-	set $newline = 1
-	set $log = log_buf + $idx + sizeof(*$msg)
-
-	# prev & LOG_CONT && !(msg->flags & LOG_PREIX)
-	if (($prev_flags & 8) && !($msg->flags & 4))
-		set $prefix = 0
+		set var $prev_flags = 0
 	end
 
-	# msg->flags & LOG_CONT
-	if ($msg->flags & 8)
+	set var $prefix = 1
+	set var $newline = 1
+
+	set var $begin = $desc->text_blk_lpos.begin % (1U << prb->text_data_ring.size_bits)
+	set var $next = $desc->text_blk_lpos.next % (1U << prb->text_data_ring.size_bits)
+
+	# handle data-less record
+	if ($begin & 1)
+		set var $text_len = 0
+		set var $log = ""
+	else
+		# handle wrapping data block
+		if ($begin > $next)
+			set var $begin = 0
+		end
+
+		# skip over descriptor id
+		set var $begin = $begin + sizeof(long)
+
+		# handle truncated message
+		if ($next - $begin < $info->text_len)
+			set var $text_len = $next - $begin
+		else
+			set var $text_len = $info->text_len
+		end
+
+		set var $log = &prb->text_data_ring.data[$begin]
+	end
+
+	# prev & LOG_CONT && !(info->flags & LOG_PREIX)
+	if (($prev_flags & 8) && !($info->flags & 4))
+		set var $prefix = 0
+	end
+
+	# info->flags & LOG_CONT
+	if ($info->flags & 8)
 		# (prev & LOG_CONT && !(prev & LOG_NEWLINE))
 		if (($prev_flags & 8) && !($prev_flags & 2))
-			set $prefix = 0
+			set var $prefix = 0
 		end
-		# (!(msg->flags & LOG_NEWLINE))
-		if (!($msg->flags & 2))
-			set $newline = 0
+		# (!(info->flags & LOG_NEWLINE))
+		if (!($info->flags & 2))
+			set var $newline = 0
 		end
 	end
 
 	if ($prefix)
-		printf "[%5lu.%06lu] ", $msg->ts_nsec / 1000000000, $msg->ts_nsec % 1000000000
+		printf "[%5lu.%06lu] ", $info->ts_nsec / 1000000000, $info->ts_nsec % 1000000000
 	end
-	if ($msg->text_len != 0)
-		eval "printf \"%%%d.%ds\", $log", $msg->text_len, $msg->text_len
+	if ($text_len)
+		eval "printf \"%%%d.%ds\", $log", $text_len, $text_len
 	end
 	if ($newline)
 		printf "\n"
 	end
-	if ($msg->dict_len > 0)
-		set $dict = $log + $msg->text_len
-		set $idx = 0
-		set $line = 1
-		while ($idx < $msg->dict_len)
-			if ($line)
-				printf " "
-				set $line = 0
-			end
-			set $c = $dict[$idx]
+
+	# handle dictionary data
+
+	set var $dict = &$info->dev_info.subsystem[0]
+	set var $dict_len = sizeof($info->dev_info.subsystem)
+	if ($dict[0] != '\0')
+		printf " SUBSYSTEM="
+		set var $idx = 0
+		while ($idx < $dict_len)
+			set var $c = $dict[$idx]
 			if ($c == '\0')
-				printf "\n"
-				set $line = 1
+				loop_break
 			else
 				if ($c < ' ' || $c >= 127 || $c == '\\')
 					printf "\\x%02x", $c
@@ -228,33 +253,67 @@ define dump_log_idx
 					printf "%c", $c
 				end
 			end
-			set $idx = $idx + 1
+			set var $idx = $idx + 1
+		end
+		printf "\n"
+	end
+
+	set var $dict = &$info->dev_info.device[0]
+	set var $dict_len = sizeof($info->dev_info.device)
+	if ($dict[0] != '\0')
+		printf " DEVICE="
+		set var $idx = 0
+		while ($idx < $dict_len)
+			set var $c = $dict[$idx]
+			if ($c == '\0')
+				loop_break
+			else
+				if ($c < ' ' || $c >= 127 || $c == '\\')
+					printf "\\x%02x", $c
+				else
+					printf "%c", $c
+				end
+			end
+			set var $idx = $idx + 1
 		end
 		printf "\n"
 	end
 end
-document dump_log_idx
-	Dump a single log given its index in the log buffer.  The first
-	parameter is the index into log_buf, the second is optional and
-	specified the previous log buffer's flags, used for properly
-	formatting continued lines.
+document dump_record
+	Dump a single record. The first parameter is the descriptor,
+	the second parameter is the info, the third parameter is
+	optional and specifies the previous record's flags, used for
+	properly formatting continued lines.
 end
 
 define dmesg
-	set $i = log_first_idx
-	set $end_idx = log_first_idx
-	set $prev_flags = 0
+	# definitions from kernel/printk/printk_ringbuffer.h
+	set var $desc_committed = 1
+	set var $desc_finalized = 2
+	set var $desc_sv_bits = sizeof(long) * 8
+	set var $desc_flags_shift = $desc_sv_bits - 2
+	set var $desc_flags_mask = 3 << $desc_flags_shift
+	set var $id_mask = ~$desc_flags_mask
+
+	set var $desc_count = 1U << prb->desc_ring.count_bits
+	set var $prev_flags = 0
+
+	set var $id = prb->desc_ring.tail_id.counter
+	set var $end_id = prb->desc_ring.head_id.counter
 
 	while (1)
-		set $msg = ((struct printk_log *) (log_buf + $i))
-		if ($msg->len == 0)
-			set $i = 0
-		else
-			dump_log_idx $i $prev_flags
-			set $i = $i + $msg->len
-			set $prev_flags = $msg->flags
+		set var $desc = &prb->desc_ring.descs[$id % $desc_count]
+		set var $info = &prb->desc_ring.infos[$id % $desc_count]
+
+		# skip non-committed record
+		set var $state = 3 & ($desc->state_var.counter >> $desc_flags_shift)
+		if ($state == $desc_committed || $state == $desc_finalized)
+			dump_record $desc $info $prev_flags
+			set var $prev_flags = $info->flags
 		end
-		if ($i == $end_idx)
+
+		set var $id = ($id + 1) & $id_mask
+		if ($id == $end_id)
 			loop_break
 		end
 	end
diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
index 2baad0bfb09d0..e44a6c01f3362 100644
--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
@@ -189,50 +189,123 @@ from this.
 Free areas descriptor. User-space tools use this value to iterate the
 free_area ranges. MAX_ORDER is used by the zone buddy allocator.
 
-log_first_idx
--------------
+prb
+---
 
-Index of the first record stored in the buffer log_buf. Used by
-user-space tools to read the strings in the log_buf.
+A pointer to the printk ringbuffer (struct printk_ringbuffer). This
+may be pointing to the static boot ringbuffer or the dynamically
+allocated ringbuffer, depending on when the the core dump occurred.
+Used by user-space tools to read the active kernel log buffer.
 
-log_buf
--------
+printk_rb_static
+----------------
 
-Console output is written to the ring buffer log_buf at index
-log_first_idx. Used to get the kernel log.
+A pointer to the static boot printk ringbuffer. If @prb has a
+different value, this is useful for viewing the initial boot messages,
+which may have been overwritten in the dynamically allocated
+ringbuffer.
 
-log_buf_len
------------
-
-log_buf's length.
-
-clear_idx
+clear_seq
 ---------
 
-The index that the next printk() record to read after the last clear
-command. It indicates the first record after the last SYSLOG_ACTION
-_CLEAR, like issued by 'dmesg -c'. Used by user-space tools to dump
-the dmesg log.
+The sequence number of the printk() record after the last clear
+command. It indicates the first record after the last
+SYSLOG_ACTION_CLEAR, like issued by 'dmesg -c'. Used by user-space
+tools to dump a subset of the dmesg log.
 
-log_next_idx
-------------
+printk_ringbuffer
+-----------------
 
-The index of the next record to store in the buffer log_buf. Used to
-compute the index of the current buffer position.
+The size of a printk_ringbuffer structure. This structure contains all
+information required for accessing the various components of the
+kernel log buffer.
 
-printk_log
-----------
+(printk_ringbuffer, desc_ring|text_data_ring|dict_data_ring|fail)
+-----------------------------------------------------------------
 
-The size of a structure printk_log. Used to compute the size of
-messages, and extract dmesg log. It encapsulates header information for
-log_buf, such as timestamp, syslog level, etc.
+Offsets for the various components of the printk ringbuffer. Used by
+user-space tools to view the kernel log buffer without requiring the
+declaration of the structure.
 
-(printk_log, ts_nsec|len|text_len|dict_len)
--------------------------------------------
+prb_desc_ring
+-------------
 
-It represents field offsets in struct printk_log. User space tools
-parse it and check whether the values of printk_log's members have been
-changed.
+The size of the prb_desc_ring structure. This structure contains
+information about the set of record descriptors.
+
+(prb_desc_ring, count_bits|descs|head_id|tail_id)
+-------------------------------------------------
+
+Offsets for the fields describing the set of record descriptors. Used
+by user-space tools to be able to traverse the descriptors without
+requiring the declaration of the structure.
+
+prb_desc
+--------
+
+The size of the prb_desc structure. This structure contains
+information about a single record descriptor.
+
+(prb_desc, info|state_var|text_blk_lpos|dict_blk_lpos)
+------------------------------------------------------
+
+Offsets for the fields describing a record descriptors. Used by
+user-space tools to be able to read descriptors without requiring
+the declaration of the structure.
+
+prb_data_blk_lpos
+-----------------
+
+The size of the prb_data_blk_lpos structure. This structure contains
+information about where the text or dictionary data (data block) is
+located within the respective data ring.
+
+(prb_data_blk_lpos, begin|next)
+-------------------------------
+
+Offsets for the fields describing the location of a data block. Used
+by user-space tools to be able to locate data blocks without
+requiring the declaration of the structure.
+
+printk_info
+-----------
+
+The size of the printk_info structure. This structure contains all
+the meta-data for a record.
+
+(printk_info, seq|ts_nsec|text_len|dict_len|caller_id)
+------------------------------------------------------
+
+Offsets for the fields providing the meta-data for a record. Used by
+user-space tools to be able to read the information without requiring
+the declaration of the structure.
+
+prb_data_ring
+-------------
+
+The size of the prb_data_ring structure. This structure contains
+information about a set of data blocks.
+
+(prb_data_ring, size_bits|data|head_lpos|tail_lpos)
+---------------------------------------------------
+
+Offsets for the fields describing a set of data blocks. Used by
+user-space tools to be able to access the data blocks without
+requiring the declaration of the structure.
+
+atomic_long_t
+-------------
+
+The size of the atomic_long_t structure. Used by user-space tools to
+be able to copy the full structure, regardless of its
+architecture-specific implementation.
+
+(atomic_long_t, counter)
+------------------------
+
+Offset for the long value of an atomic_long_t variable. Used by
+user-space tools to access the long value without requiring the
+architecture-specific declaration.
 
 (free_area.free_list, MIGRATE_TYPES)
 ------------------------------------
diff --git a/Documentation/printk-ringbuffer.txt b/Documentation/printk-ringbuffer.txt
deleted file mode 100644
index 6bde5dbd8545b..0000000000000
--- a/Documentation/printk-ringbuffer.txt
+++ /dev/null
@@ -1,377 +0,0 @@
-struct printk_ringbuffer
-------------------------
-John Ogness <john.ogness@linutronix.de>
-
-Overview
-~~~~~~~~
-As the name suggests, this ring buffer was implemented specifically to serve
-the needs of the printk() infrastructure. The ring buffer itself is not
-specific to printk and could be used for other purposes. _However_, the
-requirements and semantics of printk are rather unique. If you intend to use
-this ring buffer for anything other than printk, you need to be very clear on
-its features, behavior, and pitfalls.
-
-Features
-^^^^^^^^
-The printk ring buffer has the following features:
-
-- single global buffer
-- resides in initialized data section (available at early boot)
-- lockless readers
-- supports multiple writers
-- supports multiple non-consuming readers
-- safe from any context (including NMI)
-- groups bytes into variable length blocks (referenced by entries)
-- entries tagged with sequence numbers
-
-Behavior
-^^^^^^^^
-Since the printk ring buffer readers are lockless, there exists no
-synchronization between readers and writers. Basically writers are the tasks
-in control and may overwrite any and all committed data at any time and from
-any context. For this reason readers can miss entries if they are overwritten
-before the reader was able to access the data. The reader API implementation
-is such that reader access to entries is atomic, so there is no risk of
-readers having to deal with partial or corrupt data. Also, entries are
-tagged with sequence numbers so readers can recognize if entries were missed.
-
-Writing to the ring buffer consists of 2 steps. First a writer must reserve
-an entry of desired size. After this step the writer has exclusive access
-to the memory region. Once the data has been written to memory, it needs to
-be committed to the ring buffer. After this step the entry has been inserted
-into the ring buffer and assigned an appropriate sequence number.
-
-Once committed, a writer must no longer access the data directly. This is
-because the data may have been overwritten and no longer exists. If a
-writer must access the data, it should either keep a private copy before
-committing the entry or use the reader API to gain access to the data.
-
-Because of how the data backend is implemented, entries that have been
-reserved but not yet committed act as barriers, preventing future writers
-from filling the ring buffer beyond the location of the reserved but not
-yet committed entry region. For this reason it is *important* that writers
-perform both reserve and commit as quickly as possible. Also, be aware that
-preemption and local interrupts are disabled and writing to the ring buffer
-is processor-reentrant locked during the reserve/commit window. Writers in
-NMI contexts can still preempt any other writers, but as long as these
-writers do not write a large amount of data with respect to the ring buffer
-size, this should not become an issue.
-
-API
-~~~
-
-Declaration
-^^^^^^^^^^^
-The printk ring buffer can be instantiated as a static structure:
-
- /* declare a static struct printk_ringbuffer */
- #define DECLARE_STATIC_PRINTKRB(name, szbits, cpulockptr)
-
-The value of szbits specifies the size of the ring buffer in bits. The
-cpulockptr field is a pointer to a prb_cpulock struct that is used to
-perform processor-reentrant spin locking for the writers. It is specified
-externally because it may be used for multiple ring buffers (or other
-code) to synchronize writers without risk of deadlock.
-
-Here is an example of a declaration of a printk ring buffer specifying a
-32KB (2^15) ring buffer:
-
-....
-DECLARE_STATIC_PRINTKRB_CPULOCK(rb_cpulock);
-DECLARE_STATIC_PRINTKRB(rb, 15, &rb_cpulock);
-....
-
-If writers will be using multiple ring buffers and the ordering of that usage
-is not clear, the same prb_cpulock should be used for both ring buffers.
-
-Writer API
-^^^^^^^^^^
-The writer API consists of 2 functions. The first is to reserve an entry in
-the ring buffer, the second is to commit that data to the ring buffer. The
-reserved entry information is stored within a provided `struct prb_handle`.
-
- /* reserve an entry */
- char *prb_reserve(struct prb_handle *h, struct printk_ringbuffer *rb,
-                   unsigned int size);
-
- /* commit a reserved entry to the ring buffer */
- void prb_commit(struct prb_handle *h);
-
-Here is an example of a function to write data to a ring buffer:
-
-....
-int write_data(struct printk_ringbuffer *rb, char *data, int size)
-{
-    struct prb_handle h;
-    char *buf;
-
-    buf = prb_reserve(&h, rb, size);
-    if (!buf)
-        return -1;
-    memcpy(buf, data, size);
-    prb_commit(&h);
-
-    return 0;
-}
-....
-
-Pitfalls
-++++++++
-Be aware that prb_reserve() can fail. A retry might be successful, but it
-depends entirely on whether or not the next part of the ring buffer to
-overwrite belongs to reserved but not yet committed entries of other writers.
-Writers can use the prb_inc_lost() function to allow readers to notice that a
-message was lost.
-
-Reader API
-^^^^^^^^^^
-The reader API utilizes a `struct prb_iterator` to track the reader's
-position in the ring buffer.
-
- /* declare a pre-initialized static iterator for a ring buffer */
- #define DECLARE_STATIC_PRINTKRB_ITER(name, rbaddr)
-
- /* initialize iterator for a ring buffer (if static macro NOT used) */
- void prb_iter_init(struct prb_iterator *iter,
-                    struct printk_ringbuffer *rb, u64 *seq);
-
- /* make a deep copy of an iterator */
- void prb_iter_copy(struct prb_iterator *dest,
-                    struct prb_iterator *src);
-
- /* non-blocking, advance to next entry (and read the data) */
- int prb_iter_next(struct prb_iterator *iter, char *buf,
-                   int size, u64 *seq);
-
- /* blocking, advance to next entry (and read the data) */
- int prb_iter_wait_next(struct prb_iterator *iter, char *buf,
-                        int size, u64 *seq);
-
- /* position iterator at the entry seq */
- int prb_iter_seek(struct prb_iterator *iter, u64 seq);
-
- /* read data at current position */
- int prb_iter_data(struct prb_iterator *iter, char *buf,
-                   int size, u64 *seq);
-
-Typically prb_iter_data() is not needed because the data can be retrieved
-directly with prb_iter_next().
-
-Here is an example of a non-blocking function that will read all the data in
-a ring buffer:
-
-....
-void read_all_data(struct printk_ringbuffer *rb, char *buf, int size)
-{
-    struct prb_iterator iter;
-    u64 prev_seq = 0;
-    u64 seq;
-    int ret;
-
-    prb_iter_init(&iter, rb, NULL);
-
-    for (;;) {
-        ret = prb_iter_next(&iter, buf, size, &seq);
-        if (ret > 0) {
-            if (seq != ++prev_seq) {
-                /* "seq - prev_seq" entries missed */
-                prev_seq = seq;
-            }
-            /* process buf here */
-        } else if (ret == 0) {
-            /* hit the end, done */
-            break;
-        } else if (ret < 0) {
-            /*
-             * iterator is invalid, a writer overtook us, reset the
-             * iterator and keep going, entries were missed
-             */
-            prb_iter_init(&iter, rb, NULL);
-        }
-    }
-}
-....
-
-Pitfalls
-++++++++
-The reader's iterator can become invalid at any time because the reader was
-overtaken by a writer. Typically the reader should reset the iterator back
-to the current oldest entry (which will be newer than the entry the reader
-was at) and continue, noting the number of entries that were missed.
-
-Utility API
-^^^^^^^^^^^
-Several functions are available as convenience for external code.
-
- /* query the size of the data buffer */
- int prb_buffer_size(struct printk_ringbuffer *rb);
-
- /* skip a seq number to signify a lost record */
- void prb_inc_lost(struct printk_ringbuffer *rb);
-
- /* processor-reentrant spin lock */
- void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store);
-
- /* processor-reentrant spin unlock */
- void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store);
-
-Pitfalls
-++++++++
-Although the value returned by prb_buffer_size() does represent an absolute
-upper bound, the amount of data that can be stored within the ring buffer
-is actually less because of the additional storage space of a header for each
-entry.
-
-The prb_lock() and prb_unlock() functions can be used to synchronize between
-ring buffer writers and other external activities. The function of a
-processor-reentrant spin lock is to disable preemption and local interrupts
-and synchronize against other processors. It does *not* protect against
-multiple contexts of a single processor, i.e NMI.
-
-Implementation
-~~~~~~~~~~~~~~
-This section describes several of the implementation concepts and details to
-help developers better understand the code.
-
-Entries
-^^^^^^^
-All ring buffer data is stored within a single static byte array. The reason
-for this is to ensure that any pointers to the data (past and present) will
-always point to valid memory. This is important because the lockless readers
-may be preempted for long periods of time and when they resume may be working
-with expired pointers.
-
-Entries are identified by start index and size. (The start index plus size
-is the start index of the next entry.) The start index is not simply an
-offset into the byte array, but rather a logical position (lpos) that maps
-directly to byte array offsets.
-
-For example, for a byte array of 1000, an entry may have have a start index
-of 100. Another entry may have a start index of 1100. And yet another 2100.
-All of these entry are pointing to the same memory region, but only the most
-recent entry is valid. The other entries are pointing to valid memory, but
-represent entries that have been overwritten.
-
-Note that due to overflowing, the most recent entry is not necessarily the one
-with the highest lpos value. Indeed, the printk ring buffer initializes its
-data such that an overflow happens relatively quickly in order to validate the
-handling of this situation. The implementation assumes that an lpos (unsigned
-long) will never completely wrap while a reader is preempted. If this were to
-become an issue, the seq number (which never wraps) could be used to increase
-the robustness of handling this situation.
-
-Buffer Wrapping
-^^^^^^^^^^^^^^^
-If an entry starts near the end of the byte array but would extend beyond it,
-a special terminating entry (size = -1) is inserted into the byte array and
-the real entry is placed at the beginning of the byte array. This can waste
-space at the end of the byte array, but simplifies the implementation by
-allowing writers to always work with contiguous buffers.
-
-Note that the size field is the first 4 bytes of the entry header. Also note
-that calc_next() always ensures that there are at least 4 bytes left at the
-end of the byte array to allow room for a terminating entry.
-
-Ring Buffer Pointers
-^^^^^^^^^^^^^^^^^^^^
-Three pointers (lpos values) are used to manage the ring buffer:
-
- - _tail_: points to the oldest entry
- - _head_: points to where the next new committed entry will be
- - _reserve_: points to where the next new reserved entry will be
-
-These pointers always maintain a logical ordering:
-
- tail <= head <= reserve
-
-The reserve pointer moves forward when a writer reserves a new entry. The
-head pointer moves forward when a writer commits a new entry.
-
-The reserve pointer cannot overwrite the tail pointer in a wrap situation. In
-such a situation, the tail pointer must be "pushed forward", thus
-invalidating that oldest entry. Readers identify if they are accessing a
-valid entry by ensuring their entry pointer is `>= tail && < head`.
-
-If the tail pointer is equal to the head pointer, it cannot be pushed and any
-reserve operation will fail. The only resolution is for writers to commit
-their reserved entries.
-
-Processor-Reentrant Locking
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
-The purpose of the processor-reentrant locking is to limit the interruption
-scenarios of writers to 2 contexts. This allows for a simplified
-implementation where:
-
-- The reserve/commit window only exists on 1 processor at a time. A reserve
-  can never fail due to uncommitted entries of other processors.
-
-- When committing entries, it is trivial to handle the situation when
-  subsequent entries have already been committed, i.e. managing the head
-  pointer.
-
-Performance
-~~~~~~~~~~~
-Some basic tests were performed on a quad Intel(R) Xeon(R) CPU E5-2697 v4 at
-2.30GHz (36 cores / 72 threads). All tests involved writing a total of
-32,000,000 records at an average of 33 bytes each. Each writer was pinned to
-its own CPU and would write as fast as it could until a total of 32,000,000
-records were written. All tests involved 2 readers that were both pinned
-together to another CPU. Each reader would read as fast as it could and track
-how many of the 32,000,000 records it could read. All tests used a ring buffer
-of 16KB in size, which holds around 350 records (header + data for each
-entry).
-
-The only difference between the tests is the number of writers (and thus also
-the number of records per writer). As more writers are added, the time to
-write a record increases. This is because data pointers, modified via cmpxchg,
-and global data access in general become more contended.
-
-1 writer
-^^^^^^^^
- runtime: 0m 18s
- reader1: 16219900/32000000 (50%) records
- reader2: 16141582/32000000 (50%) records
-
-2 writers
-^^^^^^^^^
- runtime: 0m 32s
- reader1: 16327957/32000000 (51%) records
- reader2: 16313988/32000000 (50%) records
-
-4 writers
-^^^^^^^^^
- runtime: 0m 42s
- reader1: 16421642/32000000 (51%) records
- reader2: 16417224/32000000 (51%) records
-
-8 writers
-^^^^^^^^^
- runtime: 0m 43s
- reader1: 16418300/32000000 (51%) records
- reader2: 16432222/32000000 (51%) records
-
-16 writers
-^^^^^^^^^^
- runtime: 0m 54s
- reader1: 16539189/32000000 (51%) records
- reader2: 16542711/32000000 (51%) records
-
-32 writers
-^^^^^^^^^^
- runtime: 1m 13s
- reader1: 16731808/32000000 (52%) records
- reader2: 16735119/32000000 (52%) records
-
-Comments
-^^^^^^^^
-It is particularly interesting to compare/contrast the 1-writer and 32-writer
-tests. Despite the writing of the 32,000,000 records taking over 4 times
-longer, the readers (which perform no cmpxchg) were still unable to keep up.
-This shows that the memory contention between the increasing number of CPUs
-also has a dramatic effect on readers.
-
-It should also be noted that in all cases each reader was able to read >=50%
-of the records. This means that a single reader would have been able to keep
-up with the writer(s) in all cases, becoming slightly easier as more writers
-are added. This was the purpose of pinning 2 readers to 1 CPU: to observe how
-maximum reader performance changes.
diff --git a/MAINTAINERS b/MAINTAINERS
index 867157311dc8b..7ae63272d994c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13960,6 +13960,7 @@ PRINTK
 M:	Petr Mladek <pmladek@suse.com>
 M:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
 R:	Steven Rostedt <rostedt@goodmis.org>
+R:	John Ogness <john.ogness@linutronix.de>
 S:	Maintained
 F:	include/linux/printk.h
 F:	kernel/printk/
diff --git a/drivers/base/core.c b/drivers/base/core.c
index bb5806a2bd4ca..f90e9f77bf8c2 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -4061,22 +4061,21 @@ void device_shutdown(void)
  */
 
 #ifdef CONFIG_PRINTK
-static int
-create_syslog_header(const struct device *dev, char *hdr, size_t hdrlen)
+static void
+set_dev_info(const struct device *dev, struct dev_printk_info *dev_info)
 {
 	const char *subsys;
-	size_t pos = 0;
+
+	memset(dev_info, 0, sizeof(*dev_info));
 
 	if (dev->class)
 		subsys = dev->class->name;
 	else if (dev->bus)
 		subsys = dev->bus->name;
 	else
-		return 0;
+		return;
 
-	pos += snprintf(hdr + pos, hdrlen - pos, "SUBSYSTEM=%s", subsys);
-	if (pos >= hdrlen)
-		goto overflow;
+	strscpy(dev_info->subsystem, subsys, sizeof(dev_info->subsystem));
 
 	/*
 	 * Add device identifier DEVICE=:
@@ -4092,41 +4091,28 @@ create_syslog_header(const struct device *dev, char *hdr, size_t hdrlen)
 			c = 'b';
 		else
 			c = 'c';
-		pos++;
-		pos += snprintf(hdr + pos, hdrlen - pos,
-				"DEVICE=%c%u:%u",
-				c, MAJOR(dev->devt), MINOR(dev->devt));
+
+		snprintf(dev_info->device, sizeof(dev_info->device),
+			 "%c%u:%u", c, MAJOR(dev->devt), MINOR(dev->devt));
 	} else if (strcmp(subsys, "net") == 0) {
 		struct net_device *net = to_net_dev(dev);
 
-		pos++;
-		pos += snprintf(hdr + pos, hdrlen - pos,
-				"DEVICE=n%u", net->ifindex);
+		snprintf(dev_info->device, sizeof(dev_info->device),
+			 "n%u", net->ifindex);
 	} else {
-		pos++;
-		pos += snprintf(hdr + pos, hdrlen - pos,
-				"DEVICE=+%s:%s", subsys, dev_name(dev));
+		snprintf(dev_info->device, sizeof(dev_info->device),
+			 "+%s:%s", subsys, dev_name(dev));
 	}
-
-	if (pos >= hdrlen)
-		goto overflow;
-
-	return pos;
-
-overflow:
-	dev_WARN(dev, "device/subsystem name too long");
-	return 0;
 }
 
 int dev_vprintk_emit(int level, const struct device *dev,
 		     const char *fmt, va_list args)
 {
-	char hdr[128];
-	size_t hdrlen;
+	struct dev_printk_info dev_info;
 
-	hdrlen = create_syslog_header(dev, hdr, sizeof(hdr));
+	set_dev_info(dev, &dev_info);
 
-	return vprintk_emit(0, level, hdrlen ? hdr : NULL, hdrlen, fmt, args);
+	return vprintk_emit(0, level, &dev_info, fmt, args);
 }
 EXPORT_SYMBOL(dev_vprintk_emit);
 
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index 18ed6e4e0c7e7..b38ad552887fb 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -18,6 +18,8 @@
 #include <linux/uaccess.h>
 #include <asm/io.h>
 
+extern wait_queue_head_t log_wait;
+
 static int kmsg_open(struct inode * inode, struct file * file)
 {
 	return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC);
@@ -40,7 +42,7 @@ static ssize_t kmsg_read(struct file *file, char __user *buf,
 
 static __poll_t kmsg_poll(struct file *file, poll_table *wait)
 {
-	poll_wait(file, printk_wait_queue(), wait);
+	poll_wait(file, &log_wait, wait);
 	if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC))
 		return EPOLLIN | EPOLLRDNORM;
 	return 0;
diff --git a/include/linux/console.h b/include/linux/console.h
index 1badb57ba82f3..00d7437a92e11 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -137,6 +137,7 @@ static inline int con_debug_leave(void)
 #define CON_ANYTIME	(16) /* Safe to call when cpu is offline */
 #define CON_BRL		(32) /* Used for a braille device */
 #define CON_EXTENDED	(64) /* Use the extended output format a la /dev/kmsg */
+#define CON_HANDOVER	(128) /* Device was previously a boot console. */
 
 struct console {
 	char	name[16];
@@ -151,8 +152,8 @@ struct console {
 	short	flags;
 	short	index;
 	int	cflag;
-	unsigned long printk_seq;
-	int	wrote_history;
+	atomic64_t printk_seq;
+	struct task_struct *thread;
 	void	*data;
 	struct	 console *next;
 };
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 6594dbc34a374..206bde8308b2d 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -55,6 +55,9 @@ phys_addr_t paddr_vmcoreinfo_note(void);
 #define VMCOREINFO_OFFSET(name, field) \
 	vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
 			      (unsigned long)offsetof(struct name, field))
+#define VMCOREINFO_TYPE_OFFSET(name, field) \
+	vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
+			      (unsigned long)offsetof(name, field))
 #define VMCOREINFO_LENGTH(name, value) \
 	vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value)
 #define VMCOREINFO_NUMBER(name) \
diff --git a/include/linux/dev_printk.h b/include/linux/dev_printk.h
index 3028b644b4fbd..6f009559ee540 100644
--- a/include/linux/dev_printk.h
+++ b/include/linux/dev_printk.h
@@ -21,6 +21,14 @@
 
 struct device;
 
+#define PRINTK_INFO_SUBSYSTEM_LEN	16
+#define PRINTK_INFO_DEVICE_LEN		48
+
+struct dev_printk_info {
+	char subsystem[PRINTK_INFO_SUBSYSTEM_LEN];
+	char device[PRINTK_INFO_DEVICE_LEN];
+};
+
 #ifdef CONFIG_PRINTK
 
 __printf(3, 0) __cold
diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h
index 25f6652c05d53..3378bcbe585ea 100644
--- a/include/linux/kmsg_dump.h
+++ b/include/linux/kmsg_dump.h
@@ -45,8 +45,10 @@ struct kmsg_dumper {
 	bool registered;
 
 	/* private state of the kmsg iterator */
-	u64 line_seq;
-	u64 buffer_end_seq;
+	u32 cur_idx;
+	u32 next_idx;
+	u64 cur_seq;
+	u64 next_seq;
 };
 
 #ifdef CONFIG_PRINTK
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index e72b67e0ced8c..8a47b9b1bade1 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -236,11 +236,16 @@ do { \
 		__preempt_schedule(); \
 } while (0)
 
+/*
+ * open code preempt_check_resched() because it is not exported to modules and
+ * used by local_unlock() or bpf_enable_instrumentation().
+ */
 #define preempt_lazy_enable() \
 do { \
 	dec_preempt_lazy_count(); \
 	barrier(); \
-	preempt_check_resched(); \
+	if (should_resched(0)) \
+		__preempt_schedule(); \
 } while (0)
 
 #else /* !CONFIG_PREEMPTION */
@@ -441,7 +446,19 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
 extern void migrate_disable(void);
 extern void migrate_enable(void);
 
-#else /* !(CONFIG_SMP && CONFIG_PREEMPT_RT) */
+#elif defined(CONFIG_PREEMPT_RT)
+
+static inline void migrate_disable(void)
+{
+	preempt_lazy_disable();
+}
+
+static inline void migrate_enable(void)
+{
+	preempt_lazy_enable();
+}
+
+#else /* !CONFIG_PREEMPT_RT */
 
 /**
  * migrate_disable - Prevent migration of the current task
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 4318e2190408a..c49d5bb3f8ffa 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -59,7 +59,6 @@ static inline const char *printk_skip_headers(const char *buffer)
  */
 #define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT
 #define CONSOLE_LOGLEVEL_QUIET	 CONFIG_CONSOLE_LOGLEVEL_QUIET
-#define CONSOLE_LOGLEVEL_EMERGENCY CONFIG_CONSOLE_LOGLEVEL_EMERGENCY
 
 extern int console_printk[];
 
@@ -67,7 +66,6 @@ extern int console_printk[];
 #define default_message_loglevel (console_printk[1])
 #define minimum_console_loglevel (console_printk[2])
 #define default_console_loglevel (console_printk[3])
-#define emergency_console_loglevel (console_printk[4])
 
 static inline void console_silent(void)
 {
@@ -149,10 +147,12 @@ static inline __printf(1, 2) __cold
 void early_printk(const char *s, ...) { }
 #endif
 
+struct dev_printk_info;
+
 #ifdef CONFIG_PRINTK
-asmlinkage __printf(5, 0)
+asmlinkage __printf(4, 0)
 int vprintk_emit(int facility, int level,
-		 const char *dict, size_t dictlen,
+		 const struct dev_printk_info *dev_info,
 		 const char *fmt, va_list args);
 
 asmlinkage __printf(1, 0)
@@ -193,7 +193,6 @@ __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...);
 void dump_stack_print_info(const char *log_lvl);
 void show_regs_print_info(const char *log_lvl);
 extern asmlinkage void dump_stack(void) __cold;
-struct wait_queue_head *printk_wait_queue(void);
 #else
 static inline __printf(1, 0)
 int vprintk(const char *s, va_list args)
@@ -257,7 +256,6 @@ static inline void show_regs_print_info(const char *log_lvl)
 static inline void dump_stack(void)
 {
 }
-
 #endif
 
 extern int kptr_restrict;
diff --git a/include/linux/printk_ringbuffer.h b/include/linux/printk_ringbuffer.h
deleted file mode 100644
index afd03305d2066..0000000000000
--- a/include/linux/printk_ringbuffer.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_PRINTK_RINGBUFFER_H
-#define _LINUX_PRINTK_RINGBUFFER_H
-
-#include <linux/irq_work.h>
-#include <linux/atomic.h>
-#include <linux/percpu.h>
-#include <linux/wait.h>
-
-struct prb_cpulock {
-	atomic_t owner;
-	unsigned long __percpu *irqflags;
-};
-
-struct printk_ringbuffer {
-	void *buffer;
-	unsigned int size_bits;
-
-	u64 seq;
-	atomic_long_t lost;
-
-	atomic_long_t tail;
-	atomic_long_t head;
-	atomic_long_t reserve;
-
-	struct prb_cpulock *cpulock;
-	atomic_t ctx;
-
-	struct wait_queue_head *wq;
-	atomic_long_t wq_counter;
-	struct irq_work *wq_work;
-};
-
-struct prb_entry {
-	unsigned int size;
-	u64 seq;
-	char data[0];
-};
-
-struct prb_handle {
-	struct printk_ringbuffer *rb;
-	unsigned int cpu;
-	struct prb_entry *entry;
-};
-
-#define DECLARE_STATIC_PRINTKRB_CPULOCK(name)				\
-static DEFINE_PER_CPU(unsigned long, _##name##_percpu_irqflags);	\
-static struct prb_cpulock name = {					\
-	.owner = ATOMIC_INIT(-1),					\
-	.irqflags = &_##name##_percpu_irqflags,				\
-}
-
-#define PRB_INIT ((unsigned long)-1)
-
-#define DECLARE_STATIC_PRINTKRB_ITER(name, rbaddr)			\
-static struct prb_iterator name = {					\
-	.rb = rbaddr,							\
-	.lpos = PRB_INIT,						\
-}
-
-struct prb_iterator {
-	struct printk_ringbuffer *rb;
-	unsigned long lpos;
-};
-
-#define DECLARE_STATIC_PRINTKRB(name, szbits, cpulockptr)		\
-static char _##name##_buffer[1 << (szbits)]				\
-	__aligned(__alignof__(long));					\
-static DECLARE_WAIT_QUEUE_HEAD(_##name##_wait);				\
-static void _##name##_wake_work_func(struct irq_work *irq_work)		\
-{									\
-	wake_up_interruptible_all(&_##name##_wait);			\
-}									\
-static struct irq_work _##name##_wake_work = {				\
-	.func = _##name##_wake_work_func,				\
-	.flags = ATOMIC_INIT(IRQ_WORK_LAZY),				\
-};									\
-static struct printk_ringbuffer name = {				\
-	.buffer = &_##name##_buffer[0],					\
-	.size_bits = szbits,						\
-	.seq = 0,							\
-	.lost = ATOMIC_LONG_INIT(0),					\
-	.tail = ATOMIC_LONG_INIT(-111 * sizeof(long)),			\
-	.head = ATOMIC_LONG_INIT(-111 * sizeof(long)),			\
-	.reserve = ATOMIC_LONG_INIT(-111 * sizeof(long)),		\
-	.cpulock = cpulockptr,						\
-	.ctx = ATOMIC_INIT(0),						\
-	.wq = &_##name##_wait,						\
-	.wq_counter = ATOMIC_LONG_INIT(0),				\
-	.wq_work = &_##name##_wake_work,				\
-}
-
-/* writer interface */
-char *prb_reserve(struct prb_handle *h, struct printk_ringbuffer *rb,
-		  unsigned int size);
-void prb_commit(struct prb_handle *h);
-
-/* reader interface */
-void prb_iter_init(struct prb_iterator *iter, struct printk_ringbuffer *rb,
-		   u64 *seq);
-void prb_iter_copy(struct prb_iterator *dest, struct prb_iterator *src);
-int prb_iter_next(struct prb_iterator *iter, char *buf, int size, u64 *seq);
-int prb_iter_wait_next(struct prb_iterator *iter, char *buf, int size,
-		       u64 *seq);
-int prb_iter_seek(struct prb_iterator *iter, u64 seq);
-int prb_iter_data(struct prb_iterator *iter, char *buf, int size, u64 *seq);
-
-/* utility functions */
-int prb_buffer_size(struct printk_ringbuffer *rb);
-void prb_inc_lost(struct printk_ringbuffer *rb);
-void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store);
-void prb_unlock(struct prb_cpulock *cpu_lock, unsigned int cpu_store);
-
-#endif /*_LINUX_PRINTK_RINGBUFFER_H */
diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h
index 5ca206a41d678..b17e0cd0a30cf 100644
--- a/include/linux/ratelimit.h
+++ b/include/linux/ratelimit.h
@@ -28,7 +28,7 @@ static inline void ratelimit_state_exit(struct ratelimit_state *rs)
 		return;
 
 	if (rs->missed) {
-		pr_info("%s: %d output lines suppressed due to ratelimiting\n",
+		pr_warn("%s: %d output lines suppressed due to ratelimiting\n",
 			current->comm, rs->missed);
 		rs->missed = 0;
 	}
diff --git a/init/Kconfig b/init/Kconfig
index 7743d6e62a06a..c48887283f88a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -682,7 +682,8 @@ config IKHEADERS
 
 config LOG_BUF_SHIFT
 	int "Kernel log buffer size (16 => 64KB, 17 => 128KB)"
-	range 12 25
+	range 12 25 if !H8300
+	range 12 19 if H8300
 	default 17
 	depends on PRINTK
 	help
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 7b219d824c0fb..59cb24e25f004 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-y	= printk.o
 obj-$(CONFIG_A11Y_BRAILLE_CONSOLE)	+= braille.o
+obj-$(CONFIG_PRINTK)	+= printk_ringbuffer.o
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ee7008c436ca1..78a277ea5c351 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -46,10 +46,10 @@
 #include <linux/uio.h>
 #include <linux/kthread.h>
 #include <linux/clocksource.h>
-#include <linux/printk_ringbuffer.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task_stack.h>
+#include <linux/kdb.h>
 
 #include <linux/uaccess.h>
 #include <asm/sections.h>
@@ -58,15 +58,15 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/printk.h>
 
+#include "printk_ringbuffer.h"
 #include "console_cmdline.h"
 #include "braille.h"
 
-int console_printk[5] = {
+int console_printk[4] = {
 	CONSOLE_LOGLEVEL_DEFAULT,	/* console_loglevel */
 	MESSAGE_LOGLEVEL_DEFAULT,	/* default_message_loglevel */
 	CONSOLE_LOGLEVEL_MIN,		/* minimum_console_loglevel */
 	CONSOLE_LOGLEVEL_DEFAULT,	/* default_console_loglevel */
-	CONSOLE_LOGLEVEL_EMERGENCY,	/* emergency_console_loglevel */
 };
 EXPORT_SYMBOL_GPL(console_printk);
 
@@ -80,6 +80,9 @@ EXPORT_SYMBOL(ignore_console_lock_warning);
 int oops_in_progress;
 EXPORT_SYMBOL(oops_in_progress);
 
+/* Set to enable sync mode. Once set, it is never cleared. */
+static bool sync_mode;
+
 /*
  * console_sem protects the console_drivers list, and also
  * provides serialisation for access to the entire console
@@ -276,30 +279,22 @@ enum con_msg_format_flags {
 static int console_msg_format = MSG_FORMAT_DEFAULT;
 
 /*
- * The printk log buffer consists of a chain of concatenated variable
- * length records. Every record starts with a record header, containing
- * the overall length of the record.
+ * The printk log buffer consists of a sequenced collection of records, each
+ * containing variable length message text. Every record also contains its
+ * own meta-data (@info).
  *
- * The heads to the first and last entry in the buffer, as well as the
- * sequence numbers of these entries are maintained when messages are
- * stored.
+ * Every record meta-data carries the timestamp in microseconds, as well as
+ * the standard userspace syslog level and syslog facility. The usual kernel
+ * messages use LOG_KERN; userspace-injected messages always carry a matching
+ * syslog facility, by default LOG_USER. The origin of every message can be
+ * reliably determined that way.
  *
- * If the heads indicate available messages, the length in the header
- * tells the start next message. A length == 0 for the next message
- * indicates a wrap-around to the beginning of the buffer.
+ * The human readable log message of a record is available in @text, the
+ * length of the message text in @text_len. The stored message is not
+ * terminated.
  *
- * Every record carries the monotonic timestamp in microseconds, as well as
- * the standard userspace syslog level and syslog facility. The usual
- * kernel messages use LOG_KERN; userspace-injected messages always carry
- * a matching syslog facility, by default LOG_USER. The origin of every
- * message can be reliably determined that way.
- *
- * The human readable log message directly follows the message header. The
- * length of the message text is stored in the header, the stored message
- * is not terminated.
- *
- * Optionally, a message can carry a dictionary of properties (key/value pairs),
- * to provide userspace with a machine-readable message context.
+ * Optionally, a record can carry a dictionary of properties (key/value
+ * pairs), to provide userspace with a machine-readable message context.
  *
  * Examples for well-defined, commonly used property names are:
  *   DEVICE=b12:8               device identifier
@@ -309,25 +304,22 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
  *                                +sound:card0  subsystem:devname
  *   SUBSYSTEM=pci              driver-core subsystem name
  *
- * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value
- * follows directly after a '=' character. Every property is terminated by
- * a '\0' character. The last property is not terminated.
+ * Valid characters in property names are [a-zA-Z0-9.-_]. Property names
+ * and values are terminated by a '\0' character.
  *
- * Example of a message structure:
- *   0000  ff 8f 00 00 00 00 00 00      monotonic time in nsec
- *   0008  34 00                        record is 52 bytes long
- *   000a        0b 00                  text is 11 bytes long
- *   000c              1f 00            dictionary is 23 bytes long
- *   000e                    03 00      LOG_KERN (facility) LOG_ERR (level)
- *   0010  69 74 27 73 20 61 20 6c      "it's a l"
- *         69 6e 65                     "ine"
- *   001b           44 45 56 49 43      "DEVIC"
- *         45 3d 62 38 3a 32 00 44      "E=b8:2\0D"
- *         52 49 56 45 52 3d 62 75      "RIVER=bu"
- *         67                           "g"
- *   0032     00 00 00                  padding to next message header
+ * Example of record values:
+ *   record.text_buf                = "it's a line" (unterminated)
+ *   record.info.seq                = 56
+ *   record.info.ts_nsec            = 36863
+ *   record.info.text_len           = 11
+ *   record.info.facility           = 0 (LOG_KERN)
+ *   record.info.flags              = 0
+ *   record.info.level              = 3 (LOG_ERR)
+ *   record.info.caller_id          = 299 (task 299)
+ *   record.info.dev_info.subsystem = "pci" (terminated)
+ *   record.info.dev_info.device    = "+pci:0000:00:01.0" (terminated)
  *
- * The 'struct printk_log' buffer header must never be directly exported to
+ * The 'struct printk_info' buffer must never be directly exported to
  * userspace, it is a kernel-private implementation detail that might
  * need to be changed in the future, when the requirements change.
  *
@@ -347,40 +339,23 @@ enum log_flags {
 	LOG_CONT	= 8,	/* text is a fragment of a continuation line */
 };
 
-struct printk_log {
-	u64 ts_nsec;		/* timestamp in nanoseconds */
-	u16 cpu;		/* cpu that generated record */
-	u16 len;		/* length of entire record */
-	u16 text_len;		/* length of text buffer */
-	u16 dict_len;		/* length of dictionary buffer */
-	u8 facility;		/* syslog facility */
-	u8 flags:5;		/* internal record flags */
-	u8 level:3;		/* syslog level */
-#ifdef CONFIG_PRINTK_CALLER
-	u32 caller_id;            /* thread id or processor id */
-#endif
-}
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-__packed __aligned(4)
-#endif
-;
-
-DECLARE_STATIC_PRINTKRB_CPULOCK(printk_cpulock);
+/* The syslog_lock protects syslog_* variables. */
+static DEFINE_SPINLOCK(syslog_lock);
+#define syslog_lock_irq() spin_lock_irq(&syslog_lock)
+#define syslog_unlock_irq() spin_unlock_irq(&syslog_lock)
+#define syslog_lock_irqsave(flags) spin_lock_irqsave(&syslog_lock, flags)
+#define syslog_unlock_irqrestore(flags) spin_unlock_irqrestore(&syslog_lock, flags)
 
 #ifdef CONFIG_PRINTK
-/* record buffer */
-DECLARE_STATIC_PRINTKRB(printk_rb, CONFIG_LOG_BUF_SHIFT, &printk_cpulock);
-
-static DEFINE_MUTEX(syslog_lock);
-DECLARE_STATIC_PRINTKRB_ITER(syslog_iter, &printk_rb);
-
-/* the last printk record to read by syslog(READ) or /proc/kmsg */
+DECLARE_WAIT_QUEUE_HEAD(log_wait);
+/* All 3 protected by @syslog_lock. */
+/* the next printk record to read by syslog(READ) or /proc/kmsg */
 static u64 syslog_seq;
 static size_t syslog_partial;
 static bool syslog_time;
 
 /* the next printk record to read after the last 'clear' command */
-static u64 clear_seq;
+static atomic64_t clear_seq = ATOMIC64_INIT(0);
 
 #ifdef CONFIG_PRINTK_CALLER
 #define PREFIX_MAX		48
@@ -392,76 +367,80 @@ static u64 clear_seq;
 #define LOG_LEVEL(v)		((v) & 0x07)
 #define LOG_FACILITY(v)		((v) >> 3 & 0xff)
 
+/* record buffer */
+#define LOG_ALIGN __alignof__(unsigned long)
+#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
+#define LOG_BUF_LEN_MAX (u32)(1 << 31)
+static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
+static char *log_buf = __log_buf;
+static u32 log_buf_len = __LOG_BUF_LEN;
+
+/*
+ * Define the average message size. This only affects the number of
+ * descriptors that will be available. Underestimating is better than
+ * overestimating (too many available descriptors is better than not enough).
+ */
+#define PRB_AVGBITS 5	/* 32 character average length */
+
+#if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS
+#error CONFIG_LOG_BUF_SHIFT value too small.
+#endif
+_DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS,
+		 PRB_AVGBITS, &__log_buf[0]);
+
+static struct printk_ringbuffer printk_rb_dynamic;
+
+static struct printk_ringbuffer *prb = &printk_rb_static;
+
+/*
+ * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before
+ * per_cpu_areas are initialised. This variable is set to true when
+ * it's safe to access per-CPU data.
+ */
+static bool __printk_percpu_data_ready __read_mostly;
+
+static bool printk_percpu_data_ready(void)
+{
+	return __printk_percpu_data_ready;
+}
+
 /* Return log buffer address */
 char *log_buf_addr_get(void)
 {
-	return printk_rb.buffer;
+	return log_buf;
 }
 
 /* Return log buffer size */
 u32 log_buf_len_get(void)
 {
-	return (1 << printk_rb.size_bits);
+	return log_buf_len;
 }
 
-/* human readable text of the record */
-static char *log_text(const struct printk_log *msg)
+/*
+ * Define how much of the log buffer we could take at maximum. The value
+ * must be greater than two. Note that only half of the buffer is available
+ * when the index points to the middle.
+ */
+#define MAX_LOG_TAKE_PART 4
+static const char trunc_msg[] = "<truncated>";
+
+static void truncate_msg(u16 *text_len, u16 *trunc_msg_len)
 {
-	return (char *)msg + sizeof(struct printk_log);
-}
+	/*
+	 * The message should not take the whole buffer. Otherwise, it might
+	 * get removed too soon.
+	 */
+	u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART;
 
-/* optional key/value pair dictionary attached to the record */
-static char *log_dict(const struct printk_log *msg)
-{
-	return (char *)msg + sizeof(struct printk_log) + msg->text_len;
-}
+	if (*text_len > max_text_len)
+		*text_len = max_text_len;
 
-static void printk_emergency(char *buffer, int level, u64 ts_nsec, u16 cpu,
-			     char *text, u16 text_len);
-
-/* insert record into the buffer, discard old ones, update heads */
-static int log_store(u32 caller_id, int facility, int level,
-		     enum log_flags flags, u64 ts_nsec, u16 cpu,
-		     const char *dict, u16 dict_len,
-		     const char *text, u16 text_len)
-{
-	struct printk_log *msg;
-	struct prb_handle h;
-	char *rbuf;
-	u32 size;
-
-	size = sizeof(*msg) + text_len + dict_len;
-
-	rbuf = prb_reserve(&h, &printk_rb, size);
-	if (!rbuf) {
-		/*
-		 * An emergency message would have been printed, but
-		 * it cannot be stored in the log.
-		 */
-		prb_inc_lost(&printk_rb);
-		return 0;
-	}
-
-	/* fill message */
-	msg = (struct printk_log *)rbuf;
-	memcpy(log_text(msg), text, text_len);
-	msg->text_len = text_len;
-	memcpy(log_dict(msg), dict, dict_len);
-	msg->dict_len = dict_len;
-	msg->facility = facility;
-	msg->level = level & 7;
-	msg->flags = flags & 0x1f;
-	msg->ts_nsec = ts_nsec;
-#ifdef CONFIG_PRINTK_CALLER
-	msg->caller_id = caller_id;
-#endif
-	msg->cpu = cpu;
-	msg->len = size;
-
-	/* insert message */
-	prb_commit(&h);
-
-	return msg->text_len;
+	/* enable the warning message (if there is room) */
+	*trunc_msg_len = strlen(trunc_msg);
+	if (*text_len >= *trunc_msg_len)
+		*text_len -= *trunc_msg_len;
+	else
+		*trunc_msg_len = 0;
 }
 
 int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT);
@@ -513,13 +492,13 @@ static void append_char(char **pp, char *e, char c)
 		*(*pp)++ = c;
 }
 
-static ssize_t msg_print_ext_header(char *buf, size_t size,
-				    struct printk_log *msg, u64 seq)
+static ssize_t info_print_ext_header(char *buf, size_t size,
+				     struct printk_info *info)
 {
-	u64 ts_usec = msg->ts_nsec;
+	u64 ts_usec = info->ts_nsec;
 	char caller[20];
 #ifdef CONFIG_PRINTK_CALLER
-	u32 id = msg->caller_id;
+	u32 id = info->caller_id;
 
 	snprintf(caller, sizeof(caller), ",caller=%c%u",
 		 id & 0x80000000 ? 'C' : 'T', id & ~0x80000000);
@@ -529,14 +508,14 @@ static ssize_t msg_print_ext_header(char *buf, size_t size,
 
 	do_div(ts_usec, 1000);
 
-	return scnprintf(buf, size, "%u,%llu,%llu,%c%s,%hu;",
-			 (msg->facility << 3) | msg->level, seq, ts_usec,
-			 msg->flags & LOG_CONT ? 'c' : '-', caller, msg->cpu);
+	return scnprintf(buf, size, "%u,%llu,%llu,%c%s;",
+			 (info->facility << 3) | info->level, info->seq,
+			 ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller);
 }
 
-static ssize_t msg_print_ext_body(char *buf, size_t size,
-				  char *dict, size_t dict_len,
-				  char *text, size_t text_len)
+static ssize_t msg_add_ext_text(char *buf, size_t size,
+				const char *text, size_t text_len,
+				unsigned char endc)
 {
 	char *p = buf, *e = buf + size;
 	size_t i;
@@ -550,50 +529,56 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
 		else
 			append_char(&p, e, c);
 	}
-	append_char(&p, e, '\n');
-
-	if (dict_len) {
-		bool line = true;
-
-		for (i = 0; i < dict_len; i++) {
-			unsigned char c = dict[i];
-
-			if (line) {
-				append_char(&p, e, ' ');
-				line = false;
-			}
-
-			if (c == '\0') {
-				append_char(&p, e, '\n');
-				line = true;
-				continue;
-			}
-
-			if (c < ' ' || c >= 127 || c == '\\') {
-				p += scnprintf(p, e - p, "\\x%02x", c);
-				continue;
-			}
-
-			append_char(&p, e, c);
-		}
-		append_char(&p, e, '\n');
-	}
+	append_char(&p, e, endc);
 
 	return p - buf;
 }
 
-#define PRINTK_SPRINT_MAX (LOG_LINE_MAX + PREFIX_MAX)
-#define PRINTK_RECORD_MAX (sizeof(struct printk_log) + \
-				CONSOLE_EXT_LOG_MAX + PRINTK_SPRINT_MAX)
+static ssize_t msg_add_dict_text(char *buf, size_t size,
+				 const char *key, const char *val)
+{
+	size_t val_len = strlen(val);
+	ssize_t len;
+
+	if (!val_len)
+		return 0;
+
+	len = msg_add_ext_text(buf, size, "", 0, ' ');	/* dict prefix */
+	len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '=');
+	len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n');
+
+	return len;
+}
+
+static ssize_t msg_print_ext_body(char *buf, size_t size,
+				  char *text, size_t text_len,
+				  struct dev_printk_info *dev_info)
+{
+	ssize_t len;
+
+	len = msg_add_ext_text(buf, size, text, text_len, '\n');
+
+	if (!dev_info)
+		goto out;
+
+	len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM",
+				 dev_info->subsystem);
+	len += msg_add_dict_text(buf + len, size - len, "DEVICE",
+				 dev_info->device);
+out:
+	return len;
+}
 
 /* /dev/kmsg - userspace message inject/listen interface */
 struct devkmsg_user {
 	u64 seq;
-	struct prb_iterator iter;
 	struct ratelimit_state rs;
 	struct mutex lock;
 	char buf[CONSOLE_EXT_LOG_MAX];
-	char msgbuf[PRINTK_RECORD_MAX];
+
+	struct printk_info info;
+	char text_buf[CONSOLE_EXT_LOG_MAX];
+	struct printk_record record;
 };
 
 static __printf(3, 4) __cold
@@ -603,7 +588,7 @@ int devkmsg_emit(int facility, int level, const char *fmt, ...)
 	int r;
 
 	va_start(args, fmt);
-	r = vprintk_emit(facility, level, NULL, 0, fmt, args);
+	r = vprintk_emit(facility, level, NULL, fmt, args);
 	va_end(args);
 
 	return r;
@@ -676,11 +661,9 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *ppos)
 {
 	struct devkmsg_user *user = file->private_data;
-	struct prb_iterator backup_iter;
-	struct printk_log *msg;
-	ssize_t ret;
+	struct printk_record *r = &user->record;
 	size_t len;
-	u64 seq;
+	ssize_t ret;
 
 	if (!user)
 		return -EBADF;
@@ -689,63 +672,42 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 	if (ret)
 		return ret;
 
-	/* make a backup copy in case there is a problem */
-	prb_iter_copy(&backup_iter, &user->iter);
+	if (!prb_read_valid(prb, user->seq, r)) {
+		if (file->f_flags & O_NONBLOCK) {
+			ret = -EAGAIN;
+			goto out;
+		}
 
-	if (file->f_flags & O_NONBLOCK) {
-		ret = prb_iter_next(&user->iter, &user->msgbuf[0],
-				      sizeof(user->msgbuf), &seq);
-	} else {
-		ret = prb_iter_wait_next(&user->iter, &user->msgbuf[0],
-					   sizeof(user->msgbuf), &seq);
+		ret = wait_event_interruptible(log_wait,
+					prb_read_valid(prb, user->seq, r));
+		if (ret)
+			goto out;
 	}
-	if (ret == 0) {
-		/* end of list */
-		ret = -EAGAIN;
-		goto out;
-	} else if (ret == -EINVAL) {
-		/* iterator invalid, return error and reset */
+
+	if (user->seq < prb_first_valid_seq(prb)) {
+		/* our last seen message is gone, return error and reset */
+		user->seq = prb_first_valid_seq(prb);
 		ret = -EPIPE;
-		prb_iter_init(&user->iter, &printk_rb, &user->seq);
-		goto out;
-	} else if (ret < 0) {
-		/* interrupted by signal */
 		goto out;
 	}
 
-	user->seq++;
-	if (user->seq < seq) {
-		ret = -EPIPE;
-		goto restore_out;
-	}
-
-	msg = (struct printk_log *)&user->msgbuf[0];
-	len = msg_print_ext_header(user->buf, sizeof(user->buf),
-				   msg, user->seq);
+	len = info_print_ext_header(user->buf, sizeof(user->buf), r->info);
 	len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len,
-				  log_dict(msg), msg->dict_len,
-				  log_text(msg), msg->text_len);
+				  &r->text_buf[0], r->info->text_len,
+				  &r->info->dev_info);
+
+	user->seq = r->info->seq + 1;
 
 	if (len > count) {
 		ret = -EINVAL;
-		goto restore_out;
+		goto out;
 	}
 
 	if (copy_to_user(buf, user->buf, len)) {
 		ret = -EFAULT;
-		goto restore_out;
+		goto out;
 	}
-
 	ret = len;
-	goto out;
-restore_out:
-	/*
-	 * There was an error, but this message should not be
-	 * lost because of it. Restore the backup and setup
-	 * seq so that it will work with the next read.
-	 */
-	prb_iter_copy(&user->iter, &backup_iter);
-	user->seq = seq - 1;
 out:
 	mutex_unlock(&user->lock);
 	return ret;
@@ -762,22 +724,17 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct devkmsg_user *user = file->private_data;
-	loff_t ret;
-	u64 seq;
+	loff_t ret = 0;
 
 	if (!user)
 		return -EBADF;
 	if (offset)
 		return -ESPIPE;
 
-	ret = mutex_lock_interruptible(&user->lock);
-	if (ret)
-		return ret;
-
 	switch (whence) {
 	case SEEK_SET:
 		/* the first record */
-		prb_iter_init(&user->iter, &printk_rb, &user->seq);
+		user->seq = prb_first_valid_seq(prb);
 		break;
 	case SEEK_DATA:
 		/*
@@ -785,87 +742,35 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
 		 * like issued by 'dmesg -c'. Reading /dev/kmsg itself
 		 * changes no global state, and does not clear anything.
 		 */
-		for (;;) {
-			prb_iter_init(&user->iter, &printk_rb, &seq);
-			ret = prb_iter_seek(&user->iter, clear_seq);
-			if (ret > 0) {
-				/* seeked to clear seq */
-				user->seq = clear_seq;
-				break;
-			} else if (ret == 0) {
-				/*
-				 * The end of the list was hit without
-				 * ever seeing the clear seq. Just
-				 * seek to the beginning of the list.
-				 */
-				prb_iter_init(&user->iter, &printk_rb,
-					      &user->seq);
-				break;
-			}
-			/* iterator invalid, start over */
-
-			/* reset clear_seq if it is no longer available */
-			if (seq > clear_seq)
-				clear_seq = 0;
-		}
-		ret = 0;
+		user->seq = atomic64_read(&clear_seq);
 		break;
 	case SEEK_END:
 		/* after the last record */
-		for (;;) {
-			ret = prb_iter_next(&user->iter, NULL, 0, &user->seq);
-			if (ret == 0)
-				break;
-			else if (ret > 0)
-				continue;
-			/* iterator invalid, start over */
-			prb_iter_init(&user->iter, &printk_rb, &user->seq);
-		}
-		ret = 0;
+		user->seq = prb_next_seq(prb);
 		break;
 	default:
 		ret = -EINVAL;
 	}
-
-	mutex_unlock(&user->lock);
 	return ret;
 }
 
-struct wait_queue_head *printk_wait_queue(void)
-{
-	/* FIXME: using prb internals! */
-	return printk_rb.wq;
-}
-
 static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
 {
 	struct devkmsg_user *user = file->private_data;
-	struct prb_iterator iter;
 	__poll_t ret = 0;
-	int rbret;
-	u64 seq;
 
 	if (!user)
 		return EPOLLERR|EPOLLNVAL;
 
-	poll_wait(file, printk_wait_queue(), wait);
+	poll_wait(file, &log_wait, wait);
 
-	mutex_lock(&user->lock);
-
-	/* use copy so no actual iteration takes place */
-	prb_iter_copy(&iter, &user->iter);
-
-	rbret = prb_iter_next(&iter, &user->msgbuf[0],
-			      sizeof(user->msgbuf), &seq);
-	if (rbret == 0)
-		goto out;
-
-	ret = EPOLLIN|EPOLLRDNORM;
-
-	if (rbret < 0 || (seq - user->seq) != 1)
-		ret |= EPOLLERR|EPOLLPRI;
-out:
-	mutex_unlock(&user->lock);
+	if (prb_read_valid(prb, user->seq, NULL)) {
+		/* return error when data has vanished underneath us */
+		if (user->seq < prb_first_valid_seq(prb))
+			ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
+		else
+			ret = EPOLLIN|EPOLLRDNORM;
+	}
 
 	return ret;
 }
@@ -895,7 +800,10 @@ static int devkmsg_open(struct inode *inode, struct file *file)
 
 	mutex_init(&user->lock);
 
-	prb_iter_init(&user->iter, &printk_rb, &user->seq);
+	prb_rec_init_rd(&user->record, &user->info,
+			&user->text_buf[0], sizeof(user->text_buf));
+
+	user->seq = prb_first_valid_seq(prb);
 
 	file->private_data = user;
 	return 0;
@@ -935,23 +843,64 @@ const struct file_operations kmsg_fops = {
  */
 void log_buf_vmcoreinfo_setup(void)
 {
+	struct dev_printk_info *dev_info = NULL;
+
+	VMCOREINFO_SYMBOL(prb);
+	VMCOREINFO_SYMBOL(printk_rb_static);
+	VMCOREINFO_SYMBOL(clear_seq);
+
 	/*
-	 * Export struct printk_log size and field offsets. User space tools can
+	 * Export struct size and field offsets. User space tools can
 	 * parse it and detect any changes to structure down the line.
 	 */
-	VMCOREINFO_STRUCT_SIZE(printk_log);
-	VMCOREINFO_OFFSET(printk_log, ts_nsec);
-	VMCOREINFO_OFFSET(printk_log, len);
-	VMCOREINFO_OFFSET(printk_log, text_len);
-	VMCOREINFO_OFFSET(printk_log, dict_len);
-#ifdef CONFIG_PRINTK_CALLER
-	VMCOREINFO_OFFSET(printk_log, caller_id);
-#endif
+
+	VMCOREINFO_SIZE(atomic64_t);
+	VMCOREINFO_TYPE_OFFSET(atomic64_t, counter);
+
+	VMCOREINFO_STRUCT_SIZE(printk_ringbuffer);
+	VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring);
+	VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring);
+	VMCOREINFO_OFFSET(printk_ringbuffer, fail);
+
+	VMCOREINFO_STRUCT_SIZE(prb_desc_ring);
+	VMCOREINFO_OFFSET(prb_desc_ring, count_bits);
+	VMCOREINFO_OFFSET(prb_desc_ring, descs);
+	VMCOREINFO_OFFSET(prb_desc_ring, infos);
+	VMCOREINFO_OFFSET(prb_desc_ring, head_id);
+	VMCOREINFO_OFFSET(prb_desc_ring, tail_id);
+
+	VMCOREINFO_STRUCT_SIZE(prb_desc);
+	VMCOREINFO_OFFSET(prb_desc, state_var);
+	VMCOREINFO_OFFSET(prb_desc, text_blk_lpos);
+
+	VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos);
+	VMCOREINFO_OFFSET(prb_data_blk_lpos, begin);
+	VMCOREINFO_OFFSET(prb_data_blk_lpos, next);
+
+	VMCOREINFO_STRUCT_SIZE(printk_info);
+	VMCOREINFO_OFFSET(printk_info, seq);
+	VMCOREINFO_OFFSET(printk_info, ts_nsec);
+	VMCOREINFO_OFFSET(printk_info, text_len);
+	VMCOREINFO_OFFSET(printk_info, caller_id);
+	VMCOREINFO_OFFSET(printk_info, dev_info);
+
+	VMCOREINFO_STRUCT_SIZE(dev_printk_info);
+	VMCOREINFO_OFFSET(dev_printk_info, subsystem);
+	VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem));
+	VMCOREINFO_OFFSET(dev_printk_info, device);
+	VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device));
+
+	VMCOREINFO_STRUCT_SIZE(prb_data_ring);
+	VMCOREINFO_OFFSET(prb_data_ring, size_bits);
+	VMCOREINFO_OFFSET(prb_data_ring, data);
+	VMCOREINFO_OFFSET(prb_data_ring, head_lpos);
+	VMCOREINFO_OFFSET(prb_data_ring, tail_lpos);
+
+	VMCOREINFO_SIZE(atomic_long_t);
+	VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter);
 }
 #endif
 
-/* FIXME: no support for buffer resizing */
-#if 0
 /* requested log_buf_len from kernel cmdline */
 static unsigned long __initdata new_log_buf_len;
 
@@ -1017,15 +966,59 @@ static void __init log_buf_add_cpu(void)
 #else /* !CONFIG_SMP */
 static inline void log_buf_add_cpu(void) {}
 #endif /* CONFIG_SMP */
-#endif /* 0 */
+
+static void __init set_percpu_data_ready(void)
+{
+	__printk_percpu_data_ready = true;
+}
+
+static unsigned int __init add_to_rb(struct printk_ringbuffer *rb,
+				     struct printk_record *r)
+{
+	struct prb_reserved_entry e;
+	struct printk_record dest_r;
+
+	prb_rec_init_wr(&dest_r, r->info->text_len);
+
+	if (!prb_reserve(&e, rb, &dest_r))
+		return 0;
+
+	memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len);
+	dest_r.info->text_len = r->info->text_len;
+	dest_r.info->facility = r->info->facility;
+	dest_r.info->level = r->info->level;
+	dest_r.info->flags = r->info->flags;
+	dest_r.info->ts_nsec = r->info->ts_nsec;
+	dest_r.info->caller_id = r->info->caller_id;
+	memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info));
+
+	prb_final_commit(&e);
+
+	return prb_record_text_space(&e);
+}
+
+static char setup_text_buf[LOG_LINE_MAX] __initdata;
 
 void __init setup_log_buf(int early)
 {
-/* FIXME: no support for buffer resizing */
-#if 0
-	unsigned long flags;
+	struct printk_info *new_infos;
+	unsigned int new_descs_count;
+	struct prb_desc *new_descs;
+	struct printk_info info;
+	struct printk_record r;
+	size_t new_descs_size;
+	size_t new_infos_size;
 	char *new_log_buf;
 	unsigned int free;
+	u64 seq;
+
+	/*
+	 * Some archs call setup_log_buf() multiple times - first is very
+	 * early, e.g. from setup_arch(), and second - when percpu_areas
+	 * are initialised.
+	 */
+	if (!early)
+		set_percpu_data_ready();
 
 	if (log_buf != __log_buf)
 		return;
@@ -1036,25 +1029,71 @@ void __init setup_log_buf(int early)
 	if (!new_log_buf_len)
 		return;
 
-	new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN);
-	if (unlikely(!new_log_buf)) {
-		pr_err("log_buf_len: %lu bytes not available\n",
-			new_log_buf_len);
+	new_descs_count = new_log_buf_len >> PRB_AVGBITS;
+	if (new_descs_count == 0) {
+		pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len);
 		return;
 	}
 
-	logbuf_lock_irqsave(flags);
+	new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN);
+	if (unlikely(!new_log_buf)) {
+		pr_err("log_buf_len: %lu text bytes not available\n",
+		       new_log_buf_len);
+		return;
+	}
+
+	new_descs_size = new_descs_count * sizeof(struct prb_desc);
+	new_descs = memblock_alloc(new_descs_size, LOG_ALIGN);
+	if (unlikely(!new_descs)) {
+		pr_err("log_buf_len: %zu desc bytes not available\n",
+		       new_descs_size);
+		goto err_free_log_buf;
+	}
+
+	new_infos_size = new_descs_count * sizeof(struct printk_info);
+	new_infos = memblock_alloc(new_infos_size, LOG_ALIGN);
+	if (unlikely(!new_infos)) {
+		pr_err("log_buf_len: %zu info bytes not available\n",
+		       new_infos_size);
+		goto err_free_descs;
+	}
+
+	prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf));
+
+	prb_init(&printk_rb_dynamic,
+		 new_log_buf, ilog2(new_log_buf_len),
+		 new_descs, ilog2(new_descs_count),
+		 new_infos);
+
 	log_buf_len = new_log_buf_len;
 	log_buf = new_log_buf;
 	new_log_buf_len = 0;
-	free = __LOG_BUF_LEN - log_next_idx;
-	memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
-	logbuf_unlock_irqrestore(flags);
+
+	free = __LOG_BUF_LEN;
+	prb_for_each_record(0, &printk_rb_static, seq, &r)
+		free -= add_to_rb(&printk_rb_dynamic, &r);
+
+	/*
+	 * This is early enough that everything is still running on the
+	 * boot CPU and interrupts are disabled. So no new messages will
+	 * appear during the transition to the dynamic buffer.
+	 */
+	prb = &printk_rb_dynamic;
+
+	if (seq != prb_next_seq(&printk_rb_static)) {
+		pr_err("dropped %llu messages\n",
+		       prb_next_seq(&printk_rb_static) - seq);
+	}
 
 	pr_info("log_buf_len: %u bytes\n", log_buf_len);
 	pr_info("early log buf free: %u(%u%%)\n",
 		free, (free * 100) / __LOG_BUF_LEN);
-#endif
+	return;
+
+err_free_descs:
+	memblock_free(__pa(new_descs), new_descs_size);
+err_free_log_buf:
+	memblock_free(__pa(new_log_buf), new_log_buf_len);
 }
 
 static bool __read_mostly ignore_loglevel;
@@ -1135,11 +1174,6 @@ static inline void boot_delay_msec(int level)
 static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);
 module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
 
-static size_t print_cpu(u16 cpu, char *buf)
-{
-	return sprintf(buf, "%03hu: ", cpu);
-}
-
 static size_t print_syslog(unsigned int level, char *buf)
 {
 	return sprintf(buf, "<%u>", level);
@@ -1166,104 +1200,169 @@ static size_t print_caller(u32 id, char *buf)
 #define print_caller(id, buf) 0
 #endif
 
-static size_t print_prefix(const struct printk_log *msg, bool syslog,
-			   bool time, char *buf)
+static size_t info_print_prefix(const struct printk_info  *info, bool syslog,
+				bool time, char *buf)
 {
 	size_t len = 0;
 
 	if (syslog)
-		len = print_syslog((msg->facility << 3) | msg->level, buf);
+		len = print_syslog((info->facility << 3) | info->level, buf);
 
 	if (time)
-		len += print_time(msg->ts_nsec, buf + len);
+		len += print_time(info->ts_nsec, buf + len);
 
-	len += print_caller(msg->caller_id, buf + len);
+	len += print_caller(info->caller_id, buf + len);
 
 	if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) {
 		buf[len++] = ' ';
 		buf[len] = '\0';
 	}
-	len += print_cpu(msg->cpu, buf + len);
 
 	return len;
 }
 
-static size_t msg_print_text(const struct printk_log *msg, bool syslog,
-			     bool time, char *buf, size_t size)
+/*
+ * Prepare the record for printing. The text is shifted within the given
+ * buffer to avoid a need for another one. The following operations are
+ * done:
+ *
+ *   - Add prefix for each line.
+ *   - Add the trailing newline that has been removed in vprintk_store().
+ *   - Drop truncated lines that do not longer fit into the buffer.
+ *
+ * Return: The length of the updated/prepared text, including the added
+ * prefixes and the newline. The dropped line(s) are not counted.
+ */
+static size_t record_print_text(struct printk_record *r, bool syslog,
+				bool time)
 {
-	const char *text = log_text(msg);
-	size_t text_size = msg->text_len;
-	size_t len = 0;
+	size_t text_len = r->info->text_len;
+	size_t buf_size = r->text_buf_size;
+	char *text = r->text_buf;
 	char prefix[PREFIX_MAX];
-	const size_t prefix_len = print_prefix(msg, syslog, time, prefix);
+	bool truncated = false;
+	size_t prefix_len;
+	size_t line_len;
+	size_t len = 0;
+	char *next;
 
-	do {
-		const char *next = memchr(text, '\n', text_size);
-		size_t text_len;
+	/*
+	 * If the message was truncated because the buffer was not large
+	 * enough, treat the available text as if it were the full text.
+	 */
+	if (text_len > buf_size)
+		text_len = buf_size;
 
+	prefix_len = info_print_prefix(r->info, syslog, time, prefix);
+
+	/*
+	 * @text_len: bytes of unprocessed text
+	 * @line_len: bytes of current line _without_ newline
+	 * @text:     pointer to beginning of current line
+	 * @len:      number of bytes prepared in r->text_buf
+	 */
+	for (;;) {
+		next = memchr(text, '\n', text_len);
 		if (next) {
-			text_len = next - text;
-			next++;
-			text_size -= next - text;
+			line_len = next - text;
 		} else {
-			text_len = text_size;
+			/* Drop truncated line(s). */
+			if (truncated)
+				break;
+			line_len = text_len;
 		}
 
-		if (buf) {
-			if (prefix_len + text_len + 1 >= size - len)
+		/*
+		 * Truncate the text if there is not enough space to add the
+		 * prefix and a trailing newline.
+		 */
+		if (len + prefix_len + text_len + 1 > buf_size) {
+			/* Drop even the current line if no space. */
+			if (len + prefix_len + line_len + 1 > buf_size)
 				break;
 
-			memcpy(buf + len, prefix, prefix_len);
-			len += prefix_len;
-			memcpy(buf + len, text, text_len);
-			len += text_len;
-			buf[len++] = '\n';
-		} else {
-			/* SYSLOG_ACTION_* buffer size only calculation */
-			len += prefix_len + text_len + 1;
+			text_len = buf_size - len - prefix_len - 1;
+			truncated = true;
 		}
 
-		text = next;
-	} while (text);
+		memmove(text + prefix_len, text, text_len);
+		memcpy(text, prefix, prefix_len);
+
+		len += prefix_len + line_len + 1;
+
+		if (text_len == line_len) {
+			/*
+			 * Add the trailing newline removed in
+			 * vprintk_store().
+			 */
+			text[prefix_len + line_len] = '\n';
+			break;
+		}
+
+		/*
+		 * Advance beyond the added prefix and the related line with
+		 * its newline.
+		 */
+		text += prefix_len + line_len + 1;
+
+		/*
+		 * The remaining text has only decreased by the line with its
+		 * newline.
+		 *
+		 * Note that @text_len can become zero. It happens when @text
+		 * ended with a newline (either due to truncation or the
+		 * original string ending with "\n\n"). The loop is correctly
+		 * repeated and (if not truncated) an empty line with a prefix
+		 * will be prepared.
+		 */
+		text_len -= line_len + 1;
+	}
 
 	return len;
 }
 
-static int syslog_print(char __user *buf, int size, char *text,
-			char *msgbuf, int *locked)
+static size_t get_record_print_text_size(struct printk_info *info,
+					 unsigned int line_count,
+					 bool syslog, bool time)
 {
-	struct prb_iterator iter;
-	struct printk_log *msg;
+	char prefix[PREFIX_MAX];
+	size_t prefix_len;
+
+	prefix_len = info_print_prefix(info, syslog, time, prefix);
+
+	/*
+	 * Each line will be preceded with a prefix. The intermediate
+	 * newlines are already within the text, but a final trailing
+	 * newline will be added.
+	 */
+	return ((prefix_len * line_count) + info->text_len + 1);
+}
+
+static int syslog_print(char __user *buf, int size)
+{
+	struct printk_info info;
+	struct printk_record r;
+	char *text;
 	int len = 0;
-	u64 seq;
-	int ret;
+
+	text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+	if (!text)
+		return -ENOMEM;
+
+	prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX);
 
 	while (size > 0) {
 		size_t n;
 		size_t skip;
 
-		for (;;) {
-			prb_iter_copy(&iter, &syslog_iter);
-			ret = prb_iter_next(&iter, msgbuf,
-					    PRINTK_RECORD_MAX, &seq);
-			if (ret < 0) {
-				/* messages are gone, move to first one */
-				prb_iter_init(&syslog_iter, &printk_rb,
-					      &syslog_seq);
-				syslog_partial = 0;
-				continue;
-			}
+		syslog_lock_irq();
+		if (!prb_read_valid(prb, syslog_seq, &r)) {
+			syslog_unlock_irq();
 			break;
 		}
-		if (ret == 0)
-			break;
-
-		/*
-		 * If messages have been missed, the partial tracker
-		 * is no longer valid and must be reset.
-		 */
-		if (syslog_seq > 0 && seq - 1 != syslog_seq) {
-			syslog_seq = seq - 1;
+		if (r.info->seq != syslog_seq) {
+			/* message is gone, move to next valid one */
+			syslog_seq = r.info->seq;
 			syslog_partial = 0;
 		}
 
@@ -1274,213 +1373,124 @@ static int syslog_print(char __user *buf, int size, char *text,
 		if (!syslog_partial)
 			syslog_time = printk_time;
 
-		msg = (struct printk_log *)msgbuf;
-
 		skip = syslog_partial;
-		n = msg_print_text(msg, true, syslog_time, text,
-				   PRINTK_SPRINT_MAX);
+		n = record_print_text(&r, true, syslog_time);
 		if (n - syslog_partial <= size) {
 			/* message fits into buffer, move forward */
-			prb_iter_next(&syslog_iter, NULL, 0, &syslog_seq);
+			syslog_seq = r.info->seq + 1;
 			n -= syslog_partial;
 			syslog_partial = 0;
-		} else if (!len) {
+		} else if (!len){
 			/* partial read(), remember position */
 			n = size;
 			syslog_partial += n;
 		} else
 			n = 0;
+		syslog_unlock_irq();
 
 		if (!n)
 			break;
 
-		mutex_unlock(&syslog_lock);
 		if (copy_to_user(buf, text + skip, n)) {
 			if (!len)
 				len = -EFAULT;
-			*locked = 0;
 			break;
 		}
-		ret = mutex_lock_interruptible(&syslog_lock);
 
 		len += n;
 		size -= n;
 		buf += n;
-
-		if (ret) {
-			if (!len)
-				len = ret;
-			*locked = 0;
-			break;
-		}
 	}
 
+	kfree(text);
 	return len;
 }
 
-static int count_remaining(struct prb_iterator *iter, u64 until_seq,
-			   char *msgbuf, int size, bool records, bool time)
-{
-	struct prb_iterator local_iter;
-	struct printk_log *msg;
-	int len = 0;
-	u64 seq;
-	int ret;
-
-	prb_iter_copy(&local_iter, iter);
-	for (;;) {
-		ret = prb_iter_next(&local_iter, msgbuf, size, &seq);
-		if (ret == 0) {
-			break;
-		} else if (ret < 0) {
-			/* the iter is invalid, restart from head */
-			prb_iter_init(&local_iter, &printk_rb, NULL);
-			len = 0;
-			continue;
-		}
-
-		if (until_seq && seq >= until_seq)
-			break;
-
-		if (records) {
-			len++;
-		} else {
-			msg = (struct printk_log *)msgbuf;
-			len += msg_print_text(msg, true, time, NULL, 0);
-		}
-	}
-
-	return len;
-}
-
-static void syslog_clear(void)
-{
-	struct prb_iterator iter;
-	int ret;
-
-	prb_iter_init(&iter, &printk_rb, &clear_seq);
-	for (;;) {
-		ret = prb_iter_next(&iter, NULL, 0, &clear_seq);
-		if (ret == 0)
-			break;
-		else if (ret < 0)
-			prb_iter_init(&iter, &printk_rb, &clear_seq);
-	}
-}
-
 static int syslog_print_all(char __user *buf, int size, bool clear)
 {
-	struct prb_iterator iter;
-	struct printk_log *msg;
-	char *msgbuf = NULL;
-	char *text = NULL;
-	int textlen;
-	u64 seq = 0;
+	struct printk_info info;
+	unsigned int line_count;
+	struct printk_record r;
+	u64 newest_seq;
+	u64 clr_seq;
+	char *text;
 	int len = 0;
+	u64 seq;
 	bool time;
-	int ret;
 
-	text = kmalloc(PRINTK_SPRINT_MAX, GFP_KERNEL);
+	text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
 	if (!text)
 		return -ENOMEM;
-	msgbuf = kmalloc(PRINTK_RECORD_MAX, GFP_KERNEL);
-	if (!msgbuf) {
-		kfree(text);
-		return -ENOMEM;
-	}
 
 	time = printk_time;
+	clr_seq = atomic64_read(&clear_seq);
 
 	/*
-	 * Setup iter to last event before clear. Clear may
-	 * be lost, but keep going with a best effort.
+	 * Find first record that fits, including all following records,
+	 * into the user-provided buffer for this dump.
 	 */
-	prb_iter_init(&iter, &printk_rb, NULL);
-	prb_iter_seek(&iter, clear_seq);
 
-	/* count the total bytes after clear */
-	len = count_remaining(&iter, 0, msgbuf, PRINTK_RECORD_MAX,
-			      false, time);
+	prb_for_each_info(clr_seq, prb, seq, &info, &line_count)
+		len += get_record_print_text_size(&info, line_count, true, time);
 
-	/* move iter forward until length fits into the buffer */
-	while (len > size) {
-		ret = prb_iter_next(&iter, msgbuf,
-				    PRINTK_RECORD_MAX, &seq);
-		if (ret == 0) {
+	/*
+	 * Keep track of the latest in case new records are coming in fast
+	 * and overwriting the older records.
+	 */
+	newest_seq = seq;
+
+	/*
+	 * Move first record forward until length fits into the buffer. This
+	 * is a best effort attempt. If @newest_seq is reached because the
+	 * ringbuffer is wrapping too fast, just start filling the buffer
+	 * from there.
+	 */
+	prb_for_each_info(clr_seq, prb, seq, &info, &line_count) {
+		if (len <= size || info.seq > newest_seq)
 			break;
-		} else if (ret < 0) {
-			/*
-			 * The iter is now invalid so clear will
-			 * also be invalid. Restart from the head.
-			 */
-			prb_iter_init(&iter, &printk_rb, NULL);
-			len = count_remaining(&iter, 0, msgbuf,
-					      PRINTK_RECORD_MAX, false, time);
-			continue;
-		}
-
-		msg = (struct printk_log *)msgbuf;
-		len -= msg_print_text(msg, true, time, NULL, 0);
-
-		if (clear)
-			clear_seq = seq;
+		len -= get_record_print_text_size(&info, line_count, true, time);
 	}
 
-	/* copy messages to buffer */
+	prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX);
+
 	len = 0;
-	while (len >= 0 && len < size) {
-		if (clear)
-			clear_seq = seq;
+	prb_for_each_record(seq, prb, seq, &r) {
+		int textlen;
 
-		ret = prb_iter_next(&iter, msgbuf,
-				    PRINTK_RECORD_MAX, &seq);
-		if (ret == 0) {
-			break;
-		} else if (ret < 0) {
-			/*
-			 * The iter is now invalid. Make a best
-			 * effort to grab the rest of the log
-			 * from the new head.
-			 */
-			prb_iter_init(&iter, &printk_rb, NULL);
-			continue;
-		}
+		textlen = record_print_text(&r, true, time);
 
-		msg = (struct printk_log *)msgbuf;
-		textlen = msg_print_text(msg, true, time, text,
-					 PRINTK_SPRINT_MAX);
-		if (textlen < 0) {
-			len = textlen;
+		if (len + textlen > size) {
+			seq--;
 			break;
 		}
 
-		if (len + textlen > size)
-			break;
-
 		if (copy_to_user(buf + len, text, textlen))
 			len = -EFAULT;
 		else
 			len += textlen;
+
+		if (len < 0)
+			break;
 	}
 
-	if (clear && !seq)
-		syslog_clear();
+	if (clear)
+		atomic64_set(&clear_seq, seq);
 
 	kfree(text);
-	kfree(msgbuf);
 	return len;
 }
 
+static void syslog_clear(void)
+{
+	atomic64_set(&clear_seq, prb_next_seq(prb));
+}
+
 int do_syslog(int type, char __user *buf, int len, int source)
 {
 	bool clear = false;
 	static int saved_console_loglevel = LOGLEVEL_DEFAULT;
-	struct prb_iterator iter;
-	char *msgbuf = NULL;
-	char *text = NULL;
-	int locked;
 	int error;
-	int ret;
+	u64 seq;
 
 	error = check_syslog_permissions(type, source);
 	if (error)
@@ -1498,54 +1508,19 @@ int do_syslog(int type, char __user *buf, int len, int source)
 			return 0;
 		if (!access_ok(buf, len))
 			return -EFAULT;
-
-		text = kmalloc(PRINTK_SPRINT_MAX, GFP_KERNEL);
-		msgbuf = kmalloc(PRINTK_RECORD_MAX, GFP_KERNEL);
-		if (!text || !msgbuf) {
-			error = -ENOMEM;
-			goto out;
-		}
-
-		error = mutex_lock_interruptible(&syslog_lock);
+		syslog_lock_irq();
+		seq = syslog_seq;
+		syslog_unlock_irq();
+		error = wait_event_interruptible(log_wait,
+				prb_read_valid(prb, seq, NULL));
 		if (error)
-			goto out;
-
-		/*
-		 * Wait until a first message is available. Use a copy
-		 * because no iteration should occur for syslog now.
-		 */
-		for (;;) {
-			prb_iter_copy(&iter, &syslog_iter);
-
-			mutex_unlock(&syslog_lock);
-			ret = prb_iter_wait_next(&iter, NULL, 0, NULL);
-			if (ret == -ERESTARTSYS) {
-				error = ret;
-				goto out;
-			}
-			error = mutex_lock_interruptible(&syslog_lock);
-			if (error)
-				goto out;
-
-			if (ret == -EINVAL) {
-				prb_iter_init(&syslog_iter, &printk_rb,
-					      &syslog_seq);
-				syslog_partial = 0;
-				continue;
-			}
-			break;
-		}
-
-		/* print as much as will fit in the user buffer */
-		locked = 1;
-		error = syslog_print(buf, len, text, msgbuf, &locked);
-		if (locked)
-			mutex_unlock(&syslog_lock);
+			return error;
+		error = syslog_print(buf, len);
 		break;
 	/* Read/clear last kernel messages */
 	case SYSLOG_ACTION_READ_CLEAR:
 		clear = true;
-		/* FALL THRU */
+		fallthrough;
 	/* Read last kernel messages */
 	case SYSLOG_ACTION_READ_ALL:
 		if (!buf || len < 0)
@@ -1585,43 +1560,44 @@ int do_syslog(int type, char __user *buf, int len, int source)
 		break;
 	/* Number of chars in the log buffer */
 	case SYSLOG_ACTION_SIZE_UNREAD:
-		msgbuf = kmalloc(PRINTK_RECORD_MAX, GFP_KERNEL);
-		if (!msgbuf)
-			return -ENOMEM;
-
-		error = mutex_lock_interruptible(&syslog_lock);
-		if (error)
-			goto out;
-
+		syslog_lock_irq();
+		if (syslog_seq < prb_first_valid_seq(prb)) {
+			/* messages are gone, move to first one */
+			syslog_seq = prb_first_valid_seq(prb);
+			syslog_partial = 0;
+		}
 		if (source == SYSLOG_FROM_PROC) {
 			/*
 			 * Short-cut for poll(/"proc/kmsg") which simply checks
 			 * for pending data, not the size; return the count of
 			 * records, not the length.
 			 */
-			error = count_remaining(&syslog_iter, 0, msgbuf,
-						PRINTK_RECORD_MAX, true,
-						printk_time);
+			error = prb_next_seq(prb) - syslog_seq;
 		} else {
-			error = count_remaining(&syslog_iter, 0, msgbuf,
-						PRINTK_RECORD_MAX, false,
-						printk_time);
+			bool time = syslog_partial ? syslog_time : printk_time;
+			struct printk_info info;
+			unsigned int line_count;
+			u64 seq;
+
+			prb_for_each_info(syslog_seq, prb, seq, &info,
+					  &line_count) {
+				error += get_record_print_text_size(&info, line_count,
+								    true, time);
+				time = printk_time;
+			}
 			error -= syslog_partial;
 		}
-
-		mutex_unlock(&syslog_lock);
+		syslog_unlock_irq();
 		break;
 	/* Size of the log buffer */
 	case SYSLOG_ACTION_SIZE_BUFFER:
-		error = prb_buffer_size(&printk_rb);
+		error = log_buf_len;
 		break;
 	default:
 		error = -EINVAL;
 		break;
 	}
-out:
-	kfree(msgbuf);
-	kfree(text);
+
 	return error;
 }
 
@@ -1630,11 +1606,135 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
 	return do_syslog(type, buf, len, SYSLOG_FROM_READER);
 }
 
+/*
+ * The per-cpu sprint buffers are used with interrupts disabled, so each CPU
+ * only requires 2 buffers: for non-NMI and NMI contexts. Recursive printk()
+ * calls are handled by the global sprint buffers.
+ */
+#define SPRINT_CTX_DEPTH 2
+
+/* Static sprint buffers for early boot (only 1 CPU) and recursion. */
+static DECLARE_BITMAP(sprint_global_buffer_map, SPRINT_CTX_DEPTH);
+static char sprint_global_buffer[SPRINT_CTX_DEPTH][PREFIX_MAX + LOG_LINE_MAX];
+
+struct sprint_buffers {
+	char		buf[SPRINT_CTX_DEPTH][PREFIX_MAX + LOG_LINE_MAX];
+	atomic_t	index;
+};
+
+static DEFINE_PER_CPU(struct sprint_buffers, percpu_sprint_buffers);
+
+/*
+ * Acquire an unused buffer, returning its index. If no buffer is
+ * available, @count is returned.
+ */
+static int _get_sprint_buf(unsigned long *map, int count)
+{
+	int index;
+
+	do {
+		index = find_first_zero_bit(map, count);
+		if (index == count)
+			break;
+	/*
+	 * Guarantee map changes are ordered for the other CPUs.
+	 * Pairs with clear_bit() in _put_sprint_buf().
+	 */
+	} while (test_and_set_bit(index, map));
+
+	return index;
+}
+
+/* Mark the buffer @index as unused. */
+static void _put_sprint_buf(unsigned long *map, unsigned int count, unsigned int index)
+{
+	/*
+	 * Guarantee map changes are ordered for the other CPUs.
+	 * Pairs with test_and_set_bit() in _get_sprint_buf().
+	 */
+	clear_bit(index, map);
+}
+
+/*
+ * Get a buffer sized PREFIX_MAX+LOG_LINE_MAX for sprinting. On success, @id
+ * is set and interrupts are disabled. @id is used to put back the buffer.
+ *
+ * @id is non-negative for per-cpu buffers, negative for global buffers.
+ */
+static char *get_sprint_buf(int *id, unsigned long *flags)
+{
+	struct sprint_buffers *bufs;
+	unsigned int index;
+	unsigned int cpu;
+
+	local_irq_save(*flags);
+	cpu = get_cpu();
+
+	if (printk_percpu_data_ready()) {
+
+		/*
+		 * First try with per-cpu pool. Note that the last
+		 * buffer is reserved for NMI context.
+		 */
+		bufs = per_cpu_ptr(&percpu_sprint_buffers, cpu);
+		index = atomic_read(&bufs->index);
+		if (index < (SPRINT_CTX_DEPTH - 1) ||
+		    (in_nmi() && index < SPRINT_CTX_DEPTH)) {
+			atomic_set(&bufs->index, index + 1);
+			*id = cpu;
+			return &bufs->buf[index][0];
+		}
+	}
+
+	/*
+	 * Fallback to global pool.
+	 *
+	 * The global pool will only ever be used if per-cpu data is not ready
+	 * yet or printk recurses. Recursion will not occur unless printk is
+	 * having internal issues.
+	 */
+	index = _get_sprint_buf(sprint_global_buffer_map, SPRINT_CTX_DEPTH);
+	if (index != SPRINT_CTX_DEPTH) {
+		/* Convert to global buffer representation. */
+		*id = -index - 1;
+		return &sprint_global_buffer[index][0];
+	}
+
+	/* Failed to get a buffer. */
+	put_cpu();
+	local_irq_restore(*flags);
+	return NULL;
+}
+
+/* Put back an sprint buffer and restore interrupts. */
+static void put_sprint_buf(int id, unsigned long flags)
+{
+	struct sprint_buffers *bufs;
+	unsigned int index;
+	unsigned int cpu;
+
+	if (id >= 0) {
+		cpu = id;
+		bufs = per_cpu_ptr(&percpu_sprint_buffers, cpu);
+		index = atomic_read(&bufs->index);
+		atomic_set(&bufs->index, index - 1);
+	} else {
+		/* Convert from global buffer representation. */
+		index = -id - 1;
+		_put_sprint_buf(sprint_global_buffer_map,
+				SPRINT_CTX_DEPTH, index);
+	}
+
+	put_cpu();
+	local_irq_restore(flags);
+}
+
 int printk_delay_msec __read_mostly;
 
 static inline void printk_delay(int level)
 {
 	boot_delay_msec(level);
+
 	if (unlikely(printk_delay_msec)) {
 		int m = printk_delay_msec;
 
@@ -1645,168 +1745,116 @@ static inline void printk_delay(int level)
 	}
 }
 
-static void print_console_dropped(struct console *con, u64 count)
+static bool kernel_sync_mode(void)
 {
-	char text[64];
-	int len;
-
-	len = sprintf(text, "** %llu printk message%s dropped **\n",
-		      count, count > 1 ? "s" : "");
-	con->write(con, text, len);
+	return (oops_in_progress || sync_mode);
 }
 
-static void format_text(struct printk_log *msg, u64 seq,
-			char *ext_text, size_t *ext_len,
-			char *text, size_t *len, bool time)
+static bool console_can_sync(struct console *con)
 {
-	if (suppress_message_printing(msg->level)) {
-		/*
-		 * Skip record that has level above the console
-		 * loglevel and update each console's local seq.
-		 */
-		*len = 0;
-		*ext_len = 0;
-		return;
-	}
-
-	*len = msg_print_text(msg, console_msg_format & MSG_FORMAT_SYSLOG,
-			      time, text, PRINTK_SPRINT_MAX);
-	if (nr_ext_console_drivers) {
-		*ext_len = msg_print_ext_header(ext_text, CONSOLE_EXT_LOG_MAX,
-						msg, seq);
-		*ext_len += msg_print_ext_body(ext_text + *ext_len,
-					       CONSOLE_EXT_LOG_MAX - *ext_len,
-					       log_dict(msg), msg->dict_len,
-					       log_text(msg), msg->text_len);
-	} else {
-		*ext_len = 0;
-	}
-}
-
-static void printk_write_history(struct console *con, u64 master_seq)
-{
-	struct prb_iterator iter;
-	bool time = printk_time;
-	static char *ext_text;
-	static char *text;
-	static char *buf;
-	u64 seq;
-
-	ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL);
-	text = kmalloc(PRINTK_SPRINT_MAX, GFP_KERNEL);
-	buf = kmalloc(PRINTK_RECORD_MAX, GFP_KERNEL);
-	if (!ext_text || !text || !buf)
-		return;
-
 	if (!(con->flags & CON_ENABLED))
-		goto out;
-
-	if (!con->write)
-		goto out;
-
-	if (!cpu_online(raw_smp_processor_id()) &&
-	    !(con->flags & CON_ANYTIME))
-		goto out;
-
-	prb_iter_init(&iter, &printk_rb, NULL);
-
-	for (;;) {
-		struct printk_log *msg;
-		size_t ext_len;
-		size_t len;
-		int ret;
-
-		ret = prb_iter_next(&iter, buf, PRINTK_RECORD_MAX, &seq);
-		if (ret == 0) {
-			break;
-		} else if (ret < 0) {
-			prb_iter_init(&iter, &printk_rb, NULL);
-			continue;
-		}
-
-		if (seq > master_seq)
-			break;
-
-		con->printk_seq++;
-		if (con->printk_seq < seq) {
-			print_console_dropped(con, seq - con->printk_seq);
-			con->printk_seq = seq;
-		}
-
-		msg = (struct printk_log *)buf;
-		format_text(msg, master_seq, ext_text, &ext_len, text,
-			    &len, time);
-
-		if (len == 0 && ext_len == 0)
-			continue;
-
-		if (con->flags & CON_EXTENDED)
-			con->write(con, ext_text, ext_len);
-		else
-			con->write(con, text, len);
-
-		printk_delay(msg->level);
-	}
-out:
-	con->wrote_history = 1;
-	kfree(ext_text);
-	kfree(text);
-	kfree(buf);
+		return false;
+	if (con->write_atomic && kernel_sync_mode())
+		return true;
+	if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread)
+		return true;
+	if (con->write && (con->flags & CON_BOOT) && !con->thread)
+		return true;
+	return false;
 }
 
-/*
- * Call the console drivers, asking them to write out
- * log_buf[start] to log_buf[end - 1].
- * The console_lock must be held.
- */
-static void call_console_drivers(u64 seq, const char *ext_text, size_t ext_len,
-				 const char *text, size_t len, int level,
-				 int facility)
+static bool call_sync_console_driver(struct console *con, const char *text, size_t text_len)
+{
+	if (!(con->flags & CON_ENABLED))
+		return false;
+	if (con->write_atomic && kernel_sync_mode())
+		con->write_atomic(con, text, text_len);
+	else if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread)
+		con->write_atomic(con, text, text_len);
+	else if (con->write && (con->flags & CON_BOOT) && !con->thread)
+		con->write(con, text, text_len);
+	else
+		return false;
+
+	return true;
+}
+
+static bool any_console_can_sync(void)
 {
 	struct console *con;
 
-	trace_console_rcuidle(text, len);
+	for_each_console(con) {
+		if (console_can_sync(con))
+			return true;
+	}
+	return false;
+}
+
+static bool have_atomic_console(void)
+{
+	struct console *con;
 
 	for_each_console(con) {
 		if (!(con->flags & CON_ENABLED))
 			continue;
-		if (!con->wrote_history) {
-			if (con->flags & CON_PRINTBUFFER) {
-				printk_write_history(con, seq);
-				continue;
-			}
-			con->wrote_history = 1;
-			con->printk_seq = seq - 1;
-		}
-		if (con->flags & CON_BOOT && facility == 0) {
-			/* skip boot messages, already printed */
-			if (con->printk_seq < seq)
-				con->printk_seq = seq;
-			continue;
-		}
-		if (!con->write)
-			continue;
-		if (!cpu_online(raw_smp_processor_id()) &&
-		    !(con->flags & CON_ANYTIME))
-			continue;
-		if (con->printk_seq >= seq)
-			continue;
-
-		con->printk_seq++;
-		if (con->printk_seq < seq) {
-			print_console_dropped(con, seq - con->printk_seq);
-			con->printk_seq = seq;
-		}
-
-		/* for supressed messages, only seq is updated */
-		if (len == 0 && ext_len == 0)
-			continue;
-
-		if (con->flags & CON_EXTENDED)
-			con->write(con, ext_text, ext_len);
-		else
-			con->write(con, text, len);
+		if (con->write_atomic)
+			return true;
 	}
+	return false;
+}
+
+static bool print_sync(struct console *con, char *buf, size_t buf_size, u64 *seq)
+{
+	struct printk_info info;
+	struct printk_record r;
+	size_t text_len;
+
+	prb_rec_init_rd(&r, &info, buf, buf_size);
+
+	if (!prb_read_valid(prb, *seq, &r))
+		return false;
+
+	text_len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
+
+	if (!call_sync_console_driver(con, buf, text_len))
+		return false;
+
+	*seq = r.info->seq;
+
+	touch_softlockup_watchdog_sync();
+	clocksource_touch_watchdog();
+	rcu_cpu_stall_reset();
+	touch_nmi_watchdog();
+
+	if (text_len)
+		printk_delay(r.info->level);
+
+	return true;
+}
+
+static void print_sync_until(u64 seq, struct console *con, char *buf, size_t buf_size)
+{
+	unsigned int flags;
+	u64 printk_seq;
+
+	if (!con) {
+		for_each_console(con) {
+			if (console_can_sync(con))
+				print_sync_until(seq, con, buf, buf_size);
+		}
+		return;
+	}
+
+	console_atomic_lock(&flags);
+	for (;;) {
+		printk_seq = atomic64_read(&con->printk_seq);
+		if (printk_seq >= seq)
+			break;
+		if (!print_sync(con, buf, buf_size, &printk_seq))
+			break;
+		atomic64_set(&con->printk_seq, printk_seq + 1);
+	}
+	console_atomic_unlock(flags);
 }
 
 static inline u32 printk_caller_id(void)
@@ -1815,105 +1863,39 @@ static inline u32 printk_caller_id(void)
 		0x80000000 + raw_smp_processor_id();
 }
 
-/*
- * Continuation lines are buffered, and not committed to the record buffer
- * until the line is complete, or a race forces it. The line fragments
- * though, are printed immediately to the consoles to ensure everything has
- * reached the console in case of a kernel crash.
- */
-static struct cont {
-	char buf[LOG_LINE_MAX];
-	size_t len;			/* length == 0 means unused buffer */
-	u32 caller_id;			/* printk_caller_id() of first print */
-	int cpu_owner;			/* cpu of first print */
-	u64 ts_nsec;			/* time of first print */
-	u8 level;			/* log level of first message */
-	u8 facility;			/* log facility of first message */
-	enum log_flags flags;		/* prefix, newline flags */
-} cont[2];
-
-static void cont_flush(int ctx)
-{
-	struct cont *c = &cont[ctx];
-
-	if (c->len == 0)
-		return;
-
-	log_store(c->caller_id, c->facility, c->level, c->flags,
-		  c->ts_nsec, c->cpu_owner, NULL, 0, c->buf, c->len);
-	c->len = 0;
-}
-
-static void cont_add(int ctx, int cpu, u32 caller_id, int facility, int level,
-		     enum log_flags flags, const char *text, size_t len)
-{
-	struct cont *c = &cont[ctx];
-
-	if (cpu != c->cpu_owner || !(flags & LOG_CONT))
-		cont_flush(ctx);
-
-	/* If the line gets too long, split it up in separate records. */
-	while (c->len + len > sizeof(c->buf))
-		cont_flush(ctx);
-
-	if (!c->len) {
-		c->facility = facility;
-		c->level = level;
-		c->caller_id = caller_id;
-		c->ts_nsec = local_clock();
-		c->flags = flags;
-		c->cpu_owner = cpu;
-	}
-
-	memcpy(c->buf + c->len, text, len);
-	c->len += len;
-
-	// The original flags come from the first line,
-	// but later continuations can add a newline.
-	if (flags & LOG_NEWLINE) {
-		c->flags |= LOG_NEWLINE;
-		cont_flush(ctx);
-	}
-}
-
-/* ring buffer used as memory allocator for temporary sprint buffers */
-DECLARE_STATIC_PRINTKRB(sprint_rb,
-			ilog2(PRINTK_RECORD_MAX + sizeof(struct prb_entry) +
-			      sizeof(long)) + 2, &printk_cpulock);
-
-asmlinkage int vprintk_emit(int facility, int level,
-			    const char *dict, size_t dictlen,
-			    const char *fmt, va_list args)
+__printf(4, 0)
+static int vprintk_store(int facility, int level,
+			 const struct dev_printk_info *dev_info,
+			 const char *fmt, va_list args)
 {
 	const u32 caller_id = printk_caller_id();
-	int ctx = !!in_nmi();
+	struct prb_reserved_entry e;
 	enum log_flags lflags = 0;
-	int printed_len = 0;
-	struct prb_handle h;
-	size_t text_len;
+	bool final_commit = false;
+	unsigned long irqflags;
+	struct printk_record r;
+	u16 trunc_msg_len = 0;
+	int sprint_id;
+	u16 text_len;
 	u64 ts_nsec;
+	int ret = 0;
 	char *text;
-	char *rbuf;
-	int cpu;
+	u64 seq;
 
 	ts_nsec = local_clock();
 
-	rbuf = prb_reserve(&h, &sprint_rb, PRINTK_SPRINT_MAX);
-	if (!rbuf) {
-		prb_inc_lost(&printk_rb);
-		return printed_len;
-	}
-
-	cpu = raw_smp_processor_id();
+	/* No buffer is available if printk has recursed too much. */
+	text = get_sprint_buf(&sprint_id, &irqflags);
+	if (!text)
+		return 0;
 
 	/*
-	 * If this turns out to be an emergency message, there
-	 * may need to be a prefix added. Leave room for it.
+	 * The printf needs to come first; we need the syslog
+	 * prefix which might be passed-in as a parameter.
 	 */
-	text = rbuf + PREFIX_MAX;
-	text_len = vscnprintf(text, PRINTK_SPRINT_MAX - PREFIX_MAX, fmt, args);
+	text_len = vscnprintf(text, LOG_LINE_MAX, fmt, args);
 
-	/* strip and flag a trailing newline */
+	/* mark and strip a trailing newline */
 	if (text_len && text[text_len-1] == '\n') {
 		text_len--;
 		lflags |= LOG_NEWLINE;
@@ -1941,38 +1923,108 @@ asmlinkage int vprintk_emit(int facility, int level,
 	if (level == LOGLEVEL_DEFAULT)
 		level = default_message_loglevel;
 
-	if (dict)
+	if (dev_info)
 		lflags |= LOG_NEWLINE;
 
-	/*
-	 * NOTE:
-	 * - rbuf points to beginning of allocated buffer
-	 * - text points to beginning of text
-	 * - there is room before text for prefix
-	 */
-	if (facility == 0) {
-		/* only the kernel can create emergency messages */
-		printk_emergency(rbuf, level & 7, ts_nsec, cpu, text, text_len);
+	if (lflags & LOG_CONT) {
+		prb_rec_init_wr(&r, text_len);
+		if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) {
+			seq = r.info->seq;
+			memcpy(&r.text_buf[r.info->text_len], text, text_len);
+			r.info->text_len += text_len;
+			if (lflags & LOG_NEWLINE) {
+				r.info->flags |= LOG_NEWLINE;
+				prb_final_commit(&e);
+				final_commit = true;
+			} else {
+				prb_commit(&e);
+			}
+			ret = text_len;
+			goto out;
+		}
 	}
 
+	/* Store it in the record log */
+
+	prb_rec_init_wr(&r, text_len);
+
+	if (!prb_reserve(&e, prb, &r)) {
+		/* truncate the message if it is too long for empty buffer */
+		truncate_msg(&text_len, &trunc_msg_len);
+		prb_rec_init_wr(&r, text_len + trunc_msg_len);
+		/* survive when the log buffer is too small for trunc_msg */
+		if (!prb_reserve(&e, prb, &r))
+			goto out;
+	}
+
+	seq = r.info->seq;
+
+	/* fill message */
+	memcpy(&r.text_buf[0], text, text_len);
+	if (trunc_msg_len)
+		memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len);
+	r.info->text_len = text_len + trunc_msg_len;
+	r.info->facility = facility;
+	r.info->level = level & 7;
+	r.info->flags = lflags & 0x1f;
+	r.info->ts_nsec = ts_nsec;
+	r.info->caller_id = caller_id;
+	if (dev_info)
+		memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info));
+
+	/* insert message */
 	if ((lflags & LOG_CONT) || !(lflags & LOG_NEWLINE)) {
-		 cont_add(ctx, cpu, caller_id, facility, level, lflags, text, text_len);
-		 printed_len = text_len;
+		prb_commit(&e);
 	} else {
-		if (cpu == cont[ctx].cpu_owner)
-			cont_flush(ctx);
-		printed_len = log_store(caller_id, facility, level, lflags, ts_nsec, cpu,
-					dict, dictlen, text, text_len);
+		prb_final_commit(&e);
+		final_commit = true;
 	}
 
-	prb_commit(&h);
+	ret = text_len + trunc_msg_len;
+out:
+	/* only the kernel may perform synchronous printing */
+	if (facility == 0 && final_commit && any_console_can_sync())
+		print_sync_until(seq + 1, NULL, text, PREFIX_MAX + LOG_LINE_MAX);
+
+	put_sprint_buf(sprint_id, irqflags);
+	return ret;
+}
+
+asmlinkage int vprintk_emit(int facility, int level,
+			    const struct dev_printk_info *dev_info,
+			    const char *fmt, va_list args)
+{
+	int printed_len;
+
+	/* Suppress unimportant messages after panic happens */
+	if (unlikely(suppress_printk))
+		return 0;
+
+	if (level == LOGLEVEL_SCHED)
+		level = LOGLEVEL_DEFAULT;
+
+	printed_len = vprintk_store(facility, level, dev_info, fmt, args);
+
+	wake_up_klogd();
 	return printed_len;
 }
 EXPORT_SYMBOL(vprintk_emit);
 
-static __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+ __printf(1, 0)
+static int vprintk_default(const char *fmt, va_list args)
 {
-	return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+	return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
+}
+
+__printf(1, 0)
+static int vprintk_func(const char *fmt, va_list args)
+{
+#ifdef CONFIG_KGDB_KDB
+	/* Allow to pass printk() to kdb but avoid a recursion. */
+	if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0))
+		return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
+#endif
+	return vprintk_default(fmt, args);
 }
 
 asmlinkage int vprintk(const char *fmt, va_list args)
@@ -2014,6 +2066,35 @@ asmlinkage __visible int printk(const char *fmt, ...)
 	return r;
 }
 EXPORT_SYMBOL(printk);
+
+#else /* CONFIG_PRINTK */
+
+#define LOG_LINE_MAX		0
+#define PREFIX_MAX		0
+#define printk_time		false
+
+#define prb_read_valid(rb, seq, r)	false
+#define prb_first_valid_seq(rb)		0
+
+static u64 syslog_seq;
+
+static size_t record_print_text(const struct printk_record *r,
+				bool syslog, bool time)
+{
+	return 0;
+}
+static ssize_t info_print_ext_header(char *buf, size_t size,
+				     struct printk_info *info)
+{
+	return 0;
+}
+static ssize_t msg_print_ext_body(char *buf, size_t size,
+				  char *text, size_t text_len,
+				  struct dev_printk_info *dev_info) { return 0; }
+static void call_console_drivers(const char *ext_text, size_t ext_len,
+				 const char *text, size_t len) {}
+static bool suppress_message_printing(int level) { return false; }
+
 #endif /* CONFIG_PRINTK */
 
 #ifdef CONFIG_EARLY_PRINTK
@@ -2256,6 +2337,12 @@ EXPORT_SYMBOL(is_console_locked);
  * Releases the console_lock which the caller holds on the console system
  * and the console driver list.
  *
+ * While the console_lock was held, console output may have been buffered
+ * by printk().  If this is the case, console_unlock(); emits
+ * the output prior to releasing the lock.
+ *
+ * If there is output waiting, we wake /dev/kmsg and syslog() users.
+ *
  * console_unlock(); may be called from any context.
  */
 void console_unlock(void)
@@ -2317,11 +2404,21 @@ void console_unblank(void)
  */
 void console_flush_on_panic(enum con_flush_mode mode)
 {
-	/*
-	 * FIXME: This is currently a NOP. Emergency messages will have been
-	 * printed, but what about if write_atomic is not available on the
-	 * console? What if the printk kthread is still alive?
-	 */
+	struct console *c;
+	u64 seq;
+
+	if (!console_trylock())
+		return;
+
+	console_may_schedule = 0;
+
+	if (mode == CONSOLE_REPLAY_ALL) {
+		seq = prb_first_valid_seq(prb);
+		for_each_console(c)
+			atomic64_set(&c->printk_seq, seq);
+	}
+
+	console_unlock();
 }
 
 /*
@@ -2434,6 +2531,8 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
 	return -ENOENT;
 }
 
+static void console_try_thread(struct console *con);
+
 /*
  * The console driver calls this routine during kernel initialization
  * to register the console printing procedure with printk() and to
@@ -2478,6 +2577,8 @@ void register_console(struct console *newcon)
 		}
 	}
 
+	newcon->thread = NULL;
+
 	if (console_drivers && console_drivers->flags & CON_BOOT)
 		bcon = console_drivers;
 
@@ -2519,8 +2620,10 @@ void register_console(struct console *newcon)
 	 * the real console are the same physical device, it's annoying to
 	 * see the beginning boot messages twice
 	 */
-	if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
+	if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
 		newcon->flags &= ~CON_PRINTBUFFER;
+		newcon->flags |= CON_HANDOVER;
+	}
 
 	/*
 	 *	Put this console in the list - keep the
@@ -2542,6 +2645,12 @@ void register_console(struct console *newcon)
 	if (newcon->flags & CON_EXTENDED)
 		nr_ext_console_drivers++;
 
+	if (newcon->flags & CON_PRINTBUFFER)
+		atomic64_set(&newcon->printk_seq, 0);
+	else
+		atomic64_set(&newcon->printk_seq, prb_next_seq(prb));
+
+	console_try_thread(newcon);
 	console_unlock();
 	console_sysfs_notify();
 
@@ -2551,10 +2660,6 @@ void register_console(struct console *newcon)
 	 * boot consoles, real consoles, etc - this is to ensure that end
 	 * users know there might be something in the kernel's log buffer that
 	 * went to the bootconsole (that they do not see on the real console)
-	 *
-	 * This message is also important because it will trigger the
-	 * printk kthread to begin dumping the log buffer to the newly
-	 * registered console.
 	 */
 	pr_info("%sconsole [%s%d] enabled\n",
 		(newcon->flags & CON_BOOT) ? "boot" : "" ,
@@ -2619,6 +2724,9 @@ int unregister_console(struct console *console)
 	console_unlock();
 	console_sysfs_notify();
 
+	if (console->thread && !IS_ERR(console->thread))
+		kthread_stop(console->thread);
+
 	if (console->exit)
 		res = console->exit(console);
 
@@ -2662,6 +2770,154 @@ void __init console_init(void)
 	}
 }
 
+static int printk_kthread_func(void *data)
+{
+	struct console *con = data;
+	unsigned long dropped = 0;
+	struct printk_info info;
+	struct printk_record r;
+	char *ext_text = NULL;
+	size_t dropped_len;
+	char *dropped_text;
+	int ret = -ENOMEM;
+	char *write_text;
+	u64 printk_seq;
+	size_t len;
+	char *text;
+	int error;
+	u64 seq;
+
+	if (con->flags & CON_EXTENDED) {
+		ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL);
+		if (!ext_text)
+			return ret;
+	}
+	text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+	dropped_text = kmalloc(64, GFP_KERNEL);
+	if (!text || !dropped_text)
+		goto out;
+
+	if (con->flags & CON_EXTENDED)
+		write_text = ext_text;
+	else
+		write_text = text;
+
+	seq = atomic64_read(&con->printk_seq);
+
+	prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX);
+
+	for (;;) {
+		error = wait_event_interruptible(log_wait,
+				prb_read_valid(prb, seq, &r) || kthread_should_stop());
+
+		if (kthread_should_stop())
+			break;
+
+		if (error)
+			continue;
+
+		if (seq != r.info->seq) {
+			dropped += r.info->seq - seq;
+			seq = r.info->seq;
+		}
+
+		seq++;
+
+		if (!(con->flags & CON_ENABLED))
+			continue;
+
+		if (suppress_message_printing(r.info->level))
+			continue;
+
+		if (con->flags & CON_EXTENDED) {
+			len = info_print_ext_header(ext_text,
+				CONSOLE_EXT_LOG_MAX,
+				r.info);
+			len += msg_print_ext_body(ext_text + len,
+				CONSOLE_EXT_LOG_MAX - len,
+				&r.text_buf[0], r.info->text_len,
+				&r.info->dev_info);
+		} else {
+			len = record_print_text(&r,
+				console_msg_format & MSG_FORMAT_SYSLOG,
+				printk_time);
+		}
+
+		printk_seq = atomic64_read(&con->printk_seq);
+
+		console_lock();
+		console_may_schedule = 0;
+
+		if (kernel_sync_mode() && con->write_atomic) {
+			console_unlock();
+			break;
+		}
+
+		if (!(con->flags & CON_EXTENDED) && dropped) {
+			dropped_len = snprintf(dropped_text, 64,
+					       "** %lu printk messages dropped **\n",
+					       dropped);
+			dropped = 0;
+
+			con->write(con, dropped_text, dropped_len);
+			printk_delay(r.info->level);
+		}
+
+		con->write(con, write_text, len);
+		if (len)
+			printk_delay(r.info->level);
+
+		atomic64_cmpxchg_relaxed(&con->printk_seq, printk_seq, seq);
+
+		console_unlock();
+	}
+out:
+	kfree(dropped_text);
+	kfree(text);
+	kfree(ext_text);
+	pr_info("%sconsole [%s%d]: printing thread stopped\n",
+		(con->flags & CON_BOOT) ? "boot" : "" ,
+		con->name, con->index);
+	return ret;
+}
+
+static void start_printk_kthread(struct console *con)
+{
+	con->thread = kthread_run(printk_kthread_func, con,
+				  "pr/%s%d", con->name, con->index);
+	if (IS_ERR(con->thread)) {
+		pr_err("%sconsole [%s%d]: unable to start printing thread\n",
+			(con->flags & CON_BOOT) ? "boot" : "" ,
+			con->name, con->index);
+		return;
+	}
+	pr_info("%sconsole [%s%d]: printing thread started\n",
+		(con->flags & CON_BOOT) ? "boot" : "" ,
+		con->name, con->index);
+}
+
+static bool kthreads_started;
+
+static void console_try_thread(struct console *con)
+{
+	unsigned long irqflags;
+	int sprint_id;
+	char *buf;
+
+	if (kthreads_started) {
+		start_printk_kthread(con);
+		return;
+	}
+
+	buf = get_sprint_buf(&sprint_id, &irqflags);
+	if (!buf)
+		return;
+
+	print_sync_until(prb_next_seq(prb), con, buf, PREFIX_MAX + LOG_LINE_MAX);
+
+	put_sprint_buf(sprint_id, irqflags);
+}
+
 /*
  * Some boot consoles access data that is in the init section and which will
  * be discarded after the initcalls have been run. To make sure that no code
@@ -2701,6 +2957,13 @@ static int __init printk_late_init(void)
 			unregister_console(con);
 		}
 	}
+
+	console_lock();
+	for_each_console(con)
+		start_printk_kthread(con);
+	kthreads_started = true;
+	console_unlock();
+
 	ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL,
 					console_cpu_notify);
 	WARN_ON(ret < 0);
@@ -2712,75 +2975,43 @@ static int __init printk_late_init(void)
 late_initcall(printk_late_init);
 
 #if defined CONFIG_PRINTK
-static int printk_kthread_func(void *data)
+/*
+ * Delayed printk version, for scheduler-internal messages:
+ */
+#define PRINTK_PENDING_WAKEUP	0x01
+
+static DEFINE_PER_CPU(int, printk_pending);
+
+static void wake_up_klogd_work_func(struct irq_work *irq_work)
 {
-	struct prb_iterator iter;
-	struct printk_log *msg;
-	size_t ext_len;
-	char *ext_text;
-	u64 master_seq;
-	size_t len;
-	char *text;
-	char *buf;
-	int ret;
+	int pending = __this_cpu_xchg(printk_pending, 0);
 
-	ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL);
-	text = kmalloc(PRINTK_SPRINT_MAX, GFP_KERNEL);
-	buf = kmalloc(PRINTK_RECORD_MAX, GFP_KERNEL);
-	if (!ext_text || !text || !buf)
-		return -1;
-
-	prb_iter_init(&iter, &printk_rb, NULL);
-
-	/* the printk kthread never exits */
-	for (;;) {
-		ret = prb_iter_wait_next(&iter, buf,
-					 PRINTK_RECORD_MAX, &master_seq);
-		if (ret == -ERESTARTSYS) {
-			continue;
-		} else if (ret < 0) {
-			/* iterator invalid, start over */
-			prb_iter_init(&iter, &printk_rb, NULL);
-			continue;
-		}
-
-		msg = (struct printk_log *)buf;
-		format_text(msg, master_seq, ext_text, &ext_len, text,
-			    &len, printk_time);
-
-		console_lock();
-		console_may_schedule = 0;
-		call_console_drivers(master_seq, ext_text, ext_len, text, len,
-				     msg->level, msg->facility);
-		if (len > 0 || ext_len > 0)
-			printk_delay(msg->level);
-		console_unlock();
-	}
-
-	kfree(ext_text);
-	kfree(text);
-	kfree(buf);
-
-	return 0;
+	if (pending & PRINTK_PENDING_WAKEUP)
+		wake_up_interruptible(&log_wait);
 }
 
-static int __init init_printk_kthread(void)
-{
-	struct task_struct *thread;
+static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
+	.func = wake_up_klogd_work_func,
+	.flags = ATOMIC_INIT(IRQ_WORK_LAZY),
+};
 
-	thread = kthread_run(printk_kthread_func, NULL, "printk");
-	if (IS_ERR(thread)) {
-		pr_err("printk: unable to create printing thread\n");
-		return PTR_ERR(thread);
+void wake_up_klogd(void)
+{
+	if (!printk_percpu_data_ready())
+		return;
+
+	preempt_disable();
+	if (waitqueue_active(&log_wait)) {
+		this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
+		irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
 	}
-
-	return 0;
+	preempt_enable();
 }
-late_initcall(init_printk_kthread);
 
-__printf(1, 0) static int vprintk_deferred(const char *fmt, va_list args)
+__printf(1, 0)
+static int vprintk_deferred(const char *fmt, va_list args)
 {
-	return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+	return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
 }
 
 int printk_deferred(const char *fmt, ...)
@@ -2909,6 +3140,66 @@ const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason)
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_reason_str);
 
+/**
+ * pr_flush() - Wait for printing threads to catch up.
+ *
+ * @timeout_ms:        The maximum time (in ms) to wait.
+ * @reset_on_progress: Reset the timeout if forward progress is seen.
+ *
+ * A value of 0 for @timeout_ms means no waiting will occur. A value of -1
+ * represents infinite waiting.
+ *
+ * If @reset_on_progress is true, the timeout will be reset whenever any
+ * printer has been seen to make some forward progress.
+ *
+ * Context: Any context if @timeout_ms is 0. Otherwise process context and
+ * may sleep if a printer is not caught up.
+ * Return: true if all enabled printers are caught up.
+ */
+static bool pr_flush(int timeout_ms, bool reset_on_progress)
+{
+	int remaining = timeout_ms;
+	struct console *con;
+	u64 last_diff = 0;
+	u64 printk_seq;
+	u64 diff;
+	u64 seq;
+
+	seq = prb_next_seq(prb);
+
+	for (;;) {
+		diff = 0;
+
+		for_each_console(con) {
+			if (!(con->flags & CON_ENABLED))
+				continue;
+			printk_seq = atomic64_read(&con->printk_seq);
+			if (printk_seq < seq)
+				diff += seq - printk_seq;
+		}
+
+		if (diff != last_diff && reset_on_progress)
+			remaining = timeout_ms;
+
+		if (!diff || remaining == 0)
+			break;
+
+		if (remaining < 0) {
+			msleep(100);
+		} else if (remaining < 100) {
+			msleep(remaining);
+			remaining = 0;
+		} else {
+			msleep(100);
+			remaining -= 100;
+		}
+
+		last_diff = diff;
+	}
+
+	return (diff == 0);
+}
+
 /**
  * kmsg_dump - dump kernel log to kernel message dumpers.
  * @reason: the reason (oops, panic etc) for dumping
@@ -2919,9 +3210,26 @@ EXPORT_SYMBOL_GPL(kmsg_dump_reason_str);
  */
 void kmsg_dump(enum kmsg_dump_reason reason)
 {
-	struct kmsg_dumper dumper_local;
 	struct kmsg_dumper *dumper;
 
+	if (!oops_in_progress) {
+		/*
+		 * If atomic consoles are available, activate kernel sync mode
+		 * to make sure any final messages are visible. The trailing
+		 * printk message is important to flush any pending messages.
+		 */
+		if (have_atomic_console()) {
+			sync_mode = true;
+			pr_info("enabled sync mode\n");
+		}
+
+		/*
+		 * Give the printing threads time to flush, allowing up to 1
+		 * second of no printing forward progress before giving up.
+		 */
+		pr_flush(1000, true);
+	}
+
 	rcu_read_lock();
 	list_for_each_entry_rcu(dumper, &dump_list, list) {
 		enum kmsg_dump_reason max_reason = dumper->max_reason;
@@ -2937,18 +3245,16 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 		if (reason > max_reason)
 			continue;
 
-		/*
-		 * use a local copy to avoid modifying the
-		 * iterator used by any other cpus/contexts
-		 */
-		memcpy(&dumper_local, dumper, sizeof(dumper_local));
-
 		/* initialize iterator with data about the stored records */
-		dumper_local.active = true;
-		kmsg_dump_rewind(&dumper_local);
+		dumper->active = true;
+
+		kmsg_dump_rewind_nolock(dumper);
 
 		/* invoke dumper which will iterate over records */
-		dumper_local.dump(&dumper_local, reason);
+		dumper->dump(dumper, reason);
+
+		/* reset iterator */
+		dumper->active = false;
 	}
 	rcu_read_unlock();
 }
@@ -2975,67 +3281,38 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
 			       char *line, size_t size, size_t *len)
 {
-	struct prb_iterator iter;
-	struct printk_log *msg;
-	struct prb_handle h;
-	bool cont = false;
-	char *msgbuf;
-	char *rbuf;
-	size_t l;
-	u64 seq;
-	int ret;
+	struct printk_info info;
+	unsigned int line_count;
+	struct printk_record r;
+	size_t l = 0;
+	bool ret = false;
+
+	prb_rec_init_rd(&r, &info, line, size);
 
 	if (!dumper->active)
-		return cont;
+		goto out;
 
-	rbuf = prb_reserve(&h, &sprint_rb, PRINTK_RECORD_MAX);
-	if (!rbuf)
-		return cont;
-	msgbuf = rbuf;
-retry:
-	for (;;) {
-		prb_iter_init(&iter, &printk_rb, &seq);
-
-		if (dumper->line_seq == seq) {
-			/* already where we want to be */
-			break;
-		} else if (dumper->line_seq < seq) {
-			/* messages are gone, move to first available one */
-			dumper->line_seq = seq;
-			break;
+	/* Read text or count text lines? */
+	if (line) {
+		if (!prb_read_valid(prb, dumper->cur_seq, &r))
+			goto out;
+		l = record_print_text(&r, syslog, printk_time);
+	} else {
+		if (!prb_read_valid_info(prb, dumper->cur_seq,
+					 &info, &line_count)) {
+			goto out;
 		}
+		l = get_record_print_text_size(&info, line_count, syslog,
+					       printk_time);
 
-		ret = prb_iter_seek(&iter, dumper->line_seq);
-		if (ret > 0) {
-			/* seeked to line_seq */
-			break;
-		} else if (ret == 0) {
-			/*
-			 * The end of the list was hit without ever seeing
-			 * line_seq. Reset it to the beginning of the list.
-			 */
-			prb_iter_init(&iter, &printk_rb, &dumper->line_seq);
-			break;
-		}
-		/* iterator invalid, start over */
 	}
 
-	ret = prb_iter_next(&iter, msgbuf, PRINTK_RECORD_MAX,
-			    &dumper->line_seq);
-	if (ret == 0)
-		goto out;
-	else if (ret < 0)
-		goto retry;
-
-	msg = (struct printk_log *)msgbuf;
-	l = msg_print_text(msg, syslog, printk_time, line, size);
-
+	dumper->cur_seq = r.info->seq + 1;
+	ret = true;
+out:
 	if (len)
 		*len = l;
-	cont = true;
-out:
-	prb_commit(&h);
-	return cont;
+	return ret;
 }
 
 /**
@@ -3058,11 +3335,7 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
 bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
 			char *line, size_t size, size_t *len)
 {
-	bool ret;
-
-	ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
-
-	return ret;
+	return kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
 
@@ -3072,7 +3345,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
  * @syslog: include the "<4>" prefixes
  * @buf: buffer to copy the line to
  * @size: maximum size of the buffer
- * @len: length of line placed into buffer
+ * @len_out: length of line placed into buffer
  *
  * Start at the end of the kmsg buffer and fill the provided buffer
  * with as many of the the *youngest* kmsg records that fit into it.
@@ -3086,103 +3359,74 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
  * read.
  */
 bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
-			  char *buf, size_t size, size_t *len)
+			  char *buf, size_t size, size_t *len_out)
 {
-	struct prb_iterator iter;
+	struct printk_info info;
+	unsigned int line_count;
+	struct printk_record r;
+	u64 seq;
+	u64 next_seq;
+	size_t len = 0;
+	bool ret = false;
 	bool time = printk_time;
-	struct printk_log *msg;
-	u64 new_end_seq = 0;
-	struct prb_handle h;
-	bool cont = false;
-	char *msgbuf;
-	u64 end_seq;
-	int textlen;
-	u64 seq = 0;
-	char *rbuf;
-	int l = 0;
-	int ret;
 
-	if (!dumper->active)
-		return cont;
-
-	rbuf = prb_reserve(&h, &sprint_rb, PRINTK_RECORD_MAX);
-	if (!rbuf)
-		return cont;
-	msgbuf = rbuf;
-
-	prb_iter_init(&iter, &printk_rb, NULL);
-
-	/*
-	 * seek to the start record, which is set/modified
-	 * by kmsg_dump_get_line_nolock()
-	 */
-	ret = prb_iter_seek(&iter, dumper->line_seq);
-	if (ret <= 0)
-		prb_iter_init(&iter, &printk_rb, &seq);
-
-	/* work with a local end seq to have a constant value */
-	end_seq = dumper->buffer_end_seq;
-	if (!end_seq) {
-		/* initialize end seq to "infinity" */
-		end_seq = -1;
-		dumper->buffer_end_seq = end_seq;
-	}
-retry:
-	if (seq >= end_seq)
+	if (!dumper->active || !buf || !size)
 		goto out;
 
-	/* count the total bytes after seq */
-	textlen = count_remaining(&iter, end_seq, msgbuf,
-				  PRINTK_RECORD_MAX, 0, time);
-
-	/* move iter forward until length fits into the buffer */
-	while (textlen > size) {
-		ret = prb_iter_next(&iter, msgbuf, PRINTK_RECORD_MAX, &seq);
-		if (ret == 0) {
-			break;
-		} else if (ret < 0 || seq >= end_seq) {
-			prb_iter_init(&iter, &printk_rb, &seq);
-			goto retry;
-		}
-
-		msg = (struct printk_log *)msgbuf;
-		textlen -= msg_print_text(msg, true, time, NULL, 0);
+	if (dumper->cur_seq < prb_first_valid_seq(prb)) {
+		/* messages are gone, move to first available one */
+		dumper->cur_seq = prb_first_valid_seq(prb);
 	}
 
-	/* save end seq for the next interation */
-	new_end_seq = seq + 1;
+	/* last entry */
+	if (dumper->cur_seq >= dumper->next_seq)
+		goto out;
 
-	/* copy messages to buffer */
-	while (l < size) {
-		ret = prb_iter_next(&iter, msgbuf, PRINTK_RECORD_MAX, &seq);
-		if (ret == 0) {
+	/*
+	 * Find first record that fits, including all following records,
+	 * into the user-provided buffer for this dump.
+	 */
+
+	prb_for_each_info(dumper->cur_seq, prb, seq, &info, &line_count) {
+		if (info.seq >= dumper->next_seq)
 			break;
-		} else if (ret < 0) {
-			/*
-			 * iterator (and thus also the start position)
-			 * invalid, start over from beginning of list
-			 */
-			prb_iter_init(&iter, &printk_rb, NULL);
-			continue;
-		}
-
-		if (seq >= end_seq)
-			break;
-
-		msg = (struct printk_log *)msgbuf;
-		textlen = msg_print_text(msg, syslog, time, buf + l, size - l);
-		if (textlen > 0)
-			l += textlen;
-		cont = true;
+		len += get_record_print_text_size(&info, line_count, true, time);
 	}
 
-	if (cont && len)
-		*len = l;
+	/*
+	 * Move first record forward until length fits into the buffer. This
+	 * is a best effort attempt. If @dumper->next_seq is reached because
+	 * the ringbuffer is wrapping too fast, just start filling the buffer
+	 * from there.
+	 */
+	prb_for_each_info(dumper->cur_seq, prb, seq, &info, &line_count) {
+		if (len <= size || info.seq >= dumper->next_seq)
+			break;
+		len -= get_record_print_text_size(&info, line_count, true, time);
+	}
+
+	/* Keep track of the last message for the next interation. */
+	next_seq = seq;
+
+	prb_rec_init_rd(&r, &info, buf, size);
+
+	len = 0;
+	prb_for_each_record(seq, prb, seq, &r) {
+		if (r.info->seq >= dumper->next_seq)
+			break;
+
+		len += record_print_text(&r, syslog, time);
+
+		/* Adjust record to store to remaining buffer space. */
+		prb_rec_init_rd(&r, &info, buf + len, size - len);
+	}
+
+	dumper->next_seq = next_seq;
+	ret = true;
 out:
-	prb_commit(&h);
-	if (new_end_seq)
-		dumper->buffer_end_seq = new_end_seq;
-	return cont;
+	if (len_out)
+		*len_out = len;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
 
@@ -3193,13 +3437,11 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
  * Reset the dumper's iterator so that kmsg_dump_get_line() and
  * kmsg_dump_get_buffer() can be called again and used multiple
  * times within the same dumper.dump() callback.
- *
- * The function is similar to kmsg_dump_rewind(), but grabs no locks.
  */
 void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
 {
-	dumper->line_seq = 0;
-	dumper->buffer_end_seq = 0;
+	dumper->cur_seq = atomic64_read(&clear_seq);
+	dumper->next_seq = prb_next_seq(prb);
 }
 
 /**
@@ -3216,76 +3458,95 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
 
-static bool console_can_emergency(int level)
-{
-	struct console *con;
+#endif
 
-	for_each_console(con) {
-		if (!(con->flags & CON_ENABLED))
-			continue;
-		if (con->write_atomic && oops_in_progress)
-			return true;
-		if (con->write && (con->flags & CON_BOOT))
+struct prb_cpulock {
+	atomic_t owner;
+	unsigned long __percpu *irqflags;
+};
+
+#define DECLARE_STATIC_PRINTKRB_CPULOCK(name)				\
+static DEFINE_PER_CPU(unsigned long, _##name##_percpu_irqflags);	\
+static struct prb_cpulock name = {					\
+	.owner = ATOMIC_INIT(-1),					\
+	.irqflags = &_##name##_percpu_irqflags,				\
+}
+
+static bool __prb_trylock(struct prb_cpulock *cpu_lock,
+			  unsigned int *cpu_store)
+{
+	unsigned long *flags;
+	unsigned int cpu;
+
+	cpu = get_cpu();
+
+	*cpu_store = atomic_read(&cpu_lock->owner);
+	/* memory barrier to ensure the current lock owner is visible */
+	smp_rmb();
+	if (*cpu_store == -1) {
+		flags = per_cpu_ptr(cpu_lock->irqflags, cpu);
+		local_irq_save(*flags);
+		if (atomic_try_cmpxchg_acquire(&cpu_lock->owner,
+					       cpu_store, cpu)) {
 			return true;
+		}
+		local_irq_restore(*flags);
+	} else if (*cpu_store == cpu) {
+		return true;
 	}
+
+	put_cpu();
 	return false;
 }
 
-static void call_emergency_console_drivers(int level, const char *text,
-					   size_t text_len)
+/*
+ * prb_lock: Perform a processor-reentrant spin lock.
+ * @cpu_lock: A pointer to the lock object.
+ * @cpu_store: A "flags" pointer to store lock status information.
+ *
+ * If no processor has the lock, the calling processor takes the lock and
+ * becomes the owner. If the calling processor is already the owner of the
+ * lock, this function succeeds immediately. If lock is locked by another
+ * processor, this function spins until the calling processor becomes the
+ * owner.
+ *
+ * It is safe to call this function from any context and state.
+ */
+static void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store)
 {
-	struct console *con;
-
-	for_each_console(con) {
-		if (!(con->flags & CON_ENABLED))
-			continue;
-		if (con->write_atomic && oops_in_progress) {
-			con->write_atomic(con, text, text_len);
-			continue;
-		}
-		if (con->write && (con->flags & CON_BOOT)) {
-			con->write(con, text, text_len);
-			continue;
-		}
+	for (;;) {
+		if (__prb_trylock(cpu_lock, cpu_store))
+			break;
+		cpu_relax();
 	}
 }
 
-static void printk_emergency(char *buffer, int level, u64 ts_nsec, u16 cpu,
-			     char *text, u16 text_len)
+/*
+ * prb_unlock: Perform a processor-reentrant spin unlock.
+ * @cpu_lock: A pointer to the lock object.
+ * @cpu_store: A "flags" object storing lock status information.
+ *
+ * Release the lock. The calling processor must be the owner of the lock.
+ *
+ * It is safe to call this function from any context and state.
+ */
+static void prb_unlock(struct prb_cpulock *cpu_lock, unsigned int cpu_store)
 {
-	struct printk_log msg;
-	size_t prefix_len;
+	unsigned long *flags;
+	unsigned int cpu;
 
-	if (!console_can_emergency(level))
-		return;
+	cpu = atomic_read(&cpu_lock->owner);
+	atomic_set_release(&cpu_lock->owner, cpu_store);
 
-	msg.level = level;
-	msg.ts_nsec = ts_nsec;
-	msg.cpu = cpu;
-	msg.facility = 0;
+	if (cpu_store == -1) {
+		flags = per_cpu_ptr(cpu_lock->irqflags, cpu);
+		local_irq_restore(*flags);
+	}
 
-	/* "text" must have PREFIX_MAX preceding bytes available */
-
-	prefix_len = print_prefix(&msg,
-				  console_msg_format & MSG_FORMAT_SYSLOG,
-				  printk_time, buffer);
-	/* move the prefix forward to the beginning of the message text */
-	text -= prefix_len;
-	memmove(text, buffer, prefix_len);
-	text_len += prefix_len;
-
-	text[text_len++] = '\n';
-
-	call_emergency_console_drivers(level, text, text_len);
-
-	touch_softlockup_watchdog_sync();
-	clocksource_touch_watchdog();
-	rcu_cpu_stall_reset();
-	touch_nmi_watchdog();
-
-	printk_delay(level);
+	put_cpu();
 }
-#endif
+
+DECLARE_STATIC_PRINTKRB_CPULOCK(printk_cpulock);
 
 void console_atomic_lock(unsigned int *flags)
 {
diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
new file mode 100644
index 0000000000000..24a960a89aa89
--- /dev/null
+++ b/kernel/printk/printk_ringbuffer.c
@@ -0,0 +1,2086 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/kernel.h>
+#include <linux/irqflags.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/bug.h>
+#include "printk_ringbuffer.h"
+
+/**
+ * DOC: printk_ringbuffer overview
+ *
+ * Data Structure
+ * --------------
+ * The printk_ringbuffer is made up of 3 internal ringbuffers:
+ *
+ *   desc_ring
+ *     A ring of descriptors and their meta data (such as sequence number,
+ *     timestamp, loglevel, etc.) as well as internal state information about
+ *     the record and logical positions specifying where in the other
+ *     ringbuffer the text strings are located.
+ *
+ *   text_data_ring
+ *     A ring of data blocks. A data block consists of an unsigned long
+ *     integer (ID) that maps to a desc_ring index followed by the text
+ *     string of the record.
+ *
+ * The internal state information of a descriptor is the key element to allow
+ * readers and writers to locklessly synchronize access to the data.
+ *
+ * Implementation
+ * --------------
+ *
+ * Descriptor Ring
+ * ~~~~~~~~~~~~~~~
+ * The descriptor ring is an array of descriptors. A descriptor contains
+ * essential meta data to track the data of a printk record using
+ * blk_lpos structs pointing to associated text data blocks (see
+ * "Data Rings" below). Each descriptor is assigned an ID that maps
+ * directly to index values of the descriptor array and has a state. The ID
+ * and the state are bitwise combined into a single descriptor field named
+ * @state_var, allowing ID and state to be synchronously and atomically
+ * updated.
+ *
+ * Descriptors have four states:
+ *
+ *   reserved
+ *     A writer is modifying the record.
+ *
+ *   committed
+ *     The record and all its data are written. A writer can reopen the
+ *     descriptor (transitioning it back to reserved), but in the committed
+ *     state the data is consistent.
+ *
+ *   finalized
+ *     The record and all its data are complete and available for reading. A
+ *     writer cannot reopen the descriptor.
+ *
+ *   reusable
+ *     The record exists, but its text and/or meta data may no longer be
+ *     available.
+ *
+ * Querying the @state_var of a record requires providing the ID of the
+ * descriptor to query. This can yield a possible fifth (pseudo) state:
+ *
+ *   miss
+ *     The descriptor being queried has an unexpected ID.
+ *
+ * The descriptor ring has a @tail_id that contains the ID of the oldest
+ * descriptor and @head_id that contains the ID of the newest descriptor.
+ *
+ * When a new descriptor should be created (and the ring is full), the tail
+ * descriptor is invalidated by first transitioning to the reusable state and
+ * then invalidating all tail data blocks up to and including the data blocks
+ * associated with the tail descriptor (for the text ring). Then
+ * @tail_id is advanced, followed by advancing @head_id. And finally the
+ * @state_var of the new descriptor is initialized to the new ID and reserved
+ * state.
+ *
+ * The @tail_id can only be advanced if the new @tail_id would be in the
+ * committed or reusable queried state. This makes it possible that a valid
+ * sequence number of the tail is always available.
+ *
+ * Descriptor Finalization
+ * ~~~~~~~~~~~~~~~~~~~~~~~
+ * When a writer calls the commit function prb_commit(), record data is
+ * fully stored and is consistent within the ringbuffer. However, a writer can
+ * reopen that record, claiming exclusive access (as with prb_reserve()), and
+ * modify that record. When finished, the writer must again commit the record.
+ *
+ * In order for a record to be made available to readers (and also become
+ * recyclable for writers), it must be finalized. A finalized record cannot be
+ * reopened and can never become "unfinalized". Record finalization can occur
+ * in three different scenarios:
+ *
+ *   1) A writer can simultaneously commit and finalize its record by calling
+ *      prb_final_commit() instead of prb_commit().
+ *
+ *   2) When a new record is reserved and the previous record has been
+ *      committed via prb_commit(), that previous record is automatically
+ *      finalized.
+ *
+ *   3) When a record is committed via prb_commit() and a newer record
+ *      already exists, the record being committed is automatically finalized.
+ *
+ * Data Ring
+ * ~~~~~~~~~
+ * The text data ring is a byte array composed of data blocks. Data blocks are
+ * referenced by blk_lpos structs that point to the logical position of the
+ * beginning of a data block and the beginning of the next adjacent data
+ * block. Logical positions are mapped directly to index values of the byte
+ * array ringbuffer.
+ *
+ * Each data block consists of an ID followed by the writer data. The ID is
+ * the identifier of a descriptor that is associated with the data block. A
+ * given data block is considered valid if all of the following conditions
+ * are met:
+ *
+ *   1) The descriptor associated with the data block is in the committed
+ *      or finalized queried state.
+ *
+ *   2) The blk_lpos struct within the descriptor associated with the data
+ *      block references back to the same data block.
+ *
+ *   3) The data block is within the head/tail logical position range.
+ *
+ * If the writer data of a data block would extend beyond the end of the
+ * byte array, only the ID of the data block is stored at the logical
+ * position and the full data block (ID and writer data) is stored at the
+ * beginning of the byte array. The referencing blk_lpos will point to the
+ * ID before the wrap and the next data block will be at the logical
+ * position adjacent the full data block after the wrap.
+ *
+ * Data rings have a @tail_lpos that points to the beginning of the oldest
+ * data block and a @head_lpos that points to the logical position of the
+ * next (not yet existing) data block.
+ *
+ * When a new data block should be created (and the ring is full), tail data
+ * blocks will first be invalidated by putting their associated descriptors
+ * into the reusable state and then pushing the @tail_lpos forward beyond
+ * them. Then the @head_lpos is pushed forward and is associated with a new
+ * descriptor. If a data block is not valid, the @tail_lpos cannot be
+ * advanced beyond it.
+ *
+ * Info Array
+ * ~~~~~~~~~~
+ * The general meta data of printk records are stored in printk_info structs,
+ * stored in an array with the same number of elements as the descriptor ring.
+ * Each info corresponds to the descriptor of the same index in the
+ * descriptor ring. Info validity is confirmed by evaluating the corresponding
+ * descriptor before and after loading the info.
+ *
+ * Usage
+ * -----
+ * Here are some simple examples demonstrating writers and readers. For the
+ * examples a global ringbuffer (test_rb) is available (which is not the
+ * actual ringbuffer used by printk)::
+ *
+ *	DEFINE_PRINTKRB(test_rb, 15, 5);
+ *
+ * This ringbuffer allows up to 32768 records (2 ^ 15) and has a size of
+ * 1 MiB (2 ^ (15 + 5)) for text data.
+ *
+ * Sample writer code::
+ *
+ *	const char *textstr = "message text";
+ *	struct prb_reserved_entry e;
+ *	struct printk_record r;
+ *
+ *	// specify how much to allocate
+ *	prb_rec_init_wr(&r, strlen(textstr) + 1);
+ *
+ *	if (prb_reserve(&e, &test_rb, &r)) {
+ *		snprintf(r.text_buf, r.text_buf_size, "%s", textstr);
+ *
+ *		r.info->text_len = strlen(textstr);
+ *		r.info->ts_nsec = local_clock();
+ *		r.info->caller_id = printk_caller_id();
+ *
+ *		// commit and finalize the record
+ *		prb_final_commit(&e);
+ *	}
+ *
+ * Note that additional writer functions are available to extend a record
+ * after it has been committed but not yet finalized. This can be done as
+ * long as no new records have been reserved and the caller is the same.
+ *
+ * Sample writer code (record extending)::
+ *
+ *		// alternate rest of previous example
+ *
+ *		r.info->text_len = strlen(textstr);
+ *		r.info->ts_nsec = local_clock();
+ *		r.info->caller_id = printk_caller_id();
+ *
+ *		// commit the record (but do not finalize yet)
+ *		prb_commit(&e);
+ *	}
+ *
+ *	...
+ *
+ *	// specify additional 5 bytes text space to extend
+ *	prb_rec_init_wr(&r, 5);
+ *
+ *	// try to extend, but only if it does not exceed 32 bytes
+ *	if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id()), 32) {
+ *		snprintf(&r.text_buf[r.info->text_len],
+ *			 r.text_buf_size - r.info->text_len, "hello");
+ *
+ *		r.info->text_len += 5;
+ *
+ *		// commit and finalize the record
+ *		prb_final_commit(&e);
+ *	}
+ *
+ * Sample reader code::
+ *
+ *	struct printk_info info;
+ *	struct printk_record r;
+ *	char text_buf[32];
+ *	u64 seq;
+ *
+ *	prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf));
+ *
+ *	prb_for_each_record(0, &test_rb, &seq, &r) {
+ *		if (info.seq != seq)
+ *			pr_warn("lost %llu records\n", info.seq - seq);
+ *
+ *		if (info.text_len > r.text_buf_size) {
+ *			pr_warn("record %llu text truncated\n", info.seq);
+ *			text_buf[r.text_buf_size - 1] = 0;
+ *		}
+ *
+ *		pr_info("%llu: %llu: %s\n", info.seq, info.ts_nsec,
+ *			&text_buf[0]);
+ *	}
+ *
+ * Note that additional less convenient reader functions are available to
+ * allow complex record access.
+ *
+ * ABA Issues
+ * ~~~~~~~~~~
+ * To help avoid ABA issues, descriptors are referenced by IDs (array index
+ * values combined with tagged bits counting array wraps) and data blocks are
+ * referenced by logical positions (array index values combined with tagged
+ * bits counting array wraps). However, on 32-bit systems the number of
+ * tagged bits is relatively small such that an ABA incident is (at least
+ * theoretically) possible. For example, if 4 million maximally sized (1KiB)
+ * printk messages were to occur in NMI context on a 32-bit system, the
+ * interrupted context would not be able to recognize that the 32-bit integer
+ * completely wrapped and thus represents a different data block than the one
+ * the interrupted context expects.
+ *
+ * To help combat this possibility, additional state checking is performed
+ * (such as using cmpxchg() even though set() would suffice). These extra
+ * checks are commented as such and will hopefully catch any ABA issue that
+ * a 32-bit system might experience.
+ *
+ * Memory Barriers
+ * ~~~~~~~~~~~~~~~
+ * Multiple memory barriers are used. To simplify proving correctness and
+ * generating litmus tests, lines of code related to memory barriers
+ * (loads, stores, and the associated memory barriers) are labeled::
+ *
+ *	LMM(function:letter)
+ *
+ * Comments reference the labels using only the "function:letter" part.
+ *
+ * The memory barrier pairs and their ordering are:
+ *
+ *   desc_reserve:D / desc_reserve:B
+ *     push descriptor tail (id), then push descriptor head (id)
+ *
+ *   desc_reserve:D / data_push_tail:B
+ *     push data tail (lpos), then set new descriptor reserved (state)
+ *
+ *   desc_reserve:D / desc_push_tail:C
+ *     push descriptor tail (id), then set new descriptor reserved (state)
+ *
+ *   desc_reserve:D / prb_first_seq:C
+ *     push descriptor tail (id), then set new descriptor reserved (state)
+ *
+ *   desc_reserve:F / desc_read:D
+ *     set new descriptor id and reserved (state), then allow writer changes
+ *
+ *   data_alloc:A (or data_realloc:A) / desc_read:D
+ *     set old descriptor reusable (state), then modify new data block area
+ *
+ *   data_alloc:A (or data_realloc:A) / data_push_tail:B
+ *     push data tail (lpos), then modify new data block area
+ *
+ *   _prb_commit:B / desc_read:B
+ *     store writer changes, then set new descriptor committed (state)
+ *
+ *   desc_reopen_last:A / _prb_commit:B
+ *     set descriptor reserved (state), then read descriptor data
+ *
+ *   _prb_commit:B / desc_reserve:D
+ *     set new descriptor committed (state), then check descriptor head (id)
+ *
+ *   data_push_tail:D / data_push_tail:A
+ *     set descriptor reusable (state), then push data tail (lpos)
+ *
+ *   desc_push_tail:B / desc_reserve:D
+ *     set descriptor reusable (state), then push descriptor tail (id)
+ */
+
+#define DATA_SIZE(data_ring)		_DATA_SIZE((data_ring)->size_bits)
+#define DATA_SIZE_MASK(data_ring)	(DATA_SIZE(data_ring) - 1)
+
+#define DESCS_COUNT(desc_ring)		_DESCS_COUNT((desc_ring)->count_bits)
+#define DESCS_COUNT_MASK(desc_ring)	(DESCS_COUNT(desc_ring) - 1)
+
+/* Determine the data array index from a logical position. */
+#define DATA_INDEX(data_ring, lpos)	((lpos) & DATA_SIZE_MASK(data_ring))
+
+/* Determine the desc array index from an ID or sequence number. */
+#define DESC_INDEX(desc_ring, n)	((n) & DESCS_COUNT_MASK(desc_ring))
+
+/* Determine how many times the data array has wrapped. */
+#define DATA_WRAPS(data_ring, lpos)	((lpos) >> (data_ring)->size_bits)
+
+/* Determine if a logical position refers to a data-less block. */
+#define LPOS_DATALESS(lpos)		((lpos) & 1UL)
+#define BLK_DATALESS(blk)		(LPOS_DATALESS((blk)->begin) && \
+					 LPOS_DATALESS((blk)->next))
+
+/* Get the logical position at index 0 of the current wrap. */
+#define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \
+((lpos) & ~DATA_SIZE_MASK(data_ring))
+
+/* Get the ID for the same index of the previous wrap as the given ID. */
+#define DESC_ID_PREV_WRAP(desc_ring, id) \
+DESC_ID((id) - DESCS_COUNT(desc_ring))
+
+/*
+ * A data block: mapped directly to the beginning of the data block area
+ * specified as a logical position within the data ring.
+ *
+ * @id:   the ID of the associated descriptor
+ * @data: the writer data
+ *
+ * Note that the size of a data block is only known by its associated
+ * descriptor.
+ */
+struct prb_data_block {
+	unsigned long	id;
+	char		data[0];
+};
+
+/*
+ * Return the descriptor associated with @n. @n can be either a
+ * descriptor ID or a sequence number.
+ */
+static struct prb_desc *to_desc(struct prb_desc_ring *desc_ring, u64 n)
+{
+	return &desc_ring->descs[DESC_INDEX(desc_ring, n)];
+}
+
+/*
+ * Return the printk_info associated with @n. @n can be either a
+ * descriptor ID or a sequence number.
+ */
+static struct printk_info *to_info(struct prb_desc_ring *desc_ring, u64 n)
+{
+	return &desc_ring->infos[DESC_INDEX(desc_ring, n)];
+}
+
+static struct prb_data_block *to_block(struct prb_data_ring *data_ring,
+				       unsigned long begin_lpos)
+{
+	return (void *)&data_ring->data[DATA_INDEX(data_ring, begin_lpos)];
+}
+
+/*
+ * Increase the data size to account for data block meta data plus any
+ * padding so that the adjacent data block is aligned on the ID size.
+ */
+static unsigned int to_blk_size(unsigned int size)
+{
+	struct prb_data_block *db = NULL;
+
+	size += sizeof(*db);
+	size = ALIGN(size, sizeof(db->id));
+	return size;
+}
+
+/*
+ * Sanity checker for reserve size. The ringbuffer code assumes that a data
+ * block does not exceed the maximum possible size that could fit within the
+ * ringbuffer. This function provides that basic size check so that the
+ * assumption is safe.
+ */
+static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size)
+{
+	struct prb_data_block *db = NULL;
+
+	if (size == 0)
+		return true;
+
+	/*
+	 * Ensure the alignment padded size could possibly fit in the data
+	 * array. The largest possible data block must still leave room for
+	 * at least the ID of the next block.
+	 */
+	size = to_blk_size(size);
+	if (size > DATA_SIZE(data_ring) - sizeof(db->id))
+		return false;
+
+	return true;
+}
+
+/* Query the state of a descriptor. */
+static enum desc_state get_desc_state(unsigned long id,
+				      unsigned long state_val)
+{
+	if (id != DESC_ID(state_val))
+		return desc_miss;
+
+	return DESC_STATE(state_val);
+}
+
+/*
+ * Get a copy of a specified descriptor and return its queried state. If the
+ * descriptor is in an inconsistent state (miss or reserved), the caller can
+ * only expect the descriptor's @state_var field to be valid.
+ *
+ * The sequence number and caller_id can be optionally retrieved. Like all
+ * non-state_var data, they are only valid if the descriptor is in a
+ * consistent state.
+ */
+static enum desc_state desc_read(struct prb_desc_ring *desc_ring,
+				 unsigned long id, struct prb_desc *desc_out,
+				 u64 *seq_out, u32 *caller_id_out)
+{
+	struct printk_info *info = to_info(desc_ring, id);
+	struct prb_desc *desc = to_desc(desc_ring, id);
+	atomic_long_t *state_var = &desc->state_var;
+	enum desc_state d_state;
+	unsigned long state_val;
+
+	/* Check the descriptor state. */
+	state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */
+	d_state = get_desc_state(id, state_val);
+	if (d_state == desc_miss || d_state == desc_reserved) {
+		/*
+		 * The descriptor is in an inconsistent state. Set at least
+		 * @state_var so that the caller can see the details of
+		 * the inconsistent state.
+		 */
+		goto out;
+	}
+
+	/*
+	 * Guarantee the state is loaded before copying the descriptor
+	 * content. This avoids copying obsolete descriptor content that might
+	 * not apply to the descriptor state. This pairs with _prb_commit:B.
+	 *
+	 * Memory barrier involvement:
+	 *
+	 * If desc_read:A reads from _prb_commit:B, then desc_read:C reads
+	 * from _prb_commit:A.
+	 *
+	 * Relies on:
+	 *
+	 * WMB from _prb_commit:A to _prb_commit:B
+	 *    matching
+	 * RMB from desc_read:A to desc_read:C
+	 */
+	smp_rmb(); /* LMM(desc_read:B) */
+
+	/*
+	 * Copy the descriptor data. The data is not valid until the
+	 * state has been re-checked. A memcpy() for all of @desc
+	 * cannot be used because of the atomic_t @state_var field.
+	 */
+	memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos,
+	       sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */
+	if (seq_out)
+		*seq_out = info->seq; /* also part of desc_read:C */
+	if (caller_id_out)
+		*caller_id_out = info->caller_id; /* also part of desc_read:C */
+
+	/*
+	 * 1. Guarantee the descriptor content is loaded before re-checking
+	 *    the state. This avoids reading an obsolete descriptor state
+	 *    that may not apply to the copied content. This pairs with
+	 *    desc_reserve:F.
+	 *
+	 *    Memory barrier involvement:
+	 *
+	 *    If desc_read:C reads from desc_reserve:G, then desc_read:E
+	 *    reads from desc_reserve:F.
+	 *
+	 *    Relies on:
+	 *
+	 *    WMB from desc_reserve:F to desc_reserve:G
+	 *       matching
+	 *    RMB from desc_read:C to desc_read:E
+	 *
+	 * 2. Guarantee the record data is loaded before re-checking the
+	 *    state. This avoids reading an obsolete descriptor state that may
+	 *    not apply to the copied data. This pairs with data_alloc:A and
+	 *    data_realloc:A.
+	 *
+	 *    Memory barrier involvement:
+	 *
+	 *    If copy_data:A reads from data_alloc:B, then desc_read:E
+	 *    reads from desc_make_reusable:A.
+	 *
+	 *    Relies on:
+	 *
+	 *    MB from desc_make_reusable:A to data_alloc:B
+	 *       matching
+	 *    RMB from desc_read:C to desc_read:E
+	 *
+	 *    Note: desc_make_reusable:A and data_alloc:B can be different
+	 *          CPUs. However, the data_alloc:B CPU (which performs the
+	 *          full memory barrier) must have previously seen
+	 *          desc_make_reusable:A.
+	 */
+	smp_rmb(); /* LMM(desc_read:D) */
+
+	/*
+	 * The data has been copied. Return the current descriptor state,
+	 * which may have changed since the load above.
+	 */
+	state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */
+	d_state = get_desc_state(id, state_val);
+out:
+	atomic_long_set(&desc_out->state_var, state_val);
+	return d_state;
+}
+
+/*
+ * Take a specified descriptor out of the finalized state by attempting
+ * the transition from finalized to reusable. Either this context or some
+ * other context will have been successful.
+ */
+static void desc_make_reusable(struct prb_desc_ring *desc_ring,
+			       unsigned long id)
+{
+	unsigned long val_finalized = DESC_SV(id, desc_finalized);
+	unsigned long val_reusable = DESC_SV(id, desc_reusable);
+	struct prb_desc *desc = to_desc(desc_ring, id);
+	atomic_long_t *state_var = &desc->state_var;
+
+	atomic_long_cmpxchg_relaxed(state_var, val_finalized,
+				    val_reusable); /* LMM(desc_make_reusable:A) */
+}
+
+/*
+ * Given the text data ring, put the associated descriptor of each
+ * data block from @lpos_begin until @lpos_end into the reusable state.
+ *
+ * If there is any problem making the associated descriptor reusable, either
+ * the descriptor has not yet been finalized or another writer context has
+ * already pushed the tail lpos past the problematic data block. Regardless,
+ * on error the caller can re-load the tail lpos to determine the situation.
+ */
+static bool data_make_reusable(struct printk_ringbuffer *rb,
+			       struct prb_data_ring *data_ring,
+			       unsigned long lpos_begin,
+			       unsigned long lpos_end,
+			       unsigned long *lpos_out)
+{
+	struct prb_desc_ring *desc_ring = &rb->desc_ring;
+	struct prb_data_block *blk;
+	enum desc_state d_state;
+	struct prb_desc desc;
+	struct prb_data_blk_lpos *blk_lpos = &desc.text_blk_lpos;
+	unsigned long id;
+
+	/* Loop until @lpos_begin has advanced to or beyond @lpos_end. */
+	while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) {
+		blk = to_block(data_ring, lpos_begin);
+
+		/*
+		 * Load the block ID from the data block. This is a data race
+		 * against a writer that may have newly reserved this data
+		 * area. If the loaded value matches a valid descriptor ID,
+		 * the blk_lpos of that descriptor will be checked to make
+		 * sure it points back to this data block. If the check fails,
+		 * the data area has been recycled by another writer.
+		 */
+		id = blk->id; /* LMM(data_make_reusable:A) */
+
+		d_state = desc_read(desc_ring, id, &desc,
+				    NULL, NULL); /* LMM(data_make_reusable:B) */
+
+		switch (d_state) {
+		case desc_miss:
+		case desc_reserved:
+		case desc_committed:
+			return false;
+		case desc_finalized:
+			/*
+			 * This data block is invalid if the descriptor
+			 * does not point back to it.
+			 */
+			if (blk_lpos->begin != lpos_begin)
+				return false;
+			desc_make_reusable(desc_ring, id);
+			break;
+		case desc_reusable:
+			/*
+			 * This data block is invalid if the descriptor
+			 * does not point back to it.
+			 */
+			if (blk_lpos->begin != lpos_begin)
+				return false;
+			break;
+		}
+
+		/* Advance @lpos_begin to the next data block. */
+		lpos_begin = blk_lpos->next;
+	}
+
+	*lpos_out = lpos_begin;
+	return true;
+}
+
+/*
+ * Advance the data ring tail to at least @lpos. This function puts
+ * descriptors into the reusable state if the tail is pushed beyond
+ * their associated data block.
+ */
+static bool data_push_tail(struct printk_ringbuffer *rb,
+			   struct prb_data_ring *data_ring,
+			   unsigned long lpos)
+{
+	unsigned long tail_lpos_new;
+	unsigned long tail_lpos;
+	unsigned long next_lpos;
+
+	/* If @lpos is from a data-less block, there is nothing to do. */
+	if (LPOS_DATALESS(lpos))
+		return true;
+
+	/*
+	 * Any descriptor states that have transitioned to reusable due to the
+	 * data tail being pushed to this loaded value will be visible to this
+	 * CPU. This pairs with data_push_tail:D.
+	 *
+	 * Memory barrier involvement:
+	 *
+	 * If data_push_tail:A reads from data_push_tail:D, then this CPU can
+	 * see desc_make_reusable:A.
+	 *
+	 * Relies on:
+	 *
+	 * MB from desc_make_reusable:A to data_push_tail:D
+	 *    matches
+	 * READFROM from data_push_tail:D to data_push_tail:A
+	 *    thus
+	 * READFROM from desc_make_reusable:A to this CPU
+	 */
+	tail_lpos = atomic_long_read(&data_ring->tail_lpos); /* LMM(data_push_tail:A) */
+
+	/*
+	 * Loop until the tail lpos is at or beyond @lpos. This condition
+	 * may already be satisfied, resulting in no full memory barrier
+	 * from data_push_tail:D being performed. However, since this CPU
+	 * sees the new tail lpos, any descriptor states that transitioned to
+	 * the reusable state must already be visible.
+	 */
+	while ((lpos - tail_lpos) - 1 < DATA_SIZE(data_ring)) {
+		/*
+		 * Make all descriptors reusable that are associated with
+		 * data blocks before @lpos.
+		 */
+		if (!data_make_reusable(rb, data_ring, tail_lpos, lpos,
+					&next_lpos)) {
+			/*
+			 * 1. Guarantee the block ID loaded in
+			 *    data_make_reusable() is performed before
+			 *    reloading the tail lpos. The failed
+			 *    data_make_reusable() may be due to a newly
+			 *    recycled data area causing the tail lpos to
+			 *    have been previously pushed. This pairs with
+			 *    data_alloc:A and data_realloc:A.
+			 *
+			 *    Memory barrier involvement:
+			 *
+			 *    If data_make_reusable:A reads from data_alloc:B,
+			 *    then data_push_tail:C reads from
+			 *    data_push_tail:D.
+			 *
+			 *    Relies on:
+			 *
+			 *    MB from data_push_tail:D to data_alloc:B
+			 *       matching
+			 *    RMB from data_make_reusable:A to
+			 *    data_push_tail:C
+			 *
+			 *    Note: data_push_tail:D and data_alloc:B can be
+			 *          different CPUs. However, the data_alloc:B
+			 *          CPU (which performs the full memory
+			 *          barrier) must have previously seen
+			 *          data_push_tail:D.
+			 *
+			 * 2. Guarantee the descriptor state loaded in
+			 *    data_make_reusable() is performed before
+			 *    reloading the tail lpos. The failed
+			 *    data_make_reusable() may be due to a newly
+			 *    recycled descriptor causing the tail lpos to
+			 *    have been previously pushed. This pairs with
+			 *    desc_reserve:D.
+			 *
+			 *    Memory barrier involvement:
+			 *
+			 *    If data_make_reusable:B reads from
+			 *    desc_reserve:F, then data_push_tail:C reads
+			 *    from data_push_tail:D.
+			 *
+			 *    Relies on:
+			 *
+			 *    MB from data_push_tail:D to desc_reserve:F
+			 *       matching
+			 *    RMB from data_make_reusable:B to
+			 *    data_push_tail:C
+			 *
+			 *    Note: data_push_tail:D and desc_reserve:F can
+			 *          be different CPUs. However, the
+			 *          desc_reserve:F CPU (which performs the
+			 *          full memory barrier) must have previously
+			 *          seen data_push_tail:D.
+			 */
+			smp_rmb(); /* LMM(data_push_tail:B) */
+
+			tail_lpos_new = atomic_long_read(&data_ring->tail_lpos
+							); /* LMM(data_push_tail:C) */
+			if (tail_lpos_new == tail_lpos)
+				return false;
+
+			/* Another CPU pushed the tail. Try again. */
+			tail_lpos = tail_lpos_new;
+			continue;
+		}
+
+		/*
+		 * Guarantee any descriptor states that have transitioned to
+		 * reusable are stored before pushing the tail lpos. A full
+		 * memory barrier is needed since other CPUs may have made
+		 * the descriptor states reusable. This pairs with
+		 * data_push_tail:A.
+		 */
+		if (atomic_long_try_cmpxchg(&data_ring->tail_lpos, &tail_lpos,
+					    next_lpos)) { /* LMM(data_push_tail:D) */
+			break;
+		}
+	}
+
+	return true;
+}
+
+/*
+ * Advance the desc ring tail. This function advances the tail by one
+ * descriptor, thus invalidating the oldest descriptor. Before advancing
+ * the tail, the tail descriptor is made reusable and all data blocks up to
+ * and including the descriptor's data block are invalidated (i.e. the data
+ * ring tail is pushed past the data block of the descriptor being made
+ * reusable).
+ */
+static bool desc_push_tail(struct printk_ringbuffer *rb,
+			   unsigned long tail_id)
+{
+	struct prb_desc_ring *desc_ring = &rb->desc_ring;
+	enum desc_state d_state;
+	struct prb_desc desc;
+
+	d_state = desc_read(desc_ring, tail_id, &desc, NULL, NULL);
+
+	switch (d_state) {
+	case desc_miss:
+		/*
+		 * If the ID is exactly 1 wrap behind the expected, it is
+		 * in the process of being reserved by another writer and
+		 * must be considered reserved.
+		 */
+		if (DESC_ID(atomic_long_read(&desc.state_var)) ==
+		    DESC_ID_PREV_WRAP(desc_ring, tail_id)) {
+			return false;
+		}
+
+		/*
+		 * The ID has changed. Another writer must have pushed the
+		 * tail and recycled the descriptor already. Success is
+		 * returned because the caller is only interested in the
+		 * specified tail being pushed, which it was.
+		 */
+		return true;
+	case desc_reserved:
+	case desc_committed:
+		return false;
+	case desc_finalized:
+		desc_make_reusable(desc_ring, tail_id);
+		break;
+	case desc_reusable:
+		break;
+	}
+
+	/*
+	 * Data blocks must be invalidated before their associated
+	 * descriptor can be made available for recycling. Invalidating
+	 * them later is not possible because there is no way to trust
+	 * data blocks once their associated descriptor is gone.
+	 */
+
+	if (!data_push_tail(rb, &rb->text_data_ring, desc.text_blk_lpos.next))
+		return false;
+
+	/*
+	 * Check the next descriptor after @tail_id before pushing the tail
+	 * to it because the tail must always be in a finalized or reusable
+	 * state. The implementation of prb_first_seq() relies on this.
+	 *
+	 * A successful read implies that the next descriptor is less than or
+	 * equal to @head_id so there is no risk of pushing the tail past the
+	 * head.
+	 */
+	d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc,
+			    NULL, NULL); /* LMM(desc_push_tail:A) */
+
+	if (d_state == desc_finalized || d_state == desc_reusable) {
+		/*
+		 * Guarantee any descriptor states that have transitioned to
+		 * reusable are stored before pushing the tail ID. This allows
+		 * verifying the recycled descriptor state. A full memory
+		 * barrier is needed since other CPUs may have made the
+		 * descriptor states reusable. This pairs with desc_reserve:D.
+		 */
+		atomic_long_cmpxchg(&desc_ring->tail_id, tail_id,
+				    DESC_ID(tail_id + 1)); /* LMM(desc_push_tail:B) */
+	} else {
+		/*
+		 * Guarantee the last state load from desc_read() is before
+		 * reloading @tail_id in order to see a new tail ID in the
+		 * case that the descriptor has been recycled. This pairs
+		 * with desc_reserve:D.
+		 *
+		 * Memory barrier involvement:
+		 *
+		 * If desc_push_tail:A reads from desc_reserve:F, then
+		 * desc_push_tail:D reads from desc_push_tail:B.
+		 *
+		 * Relies on:
+		 *
+		 * MB from desc_push_tail:B to desc_reserve:F
+		 *    matching
+		 * RMB from desc_push_tail:A to desc_push_tail:D
+		 *
+		 * Note: desc_push_tail:B and desc_reserve:F can be different
+		 *       CPUs. However, the desc_reserve:F CPU (which performs
+		 *       the full memory barrier) must have previously seen
+		 *       desc_push_tail:B.
+		 */
+		smp_rmb(); /* LMM(desc_push_tail:C) */
+
+		/*
+		 * Re-check the tail ID. The descriptor following @tail_id is
+		 * not in an allowed tail state. But if the tail has since
+		 * been moved by another CPU, then it does not matter.
+		 */
+		if (atomic_long_read(&desc_ring->tail_id) == tail_id) /* LMM(desc_push_tail:D) */
+			return false;
+	}
+
+	return true;
+}
+
+/* Reserve a new descriptor, invalidating the oldest if necessary. */
+static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out)
+{
+	struct prb_desc_ring *desc_ring = &rb->desc_ring;
+	unsigned long prev_state_val;
+	unsigned long id_prev_wrap;
+	struct prb_desc *desc;
+	unsigned long head_id;
+	unsigned long id;
+
+	head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */
+
+	do {
+		desc = to_desc(desc_ring, head_id);
+
+		id = DESC_ID(head_id + 1);
+		id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id);
+
+		/*
+		 * Guarantee the head ID is read before reading the tail ID.
+		 * Since the tail ID is updated before the head ID, this
+		 * guarantees that @id_prev_wrap is never ahead of the tail
+		 * ID. This pairs with desc_reserve:D.
+		 *
+		 * Memory barrier involvement:
+		 *
+		 * If desc_reserve:A reads from desc_reserve:D, then
+		 * desc_reserve:C reads from desc_push_tail:B.
+		 *
+		 * Relies on:
+		 *
+		 * MB from desc_push_tail:B to desc_reserve:D
+		 *    matching
+		 * RMB from desc_reserve:A to desc_reserve:C
+		 *
+		 * Note: desc_push_tail:B and desc_reserve:D can be different
+		 *       CPUs. However, the desc_reserve:D CPU (which performs
+		 *       the full memory barrier) must have previously seen
+		 *       desc_push_tail:B.
+		 */
+		smp_rmb(); /* LMM(desc_reserve:B) */
+
+		if (id_prev_wrap == atomic_long_read(&desc_ring->tail_id
+						    )) { /* LMM(desc_reserve:C) */
+			/*
+			 * Make space for the new descriptor by
+			 * advancing the tail.
+			 */
+			if (!desc_push_tail(rb, id_prev_wrap))
+				return false;
+		}
+
+		/*
+		 * 1. Guarantee the tail ID is read before validating the
+		 *    recycled descriptor state. A read memory barrier is
+		 *    sufficient for this. This pairs with desc_push_tail:B.
+		 *
+		 *    Memory barrier involvement:
+		 *
+		 *    If desc_reserve:C reads from desc_push_tail:B, then
+		 *    desc_reserve:E reads from desc_make_reusable:A.
+		 *
+		 *    Relies on:
+		 *
+		 *    MB from desc_make_reusable:A to desc_push_tail:B
+		 *       matching
+		 *    RMB from desc_reserve:C to desc_reserve:E
+		 *
+		 *    Note: desc_make_reusable:A and desc_push_tail:B can be
+		 *          different CPUs. However, the desc_push_tail:B CPU
+		 *          (which performs the full memory barrier) must have
+		 *          previously seen desc_make_reusable:A.
+		 *
+		 * 2. Guarantee the tail ID is stored before storing the head
+		 *    ID. This pairs with desc_reserve:B.
+		 *
+		 * 3. Guarantee any data ring tail changes are stored before
+		 *    recycling the descriptor. Data ring tail changes can
+		 *    happen via desc_push_tail()->data_push_tail(). A full
+		 *    memory barrier is needed since another CPU may have
+		 *    pushed the data ring tails. This pairs with
+		 *    data_push_tail:B.
+		 *
+		 * 4. Guarantee a new tail ID is stored before recycling the
+		 *    descriptor. A full memory barrier is needed since
+		 *    another CPU may have pushed the tail ID. This pairs
+		 *    with desc_push_tail:C and this also pairs with
+		 *    prb_first_seq:C.
+		 *
+		 * 5. Guarantee the head ID is stored before trying to
+		 *    finalize the previous descriptor. This pairs with
+		 *    _prb_commit:B.
+		 */
+	} while (!atomic_long_try_cmpxchg(&desc_ring->head_id, &head_id,
+					  id)); /* LMM(desc_reserve:D) */
+
+	desc = to_desc(desc_ring, id);
+
+	/*
+	 * If the descriptor has been recycled, verify the old state val.
+	 * See "ABA Issues" about why this verification is performed.
+	 */
+	prev_state_val = atomic_long_read(&desc->state_var); /* LMM(desc_reserve:E) */
+	if (prev_state_val &&
+	    get_desc_state(id_prev_wrap, prev_state_val) != desc_reusable) {
+		WARN_ON_ONCE(1);
+		return false;
+	}
+
+	/*
+	 * Assign the descriptor a new ID and set its state to reserved.
+	 * See "ABA Issues" about why cmpxchg() instead of set() is used.
+	 *
+	 * Guarantee the new descriptor ID and state is stored before making
+	 * any other changes. A write memory barrier is sufficient for this.
+	 * This pairs with desc_read:D.
+	 */
+	if (!atomic_long_try_cmpxchg(&desc->state_var, &prev_state_val,
+			DESC_SV(id, desc_reserved))) { /* LMM(desc_reserve:F) */
+		WARN_ON_ONCE(1);
+		return false;
+	}
+
+	/* Now data in @desc can be modified: LMM(desc_reserve:G) */
+
+	*id_out = id;
+	return true;
+}
+
+/* Determine the end of a data block. */
+static unsigned long get_next_lpos(struct prb_data_ring *data_ring,
+				   unsigned long lpos, unsigned int size)
+{
+	unsigned long begin_lpos;
+	unsigned long next_lpos;
+
+	begin_lpos = lpos;
+	next_lpos = lpos + size;
+
+	/* First check if the data block does not wrap. */
+	if (DATA_WRAPS(data_ring, begin_lpos) == DATA_WRAPS(data_ring, next_lpos))
+		return next_lpos;
+
+	/* Wrapping data blocks store their data at the beginning. */
+	return (DATA_THIS_WRAP_START_LPOS(data_ring, next_lpos) + size);
+}
+
+/*
+ * Allocate a new data block, invalidating the oldest data block(s)
+ * if necessary. This function also associates the data block with
+ * a specified descriptor.
+ */
+static char *data_alloc(struct printk_ringbuffer *rb,
+			struct prb_data_ring *data_ring, unsigned int size,
+			struct prb_data_blk_lpos *blk_lpos, unsigned long id)
+{
+	struct prb_data_block *blk;
+	unsigned long begin_lpos;
+	unsigned long next_lpos;
+
+	if (size == 0) {
+		/* Specify a data-less block. */
+		blk_lpos->begin = NO_LPOS;
+		blk_lpos->next = NO_LPOS;
+		return NULL;
+	}
+
+	size = to_blk_size(size);
+
+	begin_lpos = atomic_long_read(&data_ring->head_lpos);
+
+	do {
+		next_lpos = get_next_lpos(data_ring, begin_lpos, size);
+
+		if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring))) {
+			/* Failed to allocate, specify a data-less block. */
+			blk_lpos->begin = FAILED_LPOS;
+			blk_lpos->next = FAILED_LPOS;
+			return NULL;
+		}
+
+		/*
+		 * 1. Guarantee any descriptor states that have transitioned
+		 *    to reusable are stored before modifying the newly
+		 *    allocated data area. A full memory barrier is needed
+		 *    since other CPUs may have made the descriptor states
+		 *    reusable. See data_push_tail:A about why the reusable
+		 *    states are visible. This pairs with desc_read:D.
+		 *
+		 * 2. Guarantee any updated tail lpos is stored before
+		 *    modifying the newly allocated data area. Another CPU may
+		 *    be in data_make_reusable() and is reading a block ID
+		 *    from this area. data_make_reusable() can handle reading
+		 *    a garbage block ID value, but then it must be able to
+		 *    load a new tail lpos. A full memory barrier is needed
+		 *    since other CPUs may have updated the tail lpos. This
+		 *    pairs with data_push_tail:B.
+		 */
+	} while (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &begin_lpos,
+					  next_lpos)); /* LMM(data_alloc:A) */
+
+	blk = to_block(data_ring, begin_lpos);
+	blk->id = id; /* LMM(data_alloc:B) */
+
+	if (DATA_WRAPS(data_ring, begin_lpos) != DATA_WRAPS(data_ring, next_lpos)) {
+		/* Wrapping data blocks store their data at the beginning. */
+		blk = to_block(data_ring, 0);
+
+		/*
+		 * Store the ID on the wrapped block for consistency.
+		 * The printk_ringbuffer does not actually use it.
+		 */
+		blk->id = id;
+	}
+
+	blk_lpos->begin = begin_lpos;
+	blk_lpos->next = next_lpos;
+
+	return &blk->data[0];
+}
+
+/*
+ * Try to resize an existing data block associated with the descriptor
+ * specified by @id. If the resized data block should become wrapped, it
+ * copies the old data to the new data block. If @size yields a data block
+ * with the same or less size, the data block is left as is.
+ *
+ * Fail if this is not the last allocated data block or if there is not
+ * enough space or it is not possible make enough space.
+ *
+ * Return a pointer to the beginning of the entire data buffer or NULL on
+ * failure.
+ */
+static char *data_realloc(struct printk_ringbuffer *rb,
+			  struct prb_data_ring *data_ring, unsigned int size,
+			  struct prb_data_blk_lpos *blk_lpos, unsigned long id)
+{
+	struct prb_data_block *blk;
+	unsigned long head_lpos;
+	unsigned long next_lpos;
+	bool wrapped;
+
+	/* Reallocation only works if @blk_lpos is the newest data block. */
+	head_lpos = atomic_long_read(&data_ring->head_lpos);
+	if (head_lpos != blk_lpos->next)
+		return NULL;
+
+	/* Keep track if @blk_lpos was a wrapping data block. */
+	wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, blk_lpos->next));
+
+	size = to_blk_size(size);
+
+	next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size);
+
+	/* If the data block does not increase, there is nothing to do. */
+	if (head_lpos - next_lpos < DATA_SIZE(data_ring)) {
+		if (wrapped)
+			blk = to_block(data_ring, 0);
+		else
+			blk = to_block(data_ring, blk_lpos->begin);
+		return &blk->data[0];
+	}
+
+	if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring)))
+		return NULL;
+
+	/* The memory barrier involvement is the same as data_alloc:A. */
+	if (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &head_lpos,
+				     next_lpos)) { /* LMM(data_realloc:A) */
+		return NULL;
+	}
+
+	blk = to_block(data_ring, blk_lpos->begin);
+
+	if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, next_lpos)) {
+		struct prb_data_block *old_blk = blk;
+
+		/* Wrapping data blocks store their data at the beginning. */
+		blk = to_block(data_ring, 0);
+
+		/*
+		 * Store the ID on the wrapped block for consistency.
+		 * The printk_ringbuffer does not actually use it.
+		 */
+		blk->id = id;
+
+		if (!wrapped) {
+			/*
+			 * Since the allocated space is now in the newly
+			 * created wrapping data block, copy the content
+			 * from the old data block.
+			 */
+			memcpy(&blk->data[0], &old_blk->data[0],
+			       (blk_lpos->next - blk_lpos->begin) - sizeof(blk->id));
+		}
+	}
+
+	blk_lpos->next = next_lpos;
+
+	return &blk->data[0];
+}
+
+/* Return the number of bytes used by a data block. */
+static unsigned int space_used(struct prb_data_ring *data_ring,
+			       struct prb_data_blk_lpos *blk_lpos)
+{
+	/* Data-less blocks take no space. */
+	if (BLK_DATALESS(blk_lpos))
+		return 0;
+
+	if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) {
+		/* Data block does not wrap. */
+		return (DATA_INDEX(data_ring, blk_lpos->next) -
+			DATA_INDEX(data_ring, blk_lpos->begin));
+	}
+
+	/*
+	 * For wrapping data blocks, the trailing (wasted) space is
+	 * also counted.
+	 */
+	return (DATA_INDEX(data_ring, blk_lpos->next) +
+		DATA_SIZE(data_ring) - DATA_INDEX(data_ring, blk_lpos->begin));
+}
+
+/*
+ * Given @blk_lpos, return a pointer to the writer data from the data block
+ * and calculate the size of the data part. A NULL pointer is returned if
+ * @blk_lpos specifies values that could never be legal.
+ *
+ * This function (used by readers) performs strict validation on the lpos
+ * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is
+ * triggered if an internal error is detected.
+ */
+static const char *get_data(struct prb_data_ring *data_ring,
+			    struct prb_data_blk_lpos *blk_lpos,
+			    unsigned int *data_size)
+{
+	struct prb_data_block *db;
+
+	/* Data-less data block description. */
+	if (BLK_DATALESS(blk_lpos)) {
+		if (blk_lpos->begin == NO_LPOS && blk_lpos->next == NO_LPOS) {
+			*data_size = 0;
+			return "";
+		}
+		return NULL;
+	}
+
+	/* Regular data block: @begin less than @next and in same wrap. */
+	if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) &&
+	    blk_lpos->begin < blk_lpos->next) {
+		db = to_block(data_ring, blk_lpos->begin);
+		*data_size = blk_lpos->next - blk_lpos->begin;
+
+	/* Wrapping data block: @begin is one wrap behind @next. */
+	} else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) ==
+		   DATA_WRAPS(data_ring, blk_lpos->next)) {
+		db = to_block(data_ring, 0);
+		*data_size = DATA_INDEX(data_ring, blk_lpos->next);
+
+	/* Illegal block description. */
+	} else {
+		WARN_ON_ONCE(1);
+		return NULL;
+	}
+
+	/* A valid data block will always be aligned to the ID size. */
+	if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) ||
+	    WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) {
+		return NULL;
+	}
+
+	/* A valid data block will always have at least an ID. */
+	if (WARN_ON_ONCE(*data_size < sizeof(db->id)))
+		return NULL;
+
+	/* Subtract block ID space from size to reflect data size. */
+	*data_size -= sizeof(db->id);
+
+	return &db->data[0];
+}
+
+/*
+ * Attempt to transition the newest descriptor from committed back to reserved
+ * so that the record can be modified by a writer again. This is only possible
+ * if the descriptor is not yet finalized and the provided @caller_id matches.
+ */
+static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring,
+					 u32 caller_id, unsigned long *id_out)
+{
+	unsigned long prev_state_val;
+	enum desc_state d_state;
+	struct prb_desc desc;
+	struct prb_desc *d;
+	unsigned long id;
+	u32 cid;
+
+	id = atomic_long_read(&desc_ring->head_id);
+
+	/*
+	 * To reduce unnecessarily reopening, first check if the descriptor
+	 * state and caller ID are correct.
+	 */
+	d_state = desc_read(desc_ring, id, &desc, NULL, &cid);
+	if (d_state != desc_committed || cid != caller_id)
+		return NULL;
+
+	d = to_desc(desc_ring, id);
+
+	prev_state_val = DESC_SV(id, desc_committed);
+
+	/*
+	 * Guarantee the reserved state is stored before reading any
+	 * record data. A full memory barrier is needed because @state_var
+	 * modification is followed by reading. This pairs with _prb_commit:B.
+	 *
+	 * Memory barrier involvement:
+	 *
+	 * If desc_reopen_last:A reads from _prb_commit:B, then
+	 * prb_reserve_in_last:A reads from _prb_commit:A.
+	 *
+	 * Relies on:
+	 *
+	 * WMB from _prb_commit:A to _prb_commit:B
+	 *    matching
+	 * MB If desc_reopen_last:A to prb_reserve_in_last:A
+	 */
+	if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val,
+			DESC_SV(id, desc_reserved))) { /* LMM(desc_reopen_last:A) */
+		return NULL;
+	}
+
+	*id_out = id;
+	return d;
+}
+
+/**
+ * prb_reserve_in_last() - Re-reserve and extend the space in the ringbuffer
+ *                         used by the newest record.
+ *
+ * @e:         The entry structure to setup.
+ * @rb:        The ringbuffer to re-reserve and extend data in.
+ * @r:         The record structure to allocate buffers for.
+ * @caller_id: The caller ID of the caller (reserving writer).
+ * @max_size:  Fail if the extended size would be greater than this.
+ *
+ * This is the public function available to writers to re-reserve and extend
+ * data.
+ *
+ * The writer specifies the text size to extend (not the new total size) by
+ * setting the @text_buf_size field of @r. To ensure proper initialization
+ * of @r, prb_rec_init_wr() should be used.
+ *
+ * This function will fail if @caller_id does not match the caller ID of the
+ * newest record. In that case the caller must reserve new data using
+ * prb_reserve().
+ *
+ * Context: Any context. Disables local interrupts on success.
+ * Return: true if text data could be extended, otherwise false.
+ *
+ * On success:
+ *
+ *   - @r->text_buf points to the beginning of the entire text buffer.
+ *
+ *   - @r->text_buf_size is set to the new total size of the buffer.
+ *
+ *   - @r->info is not touched so that @r->info->text_len could be used
+ *     to append the text.
+ *
+ *   - prb_record_text_space() can be used on @e to query the new
+ *     actually used space.
+ *
+ * Important: All @r->info fields will already be set with the current values
+ *            for the record. I.e. @r->info->text_len will be less than
+ *            @text_buf_size. Writers can use @r->info->text_len to know
+ *            where concatenation begins and writers should update
+ *            @r->info->text_len after concatenating.
+ */
+bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
+			 struct printk_record *r, u32 caller_id, unsigned int max_size)
+{
+	struct prb_desc_ring *desc_ring = &rb->desc_ring;
+	struct printk_info *info;
+	unsigned int data_size;
+	struct prb_desc *d;
+	unsigned long id;
+
+	local_irq_save(e->irqflags);
+
+	/* Transition the newest descriptor back to the reserved state. */
+	d = desc_reopen_last(desc_ring, caller_id, &id);
+	if (!d) {
+		local_irq_restore(e->irqflags);
+		goto fail_reopen;
+	}
+
+	/* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */
+
+	info = to_info(desc_ring, id);
+
+	/*
+	 * Set the @e fields here so that prb_commit() can be used if
+	 * anything fails from now on.
+	 */
+	e->rb = rb;
+	e->id = id;
+
+	/*
+	 * desc_reopen_last() checked the caller_id, but there was no
+	 * exclusive access at that point. The descriptor may have
+	 * changed since then.
+	 */
+	if (caller_id != info->caller_id)
+		goto fail;
+
+	if (BLK_DATALESS(&d->text_blk_lpos)) {
+		if (WARN_ON_ONCE(info->text_len != 0)) {
+			pr_warn_once("wrong text_len value (%hu, expecting 0)\n",
+				     info->text_len);
+			info->text_len = 0;
+		}
+
+		if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
+			goto fail;
+
+		if (r->text_buf_size > max_size)
+			goto fail;
+
+		r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size,
+					 &d->text_blk_lpos, id);
+	} else {
+		if (!get_data(&rb->text_data_ring, &d->text_blk_lpos, &data_size))
+			goto fail;
+
+		/*
+		 * Increase the buffer size to include the original size. If
+		 * the meta data (@text_len) is not sane, use the full data
+		 * block size.
+		 */
+		if (WARN_ON_ONCE(info->text_len > data_size)) {
+			pr_warn_once("wrong text_len value (%hu, expecting <=%u)\n",
+				     info->text_len, data_size);
+			info->text_len = data_size;
+		}
+		r->text_buf_size += info->text_len;
+
+		if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
+			goto fail;
+
+		if (r->text_buf_size > max_size)
+			goto fail;
+
+		r->text_buf = data_realloc(rb, &rb->text_data_ring, r->text_buf_size,
+					   &d->text_blk_lpos, id);
+	}
+	if (r->text_buf_size && !r->text_buf)
+		goto fail;
+
+	r->info = info;
+
+	e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos);
+
+	return true;
+fail:
+	prb_commit(e);
+	/* prb_commit() re-enabled interrupts. */
+fail_reopen:
+	/* Make it clear to the caller that the re-reserve failed. */
+	memset(r, 0, sizeof(*r));
+	return false;
+}
+
+/*
+ * Attempt to finalize a specified descriptor. If this fails, the descriptor
+ * is either already final or it will finalize itself when the writer commits.
+ */
+static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id)
+{
+	unsigned long prev_state_val = DESC_SV(id, desc_committed);
+	struct prb_desc *d = to_desc(desc_ring, id);
+
+	atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val,
+			DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */
+}
+
+/**
+ * prb_reserve() - Reserve space in the ringbuffer.
+ *
+ * @e:  The entry structure to setup.
+ * @rb: The ringbuffer to reserve data in.
+ * @r:  The record structure to allocate buffers for.
+ *
+ * This is the public function available to writers to reserve data.
+ *
+ * The writer specifies the text size to reserve by setting the
+ * @text_buf_size field of @r. To ensure proper initialization of @r,
+ * prb_rec_init_wr() should be used.
+ *
+ * Context: Any context. Disables local interrupts on success.
+ * Return: true if at least text data could be allocated, otherwise false.
+ *
+ * On success, the fields @info and @text_buf of @r will be set by this
+ * function and should be filled in by the writer before committing. Also
+ * on success, prb_record_text_space() can be used on @e to query the actual
+ * space used for the text data block.
+ *
+ * Important: @info->text_len needs to be set correctly by the writer in
+ *            order for data to be readable and/or extended. Its value
+ *            is initialized to 0.
+ */
+bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
+		 struct printk_record *r)
+{
+	struct prb_desc_ring *desc_ring = &rb->desc_ring;
+	struct printk_info *info;
+	struct prb_desc *d;
+	unsigned long id;
+	u64 seq;
+
+	if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
+		goto fail;
+
+	/*
+	 * Descriptors in the reserved state act as blockers to all further
+	 * reservations once the desc_ring has fully wrapped. Disable
+	 * interrupts during the reserve/commit window in order to minimize
+	 * the likelihood of this happening.
+	 */
+	local_irq_save(e->irqflags);
+
+	if (!desc_reserve(rb, &id)) {
+		/* Descriptor reservation failures are tracked. */
+		atomic_long_inc(&rb->fail);
+		local_irq_restore(e->irqflags);
+		goto fail;
+	}
+
+	d = to_desc(desc_ring, id);
+	info = to_info(desc_ring, id);
+
+	/*
+	 * All @info fields (except @seq) are cleared and must be filled in
+	 * by the writer. Save @seq before clearing because it is used to
+	 * determine the new sequence number.
+	 */
+	seq = info->seq;
+	memset(info, 0, sizeof(*info));
+
+	/*
+	 * Set the @e fields here so that prb_commit() can be used if
+	 * text data allocation fails.
+	 */
+	e->rb = rb;
+	e->id = id;
+
+	/*
+	 * Initialize the sequence number if it has "never been set".
+	 * Otherwise just increment it by a full wrap.
+	 *
+	 * @seq is considered "never been set" if it has a value of 0,
+	 * _except_ for @infos[0], which was specially setup by the ringbuffer
+	 * initializer and therefore is always considered as set.
+	 *
+	 * See the "Bootstrap" comment block in printk_ringbuffer.h for
+	 * details about how the initializer bootstraps the descriptors.
+	 */
+	if (seq == 0 && DESC_INDEX(desc_ring, id) != 0)
+		info->seq = DESC_INDEX(desc_ring, id);
+	else
+		info->seq = seq + DESCS_COUNT(desc_ring);
+
+	/*
+	 * New data is about to be reserved. Once that happens, previous
+	 * descriptors are no longer able to be extended. Finalize the
+	 * previous descriptor now so that it can be made available to
+	 * readers. (For seq==0 there is no previous descriptor.)
+	 */
+	if (info->seq > 0)
+		desc_make_final(desc_ring, DESC_ID(id - 1));
+
+	r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size,
+				 &d->text_blk_lpos, id);
+	/* If text data allocation fails, a data-less record is committed. */
+	if (r->text_buf_size && !r->text_buf) {
+		prb_commit(e);
+		/* prb_commit() re-enabled interrupts. */
+		goto fail;
+	}
+
+	r->info = info;
+
+	/* Record full text space used by record. */
+	e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos);
+
+	return true;
+fail:
+	/* Make it clear to the caller that the reserve failed. */
+	memset(r, 0, sizeof(*r));
+	return false;
+}
+
+/* Commit the data (possibly finalizing it) and restore interrupts. */
+static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val)
+{
+	struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
+	struct prb_desc *d = to_desc(desc_ring, e->id);
+	unsigned long prev_state_val = DESC_SV(e->id, desc_reserved);
+
+	/* Now the writer has finished all writing: LMM(_prb_commit:A) */
+
+	/*
+	 * Set the descriptor as committed. See "ABA Issues" about why
+	 * cmpxchg() instead of set() is used.
+	 *
+	 * 1  Guarantee all record data is stored before the descriptor state
+	 *    is stored as committed. A write memory barrier is sufficient
+	 *    for this. This pairs with desc_read:B and desc_reopen_last:A.
+	 *
+	 * 2. Guarantee the descriptor state is stored as committed before
+	 *    re-checking the head ID in order to possibly finalize this
+	 *    descriptor. This pairs with desc_reserve:D.
+	 *
+	 *    Memory barrier involvement:
+	 *
+	 *    If prb_commit:A reads from desc_reserve:D, then
+	 *    desc_make_final:A reads from _prb_commit:B.
+	 *
+	 *    Relies on:
+	 *
+	 *    MB _prb_commit:B to prb_commit:A
+	 *       matching
+	 *    MB desc_reserve:D to desc_make_final:A
+	 */
+	if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val,
+			DESC_SV(e->id, state_val))) { /* LMM(_prb_commit:B) */
+		WARN_ON_ONCE(1);
+	}
+
+	/* Restore interrupts, the reserve/commit window is finished. */
+	local_irq_restore(e->irqflags);
+}
+
+/**
+ * prb_commit() - Commit (previously reserved) data to the ringbuffer.
+ *
+ * @e: The entry containing the reserved data information.
+ *
+ * This is the public function available to writers to commit data.
+ *
+ * Note that the data is not yet available to readers until it is finalized.
+ * Finalizing happens automatically when space for the next record is
+ * reserved.
+ *
+ * See prb_final_commit() for a version of this function that finalizes
+ * immediately.
+ *
+ * Context: Any context. Enables local interrupts.
+ */
+void prb_commit(struct prb_reserved_entry *e)
+{
+	struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
+	unsigned long head_id;
+
+	_prb_commit(e, desc_committed);
+
+	/*
+	 * If this descriptor is no longer the head (i.e. a new record has
+	 * been allocated), extending the data for this record is no longer
+	 * allowed and therefore it must be finalized.
+	 */
+	head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */
+	if (head_id != e->id)
+		desc_make_final(desc_ring, e->id);
+}
+
+/**
+ * prb_final_commit() - Commit and finalize (previously reserved) data to
+ *                      the ringbuffer.
+ *
+ * @e: The entry containing the reserved data information.
+ *
+ * This is the public function available to writers to commit+finalize data.
+ *
+ * By finalizing, the data is made immediately available to readers.
+ *
+ * This function should only be used if there are no intentions of extending
+ * this data using prb_reserve_in_last().
+ *
+ * Context: Any context. Enables local interrupts.
+ */
+void prb_final_commit(struct prb_reserved_entry *e)
+{
+	_prb_commit(e, desc_finalized);
+}
+
+/*
+ * Count the number of lines in provided text. All text has at least 1 line
+ * (even if @text_size is 0). Each '\n' processed is counted as an additional
+ * line.
+ */
+static unsigned int count_lines(const char *text, unsigned int text_size)
+{
+	unsigned int next_size = text_size;
+	unsigned int line_count = 1;
+	const char *next = text;
+
+	while (next_size) {
+		next = memchr(next, '\n', next_size);
+		if (!next)
+			break;
+		line_count++;
+		next++;
+		next_size = text_size - (next - text);
+	}
+
+	return line_count;
+}
+
+/*
+ * Given @blk_lpos, copy an expected @len of data into the provided buffer.
+ * If @line_count is provided, count the number of lines in the data.
+ *
+ * This function (used by readers) performs strict validation on the data
+ * size to possibly detect bugs in the writer code. A WARN_ON_ONCE() is
+ * triggered if an internal error is detected.
+ */
+static bool copy_data(struct prb_data_ring *data_ring,
+		      struct prb_data_blk_lpos *blk_lpos, u16 len, char *buf,
+		      unsigned int buf_size, unsigned int *line_count)
+{
+	unsigned int data_size;
+	const char *data;
+
+	/* Caller might not want any data. */
+	if ((!buf || !buf_size) && !line_count)
+		return true;
+
+	data = get_data(data_ring, blk_lpos, &data_size);
+	if (!data)
+		return false;
+
+	/*
+	 * Actual cannot be less than expected. It can be more than expected
+	 * because of the trailing alignment padding.
+	 *
+	 * Note that invalid @len values can occur because the caller loads
+	 * the value during an allowed data race.
+	 */
+	if (data_size < (unsigned int)len)
+		return false;
+
+	/* Caller interested in the line count? */
+	if (line_count)
+		*line_count = count_lines(data, data_size);
+
+	/* Caller interested in the data content? */
+	if (!buf || !buf_size)
+		return true;
+
+	data_size = min_t(u16, buf_size, len);
+
+	memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */
+	return true;
+}
+
+/*
+ * This is an extended version of desc_read(). It gets a copy of a specified
+ * descriptor. However, it also verifies that the record is finalized and has
+ * the sequence number @seq. On success, 0 is returned.
+ *
+ * Error return values:
+ * -EINVAL: A finalized record with sequence number @seq does not exist.
+ * -ENOENT: A finalized record with sequence number @seq exists, but its data
+ *          is not available. This is a valid record, so readers should
+ *          continue with the next record.
+ */
+static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring,
+				   unsigned long id, u64 seq,
+				   struct prb_desc *desc_out)
+{
+	struct prb_data_blk_lpos *blk_lpos = &desc_out->text_blk_lpos;
+	enum desc_state d_state;
+	u64 s;
+
+	d_state = desc_read(desc_ring, id, desc_out, &s, NULL);
+
+	/*
+	 * An unexpected @id (desc_miss) or @seq mismatch means the record
+	 * does not exist. A descriptor in the reserved or committed state
+	 * means the record does not yet exist for the reader.
+	 */
+	if (d_state == desc_miss ||
+	    d_state == desc_reserved ||
+	    d_state == desc_committed ||
+	    s != seq) {
+		return -EINVAL;
+	}
+
+	/*
+	 * A descriptor in the reusable state may no longer have its data
+	 * available; report it as existing but with lost data. Or the record
+	 * may actually be a record with lost data.
+	 */
+	if (d_state == desc_reusable ||
+	    (blk_lpos->begin == FAILED_LPOS && blk_lpos->next == FAILED_LPOS)) {
+		return -ENOENT;
+	}
+
+	return 0;
+}
+
+/*
+ * Copy the ringbuffer data from the record with @seq to the provided
+ * @r buffer. On success, 0 is returned.
+ *
+ * See desc_read_finalized_seq() for error return values.
+ */
+static int prb_read(struct printk_ringbuffer *rb, u64 seq,
+		    struct printk_record *r, unsigned int *line_count)
+{
+	struct prb_desc_ring *desc_ring = &rb->desc_ring;
+	struct printk_info *info = to_info(desc_ring, seq);
+	struct prb_desc *rdesc = to_desc(desc_ring, seq);
+	atomic_long_t *state_var = &rdesc->state_var;
+	struct prb_desc desc;
+	unsigned long id;
+	int err;
+
+	/* Extract the ID, used to specify the descriptor to read. */
+	id = DESC_ID(atomic_long_read(state_var));
+
+	/* Get a local copy of the correct descriptor (if available). */
+	err = desc_read_finalized_seq(desc_ring, id, seq, &desc);
+
+	/*
+	 * If @r is NULL, the caller is only interested in the availability
+	 * of the record.
+	 */
+	if (err || !r)
+		return err;
+
+	/* If requested, copy meta data. */
+	if (r->info)
+		memcpy(r->info, info, sizeof(*(r->info)));
+
+	/* Copy text data. If it fails, this is a data-less record. */
+	if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, info->text_len,
+		       r->text_buf, r->text_buf_size, line_count)) {
+		return -ENOENT;
+	}
+
+	/* Ensure the record is still finalized and has the same @seq. */
+	return desc_read_finalized_seq(desc_ring, id, seq, &desc);
+}
+
+/* Get the sequence number of the tail descriptor. */
+static u64 prb_first_seq(struct printk_ringbuffer *rb)
+{
+	struct prb_desc_ring *desc_ring = &rb->desc_ring;
+	enum desc_state d_state;
+	struct prb_desc desc;
+	unsigned long id;
+	u64 seq;
+
+	for (;;) {
+		id = atomic_long_read(&rb->desc_ring.tail_id); /* LMM(prb_first_seq:A) */
+
+		d_state = desc_read(desc_ring, id, &desc, &seq, NULL); /* LMM(prb_first_seq:B) */
+
+		/*
+		 * This loop will not be infinite because the tail is
+		 * _always_ in the finalized or reusable state.
+		 */
+		if (d_state == desc_finalized || d_state == desc_reusable)
+			break;
+
+		/*
+		 * Guarantee the last state load from desc_read() is before
+		 * reloading @tail_id in order to see a new tail in the case
+		 * that the descriptor has been recycled. This pairs with
+		 * desc_reserve:D.
+		 *
+		 * Memory barrier involvement:
+		 *
+		 * If prb_first_seq:B reads from desc_reserve:F, then
+		 * prb_first_seq:A reads from desc_push_tail:B.
+		 *
+		 * Relies on:
+		 *
+		 * MB from desc_push_tail:B to desc_reserve:F
+		 *    matching
+		 * RMB prb_first_seq:B to prb_first_seq:A
+		 */
+		smp_rmb(); /* LMM(prb_first_seq:C) */
+	}
+
+	return seq;
+}
+
+/*
+ * Non-blocking read of a record. Updates @seq to the last finalized record
+ * (which may have no data available).
+ *
+ * See the description of prb_read_valid() and prb_read_valid_info()
+ * for details.
+ */
+static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
+			    struct printk_record *r, unsigned int *line_count)
+{
+	u64 tail_seq;
+	int err;
+
+	while ((err = prb_read(rb, *seq, r, line_count))) {
+		tail_seq = prb_first_seq(rb);
+
+		if (*seq < tail_seq) {
+			/*
+			 * Behind the tail. Catch up and try again. This
+			 * can happen for -ENOENT and -EINVAL cases.
+			 */
+			*seq = tail_seq;
+
+		} else if (err == -ENOENT) {
+			/* Record exists, but no data available. Skip. */
+			(*seq)++;
+
+		} else {
+			/* Non-existent/non-finalized record. Must stop. */
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/**
+ * prb_read_valid() - Non-blocking read of a requested record or (if gone)
+ *                    the next available record.
+ *
+ * @rb:  The ringbuffer to read from.
+ * @seq: The sequence number of the record to read.
+ * @r:   A record data buffer to store the read record to.
+ *
+ * This is the public function available to readers to read a record.
+ *
+ * The reader provides the @info and @text_buf buffers of @r to be
+ * filled in. Any of the buffer pointers can be set to NULL if the reader
+ * is not interested in that data. To ensure proper initialization of @r,
+ * prb_rec_init_rd() should be used.
+ *
+ * Context: Any context.
+ * Return: true if a record was read, otherwise false.
+ *
+ * On success, the reader must check r->info.seq to see which record was
+ * actually read. This allows the reader to detect dropped records.
+ *
+ * Failure means @seq refers to a not yet written record.
+ */
+bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
+		    struct printk_record *r)
+{
+	return _prb_read_valid(rb, &seq, r, NULL);
+}
+
+/**
+ * prb_read_valid_info() - Non-blocking read of meta data for a requested
+ *                         record or (if gone) the next available record.
+ *
+ * @rb:         The ringbuffer to read from.
+ * @seq:        The sequence number of the record to read.
+ * @info:       A buffer to store the read record meta data to.
+ * @line_count: A buffer to store the number of lines in the record text.
+ *
+ * This is the public function available to readers to read only the
+ * meta data of a record.
+ *
+ * The reader provides the @info, @line_count buffers to be filled in.
+ * Either of the buffer pointers can be set to NULL if the reader is not
+ * interested in that data.
+ *
+ * Context: Any context.
+ * Return: true if a record's meta data was read, otherwise false.
+ *
+ * On success, the reader must check info->seq to see which record meta data
+ * was actually read. This allows the reader to detect dropped records.
+ *
+ * Failure means @seq refers to a not yet written record.
+ */
+bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
+			 struct printk_info *info, unsigned int *line_count)
+{
+	struct printk_record r;
+
+	prb_rec_init_rd(&r, info, NULL, 0);
+
+	return _prb_read_valid(rb, &seq, &r, line_count);
+}
+
+/**
+ * prb_first_valid_seq() - Get the sequence number of the oldest available
+ *                         record.
+ *
+ * @rb: The ringbuffer to get the sequence number from.
+ *
+ * This is the public function available to readers to see what the
+ * first/oldest valid sequence number is.
+ *
+ * This provides readers a starting point to begin iterating the ringbuffer.
+ *
+ * Context: Any context.
+ * Return: The sequence number of the first/oldest record or, if the
+ *         ringbuffer is empty, 0 is returned.
+ */
+u64 prb_first_valid_seq(struct printk_ringbuffer *rb)
+{
+	u64 seq = 0;
+
+	if (!_prb_read_valid(rb, &seq, NULL, NULL))
+		return 0;
+
+	return seq;
+}
+
+/**
+ * prb_next_seq() - Get the sequence number after the last available record.
+ *
+ * @rb:  The ringbuffer to get the sequence number from.
+ *
+ * This is the public function available to readers to see what the next
+ * newest sequence number available to readers will be.
+ *
+ * This provides readers a sequence number to jump to if all currently
+ * available records should be skipped.
+ *
+ * Context: Any context.
+ * Return: The sequence number of the next newest (not yet available) record
+ *         for readers.
+ */
+u64 prb_next_seq(struct printk_ringbuffer *rb)
+{
+	u64 seq = 0;
+
+	/* Search forward from the oldest descriptor. */
+	while (_prb_read_valid(rb, &seq, NULL, NULL))
+		seq++;
+
+	return seq;
+}
+
+/**
+ * prb_init() - Initialize a ringbuffer to use provided external buffers.
+ *
+ * @rb:       The ringbuffer to initialize.
+ * @text_buf: The data buffer for text data.
+ * @textbits: The size of @text_buf as a power-of-2 value.
+ * @descs:    The descriptor buffer for ringbuffer records.
+ * @descbits: The count of @descs items as a power-of-2 value.
+ * @infos:    The printk_info buffer for ringbuffer records.
+ *
+ * This is the public function available to writers to setup a ringbuffer
+ * during runtime using provided buffers.
+ *
+ * This must match the initialization of DEFINE_PRINTKRB().
+ *
+ * Context: Any context.
+ */
+void prb_init(struct printk_ringbuffer *rb,
+	      char *text_buf, unsigned int textbits,
+	      struct prb_desc *descs, unsigned int descbits,
+	      struct printk_info *infos)
+{
+	memset(descs, 0, _DESCS_COUNT(descbits) * sizeof(descs[0]));
+	memset(infos, 0, _DESCS_COUNT(descbits) * sizeof(infos[0]));
+
+	rb->desc_ring.count_bits = descbits;
+	rb->desc_ring.descs = descs;
+	rb->desc_ring.infos = infos;
+	atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits));
+	atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits));
+
+	rb->text_data_ring.size_bits = textbits;
+	rb->text_data_ring.data = text_buf;
+	atomic_long_set(&rb->text_data_ring.head_lpos, BLK0_LPOS(textbits));
+	atomic_long_set(&rb->text_data_ring.tail_lpos, BLK0_LPOS(textbits));
+
+	atomic_long_set(&rb->fail, 0);
+
+	atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits));
+	descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS;
+	descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS;
+
+	infos[0].seq = -(u64)_DESCS_COUNT(descbits);
+	infos[_DESCS_COUNT(descbits) - 1].seq = 0;
+}
+
+/**
+ * prb_record_text_space() - Query the full actual used ringbuffer space for
+ *                           the text data of a reserved entry.
+ *
+ * @e: The successfully reserved entry to query.
+ *
+ * This is the public function available to writers to see how much actual
+ * space is used in the ringbuffer to store the text data of the specified
+ * entry.
+ *
+ * This function is only valid if @e has been successfully reserved using
+ * prb_reserve().
+ *
+ * Context: Any context.
+ * Return: The size in bytes used by the text data of the associated record.
+ */
+unsigned int prb_record_text_space(struct prb_reserved_entry *e)
+{
+	return e->text_space;
+}
diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h
new file mode 100644
index 0000000000000..5dc9d022db070
--- /dev/null
+++ b/kernel/printk/printk_ringbuffer.h
@@ -0,0 +1,382 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _KERNEL_PRINTK_RINGBUFFER_H
+#define _KERNEL_PRINTK_RINGBUFFER_H
+
+#include <linux/atomic.h>
+#include <linux/dev_printk.h>
+
+/*
+ * Meta information about each stored message.
+ *
+ * All fields are set by the printk code except for @seq, which is
+ * set by the ringbuffer code.
+ */
+struct printk_info {
+	u64	seq;		/* sequence number */
+	u64	ts_nsec;	/* timestamp in nanoseconds */
+	u16	text_len;	/* length of text message */
+	u8	facility;	/* syslog facility */
+	u8	flags:5;	/* internal record flags */
+	u8	level:3;	/* syslog level */
+	u32	caller_id;	/* thread id or processor id */
+
+	struct dev_printk_info	dev_info;
+};
+
+/*
+ * A structure providing the buffers, used by writers and readers.
+ *
+ * Writers:
+ * Using prb_rec_init_wr(), a writer sets @text_buf_size before calling
+ * prb_reserve(). On success, prb_reserve() sets @info and @text_buf to
+ * buffers reserved for that writer.
+ *
+ * Readers:
+ * Using prb_rec_init_rd(), a reader sets all fields before calling
+ * prb_read_valid(). Note that the reader provides the @info and @text_buf,
+ * buffers. On success, the struct pointed to by @info will be filled and
+ * the char array pointed to by @text_buf will be filled with text data.
+ */
+struct printk_record {
+	struct printk_info	*info;
+	char			*text_buf;
+	unsigned int		text_buf_size;
+};
+
+/* Specifies the logical position and span of a data block. */
+struct prb_data_blk_lpos {
+	unsigned long	begin;
+	unsigned long	next;
+};
+
+/*
+ * A descriptor: the complete meta-data for a record.
+ *
+ * @state_var: A bitwise combination of descriptor ID and descriptor state.
+ */
+struct prb_desc {
+	atomic_long_t			state_var;
+	struct prb_data_blk_lpos	text_blk_lpos;
+};
+
+/* A ringbuffer of "ID + data" elements. */
+struct prb_data_ring {
+	unsigned int	size_bits;
+	char		*data;
+	atomic_long_t	head_lpos;
+	atomic_long_t	tail_lpos;
+};
+
+/* A ringbuffer of "struct prb_desc" elements. */
+struct prb_desc_ring {
+	unsigned int		count_bits;
+	struct prb_desc		*descs;
+	struct printk_info	*infos;
+	atomic_long_t		head_id;
+	atomic_long_t		tail_id;
+};
+
+/*
+ * The high level structure representing the printk ringbuffer.
+ *
+ * @fail: Count of failed prb_reserve() calls where not even a data-less
+ *        record was created.
+ */
+struct printk_ringbuffer {
+	struct prb_desc_ring	desc_ring;
+	struct prb_data_ring	text_data_ring;
+	atomic_long_t		fail;
+};
+
+/*
+ * Used by writers as a reserve/commit handle.
+ *
+ * @rb:         Ringbuffer where the entry is reserved.
+ * @irqflags:   Saved irq flags to restore on entry commit.
+ * @id:         ID of the reserved descriptor.
+ * @text_space: Total occupied buffer space in the text data ring, including
+ *              ID, alignment padding, and wrapping data blocks.
+ *
+ * This structure is an opaque handle for writers. Its contents are only
+ * to be used by the ringbuffer implementation.
+ */
+struct prb_reserved_entry {
+	struct printk_ringbuffer	*rb;
+	unsigned long			irqflags;
+	unsigned long			id;
+	unsigned int			text_space;
+};
+
+/* The possible responses of a descriptor state-query. */
+enum desc_state {
+	desc_miss	=  -1,	/* ID mismatch (pseudo state) */
+	desc_reserved	= 0x0,	/* reserved, in use by writer */
+	desc_committed	= 0x1,	/* committed by writer, could get reopened */
+	desc_finalized	= 0x2,	/* committed, no further modification allowed */
+	desc_reusable	= 0x3,	/* free, not yet used by any writer */
+};
+
+#define _DATA_SIZE(sz_bits)	(1UL << (sz_bits))
+#define _DESCS_COUNT(ct_bits)	(1U << (ct_bits))
+#define DESC_SV_BITS		(sizeof(unsigned long) * 8)
+#define DESC_FLAGS_SHIFT	(DESC_SV_BITS - 2)
+#define DESC_FLAGS_MASK		(3UL << DESC_FLAGS_SHIFT)
+#define DESC_STATE(sv)		(3UL & (sv >> DESC_FLAGS_SHIFT))
+#define DESC_SV(id, state)	(((unsigned long)state << DESC_FLAGS_SHIFT) | id)
+#define DESC_ID_MASK		(~DESC_FLAGS_MASK)
+#define DESC_ID(sv)		((sv) & DESC_ID_MASK)
+#define FAILED_LPOS		0x1
+#define NO_LPOS			0x3
+
+#define FAILED_BLK_LPOS	\
+{				\
+	.begin	= FAILED_LPOS,	\
+	.next	= FAILED_LPOS,	\
+}
+
+/*
+ * Descriptor Bootstrap
+ *
+ * The descriptor array is minimally initialized to allow immediate usage
+ * by readers and writers. The requirements that the descriptor array
+ * initialization must satisfy:
+ *
+ *   Req1
+ *     The tail must point to an existing (committed or reusable) descriptor.
+ *     This is required by the implementation of prb_first_seq().
+ *
+ *   Req2
+ *     Readers must see that the ringbuffer is initially empty.
+ *
+ *   Req3
+ *     The first record reserved by a writer is assigned sequence number 0.
+ *
+ * To satisfy Req1, the tail initially points to a descriptor that is
+ * minimally initialized (having no data block, i.e. data-less with the
+ * data block's lpos @begin and @next values set to FAILED_LPOS).
+ *
+ * To satisfy Req2, the initial tail descriptor is initialized to the
+ * reusable state. Readers recognize reusable descriptors as existing
+ * records, but skip over them.
+ *
+ * To satisfy Req3, the last descriptor in the array is used as the initial
+ * head (and tail) descriptor. This allows the first record reserved by a
+ * writer (head + 1) to be the first descriptor in the array. (Only the first
+ * descriptor in the array could have a valid sequence number of 0.)
+ *
+ * The first time a descriptor is reserved, it is assigned a sequence number
+ * with the value of the array index. A "first time reserved" descriptor can
+ * be recognized because it has a sequence number of 0 but does not have an
+ * index of 0. (Only the first descriptor in the array could have a valid
+ * sequence number of 0.) After the first reservation, all future reservations
+ * (recycling) simply involve incrementing the sequence number by the array
+ * count.
+ *
+ *   Hack #1
+ *     Only the first descriptor in the array is allowed to have the sequence
+ *     number 0. In this case it is not possible to recognize if it is being
+ *     reserved the first time (set to index value) or has been reserved
+ *     previously (increment by the array count). This is handled by _always_
+ *     incrementing the sequence number by the array count when reserving the
+ *     first descriptor in the array. In order to satisfy Req3, the sequence
+ *     number of the first descriptor in the array is initialized to minus
+ *     the array count. Then, upon the first reservation, it is incremented
+ *     to 0, thus satisfying Req3.
+ *
+ *   Hack #2
+ *     prb_first_seq() can be called at any time by readers to retrieve the
+ *     sequence number of the tail descriptor. However, due to Req2 and Req3,
+ *     initially there are no records to report the sequence number of
+ *     (sequence numbers are u64 and there is nothing less than 0). To handle
+ *     this, the sequence number of the initial tail descriptor is initialized
+ *     to 0. Technically this is incorrect, because there is no record with
+ *     sequence number 0 (yet) and the tail descriptor is not the first
+ *     descriptor in the array. But it allows prb_read_valid() to correctly
+ *     report the existence of a record for _any_ given sequence number at all
+ *     times. Bootstrapping is complete when the tail is pushed the first
+ *     time, thus finally pointing to the first descriptor reserved by a
+ *     writer, which has the assigned sequence number 0.
+ */
+
+/*
+ * Initiating Logical Value Overflows
+ *
+ * Both logical position (lpos) and ID values can be mapped to array indexes
+ * but may experience overflows during the lifetime of the system. To ensure
+ * that printk_ringbuffer can handle the overflows for these types, initial
+ * values are chosen that map to the correct initial array indexes, but will
+ * result in overflows soon.
+ *
+ *   BLK0_LPOS
+ *     The initial @head_lpos and @tail_lpos for data rings. It is at index
+ *     0 and the lpos value is such that it will overflow on the first wrap.
+ *
+ *   DESC0_ID
+ *     The initial @head_id and @tail_id for the desc ring. It is at the last
+ *     index of the descriptor array (see Req3 above) and the ID value is such
+ *     that it will overflow on the second wrap.
+ */
+#define BLK0_LPOS(sz_bits)	(-(_DATA_SIZE(sz_bits)))
+#define DESC0_ID(ct_bits)	DESC_ID(-(_DESCS_COUNT(ct_bits) + 1))
+#define DESC0_SV(ct_bits)	DESC_SV(DESC0_ID(ct_bits), desc_reusable)
+
+/*
+ * Define a ringbuffer with an external text data buffer. The same as
+ * DEFINE_PRINTKRB() but requires specifying an external buffer for the
+ * text data.
+ *
+ * Note: The specified external buffer must be of the size:
+ *       2 ^ (descbits + avgtextbits)
+ */
+#define _DEFINE_PRINTKRB(name, descbits, avgtextbits, text_buf)			\
+static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = {				\
+	/* the initial head and tail */								\
+	[_DESCS_COUNT(descbits) - 1] = {							\
+		/* reusable */									\
+		.state_var	= ATOMIC_INIT(DESC0_SV(descbits)),				\
+		/* no associated data block */							\
+		.text_blk_lpos	= FAILED_BLK_LPOS,						\
+	},											\
+};												\
+static struct printk_info _##name##_infos[_DESCS_COUNT(descbits)] = {				\
+	/* this will be the first record reserved by a writer */				\
+	[0] = {											\
+		/* will be incremented to 0 on the first reservation */				\
+		.seq = -(u64)_DESCS_COUNT(descbits),						\
+	},											\
+	/* the initial head and tail */								\
+	[_DESCS_COUNT(descbits) - 1] = {							\
+		/* reports the first seq value during the bootstrap phase */			\
+		.seq = 0,									\
+	},											\
+};												\
+static struct printk_ringbuffer name = {							\
+	.desc_ring = {										\
+		.count_bits	= descbits,							\
+		.descs		= &_##name##_descs[0],						\
+		.infos		= &_##name##_infos[0],						\
+		.head_id	= ATOMIC_INIT(DESC0_ID(descbits)),				\
+		.tail_id	= ATOMIC_INIT(DESC0_ID(descbits)),				\
+	},											\
+	.text_data_ring = {									\
+		.size_bits	= (avgtextbits) + (descbits),					\
+		.data		= text_buf,							\
+		.head_lpos	= ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))),	\
+		.tail_lpos	= ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))),	\
+	},											\
+	.fail			= ATOMIC_LONG_INIT(0),						\
+}
+
+/**
+ * DEFINE_PRINTKRB() - Define a ringbuffer.
+ *
+ * @name:        The name of the ringbuffer variable.
+ * @descbits:    The number of descriptors as a power-of-2 value.
+ * @avgtextbits: The average text data size per record as a power-of-2 value.
+ *
+ * This is a macro for defining a ringbuffer and all internal structures
+ * such that it is ready for immediate use. See _DEFINE_PRINTKRB() for a
+ * variant where the text data buffer can be specified externally.
+ */
+#define DEFINE_PRINTKRB(name, descbits, avgtextbits)				\
+static char _##name##_text[1U << ((avgtextbits) + (descbits))]			\
+			__aligned(__alignof__(unsigned long));			\
+_DEFINE_PRINTKRB(name, descbits, avgtextbits, &_##name##_text[0])
+
+/* Writer Interface */
+
+/**
+ * prb_rec_init_wd() - Initialize a buffer for writing records.
+ *
+ * @r:             The record to initialize.
+ * @text_buf_size: The needed text buffer size.
+ */
+static inline void prb_rec_init_wr(struct printk_record *r,
+				   unsigned int text_buf_size)
+{
+	r->info = NULL;
+	r->text_buf = NULL;
+	r->text_buf_size = text_buf_size;
+}
+
+bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
+		 struct printk_record *r);
+bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
+			 struct printk_record *r, u32 caller_id, unsigned int max_size);
+void prb_commit(struct prb_reserved_entry *e);
+void prb_final_commit(struct prb_reserved_entry *e);
+
+void prb_init(struct printk_ringbuffer *rb,
+	      char *text_buf, unsigned int text_buf_size,
+	      struct prb_desc *descs, unsigned int descs_count_bits,
+	      struct printk_info *infos);
+unsigned int prb_record_text_space(struct prb_reserved_entry *e);
+
+/* Reader Interface */
+
+/**
+ * prb_rec_init_rd() - Initialize a buffer for reading records.
+ *
+ * @r:             The record to initialize.
+ * @info:          A buffer to store record meta-data.
+ * @text_buf:      A buffer to store text data.
+ * @text_buf_size: The size of @text_buf.
+ *
+ * Initialize all the fields that a reader is interested in. All arguments
+ * (except @r) are optional. Only record data for arguments that are
+ * non-NULL or non-zero will be read.
+ */
+static inline void prb_rec_init_rd(struct printk_record *r,
+				   struct printk_info *info,
+				   char *text_buf, unsigned int text_buf_size)
+{
+	r->info = info;
+	r->text_buf = text_buf;
+	r->text_buf_size = text_buf_size;
+}
+
+/**
+ * prb_for_each_record() - Iterate over the records of a ringbuffer.
+ *
+ * @from: The sequence number to begin with.
+ * @rb:   The ringbuffer to iterate over.
+ * @s:    A u64 to store the sequence number on each iteration.
+ * @r:    A printk_record to store the record on each iteration.
+ *
+ * This is a macro for conveniently iterating over a ringbuffer.
+ * Note that @s may not be the sequence number of the record on each
+ * iteration. For the sequence number, @r->info->seq should be checked.
+ *
+ * Context: Any context.
+ */
+#define prb_for_each_record(from, rb, s, r) \
+for ((s) = from; prb_read_valid(rb, s, r); (s) = (r)->info->seq + 1)
+
+/**
+ * prb_for_each_info() - Iterate over the meta data of a ringbuffer.
+ *
+ * @from: The sequence number to begin with.
+ * @rb:   The ringbuffer to iterate over.
+ * @s:    A u64 to store the sequence number on each iteration.
+ * @i:    A printk_info to store the record meta data on each iteration.
+ * @lc:   An unsigned int to store the text line count of each record.
+ *
+ * This is a macro for conveniently iterating over a ringbuffer.
+ * Note that @s may not be the sequence number of the record on each
+ * iteration. For the sequence number, @r->info->seq should be checked.
+ *
+ * Context: Any context.
+ */
+#define prb_for_each_info(from, rb, s, i, lc) \
+for ((s) = from; prb_read_valid_info(rb, s, i, lc); (s) = (i)->seq + 1)
+
+bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
+		    struct printk_record *r);
+bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
+			 struct printk_info *info, unsigned int *line_count);
+
+u64 prb_first_valid_seq(struct printk_ringbuffer *rb);
+u64 prb_next_seq(struct printk_ringbuffer *rb);
+
+#endif /* _KERNEL_PRINTK_RINGBUFFER_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 29d8062ec4f5c..2d54f1e7ef867 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2103,7 +2103,75 @@ struct set_affinity_pending {
 };
 
 /*
- * This function is wildly self concurrent, consider at least 3 times.
+ * This function is wildly self concurrent; here be dragons.
+ *
+ *
+ * When given a valid mask, __set_cpus_allowed_ptr() must block until the
+ * designated task is enqueued on an allowed CPU. If that task is currently
+ * running, we have to kick it out using the CPU stopper.
+ *
+ * Migrate-Disable comes along and tramples all over our nice sandcastle.
+ * Consider:
+ *
+ *     Initial conditions: P0->cpus_mask = [0, 1]
+ *
+ *     P0@CPU0                  P1
+ *
+ *     migrate_disable();
+ *     <preempted>
+ *                              set_cpus_allowed_ptr(P0, [1]);
+ *
+ * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes
+ * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
+ * This means we need the following scheme:
+ *
+ *     P0@CPU0                  P1
+ *
+ *     migrate_disable();
+ *     <preempted>
+ *                              set_cpus_allowed_ptr(P0, [1]);
+ *                                <blocks>
+ *     <resumes>
+ *     migrate_enable();
+ *       __set_cpus_allowed_ptr();
+ *       <wakes local stopper>
+ *                         `--> <woken on migration completion>
+ *
+ * Now the fun stuff: there may be several P1-like tasks, i.e. multiple
+ * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any
+ * task p are serialized by p->pi_lock, which we can leverage: the one that
+ * should come into effect at the end of the Migrate-Disable region is the last
+ * one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
+ * but we still need to properly signal those waiting tasks at the appropriate
+ * moment.
+ *
+ * This is implemented using struct set_affinity_pending. The first
+ * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
+ * setup an instance of that struct and install it on the targeted task_struct.
+ * Any and all further callers will reuse that instance. Those then wait for
+ * a completion signaled at the tail of the CPU stopper callback (1), triggered
+ * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
+ *
+ *
+ * (1) In the cases covered above. There is one more where the completion is
+ * signaled within affine_move_task() itself: when a subsequent affinity request
+ * cancels the need for an active migration. Consider:
+ *
+ *     Initial conditions: P0->cpus_mask = [0, 1]
+ *
+ *     P0@CPU0            P1                             P2
+ *
+ *     migrate_disable();
+ *     <preempted>
+ *                        set_cpus_allowed_ptr(P0, [1]);
+ *                          <blocks>
+ *                                                       set_cpus_allowed_ptr(P0, [0, 1]);
+ *                                                         <signal completion>
+ *                          <awakes>
+ *
+ * Note that the above is safe vs a concurrent migrate_enable(), as any
+ * pending affinity completion is preceded an uninstallion of
+ * p->migration_pending done with p->pi_lock held.
  */
 static int affine_move_task(struct rq *rq, struct rq_flags *rf,
 			    struct task_struct *p, int dest_cpu, unsigned int flags)
@@ -2127,6 +2195,7 @@ static int affine_move_task(struct rq *rq, struct rq_flags *rf,
 
 		pending = p->migration_pending;
 		if (pending) {
+			refcount_inc(&pending->refs);
 			p->migration_pending = NULL;
 			complete = true;
 		}
@@ -2146,6 +2215,7 @@ static int affine_move_task(struct rq *rq, struct rq_flags *rf,
 	if (!(flags & SCA_MIGRATE_ENABLE)) {
 		/* serialized by p->pi_lock */
 		if (!p->migration_pending) {
+			/* Install the request */
 			refcount_set(&my_pending.refs, 1);
 			init_completion(&my_pending.done);
 			p->migration_pending = &my_pending;
@@ -2184,7 +2254,11 @@ static int affine_move_task(struct rq *rq, struct rq_flags *rf,
 	}
 
 	if (task_running(rq, p) || p->state == TASK_WAKING) {
-
+		/*
+		 * Lessen races (and headaches) by delegating
+		 * is_migration_disabled(p) checks to the stopper, which will
+		 * run on the same CPU as said p.
+		 */
 		task_rq_unlock(rq, p, rf);
 		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
 
@@ -2209,6 +2283,10 @@ static int affine_move_task(struct rq *rq, struct rq_flags *rf,
 	if (refcount_dec_and_test(&pending->refs))
 		wake_up_var(&pending->refs);
 
+	/*
+	 * Block the original owner of &pending until all subsequent callers
+	 * have seen the completion and decremented the refcount
+	 */
 	wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
 
 	return 0;
@@ -2257,8 +2335,17 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 		goto out;
 	}
 
-	if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask))
-		goto out;
+	if (!(flags & SCA_MIGRATE_ENABLE)) {
+		if (cpumask_equal(&p->cpus_mask, new_mask))
+			goto out;
+
+		if (WARN_ON_ONCE(p == current &&
+				 is_migration_disabled(p) &&
+				 !cpumask_test_cpu(task_cpu(p), new_mask))) {
+			ret = -EBUSY;
+			goto out;
+		}
+	}
 
 	/*
 	 * Picking a ~random cpu helps in cases where we are changing affinity
@@ -3960,20 +4047,19 @@ static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
 	}
 }
 
-static bool balance_push(struct rq *rq);
+static void balance_push(struct rq *rq);
 
 static inline void balance_switch(struct rq *rq)
 {
-	if (unlikely(rq->balance_flags)) {
-		/*
-		 * Run the balance_callbacks, except on hotplug
-		 * when we need to push the current task away.
-		 */
-		if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
-		    !(rq->balance_flags & BALANCE_PUSH) ||
-		    !balance_push(rq))
-			__balance_callbacks(rq);
+	if (likely(!rq->balance_flags))
+		return;
+
+	if (rq->balance_flags & BALANCE_PUSH) {
+		balance_push(rq);
+		return;
 	}
+
+	__balance_callbacks(rq);
 }
 
 #else
@@ -7233,7 +7319,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
 /*
  * Ensure we only run per-cpu kthreads once the CPU goes !active.
  */
-static bool balance_push(struct rq *rq)
+static void balance_push(struct rq *rq)
 {
 	struct task_struct *push_task = rq->curr;
 
@@ -7262,7 +7348,7 @@ static bool balance_push(struct rq *rq)
 			rcuwait_wake_up(&rq->hotplug_wait);
 			raw_spin_lock(&rq->lock);
 		}
-		return false;
+		return;
 	}
 
 	get_task_struct(push_task);
@@ -7279,8 +7365,6 @@ static bool balance_push(struct rq *rq)
 	 * which is_per_cpu_kthread() and will push this task away.
 	 */
 	raw_spin_lock(&rq->lock);
-
-	return true;
 }
 
 static void balance_push_set(int cpu, bool on)
@@ -7313,12 +7397,11 @@ static void balance_hotplug_wait(void)
 
 #else
 
-static inline bool balance_push(struct rq *rq)
+static inline void balance_push(struct rq *rq)
 {
-	return false;
 }
 
-static void balance_push_set(int cpu, bool on)
+static inline void balance_push_set(int cpu, bool on)
 {
 }
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 15320ede2f456..6df71d487ed06 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1978,8 +1978,8 @@ static int find_later_rq(struct task_struct *task)
 				return this_cpu;
 			}
 
-			best_cpu = cpumask_first_and(later_mask,
-							sched_domain_span(sd));
+			best_cpu = cpumask_any_and_distribute(later_mask,
+							      sched_domain_span(sd));
 			/*
 			 * Last chance: if a CPU being in both later_mask
 			 * and current sd span is valid, that becomes our
@@ -2105,6 +2105,9 @@ static int push_dl_task(struct rq *rq)
 		return 0;
 
 retry:
+	if (is_migration_disabled(next_task))
+		return 0;
+
 	if (WARN_ON(next_task == rq->curr))
 		return 0;
 
@@ -2336,6 +2339,9 @@ static void rq_online_dl(struct rq *rq)
 /* Assumes rq->lock is held */
 static void rq_offline_dl(struct rq *rq)
 {
+	if (rq->dl.overloaded)
+		dl_clear_overload(rq);
+
 	cpudl_clear(&rq->rd->cpudl, rq->cpu);
 	cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e90a69b3e85c0..03f7b397716dd 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2289,6 +2289,9 @@ static void rq_online_rt(struct rq *rq)
 /* Assumes rq->lock is held */
 static void rq_offline_rt(struct rq *rq)
 {
+	if (rq->rt.overloaded)
+		rt_clear_overload(rq);
+
 	__disable_runtime(rq);
 
 	cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index cd5f1440c5bea..16fcda68c2b6b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -61,23 +61,6 @@ config CONSOLE_LOGLEVEL_QUIET
 	  will be used as the loglevel. IOW passing "quiet" will be the
 	  equivalent of passing "loglevel=<CONSOLE_LOGLEVEL_QUIET>"
 
-config CONSOLE_LOGLEVEL_EMERGENCY
-	int "Emergency console loglevel (1-15)"
-	range 1 15
-	default "5"
-	help
-	  The loglevel to determine if a console message is an emergency
-	  message.
-
-	  If supported by the console driver, emergency messages will be
-	  flushed to the console immediately. This can cause significant system
-	  latencies so the value should be set such that only significant
-	  messages are classified as emergency messages.
-
-	  Setting a default here is equivalent to passing in
-	  emergency_loglevel=<x> in the kernel bootargs. emergency_loglevel=<x>
-	  continues to override whatever value is specified here as well.
-
 config MESSAGE_LOGLEVEL_DEFAULT
 	int "Default message log level (1-7)"
 	range 1 7
diff --git a/lib/Makefile b/lib/Makefile
index e2822830764a1..a4a4c6864f518 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -32,7 +32,7 @@ KCSAN_SANITIZE_random32.o := n
 
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o timerqueue.o xarray.o \
-	 idr.o extable.o sha1.o irq_regs.o argv_split.o printk_ringbuffer.o \
+	 idr.o extable.o sha1.o irq_regs.o argv_split.o \
 	 flex_proportions.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 	 earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
diff --git a/lib/bust_spinlocks.c b/lib/bust_spinlocks.c
index c6e083323d1b9..8be59f84eaeaf 100644
--- a/lib/bust_spinlocks.c
+++ b/lib/bust_spinlocks.c
@@ -26,6 +26,7 @@ void bust_spinlocks(int yes)
 		unblank_screen();
 #endif
 		console_unblank();
-		--oops_in_progress;
+		if (--oops_in_progress == 0)
+			wake_up_klogd();
 	}
 }
diff --git a/lib/printk_ringbuffer.c b/lib/printk_ringbuffer.c
deleted file mode 100644
index 9a31d7dbdc005..0000000000000
--- a/lib/printk_ringbuffer.c
+++ /dev/null
@@ -1,589 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/sched.h>
-#include <linux/smp.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/printk_ringbuffer.h>
-
-#define PRB_SIZE(rb) (1 << rb->size_bits)
-#define PRB_SIZE_BITMASK(rb) (PRB_SIZE(rb) - 1)
-#define PRB_INDEX(rb, lpos) (lpos & PRB_SIZE_BITMASK(rb))
-#define PRB_WRAPS(rb, lpos) (lpos >> rb->size_bits)
-#define PRB_WRAP_LPOS(rb, lpos, xtra) \
-	((PRB_WRAPS(rb, lpos) + xtra) << rb->size_bits)
-#define PRB_DATA_SIZE(e) (e->size - sizeof(struct prb_entry))
-#define PRB_DATA_ALIGN sizeof(long)
-
-static bool __prb_trylock(struct prb_cpulock *cpu_lock,
-			  unsigned int *cpu_store)
-{
-	unsigned long *flags;
-	unsigned int cpu;
-
-	cpu = get_cpu();
-
-	*cpu_store = atomic_read(&cpu_lock->owner);
-	/* memory barrier to ensure the current lock owner is visible */
-	smp_rmb();
-	if (*cpu_store == -1) {
-		flags = per_cpu_ptr(cpu_lock->irqflags, cpu);
-		local_irq_save(*flags);
-		if (atomic_try_cmpxchg_acquire(&cpu_lock->owner,
-					       cpu_store, cpu)) {
-			return true;
-		}
-		local_irq_restore(*flags);
-	} else if (*cpu_store == cpu) {
-		return true;
-	}
-
-	put_cpu();
-	return false;
-}
-
-/*
- * prb_lock: Perform a processor-reentrant spin lock.
- * @cpu_lock: A pointer to the lock object.
- * @cpu_store: A "flags" pointer to store lock status information.
- *
- * If no processor has the lock, the calling processor takes the lock and
- * becomes the owner. If the calling processor is already the owner of the
- * lock, this function succeeds immediately. If lock is locked by another
- * processor, this function spins until the calling processor becomes the
- * owner.
- *
- * It is safe to call this function from any context and state.
- */
-void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store)
-{
-	for (;;) {
-		if (__prb_trylock(cpu_lock, cpu_store))
-			break;
-		cpu_relax();
-	}
-}
-
-/*
- * prb_unlock: Perform a processor-reentrant spin unlock.
- * @cpu_lock: A pointer to the lock object.
- * @cpu_store: A "flags" object storing lock status information.
- *
- * Release the lock. The calling processor must be the owner of the lock.
- *
- * It is safe to call this function from any context and state.
- */
-void prb_unlock(struct prb_cpulock *cpu_lock, unsigned int cpu_store)
-{
-	unsigned long *flags;
-	unsigned int cpu;
-
-	cpu = atomic_read(&cpu_lock->owner);
-	atomic_set_release(&cpu_lock->owner, cpu_store);
-
-	if (cpu_store == -1) {
-		flags = per_cpu_ptr(cpu_lock->irqflags, cpu);
-		local_irq_restore(*flags);
-	}
-
-	put_cpu();
-}
-
-static struct prb_entry *to_entry(struct printk_ringbuffer *rb,
-				  unsigned long lpos)
-{
-	char *buffer = rb->buffer;
-	buffer += PRB_INDEX(rb, lpos);
-	return (struct prb_entry *)buffer;
-}
-
-static int calc_next(struct printk_ringbuffer *rb, unsigned long tail,
-		     unsigned long lpos, int size, unsigned long *calced_next)
-{
-	unsigned long next_lpos;
-	int ret = 0;
-again:
-	next_lpos = lpos + size;
-	if (next_lpos - tail > PRB_SIZE(rb))
-		return -1;
-
-	if (PRB_WRAPS(rb, lpos) != PRB_WRAPS(rb, next_lpos)) {
-		lpos = PRB_WRAP_LPOS(rb, next_lpos, 0);
-		ret |= 1;
-		goto again;
-	}
-
-	*calced_next = next_lpos;
-	return ret;
-}
-
-static bool push_tail(struct printk_ringbuffer *rb, unsigned long tail)
-{
-	unsigned long new_tail;
-	struct prb_entry *e;
-	unsigned long head;
-
-	if (tail != atomic_long_read(&rb->tail))
-		return true;
-
-	e = to_entry(rb, tail);
-	if (e->size != -1)
-		new_tail = tail + e->size;
-	else
-		new_tail = PRB_WRAP_LPOS(rb, tail, 1);
-
-	/* make sure the new tail does not overtake the head */
-	head = atomic_long_read(&rb->head);
-	if (head - new_tail > PRB_SIZE(rb))
-		return false;
-
-	atomic_long_cmpxchg(&rb->tail, tail, new_tail);
-	return true;
-}
-
-/*
- * prb_commit: Commit a reserved entry to the ring buffer.
- * @h: An entry handle referencing the data entry to commit.
- *
- * Commit data that has been reserved using prb_reserve(). Once the data
- * block has been committed, it can be invalidated at any time. If a writer
- * is interested in using the data after committing, the writer should make
- * its own copy first or use the prb_iter_ reader functions to access the
- * data in the ring buffer.
- *
- * It is safe to call this function from any context and state.
- */
-void prb_commit(struct prb_handle *h)
-{
-	struct printk_ringbuffer *rb = h->rb;
-	bool changed = false;
-	struct prb_entry *e;
-	unsigned long head;
-	unsigned long res;
-
-	for (;;) {
-		if (atomic_read(&rb->ctx) != 1) {
-			/* the interrupted context will fixup head */
-			atomic_dec(&rb->ctx);
-			break;
-		}
-		/* assign sequence numbers before moving head */
-		head = atomic_long_read(&rb->head);
-		res = atomic_long_read(&rb->reserve);
-		while (head != res) {
-			e = to_entry(rb, head);
-			if (e->size == -1) {
-				head = PRB_WRAP_LPOS(rb, head, 1);
-				continue;
-			}
-			while (atomic_long_read(&rb->lost)) {
-				atomic_long_dec(&rb->lost);
-				rb->seq++;
-			}
-			e->seq = ++rb->seq;
-			head += e->size;
-			changed = true;
-		}
-		atomic_long_set_release(&rb->head, res);
-
-		atomic_dec(&rb->ctx);
-
-		if (atomic_long_read(&rb->reserve) == res)
-			break;
-		atomic_inc(&rb->ctx);
-	}
-
-	prb_unlock(rb->cpulock, h->cpu);
-
-	if (changed) {
-		atomic_long_inc(&rb->wq_counter);
-		if (wq_has_sleeper(rb->wq)) {
-#ifdef CONFIG_IRQ_WORK
-			irq_work_queue(rb->wq_work);
-#else
-			if (!in_nmi())
-				wake_up_interruptible_all(rb->wq);
-#endif
-		}
-	}
-}
-
-/*
- * prb_reserve: Reserve an entry within a ring buffer.
- * @h: An entry handle to be setup and reference an entry.
- * @rb: A ring buffer to reserve data within.
- * @size: The number of bytes to reserve.
- *
- * Reserve an entry of at least @size bytes to be used by the caller. If
- * successful, the data region of the entry belongs to the caller and cannot
- * be invalidated by any other task/context. For this reason, the caller
- * should call prb_commit() as quickly as possible in order to avoid preventing
- * other tasks/contexts from reserving data in the case that the ring buffer
- * has wrapped.
- *
- * It is safe to call this function from any context and state.
- *
- * Returns a pointer to the reserved entry (and @h is setup to reference that
- * entry) or NULL if it was not possible to reserve data.
- */
-char *prb_reserve(struct prb_handle *h, struct printk_ringbuffer *rb,
-		  unsigned int size)
-{
-	unsigned long tail, res1, res2;
-	int ret;
-
-	if (size == 0)
-		return NULL;
-	size += sizeof(struct prb_entry);
-	size += PRB_DATA_ALIGN - 1;
-	size &= ~(PRB_DATA_ALIGN - 1);
-	if (size >= PRB_SIZE(rb))
-		return NULL;
-
-	h->rb = rb;
-	prb_lock(rb->cpulock, &h->cpu);
-
-	atomic_inc(&rb->ctx);
-
-	do {
-		for (;;) {
-			tail = atomic_long_read(&rb->tail);
-			res1 = atomic_long_read(&rb->reserve);
-			ret = calc_next(rb, tail, res1, size, &res2);
-			if (ret >= 0)
-				break;
-			if (!push_tail(rb, tail)) {
-				prb_commit(h);
-				return NULL;
-			}
-		}
-	} while (!atomic_long_try_cmpxchg_acquire(&rb->reserve, &res1, res2));
-
-	h->entry = to_entry(rb, res1);
-
-	if (ret) {
-		/* handle wrap */
-		h->entry->size = -1;
-		h->entry = to_entry(rb, PRB_WRAP_LPOS(rb, res2, 0));
-	}
-
-	h->entry->size = size;
-
-	return &h->entry->data[0];
-}
-
-/*
- * prb_iter_copy: Copy an iterator.
- * @dest: The iterator to copy to.
- * @src: The iterator to copy from.
- *
- * Make a deep copy of an iterator. This is particularly useful for making
- * backup copies of an iterator in case a form of rewinding it needed.
- *
- * It is safe to call this function from any context and state. But
- * note that this function is not atomic. Callers should not make copies
- * to/from iterators that can be accessed by other tasks/contexts.
- */
-void prb_iter_copy(struct prb_iterator *dest, struct prb_iterator *src)
-{
-	memcpy(dest, src, sizeof(*dest));
-}
-
-/*
- * prb_iter_init: Initialize an iterator for a ring buffer.
- * @iter: The iterator to initialize.
- * @rb: A ring buffer to that @iter should iterate.
- * @seq: The sequence number of the position preceding the first record.
- *       May be NULL.
- *
- * Initialize an iterator to be used with a specified ring buffer. If @seq
- * is non-NULL, it will be set such that prb_iter_next() will provide a
- * sequence value of "@seq + 1" if no records were missed.
- *
- * It is safe to call this function from any context and state.
- */
-void prb_iter_init(struct prb_iterator *iter, struct printk_ringbuffer *rb,
-		   u64 *seq)
-{
-	memset(iter, 0, sizeof(*iter));
-	iter->rb = rb;
-	iter->lpos = PRB_INIT;
-
-	if (!seq)
-		return;
-
-	for (;;) {
-		struct prb_iterator tmp_iter;
-		int ret;
-
-		prb_iter_copy(&tmp_iter, iter);
-
-		ret = prb_iter_next(&tmp_iter, NULL, 0, seq);
-		if (ret < 0)
-			continue;
-
-		if (ret == 0)
-			*seq = 0;
-		else
-			(*seq)--;
-		break;
-	}
-}
-
-static bool is_valid(struct printk_ringbuffer *rb, unsigned long lpos)
-{
-	unsigned long head, tail;
-
-	tail = atomic_long_read(&rb->tail);
-	head = atomic_long_read(&rb->head);
-	head -= tail;
-	lpos -= tail;
-
-	if (lpos >= head)
-		return false;
-	return true;
-}
-
-/*
- * prb_iter_data: Retrieve the record data at the current position.
- * @iter: Iterator tracking the current position.
- * @buf: A buffer to store the data of the record. May be NULL.
- * @size: The size of @buf. (Ignored if @buf is NULL.)
- * @seq: The sequence number of the record. May be NULL.
- *
- * If @iter is at a record, provide the data and/or sequence number of that
- * record (if specified by the caller).
- *
- * It is safe to call this function from any context and state.
- *
- * Returns >=0 if the current record contains valid data (returns 0 if @buf
- * is NULL or returns the size of the data block if @buf is non-NULL) or
- * -EINVAL if @iter is now invalid.
- */
-int prb_iter_data(struct prb_iterator *iter, char *buf, int size, u64 *seq)
-{
-	struct printk_ringbuffer *rb = iter->rb;
-	unsigned long lpos = iter->lpos;
-	unsigned int datsize = 0;
-	struct prb_entry *e;
-
-	if (buf || seq) {
-		e = to_entry(rb, lpos);
-		if (!is_valid(rb, lpos))
-			return -EINVAL;
-		/* memory barrier to ensure valid lpos */
-		smp_rmb();
-		if (buf) {
-			datsize = PRB_DATA_SIZE(e);
-			/* memory barrier to ensure load of datsize */
-			smp_rmb();
-			if (!is_valid(rb, lpos))
-				return -EINVAL;
-			if (PRB_INDEX(rb, lpos) + datsize >
-			    PRB_SIZE(rb) - PRB_DATA_ALIGN) {
-				return -EINVAL;
-			}
-			if (size > datsize)
-				size = datsize;
-			memcpy(buf, &e->data[0], size);
-		}
-		if (seq)
-			*seq = e->seq;
-		/* memory barrier to ensure loads of entry data */
-		smp_rmb();
-	}
-
-	if (!is_valid(rb, lpos))
-		return -EINVAL;
-
-	return datsize;
-}
-
-/*
- * prb_iter_next: Advance to the next record.
- * @iter: Iterator tracking the current position.
- * @buf: A buffer to store the data of the next record. May be NULL.
- * @size: The size of @buf. (Ignored if @buf is NULL.)
- * @seq: The sequence number of the next record. May be NULL.
- *
- * If a next record is available, @iter is advanced and (if specified)
- * the data and/or sequence number of that record are provided.
- *
- * It is safe to call this function from any context and state.
- *
- * Returns 1 if @iter was advanced, 0 if @iter is at the end of the list, or
- * -EINVAL if @iter is now invalid.
- */
-int prb_iter_next(struct prb_iterator *iter, char *buf, int size, u64 *seq)
-{
-	struct printk_ringbuffer *rb = iter->rb;
-	unsigned long next_lpos;
-	struct prb_entry *e;
-	unsigned int esize;
-
-	if (iter->lpos == PRB_INIT) {
-		next_lpos = atomic_long_read(&rb->tail);
-	} else {
-		if (!is_valid(rb, iter->lpos))
-			return -EINVAL;
-		/* memory barrier to ensure valid lpos */
-		smp_rmb();
-		e = to_entry(rb, iter->lpos);
-		esize = e->size;
-		/* memory barrier to ensure load of size */
-		smp_rmb();
-		if (!is_valid(rb, iter->lpos))
-			return -EINVAL;
-		next_lpos = iter->lpos + esize;
-	}
-	if (next_lpos == atomic_long_read(&rb->head))
-		return 0;
-	if (!is_valid(rb, next_lpos))
-		return -EINVAL;
-	/* memory barrier to ensure valid lpos */
-	smp_rmb();
-
-	iter->lpos = next_lpos;
-	e = to_entry(rb, iter->lpos);
-	esize = e->size;
-	/* memory barrier to ensure load of size */
-	smp_rmb();
-	if (!is_valid(rb, iter->lpos))
-		return -EINVAL;
-	if (esize == -1)
-		iter->lpos = PRB_WRAP_LPOS(rb, iter->lpos, 1);
-
-	if (prb_iter_data(iter, buf, size, seq) < 0)
-		return -EINVAL;
-
-	return 1;
-}
-
-/*
- * prb_iter_wait_next: Advance to the next record, blocking if none available.
- * @iter: Iterator tracking the current position.
- * @buf: A buffer to store the data of the next record. May be NULL.
- * @size: The size of @buf. (Ignored if @buf is NULL.)
- * @seq: The sequence number of the next record. May be NULL.
- *
- * If a next record is already available, this function works like
- * prb_iter_next(). Otherwise block interruptible until a next record is
- * available.
- *
- * When a next record is available, @iter is advanced and (if specified)
- * the data and/or sequence number of that record are provided.
- *
- * This function might sleep.
- *
- * Returns 1 if @iter was advanced, -EINVAL if @iter is now invalid, or
- * -ERESTARTSYS if interrupted by a signal.
- */
-int prb_iter_wait_next(struct prb_iterator *iter, char *buf, int size, u64 *seq)
-{
-	unsigned long last_seen;
-	int ret;
-
-	for (;;) {
-		last_seen = atomic_long_read(&iter->rb->wq_counter);
-
-		ret = prb_iter_next(iter, buf, size, seq);
-		if (ret != 0)
-			break;
-
-		ret = wait_event_interruptible(*iter->rb->wq,
-			last_seen != atomic_long_read(&iter->rb->wq_counter));
-		if (ret < 0)
-			break;
-	}
-
-	return ret;
-}
-
-/*
- * prb_iter_seek: Seek forward to a specific record.
- * @iter: Iterator to advance.
- * @seq: Record number to advance to.
- *
- * Advance @iter such that a following call to prb_iter_data() will provide
- * the contents of the specified record. If a record is specified that does
- * not yet exist, advance @iter to the end of the record list.
- *
- * Note that iterators cannot be rewound. So if a record is requested that
- * exists but is previous to @iter in position, @iter is considered invalid.
- *
- * It is safe to call this function from any context and state.
- *
- * Returns 1 on succces, 0 if specified record does not yet exist (@iter is
- * now at the end of the list), or -EINVAL if @iter is now invalid.
- */
-int prb_iter_seek(struct prb_iterator *iter, u64 seq)
-{
-	u64 cur_seq;
-	int ret;
-
-	/* first check if the iterator is already at the wanted seq */
-	if (seq == 0) {
-		if (iter->lpos == PRB_INIT)
-			return 1;
-		else
-			return -EINVAL;
-	}
-	if (iter->lpos != PRB_INIT) {
-		if (prb_iter_data(iter, NULL, 0, &cur_seq) >= 0) {
-			if (cur_seq == seq)
-				return 1;
-			if (cur_seq > seq)
-				return -EINVAL;
-		}
-	}
-
-	/* iterate to find the wanted seq */
-	for (;;) {
-		ret = prb_iter_next(iter, NULL, 0, &cur_seq);
-		if (ret <= 0)
-			break;
-
-		if (cur_seq == seq)
-			break;
-
-		if (cur_seq > seq) {
-			ret = -EINVAL;
-			break;
-		}
-	}
-
-	return ret;
-}
-
-/*
- * prb_buffer_size: Get the size of the ring buffer.
- * @rb: The ring buffer to get the size of.
- *
- * Return the number of bytes used for the ring buffer entry storage area.
- * Note that this area stores both entry header and entry data. Therefore
- * this represents an upper bound to the amount of data that can be stored
- * in the ring buffer.
- *
- * It is safe to call this function from any context and state.
- *
- * Returns the size in bytes of the entry storage area.
- */
-int prb_buffer_size(struct printk_ringbuffer *rb)
-{
-	return PRB_SIZE(rb);
-}
-
-/*
- * prb_inc_lost: Increment the seq counter to signal a lost record.
- * @rb: The ring buffer to increment the seq of.
- *
- * Increment the seq counter so that a seq number is intentially missing
- * for the readers. This allows readers to identify that a record is
- * missing. A writer will typically use this function if prb_reserve()
- * fails.
- *
- * It is safe to call this function from any context and state.
- */
-void prb_inc_lost(struct printk_ringbuffer *rb)
-{
-	atomic_long_inc(&rb->lost);
-}
diff --git a/localversion-rt b/localversion-rt
index 1e584b47c987e..9e7cd66d9f44f 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt17
+-rt18
diff --git a/scripts/gdb/linux/dmesg.py b/scripts/gdb/linux/dmesg.py
index 2fa7bb83885f0..a92c55bd8de54 100644
--- a/scripts/gdb/linux/dmesg.py
+++ b/scripts/gdb/linux/dmesg.py
@@ -16,8 +16,13 @@ import sys
 
 from linux import utils
 
-printk_log_type = utils.CachedType("struct printk_log")
-
+printk_info_type = utils.CachedType("struct printk_info")
+prb_data_blk_lpos_type = utils.CachedType("struct prb_data_blk_lpos")
+prb_desc_type = utils.CachedType("struct prb_desc")
+prb_desc_ring_type = utils.CachedType("struct prb_desc_ring")
+prb_data_ring_type = utils.CachedType("struct prb_data_ring")
+printk_ringbuffer_type = utils.CachedType("struct printk_ringbuffer")
+atomic_long_type = utils.CachedType("atomic_long_t")
 
 class LxDmesg(gdb.Command):
     """Print Linux kernel log buffer."""
@@ -26,44 +31,110 @@ printk_log_type = utils.CachedType("struct printk_log")
         super(LxDmesg, self).__init__("lx-dmesg", gdb.COMMAND_DATA)
 
     def invoke(self, arg, from_tty):
-        log_buf_addr = int(str(gdb.parse_and_eval(
-            "(void *)'printk.c'::log_buf")).split()[0], 16)
-        log_first_idx = int(gdb.parse_and_eval("'printk.c'::log_first_idx"))
-        log_next_idx = int(gdb.parse_and_eval("'printk.c'::log_next_idx"))
-        log_buf_len = int(gdb.parse_and_eval("'printk.c'::log_buf_len"))
-
         inf = gdb.inferiors()[0]
-        start = log_buf_addr + log_first_idx
-        if log_first_idx < log_next_idx:
-            log_buf_2nd_half = -1
-            length = log_next_idx - log_first_idx
-            log_buf = utils.read_memoryview(inf, start, length).tobytes()
-        else:
-            log_buf_2nd_half = log_buf_len - log_first_idx
-            a = utils.read_memoryview(inf, start, log_buf_2nd_half)
-            b = utils.read_memoryview(inf, log_buf_addr, log_next_idx)
-            log_buf = a.tobytes() + b.tobytes()
 
-        length_offset = printk_log_type.get_type()['len'].bitpos // 8
-        text_len_offset = printk_log_type.get_type()['text_len'].bitpos // 8
-        time_stamp_offset = printk_log_type.get_type()['ts_nsec'].bitpos // 8
-        text_offset = printk_log_type.get_type().sizeof
+        # read in prb structure
+        prb_addr = int(str(gdb.parse_and_eval("(void *)'printk.c'::prb")).split()[0], 16)
+        sz = printk_ringbuffer_type.get_type().sizeof
+        prb = utils.read_memoryview(inf, prb_addr, sz).tobytes()
 
-        pos = 0
-        while pos < log_buf.__len__():
-            length = utils.read_u16(log_buf, pos + length_offset)
-            if length == 0:
-                if log_buf_2nd_half == -1:
-                    gdb.write("Corrupted log buffer!\n")
+        # read in descriptor ring structure
+        off = printk_ringbuffer_type.get_type()['desc_ring'].bitpos // 8
+        addr = prb_addr + off
+        sz = prb_desc_ring_type.get_type().sizeof
+        desc_ring = utils.read_memoryview(inf, addr, sz).tobytes()
+
+        # read in descriptor array
+        off = prb_desc_ring_type.get_type()['count_bits'].bitpos // 8
+        desc_ring_count = 1 << utils.read_u32(desc_ring, off)
+        desc_sz = prb_desc_type.get_type().sizeof
+        off = prb_desc_ring_type.get_type()['descs'].bitpos // 8
+        addr = utils.read_ulong(desc_ring, off)
+        descs = utils.read_memoryview(inf, addr, desc_sz * desc_ring_count).tobytes()
+
+        # read in info array
+        info_sz = printk_info_type.get_type().sizeof
+        off = prb_desc_ring_type.get_type()['infos'].bitpos // 8
+        addr = utils.read_ulong(desc_ring, off)
+        infos = utils.read_memoryview(inf, addr, info_sz * desc_ring_count).tobytes()
+
+        # read in text data ring structure
+        off = printk_ringbuffer_type.get_type()['text_data_ring'].bitpos // 8
+        addr = prb_addr + off
+        sz = prb_data_ring_type.get_type().sizeof
+        text_data_ring = utils.read_memoryview(inf, addr, sz).tobytes()
+
+        # read in text data
+        off = prb_data_ring_type.get_type()['size_bits'].bitpos // 8
+        text_data_sz = 1 << utils.read_u32(text_data_ring, off)
+        off = prb_data_ring_type.get_type()['data'].bitpos // 8
+        addr = utils.read_ulong(text_data_ring, off)
+        text_data = utils.read_memoryview(inf, addr, text_data_sz).tobytes()
+
+        counter_off = atomic_long_type.get_type()['counter'].bitpos // 8
+
+        sv_off = prb_desc_type.get_type()['state_var'].bitpos // 8
+
+        off = prb_desc_type.get_type()['text_blk_lpos'].bitpos // 8
+        begin_off = off + (prb_data_blk_lpos_type.get_type()['begin'].bitpos // 8)
+        next_off = off + (prb_data_blk_lpos_type.get_type()['next'].bitpos // 8)
+
+        ts_off = printk_info_type.get_type()['ts_nsec'].bitpos // 8
+        len_off = printk_info_type.get_type()['text_len'].bitpos // 8
+
+        # definitions from kernel/printk/printk_ringbuffer.h
+        desc_committed = 1
+        desc_finalized = 2
+        desc_sv_bits = utils.get_long_type().sizeof * 8
+        desc_flags_shift = desc_sv_bits - 2
+        desc_flags_mask = 3 << desc_flags_shift
+        desc_id_mask = ~desc_flags_mask
+
+        # read in tail and head descriptor ids
+        off = prb_desc_ring_type.get_type()['tail_id'].bitpos // 8
+        tail_id = utils.read_u64(desc_ring, off + counter_off)
+        off = prb_desc_ring_type.get_type()['head_id'].bitpos // 8
+        head_id = utils.read_u64(desc_ring, off + counter_off)
+
+        did = tail_id
+        while True:
+            ind = did % desc_ring_count
+            desc_off = desc_sz * ind
+            info_off = info_sz * ind
+
+            # skip non-committed record
+            state = 3 & (utils.read_u64(descs, desc_off + sv_off +
+                                        counter_off) >> desc_flags_shift)
+            if state != desc_committed and state != desc_finalized:
+                if did == head_id:
                     break
-                pos = log_buf_2nd_half
+                did = (did + 1) & desc_id_mask
                 continue
 
-            text_len = utils.read_u16(log_buf, pos + text_len_offset)
-            text_start = pos + text_offset
-            text = log_buf[text_start:text_start + text_len].decode(
-                encoding='utf8', errors='replace')
-            time_stamp = utils.read_u64(log_buf, pos + time_stamp_offset)
+            begin = utils.read_ulong(descs, desc_off + begin_off) % text_data_sz
+            end = utils.read_ulong(descs, desc_off + next_off) % text_data_sz
+
+            # handle data-less record
+            if begin & 1 == 1:
+                text = ""
+            else:
+                # handle wrapping data block
+                if begin > end:
+                    begin = 0
+
+                # skip over descriptor id
+                text_start = begin + utils.get_long_type().sizeof
+
+                text_len = utils.read_u16(infos, info_off + len_off)
+
+                # handle truncated message
+                if end - text_start < text_len:
+                    text_len = end - text_start
+
+                text = text_data[text_start:text_start + text_len].decode(
+                    encoding='utf8', errors='replace')
+
+            time_stamp = utils.read_u64(infos, info_off + ts_off)
 
             for line in text.splitlines():
                 msg = u"[{time:12.6f}] {line}\n".format(
@@ -75,7 +146,9 @@ printk_log_type = utils.CachedType("struct printk_log")
                     msg = msg.encode(encoding='utf8', errors='replace')
                 gdb.write(msg)
 
-            pos += length
+            if did == head_id:
+                break
+            did = (did + 1) & desc_id_mask
 
 
 LxDmesg()
diff --git a/scripts/gdb/linux/utils.py b/scripts/gdb/linux/utils.py
index ea94221dbd392..ff7c1799d588f 100644
--- a/scripts/gdb/linux/utils.py
+++ b/scripts/gdb/linux/utils.py
@@ -123,6 +123,13 @@ target_endianness = None
         return read_u32(buffer, offset + 4) + (read_u32(buffer, offset) << 32)
 
 
+def read_ulong(buffer, offset):
+    if get_long_type().sizeof == 8:
+        return read_u64(buffer, offset)
+    else:
+        return read_u32(buffer, offset)
+
+
 target_arch = None
 
 

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [ANNOUNCE] v5.9.1-rt18
  2020-10-21 12:53 [ANNOUNCE] v5.9.1-rt18 Sebastian Andrzej Siewior
@ 2020-10-21 13:14 ` Sebastian Andrzej Siewior
  2020-10-27  6:53   ` Fernando Lopez-Lezcano
  2020-10-22  5:21 ` ltp or kvm triggerable lockdep alloc_pid() deadlock gripe Mike Galbraith
  2020-10-22  5:28 ` kvm+nouveau induced lockdep gripe Mike Galbraith
  2 siblings, 1 reply; 29+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-10-21 13:14 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, linux-rt-users, Steven Rostedt

On 2020-10-21 14:53:27 [+0200], To Thomas Gleixner wrote:
> Dear RT folks!
> 
> I'm pleased to announce the v5.9.1-rt18 patch set. 
> 
> Changes since v5.9.1-rt17:
> 
>   - Update the migrate-disable series by Peter Zijlstra to v3. Include
>     also fixes discussed in the thread.
> 
>   - UP builds did not boot since the replace of the migrate-disable
>     code. Reported by Christian Egger. Fixed as a part of v3 by Peter
>     Zijlstra.
> 
>   - Rebase the printk code on top of the ringer buffer designed for
>     printk which was merged in the v5.10 merge window. Patches by John
>     Ogness.
> 
> Known issues
>      - It has been pointed out that due to changes to the printk code the
>        internal buffer representation changed. This is only an issue if tools
>        like `crash' are used to extract the printk buffer from a kernel memory
>        image.
> 
> The delta patch against v5.9.1-rt17 is appended below and can be found here:
>  
>      https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.9/incr/patch-5.9.1-rt17-rt18.patch.xz
> 
> You can get this release via the git tree at:
> 
>     git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v5.9.1-rt18
> 
> The RT patch against v5.9.1 can be found here:
> 
>     https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.9/older/patch-5.9.1-rt18.patch.xz
> 
> The split quilt queue is available at:
> 
>     https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.9/older/patches-5.9.1-rt18.tar.xz
> 

The attached diff was too large and the mail was dropped. It is
available at
	https://git.kernel.org/rt/linux-rt-devel/d/v5.9.1-rt18/v5.9.1-rt17

Sebastian

^ permalink raw reply	[flat|nested] 29+ messages in thread

* ltp or kvm triggerable lockdep alloc_pid()  deadlock gripe
  2020-10-21 12:53 [ANNOUNCE] v5.9.1-rt18 Sebastian Andrzej Siewior
  2020-10-21 13:14 ` Sebastian Andrzej Siewior
@ 2020-10-22  5:21 ` Mike Galbraith
  2020-10-22 16:44   ` Sebastian Andrzej Siewior
  2020-10-22  5:28 ` kvm+nouveau induced lockdep gripe Mike Galbraith
  2 siblings, 1 reply; 29+ messages in thread
From: Mike Galbraith @ 2020-10-22  5:21 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, Thomas Gleixner
  Cc: LKML, linux-rt-users, Steven Rostedt

[-- Attachment #1: Type: text/plain, Size: 5886 bytes --]

Greetings,

The gripe below is repeatable in two ways here, boot with nomodeset so
nouveau doesn't steal the lockdep show when I then fire up one of my
(oink) full distro VM's, or from an ltp directory ./runltp -f cpuset
with the attached subset of controllers file placed in ./runtest dir.

Method2 may lead to a real deal deadlock, I've got a crashdump of one,
stack traces of uninterruptible sleepers attached.

[  154.927302] ======================================================
[  154.927303] WARNING: possible circular locking dependency detected
[  154.927304] 5.9.1-rt18-rt #5 Tainted: G S          E
[  154.927305] ------------------------------------------------------
[  154.927306] cpuset_inherit_/4992 is trying to acquire lock:
[  154.927307] ffff9d334c5e64d8 (&s->seqcount){+.+.}-{0:0}, at: __slab_alloc.isra.87+0xad/0xc0
[  154.927317]
               but task is already holding lock:
[  154.927317] ffffffffac4052d0 (pidmap_lock){+.+.}-{2:2}, at: alloc_pid+0x1fb/0x510
[  154.927324]
               which lock already depends on the new lock.

[  154.927324]
               the existing dependency chain (in reverse order) is:
[  154.927325]
               -> #1 (pidmap_lock){+.+.}-{2:2}:
[  154.927328]        lock_acquire+0x92/0x410
[  154.927331]        rt_spin_lock+0x2b/0xc0
[  154.927335]        free_pid+0x27/0xc0
[  154.927338]        release_task+0x34a/0x640
[  154.927340]        do_exit+0x6e9/0xcf0
[  154.927342]        kthread+0x11c/0x190
[  154.927344]        ret_from_fork+0x1f/0x30
[  154.927347]
               -> #0 (&s->seqcount){+.+.}-{0:0}:
[  154.927350]        validate_chain+0x981/0x1250
[  154.927352]        __lock_acquire+0x86f/0xbd0
[  154.927354]        lock_acquire+0x92/0x410
[  154.927356]        ___slab_alloc+0x71b/0x820
[  154.927358]        __slab_alloc.isra.87+0xad/0xc0
[  154.927359]        kmem_cache_alloc+0x700/0x8c0
[  154.927361]        radix_tree_node_alloc.constprop.22+0xa2/0xf0
[  154.927365]        idr_get_free+0x207/0x2b0
[  154.927367]        idr_alloc_u32+0x54/0xa0
[  154.927369]        idr_alloc_cyclic+0x4f/0xa0
[  154.927370]        alloc_pid+0x22b/0x510
[  154.927372]        copy_process+0xeb5/0x1de0
[  154.927375]        _do_fork+0x52/0x750
[  154.927377]        __do_sys_clone+0x64/0x70
[  154.927379]        do_syscall_64+0x33/0x40
[  154.927382]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  154.927384]
               other info that might help us debug this:

[  154.927384]  Possible unsafe locking scenario:

[  154.927385]        CPU0                    CPU1
[  154.927386]        ----                    ----
[  154.927386]   lock(pidmap_lock);
[  154.927388]                                lock(&s->seqcount);
[  154.927389]                                lock(pidmap_lock);
[  154.927391]   lock(&s->seqcount);
[  154.927392]
                *** DEADLOCK ***

[  154.927393] 4 locks held by cpuset_inherit_/4992:
[  154.927394]  #0: ffff9d33decea5b0 ((lock).lock){+.+.}-{2:2}, at: __radix_tree_preload+0x52/0x3b0
[  154.927399]  #1: ffffffffac598fa0 (rcu_read_lock){....}-{1:2}, at: rt_spin_lock+0x5/0xc0
[  154.927405]  #2: ffffffffac4052d0 (pidmap_lock){+.+.}-{2:2}, at: alloc_pid+0x1fb/0x510
[  154.927409]  #3: ffffffffac598fa0 (rcu_read_lock){....}-{1:2}, at: rt_spin_lock+0x5/0xc0
[  154.927414]
               stack backtrace:
[  154.927416] CPU: 3 PID: 4992 Comm: cpuset_inherit_ Kdump: loaded Tainted: G S          E     5.9.1-rt18-rt #5
[  154.927418] Hardware name: MEDION MS-7848/MS-7848, BIOS M7848W08.20C 09/23/2013
[  154.927419] Call Trace:
[  154.927422]  dump_stack+0x77/0x9b
[  154.927425]  check_noncircular+0x148/0x160
[  154.927432]  ? validate_chain+0x981/0x1250
[  154.927435]  validate_chain+0x981/0x1250
[  154.927441]  __lock_acquire+0x86f/0xbd0
[  154.927446]  lock_acquire+0x92/0x410
[  154.927449]  ? __slab_alloc.isra.87+0xad/0xc0
[  154.927452]  ? kmem_cache_alloc+0x648/0x8c0
[  154.927453]  ? lock_acquire+0x92/0x410
[  154.927458]  ___slab_alloc+0x71b/0x820
[  154.927460]  ? __slab_alloc.isra.87+0xad/0xc0
[  154.927463]  ? radix_tree_node_alloc.constprop.22+0xa2/0xf0
[  154.927468]  ? __slab_alloc.isra.87+0x83/0xc0
[  154.927472]  ? radix_tree_node_alloc.constprop.22+0xa2/0xf0
[  154.927474]  ? __slab_alloc.isra.87+0xad/0xc0
[  154.927476]  __slab_alloc.isra.87+0xad/0xc0
[  154.927480]  ? radix_tree_node_alloc.constprop.22+0xa2/0xf0
[  154.927482]  kmem_cache_alloc+0x700/0x8c0
[  154.927487]  radix_tree_node_alloc.constprop.22+0xa2/0xf0
[  154.927491]  idr_get_free+0x207/0x2b0
[  154.927495]  idr_alloc_u32+0x54/0xa0
[  154.927500]  idr_alloc_cyclic+0x4f/0xa0
[  154.927503]  alloc_pid+0x22b/0x510
[  154.927506]  ? copy_thread+0x88/0x200
[  154.927512]  copy_process+0xeb5/0x1de0
[  154.927520]  _do_fork+0x52/0x750
[  154.927523]  ? lock_acquire+0x92/0x410
[  154.927525]  ? __might_fault+0x3e/0x90
[  154.927530]  ? find_held_lock+0x2d/0x90
[  154.927535]  __do_sys_clone+0x64/0x70
[  154.927541]  do_syscall_64+0x33/0x40
[  154.927544]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  154.927546] RIP: 0033:0x7f0b357356e3
[  154.927548] Code: db 45 85 ed 0f 85 ad 01 00 00 64 4c 8b 04 25 10 00 00 00 31 d2 4d 8d 90 d0 02 00 00 31 f6 bf 11 00 20 01 b8 38 00 00 00 0f 05 <48> 3d 00 f0 ff ff 0f 87 f1 00 00 00 85 c0 41 89 c4 0f 85 fe 00 00
[  154.927550] RSP: 002b:00007ffdfd6d15f0 EFLAGS: 00000246 ORIG_RAX: 0000000000000038
[  154.927552] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f0b357356e3
[  154.927554] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000001200011
[  154.927555] RBP: 00007ffdfd6d1620 R08: 00007f0b36052b80 R09: 0000000000000072
[  154.927556] R10: 00007f0b36052e50 R11: 0000000000000246 R12: 0000000000000000
[  154.927557] R13: 0000000000000000 R14: 0000000000000000 R15: 00005614ef57ecf0

[-- Attachment #2: cpuset --]
[-- Type: text/plain, Size: 587 bytes --]

#DESCRIPTION:Resource Management testing

cpuset_base_ops	cpuset_base_ops_testset.sh
cpuset_inherit	cpuset_inherit_testset.sh
cpuset_exclusive	cpuset_exclusive_test.sh
cpuset_hierarchy	cpuset_hierarchy_test.sh
cpuset_syscall	cpuset_syscall_testset.sh
cpuset_sched_domains	cpuset_sched_domains_test.sh
cpuset_load_balance	cpuset_load_balance_test.sh
cpuset_hotplug	cpuset_hotplug_test.sh
cpuset_memory	cpuset_memory_testset.sh
cpuset_memory_pressure	cpuset_memory_pressure_testset.sh
cpuset_memory_spread	cpuset_memory_spread_testset.sh

cpuset_regression_test cpuset_regression_test.sh


[-- Attachment #3: deadlock-log --]
[-- Type: text/plain, Size: 14019 bytes --]

      1      0   2  ffffa09c87fd0000  UN   0.1  221032   9368  systemd
    627      1   0  ffffa09f733c0000  UN   0.0   68392   7004  systemd-udevd
   3322   3247   7  ffffa09f512051c0  UN   0.0  167624   2732  gpg-agent
   3841   3468   2  ffffa09f32250000  UN   0.1   19912   9828  bash
   4209   3468   2  ffffa09dd070d1c0  UN   0.1   19912   9796  bash
   4845   3335   3  ffffa09f2ffe1b40  UN   0.1  268172  24880  file.so
   4846   3335   5  ffffa09f2d418000  UN   0.1  268172  24880  file.so
   5657      1   3  ffffa09f222e3680  UN   1.5 2884604 260248  Thread (pooled)
   6716   5797   3  ffffa09f30da1b40  UN   0.0   14128   4168  cpuset_hotplug_
   6743      1   3  ffffa09f54151b40  UN   0.1  574864  18532  pool-/usr/lib/x
   6744      1   4  ffffa09f357f9b40  UN   0.0  489516   6096  pool-/usr/lib/x
PID: 1      TASK: ffffa09c87fd0000  CPU: 2   COMMAND: "systemd"
 #0 [ffffbfbb00033c50] __schedule+837 at ffffffffa59b15f5
 #1 [ffffbfbb00033cd8] schedule+86 at ffffffffa59b1d96
 #2 [ffffbfbb00033ce8] __rt_mutex_slowlock+56 at ffffffffa59b3868
 #3 [ffffbfbb00033d30] rt_mutex_slowlock_locked+207 at ffffffffa59b3abf
 #4 [ffffbfbb00033d88] rt_mutex_slowlock.constprop.30+90 at ffffffffa59b3d3a
 #5 [ffffbfbb00033e00] proc_cgroup_show+74 at ffffffffa5184a7a
 #6 [ffffbfbb00033e48] proc_single_show+84 at ffffffffa53cd524
 #7 [ffffbfbb00033e80] seq_read+206 at ffffffffa534c30e
 #8 [ffffbfbb00033ed8] vfs_read+209 at ffffffffa531d281
 #9 [ffffbfbb00033f08] ksys_read+135 at ffffffffa531d637
#10 [ffffbfbb00033f40] do_syscall_64+51 at ffffffffa59a35c3
#11 [ffffbfbb00033f50] entry_SYSCALL_64_after_hwframe+68 at ffffffffa5a0008c
    RIP: 00007f1bb97dd1d8  RSP: 00007ffd5f424ac0  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 000055e3b1224cc0  RCX: 00007f1bb97dd1d8
    RDX: 0000000000000400  RSI: 000055e3b1224cc0  RDI: 0000000000000055
    RBP: 0000000000000400   R8: 0000000000000000   R9: 0000000000000000
    R10: 00007f1bbb1f9940  R11: 0000000000000246  R12: 00007f1bb9aa57a0
    R13: 00007f1bb9aa62e0  R14: 0000000000000000  R15: 000055e3b1377f20
    ORIG_RAX: 0000000000000000  CS: 0033  SS: 002b
PID: 627    TASK: ffffa09f733c0000  CPU: 0   COMMAND: "systemd-udevd"
 #0 [ffffbfbb00b6fc38] __schedule+837 at ffffffffa59b15f5
 #1 [ffffbfbb00b6fcc0] schedule+86 at ffffffffa59b1d96
 #2 [ffffbfbb00b6fcc8] percpu_rwsem_wait+181 at ffffffffa5101b75
 #3 [ffffbfbb00b6fd28] __percpu_down_read+114 at ffffffffa5101ec2
 #4 [ffffbfbb00b6fd40] cgroup_can_fork+1321 at ffffffffa5185e69
 #5 [ffffbfbb00b6fd88] copy_process+4457 at ffffffffa508aab9
 #6 [ffffbfbb00b6fe30] _do_fork+82 at ffffffffa508b882
 #7 [ffffbfbb00b6fed0] __do_sys_clone+100 at ffffffffa508c054
 #8 [ffffbfbb00b6ff40] do_syscall_64+51 at ffffffffa59a35c3
 #9 [ffffbfbb00b6ff50] entry_SYSCALL_64_after_hwframe+68 at ffffffffa5a0008c
    RIP: 00007fa6261286e3  RSP: 00007ffc0a16daf0  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 00007ffc0a16daf0  RCX: 00007fa6261286e3
    RDX: 0000000000000000  RSI: 0000000000000000  RDI: 0000000001200011
    RBP: 00007ffc0a16db40   R8: 00007fa6272cfd40   R9: 0000000000000001
    R10: 00007fa6272d0010  R11: 0000000000000246  R12: 0000000000000000
    R13: 0000000000000000  R14: 0000000000000000  R15: 0000557a3d4a9a10
    ORIG_RAX: 0000000000000038  CS: 0033  SS: 002b
PID: 3322   TASK: ffffa09f512051c0  CPU: 7   COMMAND: "gpg-agent"
 #0 [ffffbfbb00ef7c38] __schedule+837 at ffffffffa59b15f5
 #1 [ffffbfbb00ef7cc0] schedule+86 at ffffffffa59b1d96
 #2 [ffffbfbb00ef7cc8] percpu_rwsem_wait+181 at ffffffffa5101b75
 #3 [ffffbfbb00ef7d28] __percpu_down_read+114 at ffffffffa5101ec2
 #4 [ffffbfbb00ef7d40] cgroup_can_fork+1321 at ffffffffa5185e69
 #5 [ffffbfbb00ef7d88] copy_process+4457 at ffffffffa508aab9
 #6 [ffffbfbb00ef7e30] _do_fork+82 at ffffffffa508b882
 #7 [ffffbfbb00ef7ed0] __do_sys_clone+100 at ffffffffa508c054
 #8 [ffffbfbb00ef7f40] do_syscall_64+51 at ffffffffa59a35c3
 #9 [ffffbfbb00ef7f50] entry_SYSCALL_64_after_hwframe+68 at ffffffffa5a0008c
    RIP: 00007f4355ee7fb1  RSP: 00007ffdf3130358  RFLAGS: 00000202
    RAX: ffffffffffffffda  RBX: 00007f4355be7700  RCX: 00007f4355ee7fb1
    RDX: 00007f4355be79d0  RSI: 00007f4355be6fb0  RDI: 00000000003d0f00
    RBP: 00007ffdf3130760   R8: 00007f4355be7700   R9: 00007f4355be7700
    R10: 00007f4355be79d0  R11: 0000000000000202  R12: 00007ffdf31303fe
    R13: 00007ffdf31303ff  R14: 000055667da0f9e0  R15: 00007ffdf3130760
    ORIG_RAX: 0000000000000038  CS: 0033  SS: 002b
PID: 3841   TASK: ffffa09f32250000  CPU: 2   COMMAND: "bash"
 #0 [ffffbfbb02d0fc38] __schedule+837 at ffffffffa59b15f5
 #1 [ffffbfbb02d0fcc0] schedule+86 at ffffffffa59b1d96
 #2 [ffffbfbb02d0fcc8] percpu_rwsem_wait+181 at ffffffffa5101b75
 #3 [ffffbfbb02d0fd28] __percpu_down_read+114 at ffffffffa5101ec2
 #4 [ffffbfbb02d0fd40] cgroup_can_fork+1321 at ffffffffa5185e69
 #5 [ffffbfbb02d0fd88] copy_process+4457 at ffffffffa508aab9
 #6 [ffffbfbb02d0fe30] _do_fork+82 at ffffffffa508b882
 #7 [ffffbfbb02d0fed0] __do_sys_clone+100 at ffffffffa508c054
 #8 [ffffbfbb02d0ff40] do_syscall_64+51 at ffffffffa59a35c3
 #9 [ffffbfbb02d0ff50] entry_SYSCALL_64_after_hwframe+68 at ffffffffa5a0008c
    RIP: 00007f023eaf36e3  RSP: 00007ffe80698a30  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 0000000000000000  RCX: 00007f023eaf36e3
    RDX: 0000000000000000  RSI: 0000000000000000  RDI: 0000000001200011
    RBP: 00007ffe80698a60   R8: 00007f023f410b80   R9: 0000000000000000
    R10: 00007f023f410e50  R11: 0000000000000246  R12: 0000000000000000
    R13: 0000000000000000  R14: 0000562cd4ee6c90  R15: 0000562cd4ee6c90
    ORIG_RAX: 0000000000000038  CS: 0033  SS: 002b
PID: 4209   TASK: ffffa09dd070d1c0  CPU: 2   COMMAND: "bash"
 #0 [ffffbfbb0376fc38] __schedule+837 at ffffffffa59b15f5
 #1 [ffffbfbb0376fcc0] schedule+86 at ffffffffa59b1d96
 #2 [ffffbfbb0376fcc8] percpu_rwsem_wait+181 at ffffffffa5101b75
 #3 [ffffbfbb0376fd28] __percpu_down_read+114 at ffffffffa5101ec2
 #4 [ffffbfbb0376fd40] cgroup_can_fork+1321 at ffffffffa5185e69
 #5 [ffffbfbb0376fd88] copy_process+4457 at ffffffffa508aab9
 #6 [ffffbfbb0376fe30] _do_fork+82 at ffffffffa508b882
 #7 [ffffbfbb0376fed0] __do_sys_clone+100 at ffffffffa508c054
 #8 [ffffbfbb0376ff40] do_syscall_64+51 at ffffffffa59a35c3
 #9 [ffffbfbb0376ff50] entry_SYSCALL_64_after_hwframe+68 at ffffffffa5a0008c
    RIP: 00007f81a96426e3  RSP: 00007ffd97e5ebc0  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 0000000000000000  RCX: 00007f81a96426e3
    RDX: 0000000000000000  RSI: 0000000000000000  RDI: 0000000001200011
    RBP: 00007ffd97e5ebf0   R8: 00007f81a9f5fb80   R9: 0000000000000000
    R10: 00007f81a9f5fe50  R11: 0000000000000246  R12: 0000000000000000
    R13: 0000000000000000  R14: 000055a56fcb9c90  R15: 000055a56fcb9c90
    ORIG_RAX: 0000000000000038  CS: 0033  SS: 002b
PID: 4845   TASK: ffffa09f2ffe1b40  CPU: 3   COMMAND: "file.so"
 #0 [ffffbfbb03223d88] __schedule+837 at ffffffffa59b15f5
 #1 [ffffbfbb03223e10] schedule+86 at ffffffffa59b1d96
 #2 [ffffbfbb03223e18] percpu_rwsem_wait+181 at ffffffffa5101b75
 #3 [ffffbfbb03223e78] __percpu_down_read+114 at ffffffffa5101ec2
 #4 [ffffbfbb03223e90] exit_signals+711 at ffffffffa50a2f27
 #5 [ffffbfbb03223ea8] do_exit+216 at ffffffffa5093ef8
 #6 [ffffbfbb03223f10] do_group_exit+71 at ffffffffa5094bb7
 #7 [ffffbfbb03223f38] __x64_sys_exit_group+20 at ffffffffa5094c34
 #8 [ffffbfbb03223f40] do_syscall_64+51 at ffffffffa59a35c3
 #9 [ffffbfbb03223f50] entry_SYSCALL_64_after_hwframe+68 at ffffffffa5a0008c
    RIP: 00007f0ca6b20998  RSP: 00007ffc5e1eef48  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 0000000000000000  RCX: 00007f0ca6b20998
    RDX: 0000000000000000  RSI: 000000000000003c  RDI: 0000000000000000
    RBP: 00007f0ca6e0d510   R8: 00000000000000e7   R9: ffffffffffffff60
    R10: 00007f0ca5fd10f8  R11: 0000000000000246  R12: 00007f0ca6e0d510
    R13: 00007f0ca6e0d8c0  R14: 00007ffc5e1eefe0  R15: 0000000000000020
    ORIG_RAX: 00000000000000e7  CS: 0033  SS: 002b
PID: 4846   TASK: ffffa09f2d418000  CPU: 5   COMMAND: "file.so"
 #0 [ffffbfbb0396fd88] __schedule+837 at ffffffffa59b15f5
 #1 [ffffbfbb0396fe10] schedule+86 at ffffffffa59b1d96
 #2 [ffffbfbb0396fe18] percpu_rwsem_wait+181 at ffffffffa5101b75
 #3 [ffffbfbb0396fe78] __percpu_down_read+114 at ffffffffa5101ec2
 #4 [ffffbfbb0396fe90] exit_signals+711 at ffffffffa50a2f27
 #5 [ffffbfbb0396fea8] do_exit+216 at ffffffffa5093ef8
 #6 [ffffbfbb0396ff10] do_group_exit+71 at ffffffffa5094bb7
 #7 [ffffbfbb0396ff38] __x64_sys_exit_group+20 at ffffffffa5094c34
 #8 [ffffbfbb0396ff40] do_syscall_64+51 at ffffffffa59a35c3
 #9 [ffffbfbb0396ff50] entry_SYSCALL_64_after_hwframe+68 at ffffffffa5a0008c
    RIP: 00007f0ca6b20998  RSP: 00007ffc5e1eef48  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 0000000000000000  RCX: 00007f0ca6b20998
    RDX: 0000000000000000  RSI: 000000000000003c  RDI: 0000000000000000
    RBP: 00007f0ca6e0d510   R8: 00000000000000e7   R9: ffffffffffffff60
    R10: 00007f0ca5fd10f8  R11: 0000000000000246  R12: 00007f0ca6e0d510
    R13: 00007f0ca6e0d8c0  R14: 00007ffc5e1eefe0  R15: 0000000000000020
    ORIG_RAX: 00000000000000e7  CS: 0033  SS: 002b
PID: 5657   TASK: ffffa09f222e3680  CPU: 3   COMMAND: "Thread (pooled)"
 #0 [ffffbfbb03a07db0] __schedule+837 at ffffffffa59b15f5
 #1 [ffffbfbb03a07e38] schedule+86 at ffffffffa59b1d96
 #2 [ffffbfbb03a07e40] percpu_rwsem_wait+181 at ffffffffa5101b75
 #3 [ffffbfbb03a07ea0] __percpu_down_read+114 at ffffffffa5101ec2
 #4 [ffffbfbb03a07eb8] exit_signals+711 at ffffffffa50a2f27
 #5 [ffffbfbb03a07ed0] do_exit+216 at ffffffffa5093ef8
 #6 [ffffbfbb03a07f38] __x64_sys_exit+23 at ffffffffa5094b67
 #7 [ffffbfbb03a07f40] do_syscall_64+51 at ffffffffa59a35c3
 #8 [ffffbfbb03a07f50] entry_SYSCALL_64_after_hwframe+68 at ffffffffa5a0008c
    RIP: 00007f1f2f9265b6  RSP: 00007f1e95057d50  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 00007f1e95058700  RCX: 00007f1f2f9265b6
    RDX: 000000000000003c  RSI: 00007f1f2fb38010  RDI: 0000000000000000
    RBP: 0000000000000000   R8: 00007f1e880029c0   R9: 0000000000000000
    R10: 0000000000000020  R11: 0000000000000246  R12: 00007ffd7582f27e
    R13: 00007ffd7582f27f  R14: 000055fce0818180  R15: 00007ffd7582f350
    ORIG_RAX: 000000000000003c  CS: 0033  SS: 002b
PID: 6716   TASK: ffffa09f30da1b40  CPU: 3   COMMAND: "cpuset_hotplug_"
 #0 [ffffbfbb03ac79d8] __schedule+837 at ffffffffa59b15f5
 #1 [ffffbfbb03ac7a60] schedule+86 at ffffffffa59b1d96
 #2 [ffffbfbb03ac7a70] schedule_timeout+495 at ffffffffa59b4cbf
 #3 [ffffbfbb03ac7b08] wait_for_completion+165 at ffffffffa59b2f25
 #4 [ffffbfbb03ac7b48] affine_move_task+705 at ffffffffa50cc231
 #5 [ffffbfbb03ac7c88] __set_cpus_allowed_ptr+274 at ffffffffa50cc562
 #6 [ffffbfbb03ac7cc8] cpuset_attach+195 at ffffffffa518df73
 #7 [ffffbfbb03ac7d00] cgroup_migrate_execute+1133 at ffffffffa518075d
 #8 [ffffbfbb03ac7d58] cgroup_attach_task+524 at ffffffffa5180abc
 #9 [ffffbfbb03ac7e28] __cgroup1_procs_write.constprop.21+243 at ffffffffa5187843
#10 [ffffbfbb03ac7e68] cgroup_file_write+126 at ffffffffa517b07e
#11 [ffffbfbb03ac7ea0] kernfs_fop_write+275 at ffffffffa53e1c13
#12 [ffffbfbb03ac7ed8] vfs_write+240 at ffffffffa531d470
#13 [ffffbfbb03ac7f08] ksys_write+135 at ffffffffa531d737
#14 [ffffbfbb03ac7f40] do_syscall_64+51 at ffffffffa59a35c3
#15 [ffffbfbb03ac7f50] entry_SYSCALL_64_after_hwframe+68 at ffffffffa5a0008c
    RIP: 00007f0176b84244  RSP: 00007ffffcf7e2b8  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 0000000000000005  RCX: 00007f0176b84244
    RDX: 0000000000000005  RSI: 000055b15203d890  RDI: 0000000000000001
    RBP: 000055b15203d890   R8: 000000000000000a   R9: 0000000000000000
    R10: 000000000000000a  R11: 0000000000000246  R12: 0000000000000005
    R13: 0000000000000001  R14: 00007f0176e4c5a0  R15: 0000000000000005
    ORIG_RAX: 0000000000000001  CS: 0033  SS: 002b
PID: 6743   TASK: ffffa09f54151b40  CPU: 3   COMMAND: "pool-/usr/lib/x"
 #0 [ffffbfbb08657db0] __schedule+837 at ffffffffa59b15f5
 #1 [ffffbfbb08657e38] schedule+86 at ffffffffa59b1d96
 #2 [ffffbfbb08657e40] percpu_rwsem_wait+181 at ffffffffa5101b75
 #3 [ffffbfbb08657ea0] __percpu_down_read+114 at ffffffffa5101ec2
 #4 [ffffbfbb08657eb8] exit_signals+711 at ffffffffa50a2f27
 #5 [ffffbfbb08657ed0] do_exit+216 at ffffffffa5093ef8
 #6 [ffffbfbb08657f38] __x64_sys_exit+23 at ffffffffa5094b67
 #7 [ffffbfbb08657f40] do_syscall_64+51 at ffffffffa59a35c3
 #8 [ffffbfbb08657f50] entry_SYSCALL_64_after_hwframe+68 at ffffffffa5a0008c
    RIP: 00007f0176d6b5b6  RSP: 00007f015d540dd0  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 00007f015d541700  RCX: 00007f0176d6b5b6
    RDX: 000000000000003c  RSI: 00007f0176f7d010  RDI: 0000000000000000
    RBP: 0000000000000000   R8: 00007f01500008c0   R9: 0000000000000004
    R10: 000055ee65f185d0  R11: 0000000000000246  R12: 00007ffc71eef56e
    R13: 00007ffc71eef56f  R14: 00007f01640038f0  R15: 00007ffc71eef600
    ORIG_RAX: 000000000000003c  CS: 0033  SS: 002b
PID: 6744   TASK: ffffa09f357f9b40  CPU: 4   COMMAND: "pool-/usr/lib/x"
 #0 [ffffbfbb0865fdb0] __schedule+837 at ffffffffa59b15f5
 #1 [ffffbfbb0865fe38] schedule+86 at ffffffffa59b1d96
 #2 [ffffbfbb0865fe40] percpu_rwsem_wait+181 at ffffffffa5101b75
 #3 [ffffbfbb0865fea0] __percpu_down_read+114 at ffffffffa5101ec2
 #4 [ffffbfbb0865feb8] exit_signals+711 at ffffffffa50a2f27
 #5 [ffffbfbb0865fed0] do_exit+216 at ffffffffa5093ef8
 #6 [ffffbfbb0865ff38] __x64_sys_exit+23 at ffffffffa5094b67
 #7 [ffffbfbb0865ff40] do_syscall_64+51 at ffffffffa59a35c3
 #8 [ffffbfbb0865ff50] entry_SYSCALL_64_after_hwframe+68 at ffffffffa5a0008c
    RIP: 00007ff49b0c85b6  RSP: 00007ff493ffedd0  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 00007ff493fff700  RCX: 00007ff49b0c85b6
    RDX: 000000000000003c  RSI: 00007ff49b2da010  RDI: 0000000000000000
    RBP: 0000000000000000   R8: 00007ff488001600   R9: 0000000000000000
    R10: 0000000000000050  R11: 0000000000000246  R12: 00007ffdc88c817e
    R13: 00007ffdc88c817f  R14: 00007ff48c0069e0  R15: 00007ffdc88c8210
    ORIG_RAX: 000000000000003c  CS: 0033  SS: 002b

^ permalink raw reply	[flat|nested] 29+ messages in thread

* kvm+nouveau induced lockdep  gripe
  2020-10-21 12:53 [ANNOUNCE] v5.9.1-rt18 Sebastian Andrzej Siewior
  2020-10-21 13:14 ` Sebastian Andrzej Siewior
  2020-10-22  5:21 ` ltp or kvm triggerable lockdep alloc_pid() deadlock gripe Mike Galbraith
@ 2020-10-22  5:28 ` Mike Galbraith
  2020-10-23  9:01   ` Sebastian Andrzej Siewior
                     ` (4 more replies)
  2 siblings, 5 replies; 29+ messages in thread
From: Mike Galbraith @ 2020-10-22  5:28 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, Thomas Gleixner
  Cc: LKML, linux-rt-users, Steven Rostedt

I've only as yet seen nouveau lockdep gripage when firing up one of my
full distro KVM's.

[   91.655613] ======================================================
[   91.655614] WARNING: possible circular locking dependency detected
[   91.655614] 5.9.1-rt18-rt #5 Tainted: G S          E
[   91.655615] ------------------------------------------------------
[   91.655615] libvirtd/1868 is trying to acquire lock:
[   91.655616] ffff918554b801c0 (&mm->mmap_lock#2){++++}-{0:0}, at: mpol_rebind_mm+0x1e/0x50
[   91.655622]
               but task is already holding lock:
[   91.655622] ffffffff995b6c80 (&cpuset_rwsem){++++}-{0:0}, at: cpuset_attach+0x38/0x390
[   91.655625]
               which lock already depends on the new lock.

[   91.655625]
               the existing dependency chain (in reverse order) is:
[   91.655625]
               -> #3 (&cpuset_rwsem){++++}-{0:0}:
[   91.655626]        lock_acquire+0x92/0x410
[   91.655629]        cpuset_read_lock+0x39/0xf0
[   91.655630]        __sched_setscheduler+0x4be/0xaf0
[   91.655632]        _sched_setscheduler+0x69/0x70
[   91.655633]        __kthread_create_on_node+0x114/0x170
[   91.655634]        kthread_create_on_node+0x37/0x40
[   91.655635]        setup_irq_thread+0x37/0x90
[   91.655637]        __setup_irq+0x4de/0x7b0
[   91.655637]        request_threaded_irq+0xf8/0x160
[   91.655638]        nvkm_pci_oneinit+0x4c/0x70 [nouveau]
[   91.655674]        nvkm_subdev_init+0x60/0x1e0 [nouveau]
[   91.655689]        nvkm_device_init+0x10b/0x240 [nouveau]
[   91.655716]        nvkm_udevice_init+0x47/0x70 [nouveau]
[   91.655742]        nvkm_object_init+0x3d/0x180 [nouveau]
[   91.655755]        nvkm_ioctl_new+0x1a1/0x260 [nouveau]
[   91.655768]        nvkm_ioctl+0x10a/0x240 [nouveau]
[   91.655779]        nvif_object_ctor+0xeb/0x150 [nouveau]
[   91.655790]        nvif_device_ctor+0x1f/0x60 [nouveau]
[   91.655801]        nouveau_cli_init+0x1dc/0x5c0 [nouveau]
[   91.655826]        nouveau_drm_device_init+0x66/0x810 [nouveau]
[   91.655850]        nouveau_drm_probe+0xfb/0x200 [nouveau]
[   91.655873]        local_pci_probe+0x42/0x90
[   91.655875]        pci_device_probe+0xe7/0x1a0
[   91.655876]        really_probe+0xf7/0x4d0
[   91.655877]        driver_probe_device+0x5d/0x140
[   91.655878]        device_driver_attach+0x4f/0x60
[   91.655879]        __driver_attach+0xa2/0x140
[   91.655880]        bus_for_each_dev+0x67/0x90
[   91.655881]        bus_add_driver+0x192/0x230
[   91.655882]        driver_register+0x5b/0xf0
[   91.655883]        do_one_initcall+0x56/0x3c4
[   91.655884]        do_init_module+0x5b/0x21c
[   91.655886]        load_module+0x1cc7/0x2430
[   91.655887]        __do_sys_finit_module+0xa7/0xe0
[   91.655888]        do_syscall_64+0x33/0x40
[   91.655889]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   91.655890]
               -> #2 (&device->mutex){+.+.}-{0:0}:
[   91.655891]        lock_acquire+0x92/0x410
[   91.655893]        _mutex_lock+0x28/0x40
[   91.655895]        nvkm_udevice_fini+0x21/0x70 [nouveau]
[   91.655919]        nvkm_object_fini+0xb8/0x210 [nouveau]
[   91.655931]        nvkm_object_fini+0x73/0x210 [nouveau]
[   91.655943]        nvkm_ioctl_del+0x7e/0xa0 [nouveau]
[   91.655954]        nvkm_ioctl+0x10a/0x240 [nouveau]
[   91.655966]        nvif_object_dtor+0x4a/0x60 [nouveau]
[   91.655976]        nvif_client_dtor+0xe/0x40 [nouveau]
[   91.655986]        nouveau_cli_fini+0x78/0x90 [nouveau]
[   91.656010]        nouveau_drm_postclose+0xa6/0xe0 [nouveau]
[   91.656033]        drm_file_free.part.9+0x27e/0x2d0 [drm]
[   91.656045]        drm_release+0x6f/0xf0 [drm]
[   91.656052]        __fput+0xb2/0x260
[   91.656053]        task_work_run+0x73/0xc0
[   91.656055]        exit_to_user_mode_prepare+0x204/0x230
[   91.656056]        syscall_exit_to_user_mode+0x4a/0x330
[   91.656057]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   91.656058]
               -> #1 (&cli->lock){+.+.}-{0:0}:
[   91.656059]        lock_acquire+0x92/0x410
[   91.656060]        _mutex_lock+0x28/0x40
[   91.656061]        nouveau_mem_fini+0x4a/0x70 [nouveau]
[   91.656086]        ttm_tt_destroy+0x22/0x70 [ttm]
[   91.656089]        ttm_bo_cleanup_memtype_use+0x32/0xa0 [ttm]
[   91.656091]        ttm_bo_put+0xe7/0x670 [ttm]
[   91.656093]        ttm_bo_vm_close+0x15/0x30 [ttm]
[   91.656096]        remove_vma+0x3e/0x70
[   91.656097]        __do_munmap+0x2b7/0x4f0
[   91.656098]        __vm_munmap+0x5b/0xa0
[   91.656098]        __x64_sys_munmap+0x27/0x30
[   91.656099]        do_syscall_64+0x33/0x40
[   91.656100]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   91.656100]
               -> #0 (&mm->mmap_lock#2){++++}-{0:0}:
[   91.656102]        validate_chain+0x981/0x1250
[   91.656103]        __lock_acquire+0x86f/0xbd0
[   91.656104]        lock_acquire+0x92/0x410
[   91.656105]        down_write+0x3b/0x50
[   91.656106]        mpol_rebind_mm+0x1e/0x50
[   91.656108]        cpuset_attach+0x229/0x390
[   91.656109]        cgroup_migrate_execute+0x46d/0x490
[   91.656111]        cgroup_attach_task+0x20c/0x430
[   91.656112]        __cgroup1_procs_write.constprop.21+0xf3/0x150
[   91.656113]        cgroup_file_write+0x7e/0x1a0
[   91.656114]        kernfs_fop_write+0x113/0x1b0
[   91.656116]        vfs_write+0xf0/0x230
[   91.656116]        ksys_write+0x87/0xc0
[   91.656117]        do_syscall_64+0x33/0x40
[   91.656118]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   91.656118]
               other info that might help us debug this:

[   91.656119] Chain exists of:
                 &mm->mmap_lock#2 --> &device->mutex --> &cpuset_rwsem

[   91.656120]  Possible unsafe locking scenario:

[   91.656120]        CPU0                    CPU1
[   91.656120]        ----                    ----
[   91.656121]   lock(&cpuset_rwsem);
[   91.656121]                                lock(&device->mutex);
[   91.656122]                                lock(&cpuset_rwsem);
[   91.656122]   lock(&mm->mmap_lock#2);
[   91.656123]
                *** DEADLOCK ***

[   91.656123] 6 locks held by libvirtd/1868:
[   91.656124]  #0: ffff9186df6f5f88 (&f->f_pos_lock){+.+.}-{0:0}, at: __fdget_pos+0x46/0x50
[   91.656127]  #1: ffff918553b695f8 (sb_writers#7){.+.+}-{0:0}, at: vfs_write+0x1c1/0x230
[   91.656128]  #2: ffff91873fb950a8 (&of->mutex){+.+.}-{0:0}, at: kernfs_fop_write+0xde/0x1b0
[   91.656130]  #3: ffffffff995b2cc8 (cgroup_mutex){+.+.}-{3:3}, at: cgroup_kn_lock_live+0xe8/0x1d0
[   91.656133]  #4: ffffffff995b2900 (cgroup_threadgroup_rwsem){++++}-{0:0}, at: cgroup_procs_write_start+0x6e/0x200
[   91.656135]  #5: ffffffff995b6c80 (&cpuset_rwsem){++++}-{0:0}, at: cpuset_attach+0x38/0x390
[   91.656137]
               stack backtrace:
[   91.656137] CPU: 6 PID: 1868 Comm: libvirtd Kdump: loaded Tainted: G S          E     5.9.1-rt18-rt #5
[   91.656138] Hardware name: MEDION MS-7848/MS-7848, BIOS M7848W08.20C 09/23/2013
[   91.656139] Call Trace:
[   91.656141]  dump_stack+0x77/0x9b
[   91.656143]  check_noncircular+0x148/0x160
[   91.656144]  ? validate_chain+0x9d6/0x1250
[   91.656146]  ? validate_chain+0x981/0x1250
[   91.656147]  validate_chain+0x981/0x1250
[   91.656150]  __lock_acquire+0x86f/0xbd0
[   91.656151]  lock_acquire+0x92/0x410
[   91.656152]  ? mpol_rebind_mm+0x1e/0x50
[   91.656155]  down_write+0x3b/0x50
[   91.656156]  ? mpol_rebind_mm+0x1e/0x50
[   91.656157]  mpol_rebind_mm+0x1e/0x50
[   91.656158]  cpuset_attach+0x229/0x390
[   91.656160]  cgroup_migrate_execute+0x46d/0x490
[   91.656162]  cgroup_attach_task+0x20c/0x430
[   91.656165]  ? __cgroup1_procs_write.constprop.21+0xf3/0x150
[   91.656166]  __cgroup1_procs_write.constprop.21+0xf3/0x150
[   91.656168]  cgroup_file_write+0x7e/0x1a0
[   91.656169]  kernfs_fop_write+0x113/0x1b0
[   91.656171]  vfs_write+0xf0/0x230
[   91.656172]  ksys_write+0x87/0xc0
[   91.656173]  ? lockdep_hardirqs_on+0x78/0x100
[   91.656174]  do_syscall_64+0x33/0x40
[   91.656175]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   91.656176] RIP: 0033:0x7fbfe3bc0deb
[   91.656178] Code: 53 48 89 d5 48 89 f3 48 83 ec 18 48 89 7c 24 08 e8 5a fd ff ff 48 89 ea 41 89 c0 48 89 de 48 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 90 fd ff ff 48
[   91.656179] RSP: 002b:00007fbfd94f72f0 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
[   91.656179] RAX: ffffffffffffffda RBX: 00007fbfc8048b20 RCX: 00007fbfe3bc0deb
[   91.656180] RDX: 0000000000000004 RSI: 00007fbfc8048b20 RDI: 000000000000001f
[   91.656180] RBP: 0000000000000004 R08: 0000000000000000 R09: 0000000000000000
[   91.656181] R10: 0000000000000000 R11: 0000000000000293 R12: 00007fbfc8048b20
[   91.656181] R13: 0000000000000000 R14: 000000000000001f R15: 0000000000000214


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: ltp or kvm triggerable lockdep alloc_pid()  deadlock gripe
  2020-10-22  5:21 ` ltp or kvm triggerable lockdep alloc_pid() deadlock gripe Mike Galbraith
@ 2020-10-22 16:44   ` Sebastian Andrzej Siewior
  0 siblings, 0 replies; 29+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-10-22 16:44 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt

On 2020-10-22 07:21:13 [+0200], Mike Galbraith wrote:
> Greetings,
Hi,

> The gripe below is repeatable in two ways here, boot with nomodeset so
> nouveau doesn't steal the lockdep show when I then fire up one of my
> (oink) full distro VM's, or from an ltp directory ./runltp -f cpuset
> with the attached subset of controllers file placed in ./runtest dir.
> 
> Method2 may lead to a real deal deadlock, I've got a crashdump of one,
> stack traces of uninterruptible sleepers attached.

Just added commit
	267580db047ef ("seqlock: Unbreak lockdep")

and it is gone.

Sebastian

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: kvm+nouveau induced lockdep  gripe
  2020-10-22  5:28 ` kvm+nouveau induced lockdep gripe Mike Galbraith
@ 2020-10-23  9:01   ` Sebastian Andrzej Siewior
  2020-10-23 12:07     ` Mike Galbraith
       [not found]     ` <e4bf2fe3c5d2fdeded9b3d873a08094dbf145bf9.camel-Mmb7MZpHnFY@public.gmane.org>
  2021-09-17 13:17   ` [tip: irq/core] genirq: Move prio assignment into the newly created thread tip-bot2 for Thomas Gleixner
                     ` (3 subsequent siblings)
  4 siblings, 2 replies; 29+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-10-23  9:01 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt

On 2020-10-22 07:28:20 [+0200], Mike Galbraith wrote:
> I've only as yet seen nouveau lockdep gripage when firing up one of my
> full distro KVM's.

Could you please check !RT with the `threadirqs' command line option? I
don't think RT is doing here anything different (except for having
threaded interrupts enabled by default).

Sebastian

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: kvm+nouveau induced lockdep  gripe
  2020-10-23  9:01   ` Sebastian Andrzej Siewior
@ 2020-10-23 12:07     ` Mike Galbraith
       [not found]     ` <e4bf2fe3c5d2fdeded9b3d873a08094dbf145bf9.camel-Mmb7MZpHnFY@public.gmane.org>
  1 sibling, 0 replies; 29+ messages in thread
From: Mike Galbraith @ 2020-10-23 12:07 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt,
	Ben Skeggs, nouveau

On Fri, 2020-10-23 at 11:01 +0200, Sebastian Andrzej Siewior wrote:
> On 2020-10-22 07:28:20 [+0200], Mike Galbraith wrote:
> > I've only as yet seen nouveau lockdep gripage when firing up one of my
> > full distro KVM's.
>
> Could you please check !RT with the `threadirqs' command line option? I
> don't think RT is doing here anything different (except for having
> threaded interrupts enabled by default).

Yup, you are correct, RT is innocent.


[   70.135201] ======================================================
[   70.135206] WARNING: possible circular locking dependency detected
[   70.135211] 5.9.0.gf989335-master #1 Tainted: G            E
[   70.135216] ------------------------------------------------------
[   70.135220] libvirtd/1838 is trying to acquire lock:
[   70.135225] ffff983590c2d5a8 (&mm->mmap_lock#2){++++}-{3:3}, at: mpol_rebind_mm+0x1e/0x50
[   70.135239]
               but task is already holding lock:
[   70.135244] ffffffff8a585410 (&cpuset_rwsem){++++}-{0:0}, at: cpuset_attach+0x38/0x390
[   70.135256]
               which lock already depends on the new lock.

[   70.135261]
               the existing dependency chain (in reverse order) is:
[   70.135266]
               -> #3 (&cpuset_rwsem){++++}-{0:0}:
[   70.135275]        cpuset_read_lock+0x39/0xd0
[   70.135282]        __sched_setscheduler+0x456/0xa90
[   70.135287]        _sched_setscheduler+0x69/0x70
[   70.135292]        __kthread_create_on_node+0x114/0x170
[   70.135297]        kthread_create_on_node+0x37/0x40
[   70.135306]        setup_irq_thread+0x37/0x90
[   70.135312]        __setup_irq+0x4e0/0x7c0
[   70.135318]        request_threaded_irq+0xf8/0x160
[   70.135371]        nvkm_pci_oneinit+0x4c/0x70 [nouveau]
[   70.135399]        nvkm_subdev_init+0x60/0x1e0 [nouveau]
[   70.135449]        nvkm_device_init+0x10b/0x240 [nouveau]
[   70.135506]        nvkm_udevice_init+0x49/0x70 [nouveau]
[   70.135531]        nvkm_object_init+0x3d/0x180 [nouveau]
[   70.135555]        nvkm_ioctl_new+0x1a1/0x260 [nouveau]
[   70.135578]        nvkm_ioctl+0x10a/0x240 [nouveau]
[   70.135600]        nvif_object_ctor+0xeb/0x150 [nouveau]
[   70.135622]        nvif_device_ctor+0x1f/0x60 [nouveau]
[   70.135668]        nouveau_cli_init+0x1ac/0x590 [nouveau]
[   70.135711]        nouveau_drm_device_init+0x68/0x800 [nouveau]
[   70.135753]        nouveau_drm_probe+0xfb/0x200 [nouveau]
[   70.135761]        local_pci_probe+0x42/0x90
[   70.135767]        pci_device_probe+0xe7/0x1a0
[   70.135773]        really_probe+0xf7/0x4d0
[   70.135779]        driver_probe_device+0x5d/0x140
[   70.135785]        device_driver_attach+0x4f/0x60
[   70.135790]        __driver_attach+0xa4/0x140
[   70.135796]        bus_for_each_dev+0x67/0x90
[   70.135801]        bus_add_driver+0x18c/0x230
[   70.135807]        driver_register+0x5b/0xf0
[   70.135813]        do_one_initcall+0x54/0x2f0
[   70.135819]        do_init_module+0x5b/0x21b
[   70.135825]        load_module+0x1e40/0x2370
[   70.135830]        __do_sys_finit_module+0x98/0xe0
[   70.135836]        do_syscall_64+0x33/0x40
[   70.135842]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   70.135847]
               -> #2 (&device->mutex){+.+.}-{3:3}:
[   70.135857]        __mutex_lock+0x90/0x9c0
[   70.135902]        nvkm_udevice_fini+0x23/0x70 [nouveau]
[   70.135927]        nvkm_object_fini+0xb8/0x210 [nouveau]
[   70.135951]        nvkm_object_fini+0x73/0x210 [nouveau]
[   70.135974]        nvkm_ioctl_del+0x7e/0xa0 [nouveau]
[   70.135997]        nvkm_ioctl+0x10a/0x240 [nouveau]
[   70.136019]        nvif_object_dtor+0x4a/0x60 [nouveau]
[   70.136040]        nvif_client_dtor+0xe/0x40 [nouveau]
[   70.136085]        nouveau_cli_fini+0x7a/0x90 [nouveau]
[   70.136128]        nouveau_drm_postclose+0xaa/0xe0 [nouveau]
[   70.136150]        drm_file_free.part.7+0x273/0x2c0 [drm]
[   70.136165]        drm_release+0x6e/0xf0 [drm]
[   70.136171]        __fput+0xb2/0x260
[   70.136177]        task_work_run+0x73/0xc0
[   70.136183]        exit_to_user_mode_prepare+0x1a5/0x1d0
[   70.136189]        syscall_exit_to_user_mode+0x46/0x2a0
[   70.136195]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   70.136200]
               -> #1 (&cli->lock){+.+.}-{3:3}:
[   70.136209]        __mutex_lock+0x90/0x9c0
[   70.136252]        nouveau_mem_fini+0x4c/0x70 [nouveau]
[   70.136294]        nouveau_sgdma_destroy+0x20/0x50 [nouveau]
[   70.136302]        ttm_bo_cleanup_memtype_use+0x3e/0x60 [ttm]
[   70.136310]        ttm_bo_release+0x29c/0x600 [ttm]
[   70.136317]        ttm_bo_vm_close+0x15/0x30 [ttm]
[   70.136324]        remove_vma+0x3e/0x70
[   70.136329]        __do_munmap+0x2b7/0x4f0
[   70.136333]        __vm_munmap+0x5b/0xa0
[   70.136338]        __x64_sys_munmap+0x27/0x30
[   70.136343]        do_syscall_64+0x33/0x40
[   70.136349]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   70.136354]
               -> #0 (&mm->mmap_lock#2){++++}-{3:3}:
[   70.136365]        __lock_acquire+0x149d/0x1a70
[   70.136371]        lock_acquire+0x1a7/0x3b0
[   70.136376]        down_write+0x38/0x70
[   70.136382]        mpol_rebind_mm+0x1e/0x50
[   70.136387]        cpuset_attach+0x229/0x390
[   70.136393]        cgroup_migrate_execute+0x46d/0x490
[   70.136398]        cgroup_attach_task+0x20c/0x3b0
[   70.136404]        __cgroup1_procs_write.constprop.21+0xf3/0x150
[   70.136411]        cgroup_file_write+0x64/0x210
[   70.136416]        kernfs_fop_write+0x117/0x1b0
[   70.136422]        vfs_write+0xe8/0x240
[   70.136427]        ksys_write+0x87/0xc0
[   70.136432]        do_syscall_64+0x33/0x40
[   70.136438]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   70.136443]
               other info that might help us debug this:

[   70.136450] Chain exists of:
                 &mm->mmap_lock#2 --> &device->mutex --> &cpuset_rwsem

[   70.136463]  Possible unsafe locking scenario:

[   70.136469]        CPU0                    CPU1
[   70.136473]        ----                    ----
[   70.136477]   lock(&cpuset_rwsem);
[   70.136483]                                lock(&device->mutex);
[   70.136489]                                lock(&cpuset_rwsem);
[   70.136495]   lock(&mm->mmap_lock#2);
[   70.136501]
                *** DEADLOCK ***

[   70.136508] 6 locks held by libvirtd/1838:
[   70.136512]  #0: ffff98359eb27af0 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0x45/0x50
[   70.136524]  #1: ffff983591a58460 (sb_writers#7){.+.+}-{0:0}, at: vfs_write+0x1aa/0x240
[   70.136535]  #2: ffff9835bbf50488 (&of->mutex){+.+.}-{3:3}, at: kernfs_fop_write+0xe2/0x1b0
[   70.136545]  #3: ffffffff8a581848 (cgroup_mutex){+.+.}-{3:3}, at: cgroup_kn_lock_live+0xea/0x1d0
[   70.136556]  #4: ffffffff8a5816b0 (cgroup_threadgroup_rwsem){++++}-{0:0}, at: cgroup_procs_write_start+0x6e/0x200
[   70.136567]  #5: ffffffff8a585410 (&cpuset_rwsem){++++}-{0:0}, at: cpuset_attach+0x38/0x390
[   70.136579]
               stack backtrace:
[   70.136585] CPU: 2 PID: 1838 Comm: libvirtd Kdump: loaded Tainted: G            E     5.9.0.gf989335-master #1
[   70.136592] Hardware name: MEDION MS-7848/MS-7848, BIOS M7848W08.20C 09/23/2013
[   70.136598] Call Trace:
[   70.136605]  dump_stack+0x77/0x97
[   70.136611]  check_noncircular+0xe7/0x100
[   70.136618]  ? stack_trace_save+0x3b/0x50
[   70.136626]  ? __lock_acquire+0x149d/0x1a70
[   70.136631]  __lock_acquire+0x149d/0x1a70
[   70.136640]  lock_acquire+0x1a7/0x3b0
[   70.136645]  ? mpol_rebind_mm+0x1e/0x50
[   70.136652]  down_write+0x38/0x70
[   70.136657]  ? mpol_rebind_mm+0x1e/0x50
[   70.136663]  mpol_rebind_mm+0x1e/0x50
[   70.136669]  cpuset_attach+0x229/0x390
[   70.136675]  cgroup_migrate_execute+0x46d/0x490
[   70.136681]  ? _raw_spin_unlock_irq+0x2f/0x50
[   70.136688]  cgroup_attach_task+0x20c/0x3b0
[   70.136702]  ? __cgroup1_procs_write.constprop.21+0xf3/0x150
[   70.136712]  __cgroup1_procs_write.constprop.21+0xf3/0x150
[   70.136722]  cgroup_file_write+0x64/0x210
[   70.136728]  kernfs_fop_write+0x117/0x1b0
[   70.136735]  vfs_write+0xe8/0x240
[   70.136741]  ksys_write+0x87/0xc0
[   70.136746]  ? lockdep_hardirqs_on+0x85/0x110
[   70.136752]  do_syscall_64+0x33/0x40
[   70.136758]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   70.136764] RIP: 0033:0x7efc17533deb
[   70.136770] Code: 53 48 89 d5 48 89 f3 48 83 ec 18 48 89 7c 24 08 e8 5a fd ff ff 48 89 ea 41 89 c0 48 89 de 48 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 90 fd ff ff 48
[   70.136781] RSP: 002b:00007efc0d66b2f0 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
[   70.136788] RAX: ffffffffffffffda RBX: 00007efbf80500f0 RCX: 00007efc17533deb
[   70.136794] RDX: 0000000000000004 RSI: 00007efbf80500f0 RDI: 000000000000001f
[   70.136799] RBP: 0000000000000004 R08: 0000000000000000 R09: 0000000000000000
[   70.136805] R10: 0000000000000000 R11: 0000000000000293 R12: 00007efbf80500f0
[   70.136811] R13: 0000000000000000 R14: 000000000000001f R15: 0000000000000214


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: kvm+nouveau induced lockdep  gripe
       [not found]     ` <e4bf2fe3c5d2fdeded9b3d873a08094dbf145bf9.camel-Mmb7MZpHnFY@public.gmane.org>
@ 2020-10-24  2:22       ` Hillf Danton
  2020-10-24  3:38           ` Mike Galbraith
       [not found]         ` <20201024050000.8104-1-hdanton@sina.com>
  0 siblings, 2 replies; 29+ messages in thread
From: Hillf Danton @ 2020-10-24  2:22 UTC (permalink / raw)
  To: Mike Galbraith, Sebastian Andrzej Siewior
  Cc: Hillf Danton, linux-rt-users,
	nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, LKML, Steven Rostedt,
	Ben Skeggs, Thomas Gleixner

On Fri, 23 Oct 2020 14:07:13 +0200 Mike Galbraith wrote:
> On Fri, 2020-10-23 at 11:01 +0200, Sebastian Andrzej Siewior wrote:
> > On 2020-10-22 07:28:20 [+0200], Mike Galbraith wrote:
> > > I've only as yet seen nouveau lockdep gripage when firing up one of my
> > > full distro KVM's.
> >
> > Could you please check !RT with the `threadirqs' command line option? I
> > don't think RT is doing here anything different (except for having
> > threaded interrupts enabled by default).
> 
> Yup, you are correct, RT is innocent.
> 
> 
> [   70.135201] ========================================================
> [   70.135206] WARNING: possible circular locking dependency detected
> [   70.135211] 5.9.0.gf989335-master #1 Tainted: G            E
> [   70.135216] ------------------------------------------------------
> [   70.135220] libvirtd/1838 is trying to acquire lock:
> [   70.135225] ffff983590c2d5a8 (&mm->mmap_lock#2){++++}-{3:3}, at: mpol_rebind_mm+0x1e/0x50
> [   70.135239]
>                but task is already holding lock:
> [   70.135244] ffffffff8a585410 (&cpuset_rwsem){++++}-{0:0}, at: cpuset_attach+0x38/0x390
> [   70.135256]
>                which lock already depends on the new lock.
> 
> [   70.135261]
>                the existing dependency chain (in reverse order) is:
> [   70.135266]
>                -> #3 (&cpuset_rwsem){++++}-{0:0}:
> [   70.135275]        cpuset_read_lock+0x39/0xd0
> [   70.135282]        __sched_setscheduler+0x456/0xa90
> [   70.135287]        _sched_setscheduler+0x69/0x70
> [   70.135292]        __kthread_create_on_node+0x114/0x170
> [   70.135297]        kthread_create_on_node+0x37/0x40
> [   70.135306]        setup_irq_thread+0x37/0x90
> [   70.135312]        __setup_irq+0x4e0/0x7c0
> [   70.135318]        request_threaded_irq+0xf8/0x160
> [   70.135371]        nvkm_pci_oneinit+0x4c/0x70 [nouveau]
> [   70.135399]        nvkm_subdev_init+0x60/0x1e0 [nouveau]
> [   70.135449]        nvkm_device_init+0x10b/0x240 [nouveau]
> [   70.135506]        nvkm_udevice_init+0x49/0x70 [nouveau]
> [   70.135531]        nvkm_object_init+0x3d/0x180 [nouveau]
> [   70.135555]        nvkm_ioctl_new+0x1a1/0x260 [nouveau]
> [   70.135578]        nvkm_ioctl+0x10a/0x240 [nouveau]
> [   70.135600]        nvif_object_ctor+0xeb/0x150 [nouveau]
> [   70.135622]        nvif_device_ctor+0x1f/0x60 [nouveau]
> [   70.135668]        nouveau_cli_init+0x1ac/0x590 [nouveau]
> [   70.135711]        nouveau_drm_device_init+0x68/0x800 [nouveau]
> [   70.135753]        nouveau_drm_probe+0xfb/0x200 [nouveau]
> [   70.135761]        local_pci_probe+0x42/0x90
> [   70.135767]        pci_device_probe+0xe7/0x1a0
> [   70.135773]        really_probe+0xf7/0x4d0
> [   70.135779]        driver_probe_device+0x5d/0x140
> [   70.135785]        device_driver_attach+0x4f/0x60
> [   70.135790]        __driver_attach+0xa4/0x140
> [   70.135796]        bus_for_each_dev+0x67/0x90
> [   70.135801]        bus_add_driver+0x18c/0x230
> [   70.135807]        driver_register+0x5b/0xf0
> [   70.135813]        do_one_initcall+0x54/0x2f0
> [   70.135819]        do_init_module+0x5b/0x21b
> [   70.135825]        load_module+0x1e40/0x2370
> [   70.135830]        __do_sys_finit_module+0x98/0xe0
> [   70.135836]        do_syscall_64+0x33/0x40
> [   70.135842]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [   70.135847]
>                -> #2 (&device->mutex){+.+.}-{3:3}:
> [   70.135857]        __mutex_lock+0x90/0x9c0
> [   70.135902]        nvkm_udevice_fini+0x23/0x70 [nouveau]
> [   70.135927]        nvkm_object_fini+0xb8/0x210 [nouveau]
> [   70.135951]        nvkm_object_fini+0x73/0x210 [nouveau]
> [   70.135974]        nvkm_ioctl_del+0x7e/0xa0 [nouveau]
> [   70.135997]        nvkm_ioctl+0x10a/0x240 [nouveau]
> [   70.136019]        nvif_object_dtor+0x4a/0x60 [nouveau]
> [   70.136040]        nvif_client_dtor+0xe/0x40 [nouveau]
> [   70.136085]        nouveau_cli_fini+0x7a/0x90 [nouveau]
> [   70.136128]        nouveau_drm_postclose+0xaa/0xe0 [nouveau]
> [   70.136150]        drm_file_free.part.7+0x273/0x2c0 [drm]
> [   70.136165]        drm_release+0x6e/0xf0 [drm]
> [   70.136171]        __fput+0xb2/0x260
> [   70.136177]        task_work_run+0x73/0xc0
> [   70.136183]        exit_to_user_mode_prepare+0x1a5/0x1d0
> [   70.136189]        syscall_exit_to_user_mode+0x46/0x2a0
> [   70.136195]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [   70.136200]
>                -> #1 (&cli->lock){+.+.}-{3:3}:
> [   70.136209]        __mutex_lock+0x90/0x9c0
> [   70.136252]        nouveau_mem_fini+0x4c/0x70 [nouveau]
> [   70.136294]        nouveau_sgdma_destroy+0x20/0x50 [nouveau]
> [   70.136302]        ttm_bo_cleanup_memtype_use+0x3e/0x60 [ttm]
> [   70.136310]        ttm_bo_release+0x29c/0x600 [ttm]

Looks like we can break the lock chain by moving ttm bo's release
method out of mmap_lock, see diff below.

> [   70.136317]        ttm_bo_vm_close+0x15/0x30 [ttm]
> [   70.136324]        remove_vma+0x3e/0x70
> [   70.136329]        __do_munmap+0x2b7/0x4f0
> [   70.136333]        __vm_munmap+0x5b/0xa0
> [   70.136338]        __x64_sys_munmap+0x27/0x30
> [   70.136343]        do_syscall_64+0x33/0x40
> [   70.136349]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [   70.136354]
>                -> #0 (&mm->mmap_lock#2){++++}-{3:3}:
> [   70.136365]        __lock_acquire+0x149d/0x1a70
> [   70.136371]        lock_acquire+0x1a7/0x3b0
> [   70.136376]        down_write+0x38/0x70
> [   70.136382]        mpol_rebind_mm+0x1e/0x50
> [   70.136387]        cpuset_attach+0x229/0x390
> [   70.136393]        cgroup_migrate_execute+0x46d/0x490
> [   70.136398]        cgroup_attach_task+0x20c/0x3b0
> [   70.136404]        __cgroup1_procs_write.constprop.21+0xf3/0x150
> [   70.136411]        cgroup_file_write+0x64/0x210
> [   70.136416]        kernfs_fop_write+0x117/0x1b0
> [   70.136422]        vfs_write+0xe8/0x240
> [   70.136427]        ksys_write+0x87/0xc0
> [   70.136432]        do_syscall_64+0x33/0x40
> [   70.136438]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [   70.136443]
>                other info that might help us debug this:
> 
> [   70.136450] Chain exists of:
>                  &mm->mmap_lock#2 --> &device->mutex --> &cpuset_rwsem
> 
> [   70.136463]  Possible unsafe locking scenario:
> 
> [   70.136469]        CPU0                    CPU1
> [   70.136473]        ----                    ----
> [   70.136477]   lock(&cpuset_rwsem);
> [   70.136483]                                lock(&device->mutex);
> [   70.136489]                                lock(&cpuset_rwsem);
> [   70.136495]   lock(&mm->mmap_lock#2);
> [   70.136501]
>                 *** DEADLOCK ***
> 
> [   70.136508] 6 locks held by libvirtd/1838:
> [   70.136512]  #0: ffff98359eb27af0 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0x45/0x50
> [   70.136524]  #1: ffff983591a58460 (sb_writers#7){.+.+}-{0:0}, at: vfs_write+0x1aa/0x240
> [   70.136535]  #2: ffff9835bbf50488 (&of->mutex){+.+.}-{3:3}, at: kernfs_fop_write+0xe2/0x1b0
> [   70.136545]  #3: ffffffff8a581848 (cgroup_mutex){+.+.}-{3:3}, at: cgroup_kn_lock_live+0xea/0x1d0
> [   70.136556]  #4: ffffffff8a5816b0 (cgroup_threadgroup_rwsem){++++}-{0:0}, at: cgroup_procs_write_start+0x6e/0x200
> [   70.136567]  #5: ffffffff8a585410 (&cpuset_rwsem){++++}-{0:0}, at: cpuset_attach+0x38/0x390
> [   70.136579]
>                stack backtrace:
> [   70.136585] CPU: 2 PID: 1838 Comm: libvirtd Kdump: loaded Tainted: G   E     5.9.0.gf989335-master #1
> [   70.136592] Hardware name: MEDION MS-7848/MS-7848, BIOS M7848W08.20C 09/23/2013
> [   70.136598] Call Trace:
> [   70.136605]  dump_stack+0x77/0x97
> [   70.136611]  check_noncircular+0xe7/0x100
> [   70.136618]  ? stack_trace_save+0x3b/0x50
> [   70.136626]  ? __lock_acquire+0x149d/0x1a70
> [   70.136631]  __lock_acquire+0x149d/0x1a70
> [   70.136640]  lock_acquire+0x1a7/0x3b0
> [   70.136645]  ? mpol_rebind_mm+0x1e/0x50
> [   70.136652]  down_write+0x38/0x70
> [   70.136657]  ? mpol_rebind_mm+0x1e/0x50
> [   70.136663]  mpol_rebind_mm+0x1e/0x50
> [   70.136669]  cpuset_attach+0x229/0x390
> [   70.136675]  cgroup_migrate_execute+0x46d/0x490
> [   70.136681]  ? _raw_spin_unlock_irq+0x2f/0x50
> [   70.136688]  cgroup_attach_task+0x20c/0x3b0
> [   70.136702]  ? __cgroup1_procs_write.constprop.21+0xf3/0x150
> [   70.136712]  __cgroup1_procs_write.constprop.21+0xf3/0x150
> [   70.136722]  cgroup_file_write+0x64/0x210
> [   70.136728]  kernfs_fop_write+0x117/0x1b0
> [   70.136735]  vfs_write+0xe8/0x240
> [   70.136741]  ksys_write+0x87/0xc0
> [   70.136746]  ? lockdep_hardirqs_on+0x85/0x110
> [   70.136752]  do_syscall_64+0x33/0x40
> [   70.136758]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [   70.136764] RIP: 0033:0x7efc17533deb
> [   70.136770] Code: 53 48 89 d5 48 89 f3 48 83 ec 18 48 89 7c 24 08 e8 5a
>  fd ff ff 48 89 ea 41 89 c0 48 89 de 48 8b 7c 24 08 b8 01 00 00 00 0f 05 <48>
>  3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 90 fd ff ff 48
> [   70.136781] RSP: 002b:00007efc0d66b2f0 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
> [   70.136788] RAX: ffffffffffffffda RBX: 00007efbf80500f0 RCX: 00007efc17533deb
> [   70.136794] RDX: 0000000000000004 RSI: 00007efbf80500f0 RDI: 000000000000001f
> [   70.136799] RBP: 0000000000000004 R08: 0000000000000000 R09: 0000000000000000
> [   70.136805] R10: 0000000000000000 R11: 0000000000000293 R12: 00007efbf80500f0
> [   70.136811] R13: 0000000000000000 R14: 000000000000001f R15: 0000000000000214

Embed a rcu in ttm BO in order to release bo outside mmap_lock, and turn
the release method into calling rcu.

--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@@ -180,6 +180,7 @@ struct ttm_buffer_object {
 	* Members not needing protection.
 	*/
 	struct kref kref;
+	struct rcu_head rcu;
 
 	/**
 	 * Members protected by the bo::resv::reserved lock.
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -547,10 +547,10 @@ static void ttm_bo_delayed_workqueue(str
 				      ((HZ / 100) < 1) ? 1 : HZ / 100);
 }
 
-static void ttm_bo_release(struct kref *kref)
+static void ttm_bo_call_rcu_fn(struct rcu_head *rcu)
 {
-	struct ttm_buffer_object *bo =
-	    container_of(kref, struct ttm_buffer_object, kref);
+	struct ttm_buffer_object *bo = container_of(rcu,
+						struct ttm_buffer_object, rcu);
 	struct ttm_bo_device *bdev = bo->bdev;
 	struct ttm_mem_type_manager *man = &bdev->man[bo->mem.mem_type];
 	size_t acc_size = bo->acc_size;
@@ -619,6 +619,13 @@ static void ttm_bo_release(struct kref *
 	ttm_mem_global_free(&ttm_mem_glob, acc_size);
 }
 
+static void ttm_bo_release(struct kref *kref)
+{
+	struct ttm_buffer_object *bo = container_of(kref,
+					struct ttm_buffer_object, kref);
+	call_rcu(&bo->rcu, ttm_bo_call_rcu_fn);
+}
+
 void ttm_bo_put(struct ttm_buffer_object *bo)
 {
 	kref_put(&bo->kref, ttm_bo_release);

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: kvm+nouveau induced lockdep  gripe
@ 2020-10-24  3:38           ` Mike Galbraith
  0 siblings, 0 replies; 29+ messages in thread
From: Mike Galbraith @ 2020-10-24  3:38 UTC (permalink / raw)
  To: Hillf Danton, Sebastian Andrzej Siewior
  Cc: Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt,
	Ben Skeggs, nouveau

On Sat, 2020-10-24 at 10:22 +0800, Hillf Danton wrote:
>
> Looks like we can break the lock chain by moving ttm bo's release
> method out of mmap_lock, see diff below.

Ah, the perfect compliment to morning java, a patchlet to wedge in and
see what happens.

wedge/build/boot <schlurp... ahhh>

Mmm, box says no banana... a lot.

[   30.456921] ================================
[   30.456924] WARNING: inconsistent lock state
[   30.456928] 5.9.0.gf11901e-master #2 Tainted: G S          E
[   30.456932] --------------------------------
[   30.456935] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
[   30.456940] ksoftirqd/4/36 [HC0[0]:SC1[1]:HE1:SE0] takes:
[   30.456944] ffff8e2c8bde9e40 (&mgr->vm_lock){++?+}-{2:2}, at: drm_vma_offset_remove+0x14/0x70 [drm]
[   30.456976] {SOFTIRQ-ON-W} state was registered at:
[   30.456982]   lock_acquire+0x1a7/0x3b0
[   30.456987]   _raw_write_lock+0x2f/0x40
[   30.457006]   drm_vma_offset_add+0x1c/0x60 [drm]
[   30.457013]   ttm_bo_init_reserved+0x28b/0x460 [ttm]
[   30.457020]   ttm_bo_init+0x57/0x110 [ttm]
[   30.457066]   nouveau_bo_init+0xb0/0xc0 [nouveau]
[   30.457108]   nouveau_bo_new+0x4d/0x60 [nouveau]
[   30.457145]   nv84_fence_create+0xb9/0x130 [nouveau]
[   30.457180]   nvc0_fence_create+0xe/0x47 [nouveau]
[   30.457221]   nouveau_drm_device_init+0x3d9/0x800 [nouveau]
[   30.457262]   nouveau_drm_probe+0xfb/0x200 [nouveau]
[   30.457268]   local_pci_probe+0x42/0x90
[   30.457272]   pci_device_probe+0xe7/0x1a0
[   30.457276]   really_probe+0xf7/0x4d0
[   30.457280]   driver_probe_device+0x5d/0x140
[   30.457284]   device_driver_attach+0x4f/0x60
[   30.457288]   __driver_attach+0xa4/0x140
[   30.457292]   bus_for_each_dev+0x67/0x90
[   30.457296]   bus_add_driver+0x18c/0x230
[   30.457299]   driver_register+0x5b/0xf0
[   30.457304]   do_one_initcall+0x54/0x2f0
[   30.457309]   do_init_module+0x5b/0x21b
[   30.457314]   load_module+0x1e40/0x2370
[   30.457317]   __do_sys_finit_module+0x98/0xe0
[   30.457321]   do_syscall_64+0x33/0x40
[   30.457326]   entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   30.457329] irq event stamp: 366850
[   30.457335] hardirqs last  enabled at (366850): [<ffffffffa11312ff>] rcu_nocb_unlock_irqrestore+0x4f/0x60
[   30.457342] hardirqs last disabled at (366849): [<ffffffffa11384ef>] rcu_do_batch+0x59f/0x990
[   30.457347] softirqs last  enabled at (366834): [<ffffffffa1c002d7>] __do_softirq+0x2d7/0x4a4
[   30.457357] softirqs last disabled at (366839): [<ffffffffa10928c2>] run_ksoftirqd+0x32/0x60
[   30.457363]
               other info that might help us debug this:
[   30.457369]  Possible unsafe locking scenario:

[   30.457375]        CPU0
[   30.457378]        ----
[   30.457381]   lock(&mgr->vm_lock);
[   30.457386]   <Interrupt>
[   30.457389]     lock(&mgr->vm_lock);
[   30.457394]
                *** DEADLOCK ***

<snips 999 lockdep lines and zillion ATOMIC_SLEEP gripes>


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: kvm+nouveau induced lockdep  gripe
@ 2020-10-24  3:38           ` Mike Galbraith
  0 siblings, 0 replies; 29+ messages in thread
From: Mike Galbraith @ 2020-10-24  3:38 UTC (permalink / raw)
  To: Hillf Danton, Sebastian Andrzej Siewior
  Cc: linux-rt-users, nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW, LKML,
	Steven Rostedt, Ben Skeggs, Thomas Gleixner

On Sat, 2020-10-24 at 10:22 +0800, Hillf Danton wrote:
>
> Looks like we can break the lock chain by moving ttm bo's release
> method out of mmap_lock, see diff below.

Ah, the perfect compliment to morning java, a patchlet to wedge in and
see what happens.

wedge/build/boot <schlurp... ahhh>

Mmm, box says no banana... a lot.

[   30.456921] ================================
[   30.456924] WARNING: inconsistent lock state
[   30.456928] 5.9.0.gf11901e-master #2 Tainted: G S          E
[   30.456932] --------------------------------
[   30.456935] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
[   30.456940] ksoftirqd/4/36 [HC0[0]:SC1[1]:HE1:SE0] takes:
[   30.456944] ffff8e2c8bde9e40 (&mgr->vm_lock){++?+}-{2:2}, at: drm_vma_offset_remove+0x14/0x70 [drm]
[   30.456976] {SOFTIRQ-ON-W} state was registered at:
[   30.456982]   lock_acquire+0x1a7/0x3b0
[   30.456987]   _raw_write_lock+0x2f/0x40
[   30.457006]   drm_vma_offset_add+0x1c/0x60 [drm]
[   30.457013]   ttm_bo_init_reserved+0x28b/0x460 [ttm]
[   30.457020]   ttm_bo_init+0x57/0x110 [ttm]
[   30.457066]   nouveau_bo_init+0xb0/0xc0 [nouveau]
[   30.457108]   nouveau_bo_new+0x4d/0x60 [nouveau]
[   30.457145]   nv84_fence_create+0xb9/0x130 [nouveau]
[   30.457180]   nvc0_fence_create+0xe/0x47 [nouveau]
[   30.457221]   nouveau_drm_device_init+0x3d9/0x800 [nouveau]
[   30.457262]   nouveau_drm_probe+0xfb/0x200 [nouveau]
[   30.457268]   local_pci_probe+0x42/0x90
[   30.457272]   pci_device_probe+0xe7/0x1a0
[   30.457276]   really_probe+0xf7/0x4d0
[   30.457280]   driver_probe_device+0x5d/0x140
[   30.457284]   device_driver_attach+0x4f/0x60
[   30.457288]   __driver_attach+0xa4/0x140
[   30.457292]   bus_for_each_dev+0x67/0x90
[   30.457296]   bus_add_driver+0x18c/0x230
[   30.457299]   driver_register+0x5b/0xf0
[   30.457304]   do_one_initcall+0x54/0x2f0
[   30.457309]   do_init_module+0x5b/0x21b
[   30.457314]   load_module+0x1e40/0x2370
[   30.457317]   __do_sys_finit_module+0x98/0xe0
[   30.457321]   do_syscall_64+0x33/0x40
[   30.457326]   entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   30.457329] irq event stamp: 366850
[   30.457335] hardirqs last  enabled at (366850): [<ffffffffa11312ff>] rcu_nocb_unlock_irqrestore+0x4f/0x60
[   30.457342] hardirqs last disabled at (366849): [<ffffffffa11384ef>] rcu_do_batch+0x59f/0x990
[   30.457347] softirqs last  enabled at (366834): [<ffffffffa1c002d7>] __do_softirq+0x2d7/0x4a4
[   30.457357] softirqs last disabled at (366839): [<ffffffffa10928c2>] run_ksoftirqd+0x32/0x60
[   30.457363]
               other info that might help us debug this:
[   30.457369]  Possible unsafe locking scenario:

[   30.457375]        CPU0
[   30.457378]        ----
[   30.457381]   lock(&mgr->vm_lock);
[   30.457386]   <Interrupt>
[   30.457389]     lock(&mgr->vm_lock);
[   30.457394]
                *** DEADLOCK ***

<snips 999 lockdep lines and zillion ATOMIC_SLEEP gripes>

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: kvm+nouveau induced lockdep  gripe
       [not found]         ` <20201024050000.8104-1-hdanton@sina.com>
@ 2020-10-24  5:25           ` Mike Galbraith
       [not found]           ` <20201024094224.2804-1-hdanton@sina.com>
  2020-10-26 17:31           ` Sebastian Andrzej Siewior
  2 siblings, 0 replies; 29+ messages in thread
From: Mike Galbraith @ 2020-10-24  5:25 UTC (permalink / raw)
  To: Hillf Danton
  Cc: Sebastian Andrzej Siewior, Thomas Gleixner, LKML, linux-rt-users,
	Steven Rostedt, Ben Skeggs

On Sat, 2020-10-24 at 13:00 +0800, Hillf Danton wrote:
> On Sat, 24 Oct 2020 05:38:23 +0200 Mike Galbraith wrote:
> > On Sat, 2020-10-24 at 10:22 +0800, Hillf Danton wrote:
> > >
> > > Looks like we can break the lock chain by moving ttm bo's release
> > > method out of mmap_lock, see diff below.
> >
> > Ah, the perfect compliment to morning java, a patchlet to wedge in and
> > see what happens.
> >
> > wedge/build/boot <schlurp... ahhh>
> >
> > Mmm, box says no banana... a lot.
>
> Hmm...curious how that word went into your mind. And when?

There's a colloquial expression "close, but no cigar", and a variant
"close, but no banana".  The intended communication being "not quite".

	-Mike


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: kvm+nouveau induced lockdep  gripe
       [not found]           ` <20201024094224.2804-1-hdanton@sina.com>
@ 2020-10-26 17:26             ` Sebastian Andrzej Siewior
  0 siblings, 0 replies; 29+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-10-26 17:26 UTC (permalink / raw)
  To: Hillf Danton
  Cc: Mike Galbraith, Thomas Gleixner, LKML, linux-rt-users,
	Steven Rostedt, Ben Skeggs

On 2020-10-24 17:42:24 [+0800], Hillf Danton wrote:
> Hmm...sounds like you learned English neither at high school nor
> college. When did you start learning it?

He learned enough to submit valid bug report where you could reply with
	Thank you Mike!

Sebastian

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: kvm+nouveau induced lockdep  gripe
       [not found]         ` <20201024050000.8104-1-hdanton@sina.com>
  2020-10-24  5:25           ` Mike Galbraith
       [not found]           ` <20201024094224.2804-1-hdanton@sina.com>
@ 2020-10-26 17:31           ` Sebastian Andrzej Siewior
  2020-10-26 19:15             ` Mike Galbraith
  2 siblings, 1 reply; 29+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-10-26 17:31 UTC (permalink / raw)
  To: Hillf Danton
  Cc: Mike Galbraith, Thomas Gleixner, LKML, linux-rt-users,
	Steven Rostedt, Ben Skeggs

On 2020-10-24 13:00:00 [+0800], Hillf Danton wrote:
> 
> Hmm...curious how that word went into your mind. And when?
> > [   30.457363]
> >                other info that might help us debug this:
> > [   30.457369]  Possible unsafe locking scenario:
> > 
> > [   30.457375]        CPU0
> > [   30.457378]        ----
> > [   30.457381]   lock(&mgr->vm_lock);
> > [   30.457386]   <Interrupt>
> > [   30.457389]     lock(&mgr->vm_lock);
> > [   30.457394]
> >                 *** DEADLOCK ***
> > 
> > <snips 999 lockdep lines and zillion ATOMIC_SLEEP gripes>

The backtrace contained the "normal" vm_lock. What should follow is the
backtrace of the in-softirq usage.

> 
> Dunno if blocking softint is a right cure.
> 
> --- a/drivers/gpu/drm/drm_vma_manager.c
> +++ b/drivers/gpu/drm/drm_vma_manager.c
> @@ -229,6 +229,7 @@ EXPORT_SYMBOL(drm_vma_offset_add);
>  void drm_vma_offset_remove(struct drm_vma_offset_manager *mgr,
>  			   struct drm_vma_offset_node *node)
>  {
> +	local_bh_disable();

There is write_lock_bh(). However changing only one will produce the
same backtrace somewhere else unless all other users already run BH
disabled region.

>  	write_lock(&mgr->vm_lock);
>  
>  	if (drm_mm_node_allocated(&node->vm_node)) {

Sebastian

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: kvm+nouveau induced lockdep  gripe
  2020-10-26 17:31           ` Sebastian Andrzej Siewior
@ 2020-10-26 19:15             ` Mike Galbraith
  2020-10-26 19:53               ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 29+ messages in thread
From: Mike Galbraith @ 2020-10-26 19:15 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, Hillf Danton
  Cc: Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt, Ben Skeggs

On Mon, 2020-10-26 at 18:31 +0100, Sebastian Andrzej Siewior wrote:
> On 2020-10-24 13:00:00 [+0800], Hillf Danton wrote:
> >
> > Hmm...curious how that word went into your mind. And when?
> > > [   30.457363]
> > >                other info that might help us debug this:
> > > [   30.457369]  Possible unsafe locking scenario:
> > >
> > > [   30.457375]        CPU0
> > > [   30.457378]        ----
> > > [   30.457381]   lock(&mgr->vm_lock);
> > > [   30.457386]   <Interrupt>
> > > [   30.457389]     lock(&mgr->vm_lock);
> > > [   30.457394]
> > >                 *** DEADLOCK ***
> > >
> > > <snips 999 lockdep lines and zillion ATOMIC_SLEEP gripes>
>
> The backtrace contained the "normal" vm_lock. What should follow is the
> backtrace of the in-softirq usage.
>
> >
> > Dunno if blocking softint is a right cure.
> >
> > --- a/drivers/gpu/drm/drm_vma_manager.c
> > +++ b/drivers/gpu/drm/drm_vma_manager.c
> > @@ -229,6 +229,7 @@ EXPORT_SYMBOL(drm_vma_offset_add);
> >  void drm_vma_offset_remove(struct drm_vma_offset_manager *mgr,
> >  			   struct drm_vma_offset_node *node)
> >  {
> > +	local_bh_disable();
>
> There is write_lock_bh(). However changing only one will produce the
> same backtrace somewhere else unless all other users already run BH
> disabled region.

Since there doesn't _seems_ to be a genuine deadlock lurking, I just
asked lockdep to please not log the annoying initialization time
chain.

--- a/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c
@@ -116,7 +116,17 @@ nvkm_pci_oneinit(struct nvkm_subdev *sub
 			return ret;
 	}

+	/*
+	 * Scheduler code taking cpuset_rwsem during irq thread initialization sets
+	 * up a cpuset_rwsem vs mm->mmap_lock circular dependency gripe upon later
+	 * cpuset usage. It's harmless, tell lockdep there's nothing to see here.
+	 */
+	if (force_irqthreads)
+		lockdep_off();
 	ret = request_irq(pdev->irq, nvkm_pci_intr, IRQF_SHARED, "nvkm", pci);
+	if (force_irqthreads)
+		lockdep_on();
+
 	if (ret)
 		return ret;



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: kvm+nouveau induced lockdep  gripe
  2020-10-26 19:15             ` Mike Galbraith
@ 2020-10-26 19:53               ` Sebastian Andrzej Siewior
  2020-10-27  6:03                 ` Mike Galbraith
  0 siblings, 1 reply; 29+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-10-26 19:53 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Hillf Danton, Thomas Gleixner, LKML, linux-rt-users,
	Steven Rostedt, Ben Skeggs

On 2020-10-26 20:15:23 [+0100], Mike Galbraith wrote:
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c
> @@ -116,7 +116,17 @@ nvkm_pci_oneinit(struct nvkm_subdev *sub
>  			return ret;
>  	}
> 
> +	/*
> +	 * Scheduler code taking cpuset_rwsem during irq thread initialization sets
> +	 * up a cpuset_rwsem vs mm->mmap_lock circular dependency gripe upon later
> +	 * cpuset usage. It's harmless, tell lockdep there's nothing to see here.
> +	 */
> +	if (force_irqthreads)
> +		lockdep_off();
>  	ret = request_irq(pdev->irq, nvkm_pci_intr, IRQF_SHARED, "nvkm", pci);
> +	if (force_irqthreads)
> +		lockdep_on();
> +
>  	if (ret)
>  		return ret;
> 
Could you try this, please?

--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1155,6 +1155,8 @@ static int irq_thread(void *data)
 	irqreturn_t (*handler_fn)(struct irq_desc *desc,
 			struct irqaction *action);
 
+	sched_set_fifo(current);
+
 	if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD,
 					&action->thread_flags))
 		handler_fn = irq_forced_thread_fn;
@@ -1320,8 +1322,6 @@ setup_irq_thread(struct irqaction *new,
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 
-	sched_set_fifo(t);
-
 	/*
 	 * We keep the reference to the task struct even if
 	 * the thread dies to avoid that the interrupt code


Sebastian

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: kvm+nouveau induced lockdep  gripe
  2020-10-26 19:53               ` Sebastian Andrzej Siewior
@ 2020-10-27  6:03                 ` Mike Galbraith
  2020-10-27  9:00                   ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 29+ messages in thread
From: Mike Galbraith @ 2020-10-27  6:03 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Hillf Danton, Thomas Gleixner, LKML, linux-rt-users,
	Steven Rostedt, Ben Skeggs

On Mon, 2020-10-26 at 20:53 +0100, Sebastian Andrzej Siewior wrote:
>
> Could you try this, please?

Nogo, first call of sched_setscheduler() is via kthread_create().  I
confirmed that nuking that (gratuitous user foot saving override) call
on top of moving sched_set_fifo() does shut it up, but that won't fly.

> --- a/kernel/irq/manage.c
> +++ b/kernel/irq/manage.c
> @@ -1155,6 +1155,8 @@ static int irq_thread(void *data)
>  	irqreturn_t (*handler_fn)(struct irq_desc *desc,
>  			struct irqaction *action);
>
> +	sched_set_fifo(current);
> +
>  	if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD,
>  					&action->thread_flags))
>  		handler_fn = irq_forced_thread_fn;
> @@ -1320,8 +1322,6 @@ setup_irq_thread(struct irqaction *new,
>  	if (IS_ERR(t))
>  		return PTR_ERR(t);
>
> -	sched_set_fifo(t);
> -
>  	/*
>  	 * We keep the reference to the task struct even if
>  	 * the thread dies to avoid that the interrupt code
>

[  150.926954] ======================================================
[  150.926967] WARNING: possible circular locking dependency detected
[  150.926981] 5.10.0.g3650b22-master #13 Tainted: G S          E
[  150.926993] ------------------------------------------------------
[  150.927005] libvirtd/1833 is trying to acquire lock:
[  150.927016] ffff921a45ed55a8 (&mm->mmap_lock#2){++++}-{3:3}, at: mpol_rebind_mm+0x1e/0x50
[  150.927052]
               but task is already holding lock:
[  150.927064] ffffffffad585410 (&cpuset_rwsem){++++}-{0:0}, at: cpuset_attach+0x38/0x390
[  150.927094]
               which lock already depends on the new lock.

[  150.927108]
               the existing dependency chain (in reverse order) is:
[  150.927122]
               -> #3 (&cpuset_rwsem){++++}-{0:0}:
[  150.927145]        cpuset_read_lock+0x39/0xd0
[  150.927160]        __sched_setscheduler+0x456/0xa90
[  150.927173]        _sched_setscheduler+0x69/0x70
[  150.927187]        __kthread_create_on_node+0x114/0x170
[  150.927205]        kthread_create_on_node+0x37/0x40
[  150.927223]        setup_irq_thread+0x33/0xb0
[  150.927238]        __setup_irq+0x4e0/0x7c0
[  150.927254]        request_threaded_irq+0xf8/0x160
[  150.927393]        nvkm_pci_oneinit+0x4c/0x70 [nouveau]
[  150.927469]        nvkm_subdev_init+0x60/0x1e0 [nouveau]
[  150.927603]        nvkm_device_init+0x10b/0x240 [nouveau]
[  150.927733]        nvkm_udevice_init+0x49/0x70 [nouveau]
[  150.927804]        nvkm_object_init+0x3d/0x180 [nouveau]
[  150.927871]        nvkm_ioctl_new+0x1a1/0x260 [nouveau]
[  150.927938]        nvkm_ioctl+0x10a/0x240 [nouveau]
[  150.928001]        nvif_object_ctor+0xeb/0x150 [nouveau]
[  150.928069]        nvif_device_ctor+0x1f/0x60 [nouveau]
[  150.928201]        nouveau_cli_init+0x1ac/0x590 [nouveau]
[  150.928332]        nouveau_drm_device_init+0x68/0x800 [nouveau]
[  150.928462]        nouveau_drm_probe+0xfb/0x200 [nouveau]
[  150.928483]        local_pci_probe+0x42/0x90
[  150.928501]        pci_device_probe+0xe7/0x1a0
[  150.928519]        really_probe+0xf7/0x4d0
[  150.928536]        driver_probe_device+0x5d/0x140
[  150.928552]        device_driver_attach+0x4f/0x60
[  150.928569]        __driver_attach+0xa4/0x140
[  150.928584]        bus_for_each_dev+0x67/0x90
[  150.928600]        bus_add_driver+0x18c/0x230
[  150.928616]        driver_register+0x5b/0xf0
[  150.928633]        do_one_initcall+0x54/0x2f0
[  150.928650]        do_init_module+0x5b/0x21b
[  150.928667]        load_module+0x1e40/0x2370
[  150.928682]        __do_sys_finit_module+0x98/0xe0
[  150.928698]        do_syscall_64+0x33/0x40
[  150.928716]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  150.928731]
               -> #2 (&device->mutex){+.+.}-{3:3}:
[  150.928761]        __mutex_lock+0x90/0x9c0
[  150.928895]        nvkm_udevice_fini+0x23/0x70 [nouveau]
[  150.928975]        nvkm_object_fini+0xb8/0x210 [nouveau]
[  150.929048]        nvkm_object_fini+0x73/0x210 [nouveau]
[  150.929119]        nvkm_ioctl_del+0x7e/0xa0 [nouveau]
[  150.929191]        nvkm_ioctl+0x10a/0x240 [nouveau]
[  150.929258]        nvif_object_dtor+0x4a/0x60 [nouveau]
[  150.929326]        nvif_client_dtor+0xe/0x40 [nouveau]
[  150.929455]        nouveau_cli_fini+0x7a/0x90 [nouveau]
[  150.929586]        nouveau_drm_postclose+0xaa/0xe0 [nouveau]
[  150.929638]        drm_file_free.part.7+0x273/0x2c0 [drm]
[  150.929680]        drm_release+0x6e/0xf0 [drm]
[  150.929697]        __fput+0xb2/0x260
[  150.929714]        task_work_run+0x73/0xc0
[  150.929732]        exit_to_user_mode_prepare+0x1a5/0x1d0
[  150.929749]        syscall_exit_to_user_mode+0x46/0x2a0
[  150.929767]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  150.929782]
               -> #1 (&cli->lock){+.+.}-{3:3}:
[  150.929811]        __mutex_lock+0x90/0x9c0
[  150.929936]        nouveau_mem_fini+0x4c/0x70 [nouveau]
[  150.930062]        nouveau_sgdma_destroy+0x20/0x50 [nouveau]
[  150.930086]        ttm_bo_cleanup_memtype_use+0x3e/0x60 [ttm]
[  150.930109]        ttm_bo_release+0x29c/0x600 [ttm]
[  150.930130]        ttm_bo_vm_close+0x15/0x30 [ttm]
[  150.930150]        remove_vma+0x3e/0x70
[  150.930166]        __do_munmap+0x2b7/0x4f0
[  150.930181]        __vm_munmap+0x5b/0xa0
[  150.930195]        __x64_sys_munmap+0x27/0x30
[  150.930210]        do_syscall_64+0x33/0x40
[  150.930227]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  150.930241]
               -> #0 (&mm->mmap_lock#2){++++}-{3:3}:
[  150.930275]        __lock_acquire+0x149d/0x1a70
[  150.930292]        lock_acquire+0x1a7/0x3b0
[  150.930307]        down_write+0x38/0x70
[  150.930323]        mpol_rebind_mm+0x1e/0x50
[  150.930340]        cpuset_attach+0x229/0x390
[  150.930355]        cgroup_migrate_execute+0x46d/0x490
[  150.930371]        cgroup_attach_task+0x20c/0x3b0
[  150.930388]        __cgroup1_procs_write.constprop.21+0xf3/0x150
[  150.930407]        cgroup_file_write+0x64/0x210
[  150.930423]        kernfs_fop_write+0x117/0x1b0
[  150.930440]        vfs_write+0xe8/0x240
[  150.930456]        ksys_write+0x87/0xc0
[  150.930471]        do_syscall_64+0x33/0x40
[  150.930487]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  150.930501]
               other info that might help us debug this:

[  150.930522] Chain exists of:
                 &mm->mmap_lock#2 --> &device->mutex --> &cpuset_rwsem

[  150.930561]  Possible unsafe locking scenario:

[  150.930577]        CPU0                    CPU1
[  150.930589]        ----                    ----
[  150.930602]   lock(&cpuset_rwsem);
[  150.930617]                                lock(&device->mutex);
[  150.930635]                                lock(&cpuset_rwsem);
[  150.930653]   lock(&mm->mmap_lock#2);
[  150.930671]
                *** DEADLOCK ***

[  150.930690] 6 locks held by libvirtd/1833:
[  150.930703]  #0: ffff921a6f1690f0 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0x45/0x50
[  150.930736]  #1: ffff921c88c1d460 (sb_writers#7){.+.+}-{0:0}, at: vfs_write+0x1aa/0x240
[  150.930771]  #2: ffff921a48734488 (&of->mutex){+.+.}-{3:3}, at: kernfs_fop_write+0xe2/0x1b0
[  150.930801]  #3: ffffffffad581848 (cgroup_mutex){+.+.}-{3:3}, at: cgroup_kn_lock_live+0xea/0x1d0
[  150.930833]  #4: ffffffffad5816b0 (cgroup_threadgroup_rwsem){++++}-{0:0}, at: cgroup_procs_write_start+0x6e/0x200
[  150.930866]  #5: ffffffffad585410 (&cpuset_rwsem){++++}-{0:0}, at: cpuset_attach+0x38/0x390
[  150.930899]
               stack backtrace:
[  150.930918] CPU: 3 PID: 1833 Comm: libvirtd Kdump: loaded Tainted: G S          E     5.10.0.g3650b22-master #13
[  150.930938] Hardware name: MEDION MS-7848/MS-7848, BIOS M7848W08.20C 09/23/2013
[  150.930955] Call Trace:
[  150.930974]  dump_stack+0x77/0x97
[  150.930993]  check_noncircular+0xe7/0x100
[  150.931012]  ? stack_trace_save+0x3b/0x50
[  150.931036]  ? __lock_acquire+0x149d/0x1a70
[  150.931053]  __lock_acquire+0x149d/0x1a70
[  150.931077]  lock_acquire+0x1a7/0x3b0
[  150.931093]  ? mpol_rebind_mm+0x1e/0x50
[  150.931114]  down_write+0x38/0x70
[  150.931129]  ? mpol_rebind_mm+0x1e/0x50
[  150.931144]  mpol_rebind_mm+0x1e/0x50
[  150.931162]  cpuset_attach+0x229/0x390
[  150.931180]  cgroup_migrate_execute+0x46d/0x490
[  150.931199]  ? _raw_spin_unlock_irq+0x2f/0x50
[  150.931217]  cgroup_attach_task+0x20c/0x3b0
[  150.931245]  ? __cgroup1_procs_write.constprop.21+0xf3/0x150
[  150.931263]  __cgroup1_procs_write.constprop.21+0xf3/0x150
[  150.931286]  cgroup_file_write+0x64/0x210
[  150.931304]  kernfs_fop_write+0x117/0x1b0
[  150.931323]  vfs_write+0xe8/0x240
[  150.931341]  ksys_write+0x87/0xc0
[  150.931357]  ? lockdep_hardirqs_on+0x85/0x110
[  150.931374]  do_syscall_64+0x33/0x40
[  150.931391]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  150.931409] RIP: 0033:0x7fcdc56a6deb
[  150.931425] Code: 53 48 89 d5 48 89 f3 48 83 ec 18 48 89 7c 24 08 e8 5a fd ff ff 48 89 ea 41 89 c0 48 89 de 48 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 90 fd ff ff 48
[  150.931456] RSP: 002b:00007fcdbcfdc2f0 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
[  150.931476] RAX: ffffffffffffffda RBX: 00007fcdb403ca00 RCX: 00007fcdc56a6deb
[  150.931493] RDX: 0000000000000004 RSI: 00007fcdb403ca00 RDI: 000000000000001f
[  150.931510] RBP: 0000000000000004 R08: 0000000000000000 R09: 0000000000000000
[  150.931526] R10: 0000000000000000 R11: 0000000000000293 R12: 00007fcdb403ca00
[  150.931543] R13: 0000000000000000 R14: 000000000000001f R15: 0000000000000214





^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [ANNOUNCE] v5.9.1-rt18
  2020-10-21 13:14 ` Sebastian Andrzej Siewior
@ 2020-10-27  6:53   ` Fernando Lopez-Lezcano
  2020-10-27  8:22     ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 29+ messages in thread
From: Fernando Lopez-Lezcano @ 2020-10-27  6:53 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, Thomas Gleixner
  Cc: nando, LKML, linux-rt-users, Steven Rostedt

On 10/21/20 6:14 AM, Sebastian Andrzej Siewior wrote:
> On 2020-10-21 14:53:27 [+0200], To Thomas Gleixner wrote:
>> Dear RT folks!
>>
>> I'm pleased to announce the v5.9.1-rt18 patch set.

Maybe I'm doing something wrong but I get a compilation error (see 
below) when trying to do a debug build (building rpm packages for 
Fedora). 5.9.1 + rt19...

Builds fine otherwise...
Thanks,
-- Fernando



+ make -s 'HOSTCFLAGS=-O2 -g -pipe -Wall -Werror=format-security 
-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fexceptions 
-fstack-protector-strong -grecord-gcc-switches 
-specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 
-specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -fcommon -m64 
-mtune=generic -fasynchronous-unwind-tables -fstack-clash-protection 
-fcf-protection' 'HOSTLDFLAGS=-Wl,-z,relro -Wl,--as-needed  -Wl,-z,now 
-specs=/usr/lib/rpm/redhat/redhat-hardened-ld' ARCH=x86_64 KCFLAGS= 
WITH_GCOV=0 -j4 modules
BUILDSTDERR: In file included from <command-line>:
BUILDSTDERR: lib/test_lockup.c: In function 'test_lockup_init':
BUILDSTDERR: lib/test_lockup.c:484:31: error: 'spinlock_t' {aka 'struct 
spinlock'} has no member named 'rlock'; did you mean 'lock'?
BUILDSTDERR:   484 |          offsetof(spinlock_t, rlock.magic),
BUILDSTDERR:       |                               ^~~~~
BUILDSTDERR: ././include/linux/compiler_types.h:135:57: note: in 
definition of macro '__compiler_offsetof'
BUILDSTDERR:   135 | #define __compiler_offsetof(a, b) 
__builtin_offsetof(a, b)
BUILDSTDERR:       | 
      ^
BUILDSTDERR: lib/test_lockup.c:484:10: note: in expansion of macro 
'offsetof'
BUILDSTDERR:   484 |          offsetof(spinlock_t, rlock.magic),
BUILDSTDERR:       |          ^~~~~~~~
BUILDSTDERR: ././include/linux/compiler_types.h:135:35: error: 
'rwlock_t' {aka 'struct rt_rw_lock'} has no member named 'magic'
BUILDSTDERR:   135 | #define __compiler_offsetof(a, b) 
__builtin_offsetof(a, b)
BUILDSTDERR:       |                                   ^~~~~~~~~~~~~~~~~~
BUILDSTDERR: ./include/linux/stddef.h:17:32: note: in expansion of macro 
'__compiler_offsetof'
BUILDSTDERR:    17 | #define offsetof(TYPE, MEMBER) 
__compiler_offsetof(TYPE, MEMBER)
BUILDSTDERR:       |                                ^~~~~~~~~~~~~~~~~~~
BUILDSTDERR: lib/test_lockup.c:487:10: note: in expansion of macro 
'offsetof'
BUILDSTDERR:   487 |          offsetof(rwlock_t, magic),
BUILDSTDERR:       |          ^~~~~~~~
BUILDSTDERR: lib/test_lockup.c:488:10: error: 'RWLOCK_MAGIC' undeclared 
(first use in this function); did you mean 'STACK_MAGIC'?
BUILDSTDERR:   488 |          RWLOCK_MAGIC) ||
BUILDSTDERR:       |          ^~~~~~~~~~~~
BUILDSTDERR:       |          STACK_MAGIC
BUILDSTDERR: lib/test_lockup.c:488:10: note: each undeclared identifier 
is reported only once for each function it appears in
BUILDSTDERR: In file included from <command-line>:
BUILDSTDERR: ././include/linux/compiler_types.h:135:35: error: 'struct 
mutex' has no member named 'wait_lock'
BUILDSTDERR:   135 | #define __compiler_offsetof(a, b) 
__builtin_offsetof(a, b)
BUILDSTDERR:       |                                   ^~~~~~~~~~~~~~~~~~
BUILDSTDERR: ./include/linux/stddef.h:17:32: note: in expansion of macro 
'__compiler_offsetof'
BUILDSTDERR:    17 | #define offsetof(TYPE, MEMBER) 
__compiler_offsetof(TYPE, MEMBER)
BUILDSTDERR:       |                                ^~~~~~~~~~~~~~~~~~~
BUILDSTDERR: lib/test_lockup.c:490:10: note: in expansion of macro 
'offsetof'
BUILDSTDERR:   490 |          offsetof(struct mutex, wait_lock.rlock.magic),
BUILDSTDERR:       |          ^~~~~~~~
BUILDSTDERR: ././include/linux/compiler_types.h:135:35: error: 'struct 
rw_semaphore' has no member named 'wait_lock'
BUILDSTDERR:   135 | #define __compiler_offsetof(a, b) 
__builtin_offsetof(a, b)
BUILDSTDERR:       |                                   ^~~~~~~~~~~~~~~~~~
BUILDSTDERR: ./include/linux/stddef.h:17:32: note: in expansion of macro 
'__compiler_offsetof'
BUILDSTDERR:    17 | #define offsetof(TYPE, MEMBER) 
__compiler_offsetof(TYPE, MEMBER)
BUILDSTDERR:       |                                ^~~~~~~~~~~~~~~~~~~
BUILDSTDERR: lib/test_lockup.c:493:10: note: in expansion of macro 
'offsetof'
BUILDSTDERR:   493 |          offsetof(struct rw_semaphore, 
wait_lock.magic),
BUILDSTDERR:       |          ^~~~~~~~
BUILDSTDERR: make[1]: *** [scripts/Makefile.build:283: 
lib/test_lockup.o] Error 1
BUILDSTDERR: make: *** [Makefile:1784: lib] Error 2
BUILDSTDERR: make: *** Waiting for unfinished jobs....



>>
>> Changes since v5.9.1-rt17:
>>
>>    - Update the migrate-disable series by Peter Zijlstra to v3. Include
>>      also fixes discussed in the thread.
>>
>>    - UP builds did not boot since the replace of the migrate-disable
>>      code. Reported by Christian Egger. Fixed as a part of v3 by Peter
>>      Zijlstra.
>>
>>    - Rebase the printk code on top of the ringer buffer designed for
>>      printk which was merged in the v5.10 merge window. Patches by John
>>      Ogness.
>>
>> Known issues
>>       - It has been pointed out that due to changes to the printk code the
>>         internal buffer representation changed. This is only an issue if tools
>>         like `crash' are used to extract the printk buffer from a kernel memory
>>         image.
>>
>> The delta patch against v5.9.1-rt17 is appended below and can be found here:
>>   
>>       https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.9/incr/patch-5.9.1-rt17-rt18.patch.xz
>>
>> You can get this release via the git tree at:
>>
>>      git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v5.9.1-rt18
>>
>> The RT patch against v5.9.1 can be found here:
>>
>>      https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.9/older/patch-5.9.1-rt18.patch.xz
>>
>> The split quilt queue is available at:
>>
>>      https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.9/older/patches-5.9.1-rt18.tar.xz
>>
> 
> The attached diff was too large and the mail was dropped. It is
> available at
> 	https://git.kernel.org/rt/linux-rt-devel/d/v5.9.1-rt18/v5.9.1-rt17
> 
> Sebastian
> 


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [ANNOUNCE] v5.9.1-rt18
  2020-10-27  6:53   ` Fernando Lopez-Lezcano
@ 2020-10-27  8:22     ` Sebastian Andrzej Siewior
  2020-10-27 17:07       ` Fernando Lopez-Lezcano
  0 siblings, 1 reply; 29+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-10-27  8:22 UTC (permalink / raw)
  To: Fernando Lopez-Lezcano
  Cc: Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt

On 2020-10-26 23:53:20 [-0700], Fernando Lopez-Lezcano wrote:
> Maybe I'm doing something wrong but I get a compilation error (see below)
> when trying to do a debug build (building rpm packages for Fedora). 5.9.1 +
> rt19...
> 
> Builds fine otherwise...

If you could remove CONFIG_TEST_LOCKUP then it should work. I will think
of something.

> Thanks,
> -- Fernando

Sebastian

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: kvm+nouveau induced lockdep  gripe
  2020-10-27  6:03                 ` Mike Galbraith
@ 2020-10-27  9:00                   ` Sebastian Andrzej Siewior
  2020-10-27  9:49                     ` Mike Galbraith
  2020-10-27 10:14                     ` Mike Galbraith
  0 siblings, 2 replies; 29+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-10-27  9:00 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Hillf Danton, Thomas Gleixner, LKML, linux-rt-users,
	Steven Rostedt, Ben Skeggs

On 2020-10-27 07:03:38 [+0100], Mike Galbraith wrote:
> On Mon, 2020-10-26 at 20:53 +0100, Sebastian Andrzej Siewior wrote:
> >
> > Could you try this, please?
> 
> Nogo, first call of sched_setscheduler() is via kthread_create().  I
> confirmed that nuking that (gratuitous user foot saving override) call
> on top of moving sched_set_fifo() does shut it up, but that won't fly.

mkay. but this then, too. Let me try if I can figure out when this
broke.

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3edaa380dc7b4..64d6afb127239 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -244,6 +244,7 @@ EXPORT_SYMBOL_GPL(kthread_parkme);
 static int kthread(void *_create)
 {
 	/* Copy data: it's on kthread's stack */
+	static const struct sched_param param = { .sched_priority = 0 };
 	struct kthread_create_info *create = _create;
 	int (*threadfn)(void *data) = create->threadfn;
 	void *data = create->data;
@@ -273,6 +274,13 @@ static int kthread(void *_create)
 	init_completion(&self->parked);
 	current->vfork_done = &self->exited;
 
+	/*
+	 * root may have changed our (kthreadd's) priority or CPU mask.
+	 * The kernel thread should not inherit these properties.
+	 */
+	sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
+	set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD));
+
 	/* OK, tell user we're spawned, wait for stop or wakeup */
 	__set_current_state(TASK_UNINTERRUPTIBLE);
 	create->result = current;
@@ -370,7 +378,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 	}
 	task = create->result;
 	if (!IS_ERR(task)) {
-		static const struct sched_param param = { .sched_priority = 0 };
 		char name[TASK_COMM_LEN];
 
 		/*
@@ -379,13 +386,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 		 */
 		vsnprintf(name, sizeof(name), namefmt, args);
 		set_task_comm(task, name);
-		/*
-		 * root may have changed our (kthreadd's) priority or CPU mask.
-		 * The kernel thread should not inherit these properties.
-		 */
-		sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
-		set_cpus_allowed_ptr(task,
-				     housekeeping_cpumask(HK_FLAG_KTHREAD));
 	}
 	kfree(create);
 	return task;

Sebastian

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: kvm+nouveau induced lockdep  gripe
  2020-10-27  9:00                   ` Sebastian Andrzej Siewior
@ 2020-10-27  9:49                     ` Mike Galbraith
  2020-10-27 10:14                     ` Mike Galbraith
  1 sibling, 0 replies; 29+ messages in thread
From: Mike Galbraith @ 2020-10-27  9:49 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Hillf Danton, Thomas Gleixner, LKML, linux-rt-users,
	Steven Rostedt, Ben Skeggs

On Tue, 2020-10-27 at 10:00 +0100, Sebastian Andrzej Siewior wrote:
> On 2020-10-27 07:03:38 [+0100], Mike Galbraith wrote:
> > On Mon, 2020-10-26 at 20:53 +0100, Sebastian Andrzej Siewior wrote:
> > >
> > > Could you try this, please?
> >
> > Nogo, first call of sched_setscheduler() is via kthread_create().  I
> > confirmed that nuking that (gratuitous user foot saving override) call
> > on top of moving sched_set_fifo() does shut it up, but that won't fly.
>
> mkay. but this then, too.

Yup, might even fly.

>  Let me try if I can figure out when this
> broke.
>
> diff --git a/kernel/kthread.c b/kernel/kthread.c
> index 3edaa380dc7b4..64d6afb127239 100644
> --- a/kernel/kthread.c
> +++ b/kernel/kthread.c
> @@ -244,6 +244,7 @@ EXPORT_SYMBOL_GPL(kthread_parkme);
>  static int kthread(void *_create)
>  {
>  	/* Copy data: it's on kthread's stack */
> +	static const struct sched_param param = { .sched_priority = 0 };
>  	struct kthread_create_info *create = _create;
>  	int (*threadfn)(void *data) = create->threadfn;
>  	void *data = create->data;
> @@ -273,6 +274,13 @@ static int kthread(void *_create)
>  	init_completion(&self->parked);
>  	current->vfork_done = &self->exited;
>
> +	/*
> +	 * root may have changed our (kthreadd's) priority or CPU mask.
> +	 * The kernel thread should not inherit these properties.
> +	 */
> +	sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
> +	set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD));
> +
>  	/* OK, tell user we're spawned, wait for stop or wakeup */
>  	__set_current_state(TASK_UNINTERRUPTIBLE);
>  	create->result = current;
> @@ -370,7 +378,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
>  	}
>  	task = create->result;
>  	if (!IS_ERR(task)) {
> -		static const struct sched_param param = { .sched_priority = 0 };
>  		char name[TASK_COMM_LEN];
>
>  		/*
> @@ -379,13 +386,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
>  		 */
>  		vsnprintf(name, sizeof(name), namefmt, args);
>  		set_task_comm(task, name);
> -		/*
> -		 * root may have changed our (kthreadd's) priority or CPU mask.
> -		 * The kernel thread should not inherit these properties.
> -		 */
> -		sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
> -		set_cpus_allowed_ptr(task,
> -				     housekeeping_cpumask(HK_FLAG_KTHREAD));
>  	}
>  	kfree(create);
>  	return task;
>
> Sebastian


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: kvm+nouveau induced lockdep  gripe
  2020-10-27  9:00                   ` Sebastian Andrzej Siewior
  2020-10-27  9:49                     ` Mike Galbraith
@ 2020-10-27 10:14                     ` Mike Galbraith
  2020-10-27 10:18                       ` Sebastian Andrzej Siewior
  1 sibling, 1 reply; 29+ messages in thread
From: Mike Galbraith @ 2020-10-27 10:14 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Hillf Danton, Thomas Gleixner, LKML, linux-rt-users,
	Steven Rostedt, Ben Skeggs

On Tue, 2020-10-27 at 10:00 +0100, Sebastian Andrzej Siewior wrote:
> Let me try if I can figure out when this broke.

My money is on...
710da3c8ea7df (Juri Lelli 2019-07-19 16:00:00 +0200 5317)       if (pi)
710da3c8ea7df (Juri Lelli 2019-07-19 16:00:00 +0200 5318)               cpuset_read_lock();
...having just had an unnoticed consequence for nouveau.

	-Mike


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: kvm+nouveau induced lockdep  gripe
  2020-10-27 10:14                     ` Mike Galbraith
@ 2020-10-27 10:18                       ` Sebastian Andrzej Siewior
  2020-10-27 11:13                         ` Mike Galbraith
  0 siblings, 1 reply; 29+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-10-27 10:18 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Hillf Danton, Thomas Gleixner, LKML, linux-rt-users,
	Steven Rostedt, Ben Skeggs

On 2020-10-27 11:14:34 [+0100], Mike Galbraith wrote:
> On Tue, 2020-10-27 at 10:00 +0100, Sebastian Andrzej Siewior wrote:
> > Let me try if I can figure out when this broke.
> 
> My money is on...
> 710da3c8ea7df (Juri Lelli 2019-07-19 16:00:00 +0200 5317)       if (pi)
> 710da3c8ea7df (Juri Lelli 2019-07-19 16:00:00 +0200 5318)               cpuset_read_lock();
> ...having just had an unnoticed consequence for nouveau.

but that is over a year old and should be noticed in v5.4-RT.

> 	-Mike

Sebastian

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: kvm+nouveau induced lockdep  gripe
  2020-10-27 10:18                       ` Sebastian Andrzej Siewior
@ 2020-10-27 11:13                         ` Mike Galbraith
  0 siblings, 0 replies; 29+ messages in thread
From: Mike Galbraith @ 2020-10-27 11:13 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Hillf Danton, Thomas Gleixner, LKML, linux-rt-users,
	Steven Rostedt, Ben Skeggs

On Tue, 2020-10-27 at 11:18 +0100, Sebastian Andrzej Siewior wrote:
> On 2020-10-27 11:14:34 [+0100], Mike Galbraith wrote:
> > On Tue, 2020-10-27 at 10:00 +0100, Sebastian Andrzej Siewior wrote:
> > > Let me try if I can figure out when this broke.
> >
> > My money is on...
> > 710da3c8ea7df (Juri Lelli 2019-07-19 16:00:00 +0200 5317)       if (pi)
> > 710da3c8ea7df (Juri Lelli 2019-07-19 16:00:00 +0200 5318)               cpuset_read_lock();
> > ...having just had an unnoticed consequence for nouveau.
>
> but that is over a year old and should be noticed in v5.4-RT.

Yup.  Dang lazy sod nouveau users haven't been doing their broad
spectrum RT or threadirqs testing.  This one hasn't anyway ;-)

[   73.087508] ======================================================
[   73.087508] WARNING: possible circular locking dependency detected
[   73.087509] 5.4.72-rt40-rt #1 Tainted: G            E
[   73.087510] ------------------------------------------------------
[   73.087510] libvirtd/1902 is trying to acquire lock:
[   73.087511] ffff8b0f54b2f8d8 (&mm->mmap_sem#2){++++}, at: mpol_rebind_mm+0x1e/0x50
[   73.087517]
               but task is already holding lock:
[   73.087518] ffffffff9d2a0430 (&cpuset_rwsem){++++}, at: cpuset_attach+0x38/0x390
[   73.087520]
               which lock already depends on the new lock.

[   73.087521]
               the existing dependency chain (in reverse order) is:
[   73.087521]
               -> #3 (&cpuset_rwsem){++++}:
[   73.087523]        cpuset_read_lock+0x39/0x100
[   73.087524]        __sched_setscheduler+0x476/0xb00
[   73.087526]        _sched_setscheduler+0x69/0x70
[   73.087527]        __kthread_create_on_node+0x122/0x180
[   73.087531]        kthread_create_on_node+0x37/0x40
[   73.087532]        setup_irq_thread+0x3c/0xa0
[   73.087533]        __setup_irq+0x4c3/0x760
[   73.087535]        request_threaded_irq+0xf8/0x160
[   73.087535]        nvkm_pci_oneinit+0x4c/0x70 [nouveau]
[   73.087569]        nvkm_subdev_init+0x60/0x1e0 [nouveau]
[   73.087586]        nvkm_device_init+0x10b/0x240 [nouveau]
[   73.087611]        nvkm_udevice_init+0x47/0x70 [nouveau]
[   73.087636]        nvkm_object_init+0x3d/0x180 [nouveau]
[   73.087652]        nvkm_ioctl_new+0x1a1/0x260 [nouveau]
[   73.087667]        nvkm_ioctl+0x10a/0x240 [nouveau]
[   73.087682]        nvif_object_init+0xbf/0x110 [nouveau]
[   73.087697]        nvif_device_init+0xe/0x50 [nouveau]
[   73.087713]        nouveau_cli_init+0x1ce/0x5a0 [nouveau]
[   73.087739]        nouveau_drm_device_init+0x54/0x7e0 [nouveau]
[   73.087765]        nouveau_drm_probe+0x1da/0x330 [nouveau]
[   73.087791]        local_pci_probe+0x42/0x90
[   73.087793]        pci_device_probe+0xe7/0x1a0
[   73.087794]        really_probe+0xf7/0x460
[   73.087796]        driver_probe_device+0x5d/0x130
[   73.087797]        device_driver_attach+0x4f/0x60
[   73.087798]        __driver_attach+0xa2/0x140
[   73.087798]        bus_for_each_dev+0x67/0x90
[   73.087800]        bus_add_driver+0x192/0x230
[   73.087801]        driver_register+0x5b/0xf0
[   73.087801]        do_one_initcall+0x56/0x3c4
[   73.087803]        do_init_module+0x5b/0x21d
[   73.087805]        load_module+0x1cd4/0x2340
[   73.087806]        __do_sys_finit_module+0xa7/0xe0
[   73.087807]        do_syscall_64+0x6c/0x270
[   73.087808]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   73.087810]
               -> #2 (&device->mutex){+.+.}:
[   73.087811]        _mutex_lock+0x28/0x40
[   73.087813]        nvkm_udevice_fini+0x21/0x70 [nouveau]
[   73.087839]        nvkm_object_fini+0xb8/0x210 [nouveau]
[   73.087854]        nvkm_object_fini+0x73/0x210 [nouveau]
[   73.087869]        nvkm_ioctl_del+0x7e/0xa0 [nouveau]
[   73.087884]        nvkm_ioctl+0x10a/0x240 [nouveau]
[   73.087898]        nvif_object_fini+0x49/0x60 [nouveau]
[   73.087914]        nvif_client_fini+0xe/0x40 [nouveau]
[   73.087930]        nouveau_cli_fini+0x78/0x90 [nouveau]
[   73.087955]        nouveau_drm_postclose+0xa3/0xd0 [nouveau]
[   73.087981]        drm_file_free.part.5+0x20c/0x2c0 [drm]
[   73.087991]        drm_release+0x4b/0x80 [drm]
[   73.087997]        __fput+0xd5/0x280
[   73.087999]        task_work_run+0x87/0xb0
[   73.088001]        exit_to_usermode_loop+0x13b/0x160
[   73.088002]        do_syscall_64+0x1be/0x270
[   73.088003]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   73.088004]
               -> #1 (&cli->lock){+.+.}:
[   73.088006]        _mutex_lock+0x28/0x40
[   73.088007]        nouveau_mem_fini+0x4a/0x70 [nouveau]
[   73.088033]        nv04_sgdma_unbind+0xe/0x20 [nouveau]
[   73.088058]        ttm_tt_unbind+0x1d/0x30 [ttm]
[   73.088061]        ttm_tt_destroy+0x13/0x60 [ttm]
[   73.088063]        ttm_bo_cleanup_memtype_use+0x32/0x80 [ttm]
[   73.088066]        ttm_bo_release+0x264/0x460 [ttm]
[   73.088068]        ttm_bo_vm_close+0x15/0x30 [ttm]
[   73.088070]        remove_vma+0x3e/0x70
[   73.088072]        __do_munmap+0x2d7/0x510
[   73.088073]        __vm_munmap+0x5b/0xa0
[   73.088074]        __x64_sys_munmap+0x27/0x30
[   73.088075]        do_syscall_64+0x6c/0x270
[   73.088076]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   73.088077]
               -> #0 (&mm->mmap_sem#2){++++}:
[   73.088078]        __lock_acquire+0x113f/0x1410
[   73.088080]        lock_acquire+0x93/0x230
[   73.088081]        down_write+0x3b/0x50
[   73.088082]        mpol_rebind_mm+0x1e/0x50
[   73.088083]        cpuset_attach+0x229/0x390
[   73.088084]        cgroup_migrate_execute+0x42c/0x450
[   73.088086]        cgroup_attach_task+0x267/0x3f0
[   73.088086]        __cgroup1_procs_write.constprop.20+0xe8/0x140
[   73.088088]        cgroup_file_write+0x7e/0x1a0
[   73.088089]        kernfs_fop_write+0x113/0x1b0
[   73.088091]        vfs_write+0xc1/0x1d0
[   73.088092]        ksys_write+0x87/0xc0
[   73.088093]        do_syscall_64+0x6c/0x270
[   73.088094]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   73.088095]
               other info that might help us debug this:

[   73.088095] Chain exists of:
                 &mm->mmap_sem#2 --> &device->mutex --> &cpuset_rwsem

[   73.088097]  Possible unsafe locking scenario:

[   73.088097]        CPU0                    CPU1
[   73.088098]        ----                    ----
[   73.088098]   lock(&cpuset_rwsem);
[   73.088099]                                lock(&device->mutex);
[   73.088099]                                lock(&cpuset_rwsem);
[   73.088100]   lock(&mm->mmap_sem#2);
[   73.088101]
                *** DEADLOCK ***

[   73.088101] 6 locks held by libvirtd/1902:
[   73.088102]  #0: ffff8b0f771f1ba0 (&f->f_pos_lock){+.+.}, at: __fdget_pos+0x46/0x50
[   73.088105]  #1: ffff8b0cc7790658 (sb_writers#7){.+.+}, at: vfs_write+0x1af/0x1d0
[   73.088107]  #2: ffff8b0fa42762b8 (&of->mutex){+.+.}, at: kernfs_fop_write+0xde/0x1b0
[   73.088110]  #3: ffffffff9d29c578 (cgroup_mutex){+.+.}, at: cgroup_kn_lock_live+0xed/0x1d0
[   73.088112]  #4: ffffffff9d29c1b0 (cgroup_threadgroup_rwsem){++++}, at: cgroup_procs_write_start+0x4c/0x190
[   73.088114]  #5: ffffffff9d2a0430 (&cpuset_rwsem){++++}, at: cpuset_attach+0x38/0x390
[   73.088115]
               stack backtrace:
[   73.088116] CPU: 6 PID: 1902 Comm: libvirtd Kdump: loaded Tainted: G            E     5.4.72-rt40-rt #1
[   73.088117] Hardware name: MEDION MS-7848/MS-7848, BIOS M7848W08.20C 09/23/2013
[   73.088118] Call Trace:
[   73.088120]  dump_stack+0x71/0x9b
[   73.088122]  check_noncircular+0x155/0x170
[   73.088125]  ? __lock_acquire+0x113f/0x1410
[   73.088127]  __lock_acquire+0x113f/0x1410
[   73.088129]  lock_acquire+0x93/0x230
[   73.088130]  ? mpol_rebind_mm+0x1e/0x50
[   73.088133]  down_write+0x3b/0x50
[   73.088134]  ? mpol_rebind_mm+0x1e/0x50
[   73.088135]  mpol_rebind_mm+0x1e/0x50
[   73.088137]  cpuset_attach+0x229/0x390
[   73.088138]  cgroup_migrate_execute+0x42c/0x450
[   73.088140]  ? rt_spin_unlock+0x5b/0xa0
[   73.088142]  cgroup_attach_task+0x267/0x3f0
[   73.088145]  ? __cgroup1_procs_write.constprop.20+0xe8/0x140
[   73.088146]  __cgroup1_procs_write.constprop.20+0xe8/0x140
[   73.088148]  cgroup_file_write+0x7e/0x1a0
[   73.088150]  kernfs_fop_write+0x113/0x1b0
[   73.088152]  vfs_write+0xc1/0x1d0
[   73.088153]  ksys_write+0x87/0xc0
[   73.088155]  do_syscall_64+0x6c/0x270
[   73.088156]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   73.088157] RIP: 0033:0x7f6be8550deb
[   73.088159] Code: 53 48 89 d5 48 89 f3 48 83 ec 18 48 89 7c 24 08 e8 5a fd ff ff 48 89 ea 41 89 c0 48 89 de 48 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 90 fd ff ff 48
[   73.088160] RSP: 002b:00007f6bde6832f0 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
[   73.088161] RAX: ffffffffffffffda RBX: 00007f6bc80388b0 RCX: 00007f6be8550deb
[   73.088161] RDX: 0000000000000004 RSI: 00007f6bc80388b0 RDI: 000000000000001f
[   73.088162] RBP: 0000000000000004 R08: 0000000000000000 R09: 0000000000000000
[   73.088162] R10: 0000000000000000 R11: 0000000000000293 R12: 00007f6bc80388b0
[   73.088163] R13: 0000000000000000 R14: 000000000000001f R15: 0000000000000214


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [ANNOUNCE] v5.9.1-rt18
  2020-10-27  8:22     ` Sebastian Andrzej Siewior
@ 2020-10-27 17:07       ` Fernando Lopez-Lezcano
  2020-10-28 20:24         ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 29+ messages in thread
From: Fernando Lopez-Lezcano @ 2020-10-27 17:07 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, Fernando Lopez-Lezcano
  Cc: Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt

On 10/27/20 1:22 AM, Sebastian Andrzej Siewior wrote:
> On 2020-10-26 23:53:20 [-0700], Fernando Lopez-Lezcano wrote:
>> Maybe I'm doing something wrong but I get a compilation error (see below)
>> when trying to do a debug build (building rpm packages for Fedora). 5.9.1 +
>> rt19...
>>
>> Builds fine otherwise...
> 
> If you could remove CONFIG_TEST_LOCKUP then it should work. I will think
> of something.

Thanks much, I should have figured this out for myself :-( Just toooo 
busy. The compilation process went ahead (not finished yet), let me know 
if there is a proper patch. No hurry...

Thanks!
-- Fernando


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [ANNOUNCE] v5.9.1-rt18
  2020-10-27 17:07       ` Fernando Lopez-Lezcano
@ 2020-10-28 20:24         ` Sebastian Andrzej Siewior
  0 siblings, 0 replies; 29+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-10-28 20:24 UTC (permalink / raw)
  To: Fernando Lopez-Lezcano
  Cc: Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt

On 2020-10-27 10:07:35 [-0700], Fernando Lopez-Lezcano wrote:
> The compilation process went ahead (not finished yet), let me know if there
> is a proper patch. No hurry...

I just released -rt20 and it compiles now. I looked at the code and I
wouldn't recommend to use it unless you know exactly what you do.

> Thanks!
> -- Fernando

Sebastian

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [tip: irq/core] genirq: Move prio assignment into the newly created thread
  2020-10-22  5:28 ` kvm+nouveau induced lockdep gripe Mike Galbraith
  2020-10-23  9:01   ` Sebastian Andrzej Siewior
@ 2021-09-17 13:17   ` tip-bot2 for Thomas Gleixner
  2021-09-17 13:17   ` [tip: sched/core] kthread: Move prio/affinite change " tip-bot2 for Sebastian Andrzej Siewior
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 29+ messages in thread
From: tip-bot2 for Thomas Gleixner @ 2021-09-17 13:17 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Mike Galbraith, Thomas Gleixner, Sebastian Andrzej Siewior,
	Peter Zijlstra (Intel),
	x86, linux-kernel, maz

The following commit has been merged into the irq/core branch of tip:

Commit-ID:     e739f98b4b11337a4e3865364b8922a9e5ad32b6
Gitweb:        https://git.kernel.org/tip/e739f98b4b11337a4e3865364b8922a9e5ad32b6
Author:        Thomas Gleixner <tglx@linutronix.de>
AuthorDate:    Tue, 10 Nov 2020 12:38:48 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Fri, 17 Sep 2021 15:08:22 +02:00

genirq: Move prio assignment into the newly created thread

With enabled threaded interrupts the nouveau driver reported the
following:

| Chain exists of:
|   &mm->mmap_lock#2 --> &device->mutex --> &cpuset_rwsem
|
|  Possible unsafe locking scenario:
|
|        CPU0                    CPU1
|        ----                    ----
|   lock(&cpuset_rwsem);
|                                lock(&device->mutex);
|                                lock(&cpuset_rwsem);
|   lock(&mm->mmap_lock#2);

The device->mutex is nvkm_device::mutex.

Unblocking the lockchain at `cpuset_rwsem' is probably the easiest
thing to do.  Move the priority assignment to the start of the newly
created thread.

Fixes: 710da3c8ea7df ("sched/core: Prevent race condition between cpuset and __sched_setscheduler()")
Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[bigeasy: Patch description]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/a23a826af7c108ea5651e73b8fbae5e653f16e86.camel@gmx.de
---
 kernel/irq/manage.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index b392483..7405e38 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1259,6 +1259,8 @@ static int irq_thread(void *data)
 	irqreturn_t (*handler_fn)(struct irq_desc *desc,
 			struct irqaction *action);
 
+	sched_set_fifo(current);
+
 	if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD,
 					   &action->thread_flags))
 		handler_fn = irq_forced_thread_fn;
@@ -1424,8 +1426,6 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 
-	sched_set_fifo(t);
-
 	/*
 	 * We keep the reference to the task struct even if
 	 * the thread dies to avoid that the interrupt code

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [tip: sched/core] kthread: Move prio/affinite change into the newly created thread
  2020-10-22  5:28 ` kvm+nouveau induced lockdep gripe Mike Galbraith
  2020-10-23  9:01   ` Sebastian Andrzej Siewior
  2021-09-17 13:17   ` [tip: irq/core] genirq: Move prio assignment into the newly created thread tip-bot2 for Thomas Gleixner
@ 2021-09-17 13:17   ` tip-bot2 for Sebastian Andrzej Siewior
  2021-09-17 18:36   ` tip-bot2 for Sebastian Andrzej Siewior
  2021-10-05 14:12   ` tip-bot2 for Sebastian Andrzej Siewior
  4 siblings, 0 replies; 29+ messages in thread
From: tip-bot2 for Sebastian Andrzej Siewior @ 2021-09-17 13:17 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Mike Galbraith, Sebastian Andrzej Siewior, Peter Zijlstra (Intel),
	x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     2b214a488b2c83d63c99c71d054273c1c2c07027
Gitweb:        https://git.kernel.org/tip/2b214a488b2c83d63c99c71d054273c1c2c07027
Author:        Sebastian Andrzej Siewior <bigeasy@linutronix.de>
AuthorDate:    Tue, 10 Nov 2020 12:38:47 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Fri, 17 Sep 2021 15:08:18 +02:00

kthread: Move prio/affinite change into the newly created thread

With enabled threaded interrupts the nouveau driver reported the
following:

| Chain exists of:
|   &mm->mmap_lock#2 --> &device->mutex --> &cpuset_rwsem
|
|  Possible unsafe locking scenario:
|
|        CPU0                    CPU1
|        ----                    ----
|   lock(&cpuset_rwsem);
|                                lock(&device->mutex);
|                                lock(&cpuset_rwsem);
|   lock(&mm->mmap_lock#2);

The device->mutex is nvkm_device::mutex.

Unblocking the lockchain at `cpuset_rwsem' is probably the easiest
thing to do.  Move the priority reset to the start of the newly
created thread.

Fixes: 710da3c8ea7df ("sched/core: Prevent race condition between cpuset and __sched_setscheduler()")
Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/a23a826af7c108ea5651e73b8fbae5e653f16e86.camel@gmx.de
---
 kernel/kthread.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 7bbfeeb..6f59e34 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -270,6 +270,7 @@ EXPORT_SYMBOL_GPL(kthread_parkme);
 
 static int kthread(void *_create)
 {
+	static const struct sched_param param = { .sched_priority = 0 };
 	/* Copy data: it's on kthread's stack */
 	struct kthread_create_info *create = _create;
 	int (*threadfn)(void *data) = create->threadfn;
@@ -300,6 +301,13 @@ static int kthread(void *_create)
 	init_completion(&self->parked);
 	current->vfork_done = &self->exited;
 
+	/*
+	 * The new thread inherited kthreadd's priority and CPU mask. Reset
+	 * back to default in case they have been changed.
+	 */
+	sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
+	set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD));
+
 	/* OK, tell user we're spawned, wait for stop or wakeup */
 	__set_current_state(TASK_UNINTERRUPTIBLE);
 	create->result = current;
@@ -397,7 +405,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 	}
 	task = create->result;
 	if (!IS_ERR(task)) {
-		static const struct sched_param param = { .sched_priority = 0 };
 		char name[TASK_COMM_LEN];
 
 		/*
@@ -406,13 +413,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 		 */
 		vsnprintf(name, sizeof(name), namefmt, args);
 		set_task_comm(task, name);
-		/*
-		 * root may have changed our (kthreadd's) priority or CPU mask.
-		 * The kernel thread should not inherit these properties.
-		 */
-		sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
-		set_cpus_allowed_ptr(task,
-				     housekeeping_cpumask(HK_FLAG_KTHREAD));
 	}
 	kfree(create);
 	return task;

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [tip: sched/core] kthread: Move prio/affinite change into the newly created thread
  2020-10-22  5:28 ` kvm+nouveau induced lockdep gripe Mike Galbraith
                     ` (2 preceding siblings ...)
  2021-09-17 13:17   ` [tip: sched/core] kthread: Move prio/affinite change " tip-bot2 for Sebastian Andrzej Siewior
@ 2021-09-17 18:36   ` tip-bot2 for Sebastian Andrzej Siewior
  2021-10-05 14:12   ` tip-bot2 for Sebastian Andrzej Siewior
  4 siblings, 0 replies; 29+ messages in thread
From: tip-bot2 for Sebastian Andrzej Siewior @ 2021-09-17 18:36 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Mike Galbraith, Sebastian Andrzej Siewior, Peter Zijlstra (Intel),
	x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     4212bade2e86198ec84f1d65cbfb9023460f01bb
Gitweb:        https://git.kernel.org/tip/4212bade2e86198ec84f1d65cbfb9023460f01bb
Author:        Sebastian Andrzej Siewior <bigeasy@linutronix.de>
AuthorDate:    Tue, 10 Nov 2020 12:38:47 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Fri, 17 Sep 2021 20:32:27 +02:00

kthread: Move prio/affinite change into the newly created thread

With enabled threaded interrupts the nouveau driver reported the
following:

| Chain exists of:
|   &mm->mmap_lock#2 --> &device->mutex --> &cpuset_rwsem
|
|  Possible unsafe locking scenario:
|
|        CPU0                    CPU1
|        ----                    ----
|   lock(&cpuset_rwsem);
|                                lock(&device->mutex);
|                                lock(&cpuset_rwsem);
|   lock(&mm->mmap_lock#2);

The device->mutex is nvkm_device::mutex.

Unblocking the lockchain at `cpuset_rwsem' is probably the easiest
thing to do.  Move the priority reset to the start of the newly
created thread.

Fixes: 710da3c8ea7df ("sched/core: Prevent race condition between cpuset and __sched_setscheduler()")
Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/a23a826af7c108ea5651e73b8fbae5e653f16e86.camel@gmx.de
---
 kernel/kthread.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 7bbfeeb..6f59e34 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -270,6 +270,7 @@ EXPORT_SYMBOL_GPL(kthread_parkme);
 
 static int kthread(void *_create)
 {
+	static const struct sched_param param = { .sched_priority = 0 };
 	/* Copy data: it's on kthread's stack */
 	struct kthread_create_info *create = _create;
 	int (*threadfn)(void *data) = create->threadfn;
@@ -300,6 +301,13 @@ static int kthread(void *_create)
 	init_completion(&self->parked);
 	current->vfork_done = &self->exited;
 
+	/*
+	 * The new thread inherited kthreadd's priority and CPU mask. Reset
+	 * back to default in case they have been changed.
+	 */
+	sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
+	set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD));
+
 	/* OK, tell user we're spawned, wait for stop or wakeup */
 	__set_current_state(TASK_UNINTERRUPTIBLE);
 	create->result = current;
@@ -397,7 +405,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 	}
 	task = create->result;
 	if (!IS_ERR(task)) {
-		static const struct sched_param param = { .sched_priority = 0 };
 		char name[TASK_COMM_LEN];
 
 		/*
@@ -406,13 +413,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 		 */
 		vsnprintf(name, sizeof(name), namefmt, args);
 		set_task_comm(task, name);
-		/*
-		 * root may have changed our (kthreadd's) priority or CPU mask.
-		 * The kernel thread should not inherit these properties.
-		 */
-		sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
-		set_cpus_allowed_ptr(task,
-				     housekeeping_cpumask(HK_FLAG_KTHREAD));
 	}
 	kfree(create);
 	return task;

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [tip: sched/core] kthread: Move prio/affinite change into the newly created thread
  2020-10-22  5:28 ` kvm+nouveau induced lockdep gripe Mike Galbraith
                     ` (3 preceding siblings ...)
  2021-09-17 18:36   ` tip-bot2 for Sebastian Andrzej Siewior
@ 2021-10-05 14:12   ` tip-bot2 for Sebastian Andrzej Siewior
  4 siblings, 0 replies; 29+ messages in thread
From: tip-bot2 for Sebastian Andrzej Siewior @ 2021-10-05 14:12 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Mike Galbraith, Sebastian Andrzej Siewior, Peter Zijlstra (Intel),
	x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     1a7243ca4074beed97b68d7235a6e34862fc2cd6
Gitweb:        https://git.kernel.org/tip/1a7243ca4074beed97b68d7235a6e34862fc2cd6
Author:        Sebastian Andrzej Siewior <bigeasy@linutronix.de>
AuthorDate:    Tue, 10 Nov 2020 12:38:47 +01:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 05 Oct 2021 15:51:58 +02:00

kthread: Move prio/affinite change into the newly created thread

With enabled threaded interrupts the nouveau driver reported the
following:

| Chain exists of:
|   &mm->mmap_lock#2 --> &device->mutex --> &cpuset_rwsem
|
|  Possible unsafe locking scenario:
|
|        CPU0                    CPU1
|        ----                    ----
|   lock(&cpuset_rwsem);
|                                lock(&device->mutex);
|                                lock(&cpuset_rwsem);
|   lock(&mm->mmap_lock#2);

The device->mutex is nvkm_device::mutex.

Unblocking the lockchain at `cpuset_rwsem' is probably the easiest
thing to do.  Move the priority reset to the start of the newly
created thread.

Fixes: 710da3c8ea7df ("sched/core: Prevent race condition between cpuset and __sched_setscheduler()")
Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/a23a826af7c108ea5651e73b8fbae5e653f16e86.camel@gmx.de
---
 kernel/kthread.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 5b37a85..4a4d709 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -270,6 +270,7 @@ EXPORT_SYMBOL_GPL(kthread_parkme);
 
 static int kthread(void *_create)
 {
+	static const struct sched_param param = { .sched_priority = 0 };
 	/* Copy data: it's on kthread's stack */
 	struct kthread_create_info *create = _create;
 	int (*threadfn)(void *data) = create->threadfn;
@@ -300,6 +301,13 @@ static int kthread(void *_create)
 	init_completion(&self->parked);
 	current->vfork_done = &self->exited;
 
+	/*
+	 * The new thread inherited kthreadd's priority and CPU mask. Reset
+	 * back to default in case they have been changed.
+	 */
+	sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
+	set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD));
+
 	/* OK, tell user we're spawned, wait for stop or wakeup */
 	__set_current_state(TASK_UNINTERRUPTIBLE);
 	create->result = current;
@@ -397,7 +405,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 	}
 	task = create->result;
 	if (!IS_ERR(task)) {
-		static const struct sched_param param = { .sched_priority = 0 };
 		char name[TASK_COMM_LEN];
 
 		/*
@@ -406,13 +413,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 		 */
 		vsnprintf(name, sizeof(name), namefmt, args);
 		set_task_comm(task, name);
-		/*
-		 * root may have changed our (kthreadd's) priority or CPU mask.
-		 * The kernel thread should not inherit these properties.
-		 */
-		sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
-		set_cpus_allowed_ptr(task,
-				     housekeeping_cpumask(HK_FLAG_KTHREAD));
 	}
 	kfree(create);
 	return task;

^ permalink raw reply related	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2021-10-05 14:12 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-21 12:53 [ANNOUNCE] v5.9.1-rt18 Sebastian Andrzej Siewior
2020-10-21 13:14 ` Sebastian Andrzej Siewior
2020-10-27  6:53   ` Fernando Lopez-Lezcano
2020-10-27  8:22     ` Sebastian Andrzej Siewior
2020-10-27 17:07       ` Fernando Lopez-Lezcano
2020-10-28 20:24         ` Sebastian Andrzej Siewior
2020-10-22  5:21 ` ltp or kvm triggerable lockdep alloc_pid() deadlock gripe Mike Galbraith
2020-10-22 16:44   ` Sebastian Andrzej Siewior
2020-10-22  5:28 ` kvm+nouveau induced lockdep gripe Mike Galbraith
2020-10-23  9:01   ` Sebastian Andrzej Siewior
2020-10-23 12:07     ` Mike Galbraith
     [not found]     ` <e4bf2fe3c5d2fdeded9b3d873a08094dbf145bf9.camel-Mmb7MZpHnFY@public.gmane.org>
2020-10-24  2:22       ` Hillf Danton
2020-10-24  3:38         ` Mike Galbraith
2020-10-24  3:38           ` Mike Galbraith
     [not found]         ` <20201024050000.8104-1-hdanton@sina.com>
2020-10-24  5:25           ` Mike Galbraith
     [not found]           ` <20201024094224.2804-1-hdanton@sina.com>
2020-10-26 17:26             ` Sebastian Andrzej Siewior
2020-10-26 17:31           ` Sebastian Andrzej Siewior
2020-10-26 19:15             ` Mike Galbraith
2020-10-26 19:53               ` Sebastian Andrzej Siewior
2020-10-27  6:03                 ` Mike Galbraith
2020-10-27  9:00                   ` Sebastian Andrzej Siewior
2020-10-27  9:49                     ` Mike Galbraith
2020-10-27 10:14                     ` Mike Galbraith
2020-10-27 10:18                       ` Sebastian Andrzej Siewior
2020-10-27 11:13                         ` Mike Galbraith
2021-09-17 13:17   ` [tip: irq/core] genirq: Move prio assignment into the newly created thread tip-bot2 for Thomas Gleixner
2021-09-17 13:17   ` [tip: sched/core] kthread: Move prio/affinite change " tip-bot2 for Sebastian Andrzej Siewior
2021-09-17 18:36   ` tip-bot2 for Sebastian Andrzej Siewior
2021-10-05 14:12   ` tip-bot2 for Sebastian Andrzej Siewior

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.