[v3] x86/gart/kcore: Exclude GART aperture from kcore
diff mbox series

Message ID 20190213082800.4400-1-kasong@redhat.com
State Superseded
Headers show
Series
  • [v3] x86/gart/kcore: Exclude GART aperture from kcore
Related show

Commit Message

Kairui Song Feb. 13, 2019, 8:28 a.m. UTC
On machines where the GART aperture is mapped over physical RAM,
/proc/kcore contains the GART aperture range and reading it may lead
to kernel panic.

In 'commit 2a3e83c6f96c ("x86/gart: Exclude GART aperture from vmcore")',
a workaround is applied for vmcore to let /proc/vmcore return zeroes
when attempting to read the GART region, as vmcore have the same issue,
and after 'commit 707d4eefbdb3 ("Revert "[PATCH] Insert GART region
into resource map"")', userspace tools won't be able to detect GART
region so have to avoid it from being reading in kernel.

This patch applies a similar workaround for kcore. Let /proc/kcore
return zeroes for GART aperture.

Both vmcore and kcore maintain a memory mapping list, in the vmcore
workaround we exclude the GART region by registering a hook for checking
if PFN is valid before reading, because vmcore's memory mapping could
be generated by userspace which doesn't know about GART. But for kcore
it will be simpler to just alter the memory area list, kcore's area list
is always generated by kernel on init.

Kcore's memory area list is generated very late so can't exclude the
overlapped area when GART is initialized, instead simply introduce a
new area enum type KCORE_NORAM, register GART aperture as KCORE_NORAM
and let kcore return zeros for all KCORE_NORAM area. This fixes the
problem well with minor code changes.

---
Update from V2:
Instead of repeating the same hook infrastructure for kcore, introduce
a new kcore area type to avoid reading from, and let kcore always bypass
this kind of area.

Update from V1:
Fix a complie error when CONFIG_PROC_KCORE is not set

 arch/x86/kernel/aperture_64.c | 14 ++++++++++++++
 fs/proc/kcore.c               | 13 +++++++++++++
 include/linux/kcore.h         |  1 +
 3 files changed, 28 insertions(+)

Comments

Kairui Song Feb. 25, 2019, 7:42 a.m. UTC | #1
On Wed, Feb 13, 2019 at 4:28 PM Kairui Song <kasong@redhat.com> wrote:
>
> On machines where the GART aperture is mapped over physical RAM,
> /proc/kcore contains the GART aperture range and reading it may lead
> to kernel panic.
>
> In 'commit 2a3e83c6f96c ("x86/gart: Exclude GART aperture from vmcore")',
> a workaround is applied for vmcore to let /proc/vmcore return zeroes
> when attempting to read the GART region, as vmcore have the same issue,
> and after 'commit 707d4eefbdb3 ("Revert "[PATCH] Insert GART region
> into resource map"")', userspace tools won't be able to detect GART
> region so have to avoid it from being reading in kernel.
>
> This patch applies a similar workaround for kcore. Let /proc/kcore
> return zeroes for GART aperture.
>
> Both vmcore and kcore maintain a memory mapping list, in the vmcore
> workaround we exclude the GART region by registering a hook for checking
> if PFN is valid before reading, because vmcore's memory mapping could
> be generated by userspace which doesn't know about GART. But for kcore
> it will be simpler to just alter the memory area list, kcore's area list
> is always generated by kernel on init.
>
> Kcore's memory area list is generated very late so can't exclude the
> overlapped area when GART is initialized, instead simply introduce a
> new area enum type KCORE_NORAM, register GART aperture as KCORE_NORAM
> and let kcore return zeros for all KCORE_NORAM area. This fixes the
> problem well with minor code changes.
>
> ---
> Update from V2:
> Instead of repeating the same hook infrastructure for kcore, introduce
> a new kcore area type to avoid reading from, and let kcore always bypass
> this kind of area.
>
> Update from V1:
> Fix a complie error when CONFIG_PROC_KCORE is not set
>
>  arch/x86/kernel/aperture_64.c | 14 ++++++++++++++
>  fs/proc/kcore.c               | 13 +++++++++++++
>  include/linux/kcore.h         |  1 +
>  3 files changed, 28 insertions(+)
>
> diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
> index 58176b56354e..5fb04bdd3221 100644
> --- a/arch/x86/kernel/aperture_64.c
> +++ b/arch/x86/kernel/aperture_64.c
> @@ -31,6 +31,7 @@
>  #include <asm/amd_nb.h>
>  #include <asm/x86_init.h>
>  #include <linux/crash_dump.h>
> +#include <linux/kcore.h>
>
>  /*
>   * Using 512M as goal, in case kexec will load kernel_big
> @@ -84,6 +85,17 @@ static void exclude_from_vmcore(u64 aper_base, u32 aper_order)
>  }
>  #endif
>
> +#ifdef CONFIG_PROC_KCORE
> +static struct kcore_list kcore_gart;
> +
> +static void __init exclude_from_kcore(u64 aper_base, u32 aper_order) {
> +       u32 aper_size = (32 * 1024 * 1024) << aper_order;
> +       kclist_add(&kcore_gart, __va(aper_base), aper_size, KCORE_NORAM);
> +}
> +#else
> +static inline void __init exclude_from_kcore(u64 aper_base, u32 aper_order) { }
> +#endif
> +
>  /* This code runs before the PCI subsystem is initialized, so just
>     access the northbridge directly. */
>
> @@ -475,6 +487,7 @@ int __init gart_iommu_hole_init(void)
>                          * and fixed up the northbridge
>                          */
>                         exclude_from_vmcore(last_aper_base, last_aper_order);
> +                       exclude_from_kcore(last_aper_base, last_aper_order);
>
>                         return 1;
>                 }
> @@ -521,6 +534,7 @@ int __init gart_iommu_hole_init(void)
>          * range through vmcore even though it should be part of the dump.
>          */
>         exclude_from_vmcore(aper_alloc, aper_order);
> +       exclude_from_kcore(aper_alloc, aper_order);
>
>         /* Fix up the north bridges */
>         for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
> diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
> index bbcc185062bb..15e0d74d7c56 100644
> --- a/fs/proc/kcore.c
> +++ b/fs/proc/kcore.c
> @@ -75,6 +75,8 @@ static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len,
>         size = 0;
>
>         list_for_each_entry(m, &kclist_head, list) {
> +               if (m->type == KCORE_NORAM)
> +                       continue;
>                 try = kc_vaddr_to_offset((size_t)m->addr + m->size);
>                 if (try > size)
>                         size = try;
> @@ -256,6 +258,9 @@ static int kcore_update_ram(void)
>         list_for_each_entry_safe(pos, tmp, &kclist_head, list) {
>                 if (pos->type == KCORE_RAM || pos->type == KCORE_VMEMMAP)
>                         list_move(&pos->list, &garbage);
> +               /* Move NORAM area to head of the new list */
> +               if (pos->type == KCORE_NORAM)
> +                       list_move(&pos->list, &list);
>         }
>         list_splice_tail(&list, &kclist_head);
>
> @@ -356,6 +361,8 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
>
>                 phdr = &phdrs[1];
>                 list_for_each_entry(m, &kclist_head, list) {
> +                       if (m->type == KCORE_NORAM)
> +                               continue;
>                         phdr->p_type = PT_LOAD;
>                         phdr->p_flags = PF_R | PF_W | PF_X;
>                         phdr->p_offset = kc_vaddr_to_offset(m->addr) + data_offset;
> @@ -465,6 +472,12 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
>                                 goto out;
>                         }
>                         m = NULL;       /* skip the list anchor */
> +               } else if (m->type == KCORE_NORAM) {
> +                       /* for NORAM area just fill zero */
> +                       if (clear_user(buffer, tsz)) {
> +                               ret = -EFAULT;
> +                               goto out;
> +                       }
>                 } else if (m->type == KCORE_VMALLOC) {
>                         vread(buf, (char *)start, tsz);
>                         /* we have to zero-fill user buffer even if no read */
> diff --git a/include/linux/kcore.h b/include/linux/kcore.h
> index 8c3f8c14eeaa..372a4093f794 100644
> --- a/include/linux/kcore.h
> +++ b/include/linux/kcore.h
> @@ -13,6 +13,7 @@ enum kcore_type {
>         KCORE_USER,
>         KCORE_OTHER,
>         KCORE_REMAP,
> +       KCORE_NORAM,
>  };
>
>  struct kcore_list {
> --
> 2.20.1
>

Hi Jiri,

Could you help have a look of this fix too? I saw your reviewed V1,
and this V3 changed the approach a lot. Thanks!
Jiri Bohac Feb. 28, 2019, 11:12 p.m. UTC | #2
On Wed, Feb 13, 2019 at 04:28:00PM +0800, Kairui Song wrote:
> @@ -465,6 +472,12 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
>  				goto out;
>  			}
>  			m = NULL;	/* skip the list anchor */
> +		} else if (m->type == KCORE_NORAM) {
> +			/* for NORAM area just fill zero */
> +			if (clear_user(buffer, tsz)) {
> +				ret = -EFAULT;
> +				goto out;
> +			}

I don't think this works reliably. The loop filling the buffer
has this logic at the top:

        while (buflen) {
                /*
                 * If this is the first iteration or the address is not within
                 * the previous entry, search for a matching entry.
                 */
                if (!m || start < m->addr || start >= m->addr + m->size) {
                        list_for_each_entry(m, &kclist_head, list) {
                                if (start >= m->addr &&
                                    start < m->addr + m->size)
                                        break;
                        }
                }

This sets m to the kclist entry that contains the memory being
read. But if we do a large read that starts in valid KCORE_RAM
memory below the GART overlap and extends into the overlap, m
will not be set to the KCORE_NORAM entry. It will keep pointing
to the KCORE_RAM entry and the patch will have no effect.

But this seems already broken in existing cases as well, various
KCORE_* types overlap with KCORE_RAM, don't they?  So maybe
bf991c2231117d50a7645792b514354fc8d19dae ("proc/kcore: optimize
multiple page reads") broke this and once fixed, this KCORE_NORAM
approach will work. Omar?

Regards,
Kairui Song March 6, 2019, 8:20 a.m. UTC | #3
On Fri, Mar 1, 2019 at 7:12 AM Jiri Bohac <jbohac@suse.cz> wrote:
>
> On Wed, Feb 13, 2019 at 04:28:00PM +0800, Kairui Song wrote:
> > @@ -465,6 +472,12 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
> >                               goto out;
> >                       }
> >                       m = NULL;       /* skip the list anchor */
> > +             } else if (m->type == KCORE_NORAM) {
> > +                     /* for NORAM area just fill zero */
> > +                     if (clear_user(buffer, tsz)) {
> > +                             ret = -EFAULT;
> > +                             goto out;
> > +                     }
>
> I don't think this works reliably. The loop filling the buffer
> has this logic at the top:
>
>         while (buflen) {
>                 /*
>                  * If this is the first iteration or the address is not within
>                  * the previous entry, search for a matching entry.
>                  */
>                 if (!m || start < m->addr || start >= m->addr + m->size) {
>                         list_for_each_entry(m, &kclist_head, list) {
>                                 if (start >= m->addr &&
>                                     start < m->addr + m->size)
>                                         break;
>                         }
>                 }
>
> This sets m to the kclist entry that contains the memory being
> read. But if we do a large read that starts in valid KCORE_RAM
> memory below the GART overlap and extends into the overlap, m
> will not be set to the KCORE_NORAM entry. It will keep pointing
> to the KCORE_RAM entry and the patch will have no effect.
>
> But this seems already broken in existing cases as well, various
> KCORE_* types overlap with KCORE_RAM, don't they?  So maybe
> bf991c2231117d50a7645792b514354fc8d19dae ("proc/kcore: optimize
> multiple page reads") broke this and once fixed, this KCORE_NORAM
> approach will work. Omar?
>

Thanks for the review! You are right, although I hid the NORAM region
from the elf header, but didn't notice this potential risk of having
overlapped region.
I didn't see other kcore regions overlap for now, if so the
optimization should be totally good.
Better to keep using a hook just like what we did in vmcore or we will
have a performance drop for "fixing" this.
Will send V4 using the previous approach if there are no further comments.

Patch
diff mbox series

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 58176b56354e..5fb04bdd3221 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -31,6 +31,7 @@ 
 #include <asm/amd_nb.h>
 #include <asm/x86_init.h>
 #include <linux/crash_dump.h>
+#include <linux/kcore.h>
 
 /*
  * Using 512M as goal, in case kexec will load kernel_big
@@ -84,6 +85,17 @@  static void exclude_from_vmcore(u64 aper_base, u32 aper_order)
 }
 #endif
 
+#ifdef CONFIG_PROC_KCORE
+static struct kcore_list kcore_gart;
+
+static void __init exclude_from_kcore(u64 aper_base, u32 aper_order) {
+	u32 aper_size = (32 * 1024 * 1024) << aper_order;
+	kclist_add(&kcore_gart, __va(aper_base), aper_size, KCORE_NORAM);
+}
+#else
+static inline void __init exclude_from_kcore(u64 aper_base, u32 aper_order) { }
+#endif
+
 /* This code runs before the PCI subsystem is initialized, so just
    access the northbridge directly. */
 
@@ -475,6 +487,7 @@  int __init gart_iommu_hole_init(void)
 			 * and fixed up the northbridge
 			 */
 			exclude_from_vmcore(last_aper_base, last_aper_order);
+			exclude_from_kcore(last_aper_base, last_aper_order);
 
 			return 1;
 		}
@@ -521,6 +534,7 @@  int __init gart_iommu_hole_init(void)
 	 * range through vmcore even though it should be part of the dump.
 	 */
 	exclude_from_vmcore(aper_alloc, aper_order);
+	exclude_from_kcore(aper_alloc, aper_order);
 
 	/* Fix up the north bridges */
 	for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index bbcc185062bb..15e0d74d7c56 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -75,6 +75,8 @@  static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len,
 	size = 0;
 
 	list_for_each_entry(m, &kclist_head, list) {
+		if (m->type == KCORE_NORAM)
+			continue;
 		try = kc_vaddr_to_offset((size_t)m->addr + m->size);
 		if (try > size)
 			size = try;
@@ -256,6 +258,9 @@  static int kcore_update_ram(void)
 	list_for_each_entry_safe(pos, tmp, &kclist_head, list) {
 		if (pos->type == KCORE_RAM || pos->type == KCORE_VMEMMAP)
 			list_move(&pos->list, &garbage);
+		/* Move NORAM area to head of the new list */
+		if (pos->type == KCORE_NORAM)
+			list_move(&pos->list, &list);
 	}
 	list_splice_tail(&list, &kclist_head);
 
@@ -356,6 +361,8 @@  read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 
 		phdr = &phdrs[1];
 		list_for_each_entry(m, &kclist_head, list) {
+			if (m->type == KCORE_NORAM)
+				continue;
 			phdr->p_type = PT_LOAD;
 			phdr->p_flags = PF_R | PF_W | PF_X;
 			phdr->p_offset = kc_vaddr_to_offset(m->addr) + data_offset;
@@ -465,6 +472,12 @@  read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 				goto out;
 			}
 			m = NULL;	/* skip the list anchor */
+		} else if (m->type == KCORE_NORAM) {
+			/* for NORAM area just fill zero */
+			if (clear_user(buffer, tsz)) {
+				ret = -EFAULT;
+				goto out;
+			}
 		} else if (m->type == KCORE_VMALLOC) {
 			vread(buf, (char *)start, tsz);
 			/* we have to zero-fill user buffer even if no read */
diff --git a/include/linux/kcore.h b/include/linux/kcore.h
index 8c3f8c14eeaa..372a4093f794 100644
--- a/include/linux/kcore.h
+++ b/include/linux/kcore.h
@@ -13,6 +13,7 @@  enum kcore_type {
 	KCORE_USER,
 	KCORE_OTHER,
 	KCORE_REMAP,
+	KCORE_NORAM,
 };
 
 struct kcore_list {