linux-audit.redhat.com archive mirror
 help / color / mirror / Atom feed
* [PATCH] audit: optionally print warning after waiting to enqueue record
@ 2020-06-16  4:58 Max Englander
  2020-06-17 18:47 ` Paul Moore
  0 siblings, 1 reply; 13+ messages in thread
From: Max Englander @ 2020-06-16  4:58 UTC (permalink / raw)
  To: Paul Moore, Eric Paris, linux-audit

In environments where security is prioritized, users may set
--backlog_wait_time to a high value in order to reduce the likelihood
that any audit event is lost, even though doing so may result in
unpredictable performance if the kernel schedules a timeout when the
backlog limit is exceeded. For these users, the next best thing to
predictable performance is the ability to quickly detect and react to
degraded performance. This patch proposes to aid the detection of kernel
audit subsystem pauses through the following changes:

Add a variable named audit_backlog_warn_time. Enforce the value of this
variable to be no less than zero, and no more than the value of
audit_backlog_wait_time.

If audit_backlog_warn_time is greater than zero and if the total time
spent waiting to enqueue an audit record is greater than or equal to
audit_backlog_warn_time, then print a warning with the total time
spent waiting.

An example configuration:

	auditctl --backlog_warn_time 50

An example warning message:

	audit: sleep_time=52 >= audit_backlog_warn_time=50

Tested on Ubuntu 18.04.04 using complementary changes to the audit
userspace: https://github.com/linux-audit/audit-userspace/pull/131.

Signed-off-by: Max Englander <max.englander@gmail.com>
---
 include/uapi/linux/audit.h |  7 ++++++-
 kernel/audit.c             | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index a534d71e689a..e3e021047fdc 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -340,6 +340,7 @@ enum {
 #define AUDIT_STATUS_BACKLOG_LIMIT	0x0010
 #define AUDIT_STATUS_BACKLOG_WAIT_TIME	0x0020
 #define AUDIT_STATUS_LOST		0x0040
+#define AUDIT_STATUS_BACKLOG_WARN_TIME	0x0080
 
 #define AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT	0x00000001
 #define AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME	0x00000002
@@ -348,6 +349,7 @@ enum {
 #define AUDIT_FEATURE_BITMAP_SESSIONID_FILTER	0x00000010
 #define AUDIT_FEATURE_BITMAP_LOST_RESET		0x00000020
 #define AUDIT_FEATURE_BITMAP_FILTER_FS		0x00000040
+#define AUDIT_FEATURE_BITMAP_BACKLOG_WARN_TIME	0x00000080
 
 #define AUDIT_FEATURE_BITMAP_ALL (AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT | \
 				  AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME | \
@@ -355,12 +357,14 @@ enum {
 				  AUDIT_FEATURE_BITMAP_EXCLUDE_EXTEND | \
 				  AUDIT_FEATURE_BITMAP_SESSIONID_FILTER | \
 				  AUDIT_FEATURE_BITMAP_LOST_RESET | \
-				  AUDIT_FEATURE_BITMAP_FILTER_FS)
+				  AUDIT_FEATURE_BITMAP_FILTER_FS | \
+				  AUDIT_FEATURE_BITMAP_BACKLOG_WARN_TIME)
 
 /* deprecated: AUDIT_VERSION_* */
 #define AUDIT_VERSION_LATEST 		AUDIT_FEATURE_BITMAP_ALL
 #define AUDIT_VERSION_BACKLOG_LIMIT	AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT
 #define AUDIT_VERSION_BACKLOG_WAIT_TIME	AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME
+#define AUDIT_VERSION_BACKLOG_WARN_TIME	AUDIT_FEATURE_BITMAP_BACKLOG_WARN_TIME
 
 				/* Failure-to-log actions */
 #define AUDIT_FAIL_SILENT	0
@@ -466,6 +470,7 @@ struct audit_status {
 		__u32	feature_bitmap;	/* bitmap of kernel audit features */
 	};
 	__u32		backlog_wait_time;/* message queue wait timeout */
+	__u32		backlog_warn_time;/* message queue warn threshold */
 };
 
 struct audit_features {
diff --git a/kernel/audit.c b/kernel/audit.c
index 87f31bf1f0a0..4a5437cfe61f 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -122,6 +122,12 @@ static u32	audit_backlog_limit = 64;
 #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
 static u32	audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
 
+/* If audit_backlog_wait_time is non-zero, and the kernel waits
+ * for audit_backlog_warn_time or more to enqueue audit record,
+ * a warning will be printed with the duration of the wait
+ */
+static u32	audit_backlog_warn_time;
+
 /* The identity of the user shutting down the audit system. */
 kuid_t		audit_sig_uid = INVALID_UID;
 pid_t		audit_sig_pid = -1;
@@ -439,6 +445,12 @@ static int audit_set_backlog_wait_time(u32 timeout)
 				      &audit_backlog_wait_time, timeout);
 }
 
+static int audit_set_backlog_warn_time(u32 warn_time)
+{
+	return audit_do_config_change("audit_backlog_warn_time",
+				      &audit_backlog_warn_time, warn_time);
+}
+
 static int audit_set_enabled(u32 state)
 {
 	int rc;
@@ -1204,6 +1216,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		s.backlog		= skb_queue_len(&audit_queue);
 		s.feature_bitmap	= AUDIT_FEATURE_BITMAP_ALL;
 		s.backlog_wait_time	= audit_backlog_wait_time;
+		s.backlog_warn_time	= audit_backlog_warn_time;
 		audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
 		break;
 	}
@@ -1297,10 +1310,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 				return -EINVAL;
 			if (s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME)
 				return -EINVAL;
+			if (s.backlog_wait_time < audit_backlog_warn_time)
+				return -EINVAL;
 			err = audit_set_backlog_wait_time(s.backlog_wait_time);
 			if (err < 0)
 				return err;
 		}
+		if (s.mask & AUDIT_STATUS_BACKLOG_WARN_TIME) {
+			if (sizeof(s) > (size_t)nlh->nlmsg_len)
+				return -EINVAL;
+			if (s.backlog_warn_time > audit_backlog_wait_time)
+				return -EINVAL;
+			err = audit_set_backlog_warn_time(s.backlog_warn_time);
+			if (err < 0)
+				return err;
+		}
 		if (s.mask == AUDIT_STATUS_LOST) {
 			u32 lost = atomic_xchg(&audit_lost, 0);
 
@@ -1794,6 +1818,17 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 				return NULL;
 			}
 		}
+
+		/* Print a warning if current task slept for at least audit_backlog_warn_time
+		 * for audit queue length to be less than the audit_backlog_limit.
+		 */
+		if (audit_backlog_wait_time > 0 &&
+		    audit_backlog_warn_time > 0 &&
+		    audit_backlog_wait_time - stime >= audit_backlog_warn_time) {
+			pr_warn("sleep_time=%li >= audit_backlog_warn_time=%u\n",
+				audit_backlog_wait_time - stime,
+				audit_backlog_warn_time);
+		}
 	}
 
 	ab = audit_buffer_alloc(ctx, gfp_mask, type);
-- 
2.17.1

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] audit: optionally print warning after waiting to enqueue record
  2020-06-16  4:58 [PATCH] audit: optionally print warning after waiting to enqueue record Max Englander
@ 2020-06-17 18:47 ` Paul Moore
  2020-06-17 22:54   ` Max Englander
  0 siblings, 1 reply; 13+ messages in thread
From: Paul Moore @ 2020-06-17 18:47 UTC (permalink / raw)
  To: Max Englander; +Cc: linux-audit

On Tue, Jun 16, 2020 at 12:58 AM Max Englander <max.englander@gmail.com> wrote:
>
> In environments where security is prioritized, users may set
> --backlog_wait_time to a high value in order to reduce the likelihood
> that any audit event is lost, even though doing so may result in
> unpredictable performance if the kernel schedules a timeout when the
> backlog limit is exceeded. For these users, the next best thing to
> predictable performance is the ability to quickly detect and react to
> degraded performance. This patch proposes to aid the detection of kernel
> audit subsystem pauses through the following changes:
>
> Add a variable named audit_backlog_warn_time. Enforce the value of this
> variable to be no less than zero, and no more than the value of
> audit_backlog_wait_time.
>
> If audit_backlog_warn_time is greater than zero and if the total time
> spent waiting to enqueue an audit record is greater than or equal to
> audit_backlog_warn_time, then print a warning with the total time
> spent waiting.
>
> An example configuration:
>
>         auditctl --backlog_warn_time 50
>
> An example warning message:
>
>         audit: sleep_time=52 >= audit_backlog_warn_time=50
>
> Tested on Ubuntu 18.04.04 using complementary changes to the audit
> userspace: https://github.com/linux-audit/audit-userspace/pull/131.
>
> Signed-off-by: Max Englander <max.englander@gmail.com>
> ---
>  include/uapi/linux/audit.h |  7 ++++++-
>  kernel/audit.c             | 35 +++++++++++++++++++++++++++++++++++
>  2 files changed, 41 insertions(+), 1 deletion(-)

If an admin is prioritizing security, aka don't loose any audit
records, and there is a concern over variable system latency due to an
audit queue backlog, why not simply disable the backlog limit?

-- 
paul moore
www.paul-moore.com

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] audit: optionally print warning after waiting to enqueue record
  2020-06-17 18:47 ` Paul Moore
@ 2020-06-17 22:54   ` Max Englander
  2020-06-18  1:06     ` Paul Moore
  2020-06-18 13:39     ` Steve Grubb
  0 siblings, 2 replies; 13+ messages in thread
From: Max Englander @ 2020-06-17 22:54 UTC (permalink / raw)
  To: Paul Moore; +Cc: linux-audit

On Wed, Jun 17, 2020 at 02:47:19PM -0400, Paul Moore wrote:
> On Tue, Jun 16, 2020 at 12:58 AM Max Englander <max.englander@gmail.com> wrote:
> >
> > In environments where security is prioritized, users may set
> > --backlog_wait_time to a high value in order to reduce the likelihood
> > that any audit event is lost, even though doing so may result in
> > unpredictable performance if the kernel schedules a timeout when the
> > backlog limit is exceeded. For these users, the next best thing to
> > predictable performance is the ability to quickly detect and react to
> > degraded performance. This patch proposes to aid the detection of kernel
> > audit subsystem pauses through the following changes:
> >
> > Add a variable named audit_backlog_warn_time. Enforce the value of this
> > variable to be no less than zero, and no more than the value of
> > audit_backlog_wait_time.
> >
> > If audit_backlog_warn_time is greater than zero and if the total time
> > spent waiting to enqueue an audit record is greater than or equal to
> > audit_backlog_warn_time, then print a warning with the total time
> > spent waiting.
> >
> > An example configuration:
> >
> >         auditctl --backlog_warn_time 50
> >
> > An example warning message:
> >
> >         audit: sleep_time=52 >= audit_backlog_warn_time=50
> >
> > Tested on Ubuntu 18.04.04 using complementary changes to the audit
> > userspace: https://github.com/linux-audit/audit-userspace/pull/131.
> >
> > Signed-off-by: Max Englander <max.englander@gmail.com>
> > ---
> >  include/uapi/linux/audit.h |  7 ++++++-
> >  kernel/audit.c             | 35 +++++++++++++++++++++++++++++++++++
> >  2 files changed, 41 insertions(+), 1 deletion(-)
> 
> If an admin is prioritizing security, aka don't loose any audit
> records, and there is a concern over variable system latency due to an
> audit queue backlog, why not simply disable the backlog limit?
> 
> -- 
> paul moore
> www.paul-moore.com

That’s good in some cases, but in other cases unbounded growth of the
backlog could result in memory issues. If the kernel runs out of memory
it would drop the audit event or possibly have other problems. It could
also also consume memory in a way that starves user workloads or causes
them to be killed by the OOMKiller.

To refine my motivating use case a bit, if a Kubernetes admin wants to
prioritize security, and also avoid unbounded growth of the audit
backlog, they may set -b and --backlog_wait_time in a way that limits
kernel memory usage and reduces the likelihood that any audit event is
lost. Occasional performance degradation may be acceptable to the admin,
but they would like a way to be alerted to prolonged kernel pauses, so
that they can investigate and take corrective action (increase backlog,
increase server capacity, move some workloads to other servers, etc.).

To state another way. The kernel currently can be configured to print a
message when the backlog limit is exceeded and it must discard the audit
event. This is a useful message for admins, which they can address with
corrective action. I think a message similar to the one proposed by this
patch would be equally useful when the backlog limit is exceeded and the
kernel is configured to wait for the backlog to drain. Admins could
address that message in the same way, but without the cost of lost audit
events.

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] audit: optionally print warning after waiting to enqueue record
  2020-06-17 22:54   ` Max Englander
@ 2020-06-18  1:06     ` Paul Moore
  2020-06-18 23:48       ` Max Englander
  2020-06-18 13:39     ` Steve Grubb
  1 sibling, 1 reply; 13+ messages in thread
From: Paul Moore @ 2020-06-18  1:06 UTC (permalink / raw)
  To: Max Englander; +Cc: linux-audit

On Wed, Jun 17, 2020 at 6:54 PM Max Englander <max.englander@gmail.com> wrote:
> On Wed, Jun 17, 2020 at 02:47:19PM -0400, Paul Moore wrote:
> > On Tue, Jun 16, 2020 at 12:58 AM Max Englander <max.englander@gmail.com> wrote:
> > >
> > > In environments where security is prioritized, users may set
> > > --backlog_wait_time to a high value in order to reduce the likelihood
> > > that any audit event is lost, even though doing so may result in
> > > unpredictable performance if the kernel schedules a timeout when the
> > > backlog limit is exceeded. For these users, the next best thing to
> > > predictable performance is the ability to quickly detect and react to
> > > degraded performance. This patch proposes to aid the detection of kernel
> > > audit subsystem pauses through the following changes:
> > >
> > > Add a variable named audit_backlog_warn_time. Enforce the value of this
> > > variable to be no less than zero, and no more than the value of
> > > audit_backlog_wait_time.
> > >
> > > If audit_backlog_warn_time is greater than zero and if the total time
> > > spent waiting to enqueue an audit record is greater than or equal to
> > > audit_backlog_warn_time, then print a warning with the total time
> > > spent waiting.
> > >
> > > An example configuration:
> > >
> > >         auditctl --backlog_warn_time 50
> > >
> > > An example warning message:
> > >
> > >         audit: sleep_time=52 >= audit_backlog_warn_time=50
> > >
> > > Tested on Ubuntu 18.04.04 using complementary changes to the audit
> > > userspace: https://github.com/linux-audit/audit-userspace/pull/131.
> > >
> > > Signed-off-by: Max Englander <max.englander@gmail.com>
> > > ---
> > >  include/uapi/linux/audit.h |  7 ++++++-
> > >  kernel/audit.c             | 35 +++++++++++++++++++++++++++++++++++
> > >  2 files changed, 41 insertions(+), 1 deletion(-)
> >
> > If an admin is prioritizing security, aka don't loose any audit
> > records, and there is a concern over variable system latency due to an
> > audit queue backlog, why not simply disable the backlog limit?
> >
> > --
> > paul moore
> > www.paul-moore.com
>
> That’s good in some cases, but in other cases unbounded growth of the
> backlog could result in memory issues. If the kernel runs out of memory
> it would drop the audit event or possibly have other problems. It could
> also also consume memory in a way that starves user workloads or causes
> them to be killed by the OOMKiller.
>
> To refine my motivating use case a bit, if a Kubernetes admin wants to
> prioritize security, and also avoid unbounded growth of the audit
> backlog, they may set -b and --backlog_wait_time in a way that limits
> kernel memory usage and reduces the likelihood that any audit event is
> lost. Occasional performance degradation may be acceptable to the admin,
> but they would like a way to be alerted to prolonged kernel pauses, so
> that they can investigate and take corrective action (increase backlog,
> increase server capacity, move some workloads to other servers, etc.).
>
> To state another way. The kernel currently can be configured to print a
> message when the backlog limit is exceeded and it must discard the audit
> event. This is a useful message for admins, which they can address with
> corrective action. I think a message similar to the one proposed by this
> patch would be equally useful when the backlog limit is exceeded and the
> kernel is configured to wait for the backlog to drain. Admins could
> address that message in the same way, but without the cost of lost audit
> events.

I'm still struggling to understand how this is any better than
disabling the backlog limit, or setting it very high, and simply
monitoring the audit size of the audit backlog.  This way the admin
doesn't have to worry about the latency issues of a full backlog,
while still being able to trigger actions based on the state of the
backlog.  The userspace tooling/scripting to watch the backlog size
would be trivial, and would arguably provide much better visibility
into the backlog state than a single warning threshold in the kernel.

-- 
paul moore
www.paul-moore.com


--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] audit: optionally print warning after waiting to enqueue record
  2020-06-17 22:54   ` Max Englander
  2020-06-18  1:06     ` Paul Moore
@ 2020-06-18 13:39     ` Steve Grubb
  2020-06-18 13:46       ` Paul Moore
  2020-06-18 22:57       ` Max Englander
  1 sibling, 2 replies; 13+ messages in thread
From: Steve Grubb @ 2020-06-18 13:39 UTC (permalink / raw)
  To: linux-audit

On Wednesday, June 17, 2020 6:54:16 PM EDT Max Englander wrote:
> On Wed, Jun 17, 2020 at 02:47:19PM -0400, Paul Moore wrote:
> > On Tue, Jun 16, 2020 at 12:58 AM Max Englander <max.englander@gmail.com> 
wrote:
> > > In environments where security is prioritized, users may set
> > > --backlog_wait_time to a high value in order to reduce the likelihood
> > > that any audit event is lost, even though doing so may result in
> > > unpredictable performance if the kernel schedules a timeout when the
> > > backlog limit is exceeded. For these users, the next best thing to
> > > predictable performance is the ability to quickly detect and react to
> > > degraded performance. This patch proposes to aid the detection of
> > > kernel
> > > audit subsystem pauses through the following changes:
> > > 
> > > Add a variable named audit_backlog_warn_time. Enforce the value of this
> > > variable to be no less than zero, and no more than the value of
> > > audit_backlog_wait_time.
> > > 
> > > If audit_backlog_warn_time is greater than zero and if the total time
> > > spent waiting to enqueue an audit record is greater than or equal to
> > > audit_backlog_warn_time, then print a warning with the total time
> > > spent waiting.
> > > 
> > > An example configuration:
> > >         auditctl --backlog_warn_time 50
> > > 
> > > An example warning message:
> > >         audit: sleep_time=52 >= audit_backlog_warn_time=50
> > > 
> > > Tested on Ubuntu 18.04.04 using complementary changes to the audit
> > > userspace: https://github.com/linux-audit/audit-userspace/pull/131.
> > > 
> > > Signed-off-by: Max Englander <max.englander@gmail.com>
> > > ---
> > > 
> > >  include/uapi/linux/audit.h |  7 ++++++-
> > >  kernel/audit.c             | 35 +++++++++++++++++++++++++++++++++++
> > >  2 files changed, 41 insertions(+), 1 deletion(-)
> > 
> > If an admin is prioritizing security, aka don't loose any audit
> > records, and there is a concern over variable system latency due to an
> > audit queue backlog, why not simply disable the backlog limit?
> 
> That’s good in some cases, but in other cases unbounded growth of the
> backlog could result in memory issues. If the kernel runs out of memory
> it would drop the audit event or possibly have other problems. It could
> also also consume memory in a way that starves user workloads or causes
> them to be killed by the OOMKiller.

The kernel cannot grow the backlog unbounded. If you do nothing, the backlog 
is 64 - which is too small to really use. Otherwise, you set the backlog to a 
finite number with the -b option.

> To refine my motivating use case a bit, if a Kubernetes admin wants to
> prioritize security, and also avoid unbounded growth of the audit
> backlog, they may set -b and --backlog_wait_time in a way that limits
> kernel memory usage and reduces the likelihood that any audit event is
> lost. Occasional performance degradation may be acceptable to the admin,
> but they would like a way to be alerted to prolonged kernel pauses, so
> that they can investigate and take corrective action (increase backlog,
> increase server capacity, move some workloads to other servers, etc.).
> 
> To state another way. The kernel currently can be configured to print a
> message when the backlog limit is exceeded and it must discard the audit
> event. This is a useful message for admins, which they can address with
> corrective action. I think a message similar to the one proposed by this
> patch would be equally useful when the backlog limit is exceeded and the
> kernel is configured to wait for the backlog to drain. Admins could
> address that message in the same way, but without the cost of lost audit
> events.

If backlog wait time is exceeded, that could be a useful warning if that does 
not exist. I don't know how often that could happen...and of course without a 
warning we don't know if it happens or why it happens.

I also wished we had metrics on the backlog such as max used. That might help 
admins tune the size of the backlog.

-Steve



--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] audit: optionally print warning after waiting to enqueue record
  2020-06-18 13:39     ` Steve Grubb
@ 2020-06-18 13:46       ` Paul Moore
  2020-06-18 14:36         ` Steve Grubb
  2020-06-18 22:57       ` Max Englander
  1 sibling, 1 reply; 13+ messages in thread
From: Paul Moore @ 2020-06-18 13:46 UTC (permalink / raw)
  To: Steve Grubb; +Cc: linux-audit

On Thu, Jun 18, 2020 at 9:39 AM Steve Grubb <sgrubb@redhat.com> wrote:
> The kernel cannot grow the backlog unbounded. If you do nothing, the backlog
> is 64 - which is too small to really use. Otherwise, you set the backlog to a
> finite number with the -b option.

If one were to set the backlog limit to 0, it is effectively disabled
allowing the backlog to grow without any restrictions placed on it by
the audit subsystem.

-- 
paul moore
www.paul-moore.com

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] audit: optionally print warning after waiting to enqueue record
  2020-06-18 13:46       ` Paul Moore
@ 2020-06-18 14:36         ` Steve Grubb
  2020-06-18 16:29           ` Paul Moore
  0 siblings, 1 reply; 13+ messages in thread
From: Steve Grubb @ 2020-06-18 14:36 UTC (permalink / raw)
  To: Paul Moore; +Cc: linux-audit

On Thursday, June 18, 2020 9:46:54 AM EDT Paul Moore wrote:
> On Thu, Jun 18, 2020 at 9:39 AM Steve Grubb <sgrubb@redhat.com> wrote:
> > The kernel cannot grow the backlog unbounded. If you do nothing, the
> > backlog is 64 - which is too small to really use. Otherwise, you set the
> > backlog to a finite number with the -b option.
> 
> If one were to set the backlog limit to 0, it is effectively disabled
> allowing the backlog to grow without any restrictions placed on it by
> the audit subsystem.

Then I'd say you asked for it. The cure is setting a number. But regardless, 
we could use some metrics on the backlog and visibility into the time it 
takes to dequeue. That might signal problems with plugins or overly agressive 
rules.

-Steve



--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] audit: optionally print warning after waiting to enqueue record
  2020-06-18 14:36         ` Steve Grubb
@ 2020-06-18 16:29           ` Paul Moore
  0 siblings, 0 replies; 13+ messages in thread
From: Paul Moore @ 2020-06-18 16:29 UTC (permalink / raw)
  To: Steve Grubb; +Cc: linux-audit

On Thu, Jun 18, 2020 at 10:36 AM Steve Grubb <sgrubb@redhat.com> wrote:
> On Thursday, June 18, 2020 9:46:54 AM EDT Paul Moore wrote:
> > On Thu, Jun 18, 2020 at 9:39 AM Steve Grubb <sgrubb@redhat.com> wrote:
> > > The kernel cannot grow the backlog unbounded. If you do nothing, the
> > > backlog is 64 - which is too small to really use. Otherwise, you set the
> > > backlog to a finite number with the -b option.
> >
> > If one were to set the backlog limit to 0, it is effectively disabled
> > allowing the backlog to grow without any restrictions placed on it by
> > the audit subsystem.
>
> Then I'd say you asked for it. The cure is setting a number.

I wasn't commenting on if it was wise or not, that is going to depend
on the goals of the admin.  I just wanted to correct some bad
information you provided so those reading the mailing list were not
ill-informed.

-- 
paul moore
www.paul-moore.com

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] audit: optionally print warning after waiting to enqueue record
  2020-06-18 13:39     ` Steve Grubb
  2020-06-18 13:46       ` Paul Moore
@ 2020-06-18 22:57       ` Max Englander
  1 sibling, 0 replies; 13+ messages in thread
From: Max Englander @ 2020-06-18 22:57 UTC (permalink / raw)
  To: Steve Grubb; +Cc: linux-audit

On Thu, Jun 18, 2020 at 09:39:08AM -0400, Steve Grubb wrote:
> On Wednesday, June 17, 2020 6:54:16 PM EDT Max Englander wrote:
> > On Wed, Jun 17, 2020 at 02:47:19PM -0400, Paul Moore wrote:
> > > On Tue, Jun 16, 2020 at 12:58 AM Max Englander <max.englander@gmail.com> 
> wrote:
> > > > In environments where security is prioritized, users may set
> > > > --backlog_wait_time to a high value in order to reduce the likelihood
> > > > that any audit event is lost, even though doing so may result in
> > > > unpredictable performance if the kernel schedules a timeout when the
> > > > backlog limit is exceeded. For these users, the next best thing to
> > > > predictable performance is the ability to quickly detect and react to
> > > > degraded performance. This patch proposes to aid the detection of
> > > > kernel
> > > > audit subsystem pauses through the following changes:
> > > > 
> > > > Add a variable named audit_backlog_warn_time. Enforce the value of this
> > > > variable to be no less than zero, and no more than the value of
> > > > audit_backlog_wait_time.
> > > > 
> > > > If audit_backlog_warn_time is greater than zero and if the total time
> > > > spent waiting to enqueue an audit record is greater than or equal to
> > > > audit_backlog_warn_time, then print a warning with the total time
> > > > spent waiting.
> > > > 
> > > > An example configuration:
> > > >         auditctl --backlog_warn_time 50
> > > > 
> > > > An example warning message:
> > > >         audit: sleep_time=52 >= audit_backlog_warn_time=50
> > > > 
> > > > Tested on Ubuntu 18.04.04 using complementary changes to the audit
> > > > userspace: https://github.com/linux-audit/audit-userspace/pull/131.
> > > > 
> > > > Signed-off-by: Max Englander <max.englander@gmail.com>
> > > > ---
> > > > 
> > > >  include/uapi/linux/audit.h |  7 ++++++-
> > > >  kernel/audit.c             | 35 +++++++++++++++++++++++++++++++++++
> > > >  2 files changed, 41 insertions(+), 1 deletion(-)
> > > 
> > > If an admin is prioritizing security, aka don't loose any audit
> > > records, and there is a concern over variable system latency due to an
> > > audit queue backlog, why not simply disable the backlog limit?
> > 
> > That’s good in some cases, but in other cases unbounded growth of the
> > backlog could result in memory issues. If the kernel runs out of memory
> > it would drop the audit event or possibly have other problems. It could
> > also also consume memory in a way that starves user workloads or causes
> > them to be killed by the OOMKiller.
> 
> The kernel cannot grow the backlog unbounded. If you do nothing, the backlog 
> is 64 - which is too small to really use. Otherwise, you set the backlog to a 
> finite number with the -b option.
> 
> > To refine my motivating use case a bit, if a Kubernetes admin wants to
> > prioritize security, and also avoid unbounded growth of the audit
> > backlog, they may set -b and --backlog_wait_time in a way that limits
> > kernel memory usage and reduces the likelihood that any audit event is
> > lost. Occasional performance degradation may be acceptable to the admin,
> > but they would like a way to be alerted to prolonged kernel pauses, so
> > that they can investigate and take corrective action (increase backlog,
> > increase server capacity, move some workloads to other servers, etc.).
> > 
> > To state another way. The kernel currently can be configured to print a
> > message when the backlog limit is exceeded and it must discard the audit
> > event. This is a useful message for admins, which they can address with
> > corrective action. I think a message similar to the one proposed by this
> > patch would be equally useful when the backlog limit is exceeded and the
> > kernel is configured to wait for the backlog to drain. Admins could
> > address that message in the same way, but without the cost of lost audit
> > events.
> 
> If backlog wait time is exceeded, that could be a useful warning if that does 
> not exist. I don't know how often that could happen...and of course without a 
> warning we don't know if it happens or why it happens.
  
What you’re describing already exists, if I’m reading your words right.
In the event that the backlog wait time limit is exceeded, the -f flag
is consulted, and, if the value of -f is 1, then an error message
stating that the backlog limit is exceeded is printed. This is also true
when the backlog wait time is zero.

What I am suggesting is that even if the the backlog wait time is not
exceeded, it would be useful for the kernel to report when backlog
waiting occurs as a way to help identify degraded kernel performance.

> I also wished we had metrics on the backlog such as max used. That might help 
> admins tune the size of the backlog.
> 
> -Steve
> 
> 

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] audit: optionally print warning after waiting to enqueue record
  2020-06-18  1:06     ` Paul Moore
@ 2020-06-18 23:48       ` Max Englander
  2020-06-19  0:30         ` Richard Guy Briggs
  0 siblings, 1 reply; 13+ messages in thread
From: Max Englander @ 2020-06-18 23:48 UTC (permalink / raw)
  To: Paul Moore; +Cc: linux-audit

On Wed, Jun 17, 2020 at 09:06:27PM -0400, Paul Moore wrote:
> On Wed, Jun 17, 2020 at 6:54 PM Max Englander <max.englander@gmail.com> wrote:
> > On Wed, Jun 17, 2020 at 02:47:19PM -0400, Paul Moore wrote:
> > > On Tue, Jun 16, 2020 at 12:58 AM Max Englander <max.englander@gmail.com> wrote:
> > > >
> > > > In environments where security is prioritized, users may set
> > > > --backlog_wait_time to a high value in order to reduce the likelihood
> > > > that any audit event is lost, even though doing so may result in
> > > > unpredictable performance if the kernel schedules a timeout when the
> > > > backlog limit is exceeded. For these users, the next best thing to
> > > > predictable performance is the ability to quickly detect and react to
> > > > degraded performance. This patch proposes to aid the detection of kernel
> > > > audit subsystem pauses through the following changes:
> > > >
> > > > Add a variable named audit_backlog_warn_time. Enforce the value of this
> > > > variable to be no less than zero, and no more than the value of
> > > > audit_backlog_wait_time.
> > > >
> > > > If audit_backlog_warn_time is greater than zero and if the total time
> > > > spent waiting to enqueue an audit record is greater than or equal to
> > > > audit_backlog_warn_time, then print a warning with the total time
> > > > spent waiting.
> > > >
> > > > An example configuration:
> > > >
> > > >         auditctl --backlog_warn_time 50
> > > >
> > > > An example warning message:
> > > >
> > > >         audit: sleep_time=52 >= audit_backlog_warn_time=50
> > > >
> > > > Tested on Ubuntu 18.04.04 using complementary changes to the audit
> > > > userspace: https://github.com/linux-audit/audit-userspace/pull/131.
> > > >
> > > > Signed-off-by: Max Englander <max.englander@gmail.com>
> > > > ---
> > > >  include/uapi/linux/audit.h |  7 ++++++-
> > > >  kernel/audit.c             | 35 +++++++++++++++++++++++++++++++++++
> > > >  2 files changed, 41 insertions(+), 1 deletion(-)
> > >
> > > If an admin is prioritizing security, aka don't loose any audit
> > > records, and there is a concern over variable system latency due to an
> > > audit queue backlog, why not simply disable the backlog limit?
> > >
> > > --
> > > paul moore
> > > www.paul-moore.com
> >
> > That’s good in some cases, but in other cases unbounded growth of the
> > backlog could result in memory issues. If the kernel runs out of memory
> > it would drop the audit event or possibly have other problems. It could
> > also also consume memory in a way that starves user workloads or causes
> > them to be killed by the OOMKiller.
> >
> > To refine my motivating use case a bit, if a Kubernetes admin wants to
> > prioritize security, and also avoid unbounded growth of the audit
> > backlog, they may set -b and --backlog_wait_time in a way that limits
> > kernel memory usage and reduces the likelihood that any audit event is
> > lost. Occasional performance degradation may be acceptable to the admin,
> > but they would like a way to be alerted to prolonged kernel pauses, so
> > that they can investigate and take corrective action (increase backlog,
> > increase server capacity, move some workloads to other servers, etc.).
> >
> > To state another way. The kernel currently can be configured to print a
> > message when the backlog limit is exceeded and it must discard the audit
> > event. This is a useful message for admins, which they can address with
> > corrective action. I think a message similar to the one proposed by this
> > patch would be equally useful when the backlog limit is exceeded and the
> > kernel is configured to wait for the backlog to drain. Admins could
> > address that message in the same way, but without the cost of lost audit
> > events.
> 
> I'm still struggling to understand how this is any better than
> disabling the backlog limit, or setting it very high, and simply
> monitoring the audit size of the audit backlog.  This way the admin
> doesn't have to worry about the latency issues of a full backlog,
> while still being able to trigger actions based on the state of the
> backlog.  The userspace tooling/scripting to watch the backlog size
> would be trivial, and would arguably provide much better visibility
> into the backlog state than a single warning threshold in the kernel.
> 
> -- 
> paul moore
> www.paul-moore.com

Removing the backlog limit entirely could lead to the memory issues I
mentioned above (lost audit events, out-of-memory errors), and would
effectively make the backlog limit a function of free memory. Setting
the backlog limit higher won’t necessarily prevent it from being
exceeded on very busy systems where the rate of audit data generation
can, for long periods of time, outpace the ability of auditd or a
drop-in replacement to consume it. 

The combination of backlog limit and wait time, on the other hand, sets
a bound on memory while all but ensuring the preservation of audit
events. The fact that latency can arise from using this combination is,
for me, an acceptable cost for the predictable use of OS resources and
reduced probability of lost events. I’m not trying to eliminate the
possibility of latency, but rather find good means to monitor and
quickly identify its source when it does occur.

Watching the backlog limit with a userspace program, as you suggest, is
easy enough and a valuable tool for monitoring the audit system. Even
so, a full backlog may not always indicate long wait times. The backlog
may fill up 100 times in a second, but drain so quickly as to have
little impact on kernel performance. On the other hand, a specific
warning that reports backlog wait times would directly implicate or rule
out audit backlog waiting as the cause of degraded kernel performance,
and lead to faster debugging and resolution.

In case you’re any more receptive to the idea, I thought I’d mention
that the need this patch addresses would be just as well fulfilled if
wait times were reported in the audit status response along with other
currently reported metrics like backlog length and lost events. Wait
times could be reported as a cumulative sum, a moving average, or in
some other way, and would help directly implicate or rule out backlog
waiting as the cause in the event that an admin is faced with debugging
degraded kernel performance. It would eliminate the need for a new flag,
and fit well with the userspace tooling approach you suggested above.

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] audit: optionally print warning after waiting to enqueue record
  2020-06-18 23:48       ` Max Englander
@ 2020-06-19  0:30         ` Richard Guy Briggs
  2020-06-24  0:15           ` Paul Moore
  0 siblings, 1 reply; 13+ messages in thread
From: Richard Guy Briggs @ 2020-06-19  0:30 UTC (permalink / raw)
  To: Max Englander; +Cc: linux-audit

On 2020-06-18 23:48, Max Englander wrote:
> On Wed, Jun 17, 2020 at 09:06:27PM -0400, Paul Moore wrote:
> > On Wed, Jun 17, 2020 at 6:54 PM Max Englander <max.englander@gmail.com> wrote:
> > > On Wed, Jun 17, 2020 at 02:47:19PM -0400, Paul Moore wrote:
> > > > On Tue, Jun 16, 2020 at 12:58 AM Max Englander <max.englander@gmail.com> wrote:
> > > > >
> > > > > In environments where security is prioritized, users may set
> > > > > --backlog_wait_time to a high value in order to reduce the likelihood
> > > > > that any audit event is lost, even though doing so may result in
> > > > > unpredictable performance if the kernel schedules a timeout when the
> > > > > backlog limit is exceeded. For these users, the next best thing to
> > > > > predictable performance is the ability to quickly detect and react to
> > > > > degraded performance. This patch proposes to aid the detection of kernel
> > > > > audit subsystem pauses through the following changes:
> > > > >
> > > > > Add a variable named audit_backlog_warn_time. Enforce the value of this
> > > > > variable to be no less than zero, and no more than the value of
> > > > > audit_backlog_wait_time.
> > > > >
> > > > > If audit_backlog_warn_time is greater than zero and if the total time
> > > > > spent waiting to enqueue an audit record is greater than or equal to
> > > > > audit_backlog_warn_time, then print a warning with the total time
> > > > > spent waiting.
> > > > >
> > > > > An example configuration:
> > > > >
> > > > >         auditctl --backlog_warn_time 50
> > > > >
> > > > > An example warning message:
> > > > >
> > > > >         audit: sleep_time=52 >= audit_backlog_warn_time=50
> > > > >
> > > > > Tested on Ubuntu 18.04.04 using complementary changes to the audit
> > > > > userspace: https://github.com/linux-audit/audit-userspace/pull/131.
> > > > >
> > > > > Signed-off-by: Max Englander <max.englander@gmail.com>
> > > > > ---
> > > > >  include/uapi/linux/audit.h |  7 ++++++-
> > > > >  kernel/audit.c             | 35 +++++++++++++++++++++++++++++++++++
> > > > >  2 files changed, 41 insertions(+), 1 deletion(-)
> > > >
> > > > If an admin is prioritizing security, aka don't loose any audit
> > > > records, and there is a concern over variable system latency due to an
> > > > audit queue backlog, why not simply disable the backlog limit?
> > > >
> > > > --
> > > > paul moore
> > > > www.paul-moore.com
> > >
> > > That’s good in some cases, but in other cases unbounded growth of the
> > > backlog could result in memory issues. If the kernel runs out of memory
> > > it would drop the audit event or possibly have other problems. It could
> > > also also consume memory in a way that starves user workloads or causes
> > > them to be killed by the OOMKiller.
> > >
> > > To refine my motivating use case a bit, if a Kubernetes admin wants to
> > > prioritize security, and also avoid unbounded growth of the audit
> > > backlog, they may set -b and --backlog_wait_time in a way that limits
> > > kernel memory usage and reduces the likelihood that any audit event is
> > > lost. Occasional performance degradation may be acceptable to the admin,
> > > but they would like a way to be alerted to prolonged kernel pauses, so
> > > that they can investigate and take corrective action (increase backlog,
> > > increase server capacity, move some workloads to other servers, etc.).
> > >
> > > To state another way. The kernel currently can be configured to print a
> > > message when the backlog limit is exceeded and it must discard the audit
> > > event. This is a useful message for admins, which they can address with
> > > corrective action. I think a message similar to the one proposed by this
> > > patch would be equally useful when the backlog limit is exceeded and the
> > > kernel is configured to wait for the backlog to drain. Admins could
> > > address that message in the same way, but without the cost of lost audit
> > > events.
> > 
> > I'm still struggling to understand how this is any better than
> > disabling the backlog limit, or setting it very high, and simply
> > monitoring the audit size of the audit backlog.  This way the admin
> > doesn't have to worry about the latency issues of a full backlog,
> > while still being able to trigger actions based on the state of the
> > backlog.  The userspace tooling/scripting to watch the backlog size
> > would be trivial, and would arguably provide much better visibility
> > into the backlog state than a single warning threshold in the kernel.
> > 
> > -- 
> > paul moore
> > www.paul-moore.com
> 
> Removing the backlog limit entirely could lead to the memory issues I
> mentioned above (lost audit events, out-of-memory errors), and would
> effectively make the backlog limit a function of free memory. Setting
> the backlog limit higher won’t necessarily prevent it from being
> exceeded on very busy systems where the rate of audit data generation
> can, for long periods of time, outpace the ability of auditd or a
> drop-in replacement to consume it. 
> 
> The combination of backlog limit and wait time, on the other hand, sets
> a bound on memory while all but ensuring the preservation of audit
> events. The fact that latency can arise from using this combination is,
> for me, an acceptable cost for the predictable use of OS resources and
> reduced probability of lost events. I’m not trying to eliminate the
> possibility of latency, but rather find good means to monitor and
> quickly identify its source when it does occur.
> 
> Watching the backlog limit with a userspace program, as you suggest, is
> easy enough and a valuable tool for monitoring the audit system. Even
> so, a full backlog may not always indicate long wait times. The backlog
> may fill up 100 times in a second, but drain so quickly as to have
> little impact on kernel performance. On the other hand, a specific
> warning that reports backlog wait times would directly implicate or rule
> out audit backlog waiting as the cause of degraded kernel performance,
> and lead to faster debugging and resolution.
> 
> In case you’re any more receptive to the idea, I thought I’d mention
> that the need this patch addresses would be just as well fulfilled if
> wait times were reported in the audit status response along with other
> currently reported metrics like backlog length and lost events. Wait
> times could be reported as a cumulative sum, a moving average, or in
> some other way, and would help directly implicate or rule out backlog
> waiting as the cause in the event that an admin is faced with debugging
> degraded kernel performance. It would eliminate the need for a new flag,
> and fit well with the userspace tooling approach you suggested above.

Such as is captured in this upstream issue from 3 years ago:

	https://github.com/linux-audit/audit-kernel/issues/63
	"RFE: add kernel audit queue statistics"

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] audit: optionally print warning after waiting to enqueue record
  2020-06-19  0:30         ` Richard Guy Briggs
@ 2020-06-24  0:15           ` Paul Moore
  2020-06-25  3:34             ` Max Englander
  0 siblings, 1 reply; 13+ messages in thread
From: Paul Moore @ 2020-06-24  0:15 UTC (permalink / raw)
  To: Richard Guy Briggs; +Cc: linux-audit

On Thu, Jun 18, 2020 at 8:30 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> On 2020-06-18 23:48, Max Englander wrote:
> > In case you’re any more receptive to the idea, I thought I’d mention
> > that the need this patch addresses would be just as well fulfilled if
> > wait times were reported in the audit status response along with other
> > currently reported metrics like backlog length and lost events. Wait
> > times could be reported as a cumulative sum, a moving average, or in
> > some other way, and would help directly implicate or rule out backlog
> > waiting as the cause in the event that an admin is faced with debugging
> > degraded kernel performance. It would eliminate the need for a new flag,
> > and fit well with the userspace tooling approach you suggested above.
>
> Such as is captured in this upstream issue from 3 years ago:
>
>         https://github.com/linux-audit/audit-kernel/issues/63
>         "RFE: add kernel audit queue statistics"

I would be more open to the idea of reporting queue statistics as part
of the audit status information, or similar.

-- 
paul moore
www.paul-moore.com


--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] audit: optionally print warning after waiting to enqueue record
  2020-06-24  0:15           ` Paul Moore
@ 2020-06-25  3:34             ` Max Englander
  0 siblings, 0 replies; 13+ messages in thread
From: Max Englander @ 2020-06-25  3:34 UTC (permalink / raw)
  To: Paul Moore; +Cc: Richard Guy Briggs, linux-audit

On Tue, Jun 23, 2020 at 08:15:59PM -0400, Paul Moore wrote:
> On Thu, Jun 18, 2020 at 8:30 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > On 2020-06-18 23:48, Max Englander wrote:
> > > In case you’re any more receptive to the idea, I thought I’d mention
> > > that the need this patch addresses would be just as well fulfilled if
> > > wait times were reported in the audit status response along with other
> > > currently reported metrics like backlog length and lost events. Wait
> > > times could be reported as a cumulative sum, a moving average, or in
> > > some other way, and would help directly implicate or rule out backlog
> > > waiting as the cause in the event that an admin is faced with debugging
> > > degraded kernel performance. It would eliminate the need for a new flag,
> > > and fit well with the userspace tooling approach you suggested above.
> >
> > Such as is captured in this upstream issue from 3 years ago:
> >
> >         https://github.com/linux-audit/audit-kernel/issues/63
> >         "RFE: add kernel audit queue statistics"
> 
> I would be more open to the idea of reporting queue statistics as part
> of the audit status information, or similar.
> 
> -- 
> paul moore
> www.paul-moore.com

Excellent, I'll send a v2 patch.

--
Linux-audit mailing list
Linux-audit@redhat.com
https://www.redhat.com/mailman/listinfo/linux-audit

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2020-06-25  3:35 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-06-16  4:58 [PATCH] audit: optionally print warning after waiting to enqueue record Max Englander
2020-06-17 18:47 ` Paul Moore
2020-06-17 22:54   ` Max Englander
2020-06-18  1:06     ` Paul Moore
2020-06-18 23:48       ` Max Englander
2020-06-19  0:30         ` Richard Guy Briggs
2020-06-24  0:15           ` Paul Moore
2020-06-25  3:34             ` Max Englander
2020-06-18 13:39     ` Steve Grubb
2020-06-18 13:46       ` Paul Moore
2020-06-18 14:36         ` Steve Grubb
2020-06-18 16:29           ` Paul Moore
2020-06-18 22:57       ` Max Englander

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).