Hi Paul, There are some questions about this patch and hope it helps. > /** > * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd > * @skb: audit record > + * @error: error code (unused) > * > * Description: > * Not as serious as kauditd_hold_skb() as we still have a connected auditd, > * but for some reason we are having problems sending it audit records so > * queue the given record and attempt to resend. > */ > -static void kauditd_retry_skb(struct sk_buff *skb) > +static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error) > { > - /* NOTE: because records should only live in the retry queue for a > - * short period of time, before either being sent or moved to the hold > - * queue, we don't currently enforce a limit on this queue */ > - skb_queue_tail(&audit_retry_queue, skb); > + if (!audit_backlog_limit || > + skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { > + skb_queue_tail(&audit_retry_queue, skb); > + return; > + } > + > + audit_log_lost("kauditd retry queue overflow"); > + kfree_skb(skb); > } When we process the main queue, should we printk the skb when audit_log_lost be call ? > /** > * kauditd_hold_skb - Queue an audit record, waiting for auditd > * @skb: audit record > + * @error: error code > * > * Description: > * Queue the audit record, waiting for an instance of auditd. When this > @@ -564,19 +566,31 @@ static void kauditd_rehold_skb(struct sk_buff *skb) > * and queue it, if we have room. If we want to hold on to the record, but we > * don't have room, record a record lost message. > */ > -static void kauditd_hold_skb(struct sk_buff *skb) > +static void kauditd_hold_skb(struct sk_buff *skb, int error) > { > /* at this point it is uncertain if we will ever send this to auditd so > * try to send the message via printk before we go any further */ > kauditd_printk_skb(skb); > > /* can we just silently drop the message? */ > - if (!audit_default) { > - kfree_skb(skb); > - return; > + if (!audit_default) > + goto drop; > + > + /* the hold queue is only for when the daemon goes away completely, > + * not -EAGAIN failures; if we are in a -EAGAIN state requeue the > + * record on the retry queue unless it's full, in which case drop it > + */ > + if (error == -EAGAIN) { > + if (!audit_backlog_limit || > + skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { > + skb_queue_tail(&audit_retry_queue, skb); > + return; > + } > + audit_log_lost("kauditd retry queue overflow"); > + goto drop; > } > > - /* if we have room, queue the message */ > + /* if we have room in the hold queue, queue the message */ > if (!audit_backlog_limit || > skb_queue_len(&audit_hold_queue) < audit_backlog_limit) { > skb_queue_tail(&audit_hold_queue, skb); > @@ -585,24 +599,30 @@ static void kauditd_hold_skb(struct sk_buff *skb) > > /* we have no other options - drop the message */ > audit_log_lost("kauditd hold queue overflow"); > +drop: > kfree_skb(skb); > } If we move skbs from audit_hold_queue to audit_retry_queue, will these skbs be printed more than once by printk? Thanks. 在 2022/1/19 9:26, Paul Moore 写道: > When an admin enables audit at early boot via the "audit=1" kernel > command line the audit queue behavior is slightly different; the > audit subsystem goes to greater lengths to avoid dropping records, > which unfortunately can result in problems when the audit daemon is > forcibly stopped for an extended period of time. > > This patch makes a number of changes designed to improve the audit > queuing behavior so that leaving the audit daemon in a stopped state > for an extended period does not cause a significant impact to the > system. > > - kauditd_send_queue() is now limited to looping through the > passed queue only once per call. This not only prevents the > function from looping indefinitely when records are returned > to the current queue, it also allows any recovery handling in > kauditd_thread() to take place when kauditd_send_queue() > returns. > > - Transient netlink send errors seen as -EAGAIN now cause the > record to be returned to the retry queue instead of going to > the hold queue. The intention of the hold queue is to store, > perhaps for an extended period of time, the events which led > up to the audit daemon going offline. The retry queue remains > a temporary queue intended to protect against transient issues > between the kernel and the audit daemon. > > - The retry queue is now limited by the audit_backlog_limit > setting, the same as the other queues. This allows admins > to bound the size of all of the audit queues on the system. > > - kauditd_rehold_skb() now returns records to the end of the > hold queue to ensure ordering is preserved in the face of > recent changes to kauditd_send_queue(). > > Cc: stable@vger.kernel.org > Fixes: 5b52330bbfe63 ("audit: fix auditd/kernel connection state tracking") > Fixes: f4b3ee3c85551 ("audit: improve robustness of the audit queue handling") > Reported-by: Gaosheng Cui > Signed-off-by: Paul Moore > --- > kernel/audit.c | 60 ++++++++++++++++++++++++++++++++++++++------------------ > 1 file changed, 41 insertions(+), 19 deletions(-) > > diff --git a/kernel/audit.c b/kernel/audit.c > index e4bbe2c70c26..c45d3fe61466 100644 > --- a/kernel/audit.c > +++ b/kernel/audit.c > @@ -541,20 +541,22 @@ static void kauditd_printk_skb(struct sk_buff *skb) > /** > * kauditd_rehold_skb - Handle a audit record send failure in the hold queue > * @skb: audit record > + * @error: error code (unused) > * > * Description: > * This should only be used by the kauditd_thread when it fails to flush the > * hold queue. > */ > -static void kauditd_rehold_skb(struct sk_buff *skb) > +static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error) > { > - /* put the record back in the queue at the same place */ > - skb_queue_head(&audit_hold_queue, skb); > + /* put the record back in the queue */ > + skb_queue_tail(&audit_hold_queue, skb); > } > > /** > * kauditd_hold_skb - Queue an audit record, waiting for auditd > * @skb: audit record > + * @error: error code > * > * Description: > * Queue the audit record, waiting for an instance of auditd. When this > @@ -564,19 +566,31 @@ static void kauditd_rehold_skb(struct sk_buff *skb) > * and queue it, if we have room. If we want to hold on to the record, but we > * don't have room, record a record lost message. > */ > -static void kauditd_hold_skb(struct sk_buff *skb) > +static void kauditd_hold_skb(struct sk_buff *skb, int error) > { > /* at this point it is uncertain if we will ever send this to auditd so > * try to send the message via printk before we go any further */ > kauditd_printk_skb(skb); > > /* can we just silently drop the message? */ > - if (!audit_default) { > - kfree_skb(skb); > - return; > + if (!audit_default) > + goto drop; > + > + /* the hold queue is only for when the daemon goes away completely, > + * not -EAGAIN failures; if we are in a -EAGAIN state requeue the > + * record on the retry queue unless it's full, in which case drop it > + */ > + if (error == -EAGAIN) { > + if (!audit_backlog_limit || > + skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { > + skb_queue_tail(&audit_retry_queue, skb); > + return; > + } > + audit_log_lost("kauditd retry queue overflow"); > + goto drop; > } > > - /* if we have room, queue the message */ > + /* if we have room in the hold queue, queue the message */ > if (!audit_backlog_limit || > skb_queue_len(&audit_hold_queue) < audit_backlog_limit) { > skb_queue_tail(&audit_hold_queue, skb); > @@ -585,24 +599,30 @@ static void kauditd_hold_skb(struct sk_buff *skb) > > /* we have no other options - drop the message */ > audit_log_lost("kauditd hold queue overflow"); > +drop: > kfree_skb(skb); > } > > /** > * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd > * @skb: audit record > + * @error: error code (unused) > * > * Description: > * Not as serious as kauditd_hold_skb() as we still have a connected auditd, > * but for some reason we are having problems sending it audit records so > * queue the given record and attempt to resend. > */ > -static void kauditd_retry_skb(struct sk_buff *skb) > +static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error) > { > - /* NOTE: because records should only live in the retry queue for a > - * short period of time, before either being sent or moved to the hold > - * queue, we don't currently enforce a limit on this queue */ > - skb_queue_tail(&audit_retry_queue, skb); > + if (!audit_backlog_limit || > + skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { > + skb_queue_tail(&audit_retry_queue, skb); > + return; > + } > + > + audit_log_lost("kauditd retry queue overflow"); > + kfree_skb(skb); > } > > /** > @@ -640,7 +660,7 @@ static void auditd_reset(const struct auditd_connection *ac) > /* flush the retry queue to the hold queue, but don't touch the main > * queue since we need to process that normally for multicast */ > while ((skb = skb_dequeue(&audit_retry_queue))) > - kauditd_hold_skb(skb); > + kauditd_hold_skb(skb, -ECONNREFUSED); > } > > /** > @@ -714,16 +734,18 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, > struct sk_buff_head *queue, > unsigned int retry_limit, > void (*skb_hook)(struct sk_buff *skb), > - void (*err_hook)(struct sk_buff *skb)) > + void (*err_hook)(struct sk_buff *skb, int error)) > { > int rc = 0; > - struct sk_buff *skb; > + struct sk_buff *skb = NULL; > + struct sk_buff *skb_tail; > unsigned int failed = 0; > > /* NOTE: kauditd_thread takes care of all our locking, we just use > * the netlink info passed to us (e.g. sk and portid) */ > > - while ((skb = skb_dequeue(queue))) { > + skb_tail = skb_peek_tail(queue); > + while ((skb != skb_tail) && (skb = skb_dequeue(queue))) { > /* call the skb_hook for each skb we touch */ > if (skb_hook) > (*skb_hook)(skb); > @@ -731,7 +753,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, > /* can we send to anyone via unicast? */ > if (!sk) { > if (err_hook) > - (*err_hook)(skb); > + (*err_hook)(skb, -ECONNREFUSED); > continue; > } > > @@ -745,7 +767,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, > rc == -ECONNREFUSED || rc == -EPERM) { > sk = NULL; > if (err_hook) > - (*err_hook)(skb); > + (*err_hook)(skb, rc); > if (rc == -EAGAIN) > rc = 0; > /* continue to drain the queue */ > > .