All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Darrick J. Wong" <djwong@kernel.org>
To: cem@kernel.org, kent.overstreet@linux.dev, djwong@kernel.org
Cc: linux-xfs@vger.kernel.org
Subject: [PATCH 5/7] xfs_scrubbed: create daemon to listen for health events
Date: Fri, 23 Feb 2024 17:35:28 -0800	[thread overview]
Message-ID: <170873836626.1902540.6213624466323713329.stgit@frogsfrogsfrogs> (raw)
In-Reply-To: <170873836546.1902540.13109376239205481967.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

Create a daemon program that can listen for and log health events.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 scrub/Makefile        |   15 +++
 scrub/xfs_scrubbed.in |  217 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 230 insertions(+), 2 deletions(-)
 create mode 100644 scrub/xfs_scrubbed.in


diff --git a/scrub/Makefile b/scrub/Makefile
index c0fc927f4278..cf112018376b 100644
--- a/scrub/Makefile
+++ b/scrub/Makefile
@@ -18,6 +18,7 @@ XFS_SCRUB_ALL_PROG = xfs_scrub_all
 XFS_SCRUB_FAIL_PROG = xfs_scrub_fail
 XFS_SCRUB_ARGS = -p
 XFS_SCRUB_SERVICE_ARGS = -b
+XFS_SCRUBBED_PROG = xfs_scrubbed
 ifeq ($(HAVE_SYSTEMD),yes)
 INSTALL_SCRUB += install-systemd
 SYSTEMD_SERVICES=\
@@ -124,9 +125,9 @@ endif
 # Automatically trigger a media scan once per month
 XFS_SCRUB_ALL_AUTO_MEDIA_SCAN_INTERVAL=1mo
 
-LDIRT = $(XFS_SCRUB_ALL_PROG) $(XFS_SCRUB_FAIL_PROG) *.service *.cron
+LDIRT = $(XFS_SCRUB_ALL_PROG) $(XFS_SCRUB_FAIL_PROG) $(XFS_SCRUBBED_PROG) *.service *.cron
 
-default: depend $(LTCOMMAND) $(XFS_SCRUB_ALL_PROG) $(XFS_SCRUB_FAIL_PROG) $(OPTIONAL_TARGETS)
+default: depend $(LTCOMMAND) $(XFS_SCRUB_ALL_PROG) $(XFS_SCRUB_FAIL_PROG) $(XFS_SCRUBBED_PROG) $(OPTIONAL_TARGETS)
 
 xfs_scrub_all: xfs_scrub_all.in $(builddefs)
 	@echo "    [SED]    $@"
@@ -139,6 +140,14 @@ xfs_scrub_all: xfs_scrub_all.in $(builddefs)
 		   -e "s|@scrub_args@|$(XFS_SCRUB_ARGS)|g" < $< > $@
 	$(Q)chmod a+x $@
 
+xfs_scrubbed: xfs_scrubbed.in $(builddefs)
+	@echo "    [SED]    $@"
+	$(Q)$(SED) -e "s|@sbindir@|$(PKG_SBIN_DIR)|g" \
+		   -e "s|@scrub_svcname@|$(scrub_svcname)|g" \
+		   -e "s|@pkg_version@|$(PKG_VERSION)|g" \
+		   < $< > $@
+	$(Q)chmod a+x $@
+
 xfs_scrub_fail: xfs_scrub_fail.in $(builddefs)
 	@echo "    [SED]    $@"
 	$(Q)$(SED) -e "s|@sbindir@|$(PKG_SBIN_DIR)|g" \
@@ -182,6 +191,8 @@ install-scrub: default
 	$(INSTALL) -m 755 -d $(PKG_SBIN_DIR)
 	$(LTINSTALL) -m 755 $(LTCOMMAND) $(PKG_SBIN_DIR)
 	$(INSTALL) -m 755 $(XFS_SCRUB_ALL_PROG) $(PKG_SBIN_DIR)
+	$(INSTALL) -m 755 -d $(PKG_LIBEXEC_DIR)
+	$(INSTALL) -m 755 $(XFS_SCRUBBED_PROG) $(PKG_LIBEXEC_DIR)
 	$(INSTALL) -m 755 -d $(PKG_STATE_DIR)
 
 install-udev: $(UDEV_RULES)
diff --git a/scrub/xfs_scrubbed.in b/scrub/xfs_scrubbed.in
new file mode 100644
index 000000000000..0c72f5c54a78
--- /dev/null
+++ b/scrub/xfs_scrubbed.in
@@ -0,0 +1,217 @@
+#!/usr/bin/python3
+
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (C) 2024 Oracle.  All rights reserved.
+#
+# Author: Darrick J. Wong <djwong@kernel.org>
+
+# Daemon to listen for and react to filesystem health events
+
+import sys
+import os
+import argparse
+import fcntl
+import struct
+import json
+import datetime
+import errno
+
+debug = False
+log = False
+everything = False
+printf_prefix = ''
+
+# ioctl encoding stuff
+_IOC_NRBITS   =  8
+_IOC_TYPEBITS =  8
+_IOC_SIZEBITS = 14
+_IOC_DIRBITS  =  2
+
+_IOC_NRSHIFT   = 0
+_IOC_TYPESHIFT = (_IOC_NRSHIFT   + _IOC_NRBITS)
+_IOC_SIZESHIFT = (_IOC_TYPESHIFT + _IOC_TYPEBITS)
+_IOC_DIRSHIFT  = (_IOC_SIZESHIFT + _IOC_SIZEBITS)
+
+_IOC_NONE  = 0
+_IOC_WRITE = 1
+_IOC_READ  = 2
+
+def _IOC(direction, type, nr, size):
+	return (((direction)  << _IOC_DIRSHIFT) |
+		((type) << _IOC_TYPESHIFT) |
+		((nr)   << _IOC_NRSHIFT) |
+		((size) << _IOC_SIZESHIFT))
+
+def _IOR(type, number, size):
+	return _IOC(_IOC_READ, type, number, size)
+
+# xfs health monitoring ioctl stuff
+XFS_HEALTH_MONITOR_FMT_JSON = 1
+XFS_HEALTH_MONITOR_VERBOSE = 1 << 0
+xfs_health_monitor = struct.Struct('QB' + ('x' * 23))
+XFS_IOC_HEALTH_MONITOR = _IOR(0x58, 48, xfs_health_monitor.size)
+
+def open_health_monitor(fd, verbose = False):
+	'''Return a health monitoring fd.'''
+	assert xfs_health_monitor.size == 32
+
+	flags = 0
+	fmt = XFS_HEALTH_MONITOR_FMT_JSON
+
+	if verbose:
+		flags |= XFS_HEALTH_MONITOR_VERBOSE
+
+	# Create an immutable byte array representation of struct args, then
+	# pass it to the ioctl function as a mutable byte array so that the
+	# return value is the kernel fd and /not/ the post-syscall byte array
+	# contents.
+	arg = xfs_health_monitor.pack(flags, fmt)
+	ret = fcntl.ioctl(fd, XFS_IOC_HEALTH_MONITOR, bytearray(arg))
+	return ret
+
+# main program
+
+def health_reports(mon_fp):
+	'''Generate python objects describing health events.'''
+	global debug, printf_prefix
+
+	lines = []
+	buf = mon_fp.readline()
+	while buf != '':
+		for line in buf.split('\0'):
+			line = line.strip()
+			if debug:
+				print(f'new line: {line}')
+			if line == '':
+				continue
+
+			lines.append(line)
+			if not '}' in line:
+				continue
+
+			s = ''.join(lines)
+			if debug:
+				print(f'new event: {s}')
+			try:
+				yield json.loads(s)
+			except json.decoder.JSONDecodeError as e:
+				print(f"{printf_prefix}: {e} from {s}",
+						file = sys.stderr)
+				pass
+			lines = []
+		buf = mon_fp.readline()
+
+def log_event(event):
+	global printf_prefix
+
+	print(f"{printf_prefix}: {event}")
+	sys.stdout.flush()
+
+def report_lost(event):
+	'''Report that the kernel lost events.'''
+	global printf_prefix
+
+	print(f"{printf_prefix}: Events were lost.")
+	sys.stdout.flush()
+
+def report_shutdown(event):
+	'''Report an abortive shutdown of the filesystem.'''
+	global printf_prefix
+	REASONS = {
+		"meta_ioerr":		"metadata IO error",
+		"log_ioerr":		"log IO error",
+		"force_umount":		"forced unmount",
+		"corrupt_incore":	"in-memory state corruption",
+		"corrupt_ondisk":	"ondisk metadata corruption",
+		"device_removed":	"device removal",
+	}
+
+	reasons = []
+	for reason in event['reasons']:
+		if reason in REASONS:
+			reasons.append(REASONS[reason])
+		else:
+			reasons.append(reason)
+
+	print(f"{printf_prefix}: Filesystem shut down due to {', '.join(reasons)}.")
+	sys.stdout.flush()
+
+def monitor(mountpoint):
+	'''Monitor the given mountpoint for health events.'''
+	global log, everything
+
+	fd = os.open(mountpoint, os.O_RDONLY)
+	try:
+		mon_fd = open_health_monitor(fd, verbose = everything)
+	except OSError as e:
+		if e.errno != errno.ENOTTY:
+			raise e
+		print(f"{mountpoint}: XFS health monitoring not supported.",
+				file = sys.stderr)
+		return 1
+	finally:
+		# Close the mountpoint if opening the health monitor fails
+		os.close(fd)
+
+	# Ownership of mon_fd (and hence responsibility for closing it) is
+	# transferred to the mon_fp object.
+	with os.fdopen(mon_fd) as mon_fp:
+		for event in health_reports(mon_fp):
+			try:
+				ts = datetime.datetime.fromtimestamp(event['time_ns'] / 1e9).astimezone()
+				event['time'] = str(ts)
+				del event['time_ns']
+			except Exception as e:
+				print(e)
+				pass
+			if log:
+				log_event(event)
+			if event['type'] == 'lost':
+				report_lost(event)
+			elif event['type'] == 'shutdown':
+				report_shutdown(event)
+
+	return 0
+
+def main():
+	global debug, log, printf_prefix, everything
+	ret = 0
+
+	parser = argparse.ArgumentParser( \
+			description = "XFS filesystem health monitoring demon.")
+	parser.add_argument("--debug", help = "Enabling debugging messages.", \
+			action = "store_true")
+	parser.add_argument("--log", help = "Log health events to stdout.", \
+			action = "store_true")
+	parser.add_argument("--everything", help = "Capture all events.", \
+			action = "store_true")
+	parser.add_argument("-V", help = "Report version and exit.", \
+			action = "store_true")
+	parser.add_argument('mountpoint', default = None, nargs = '?',
+			help = 'XFS filesystem mountpoint to target.')
+	args = parser.parse_args()
+
+	if args.V:
+		print("xfs_scrubbed version @pkg_version@")
+		sys.exit(0)
+
+	if args.mountpoint is None:
+		parser.error("the following arguments are required: mountpoint")
+		sys.exit(1)
+
+	if args.debug:
+		debug = True
+	if args.log:
+		log = True
+	if args.everything:
+		everything = True
+
+	printf_prefix = args.mountpoint
+	try:
+		ret = monitor(args.mountpoint)
+	except KeyboardInterrupt:
+		ret = 0
+	sys.exit(ret)
+
+if __name__ == '__main__':
+	main()


  parent reply	other threads:[~2024-02-24  1:35 UTC|newest]

Thread overview: 59+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-02-24  1:00 [PATCHBOMB] time_stats, thread_with_file: lifting generic code to lib Darrick J. Wong
2024-02-24  1:07 ` [PATCHSET 1/6] time_stats: promote to lib/ Darrick J. Wong
2024-02-24  1:09   ` [PATCH 1/4] mean and variance: Promote to lib/math Darrick J. Wong
2024-02-24  1:09   ` [PATCH 2/4] eytzinger: Promote to include/linux/ Darrick J. Wong
2024-02-24  1:09   ` [PATCH 3/4] bcachefs: bch2_time_stats_to_seq_buf() Darrick J. Wong
2024-02-24  1:10   ` [PATCH 4/4] time_stats: Promote to lib/ Darrick J. Wong
2024-02-24  1:08 ` [PATCHSET 2/6] time_stats: cleanups and fixes Darrick J. Wong
2024-02-24  1:10   ` [PATCH 01/10] time_stats: report lifetime of the stats object Darrick J. Wong
2024-02-24  1:10   ` [PATCH 02/10] time_stats: split stats-with-quantiles into a separate structure Darrick J. Wong
2024-02-24  1:10   ` [PATCH 03/10] time_stats: fix struct layout bloat Darrick J. Wong
2024-02-24  1:11   ` [PATCH 04/10] time_stats: add larger units Darrick J. Wong
2024-02-24  1:11   ` [PATCH 05/10] time_stats: don't print any output if event count is zero Darrick J. Wong
2024-02-24  1:11   ` [PATCH 06/10] time_stats: allow custom epoch names Darrick J. Wong
2024-02-24  1:11   ` [PATCH 07/10] mean_and_variance: put struct mean_and_variance_weighted on a diet Darrick J. Wong
2024-02-24  1:12   ` [PATCH 08/10] time_stats: shrink time_stat_buffer for better alignment Darrick J. Wong
2024-02-24  1:12   ` [PATCH 09/10] time_stats: report information in json format Darrick J. Wong
2024-02-24  4:15     ` Darrick J. Wong
2024-02-24  5:10       ` Kent Overstreet
2024-02-24  6:02         ` Darrick J. Wong
2024-02-24  1:12   ` [PATCH 10/10] time_stats: Kill TIME_STATS_HAVE_QUANTILES Darrick J. Wong
2024-02-24  1:08 ` [PATCHSET RFC 3/6] xfs: capture statistics about wait times Darrick J. Wong
2024-02-24  1:12   ` [PATCH 1/4] xfs: present wait time statistics Darrick J. Wong
2024-02-24  1:13   ` [PATCH 2/4] xfs: present time stats for scrubbers Darrick J. Wong
2024-02-24  1:13   ` [PATCH 3/4] xfs: present timestats in json format Darrick J. Wong
2024-02-24  1:13   ` [PATCH 4/4] xfs: create debugfs uuid aliases Darrick J. Wong
2024-02-24  1:08 ` [PATCHSET 4/6] thread_with_file: promote to lib/ Darrick J. Wong
2024-02-24  1:14   ` [PATCH 01/10] bcachefs: thread_with_stdio: eliminate double buffering Darrick J. Wong
2024-02-24  1:14   ` [PATCH 02/10] bcachefs: thread_with_stdio: convert to darray Darrick J. Wong
2024-02-24  1:14   ` [PATCH 03/10] bcachefs: thread_with_stdio: kill thread_with_stdio_done() Darrick J. Wong
2024-02-24  1:14   ` [PATCH 04/10] bcachefs: thread_with_stdio: fix bch2_stdio_redirect_readline() Darrick J. Wong
2024-02-24  1:15   ` [PATCH 05/10] bcachefs: Thread with file documentation Darrick J. Wong
2024-02-24  1:15   ` [PATCH 06/10] darray: lift from bcachefs Darrick J. Wong
2024-02-24  1:15   ` [PATCH 07/10] thread_with_file: Lift " Darrick J. Wong
2024-02-24  1:15   ` [PATCH 08/10] thread_with_stdio: Mark completed in ->release() Darrick J. Wong
2024-02-24  1:16   ` [PATCH 09/10] kernel/hung_task.c: export sysctl_hung_task_timeout_secs Darrick J. Wong
2024-02-24  1:16   ` [PATCH 10/10] thread_with_stdio: suppress hung task warning Darrick J. Wong
2024-02-24  1:08 ` [PATCHSET 5/6] thread_with_file: cleanups and fixes Darrick J. Wong
2024-02-24  1:16   ` [PATCH 1/5] thread_with_file: allow creation of readonly files Darrick J. Wong
2024-02-24  1:16   ` [PATCH 2/5] thread_with_file: fix various printf problems Darrick J. Wong
2024-02-24  1:17   ` [PATCH 3/5] thread_with_file: create ops structure for thread_with_stdio Darrick J. Wong
2024-02-24  1:17   ` [PATCH 4/5] thread_with_file: allow ioctls against these files Darrick J. Wong
2024-02-24  1:17   ` [PATCH 5/5] thread_with_file: Fix missing va_end() Darrick J. Wong
2024-02-24  1:09 ` [PATCHSET RFC 6/6] xfs: live health monitoring of filesystems Darrick J. Wong
2024-02-24  1:17   ` [PATCH 1/8] xfs: use thread_with_file to create a monitoring file Darrick J. Wong
2024-02-24  1:18   ` [PATCH 2/8] xfs: create hooks for monitoring health updates Darrick J. Wong
2024-02-24  1:18   ` [PATCH 3/8] xfs: create a filesystem shutdown hook Darrick J. Wong
2024-02-24  1:18   ` [PATCH 4/8] xfs: report shutdown events through healthmon Darrick J. Wong
2024-02-24  1:18   ` [PATCH 5/8] xfs: report metadata health " Darrick J. Wong
2024-02-24  1:19   ` [PATCH 6/8] xfs: report media errors " Darrick J. Wong
2024-02-24  1:19   ` [PATCH 7/8] xfs: allow reconfiguration of the health monitoring device Darrick J. Wong
2024-02-24  1:19   ` [PATCH 8/8] xfs: send uevents when mounting and unmounting a filesystem Darrick J. Wong
2024-02-24  1:34 ` [PATCHSET RFC] xfsprogs: live health monitoring of filesystems Darrick J. Wong
2024-02-24  1:34   ` [PATCH 1/7] xfs: use thread_with_file to create a monitoring file Darrick J. Wong
2024-02-24  1:34   ` [PATCH 2/7] xfs: create hooks for monitoring health updates Darrick J. Wong
2024-02-24  1:34   ` [PATCH 3/7] xfs: report shutdown events through healthmon Darrick J. Wong
2024-02-24  1:35   ` [PATCH 4/7] xfs_io: monitor filesystem health events Darrick J. Wong
2024-02-24  1:35   ` Darrick J. Wong [this message]
2024-02-24  1:35   ` [PATCH 6/7] xfs_scrubbed: enable repairing filesystems Darrick J. Wong
2024-02-24  1:36   ` [PATCH 7/7] xfs_scrubbed: create a background monitoring service Darrick J. Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=170873836626.1902540.6213624466323713329.stgit@frogsfrogsfrogs \
    --to=djwong@kernel.org \
    --cc=cem@kernel.org \
    --cc=kent.overstreet@linux.dev \
    --cc=linux-xfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.