linux-erofs.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
From: Guo Xuenan <guoxuenan@huawei.com>
To: <linux-erofs@lists.ozlabs.org>
Cc: mpiglet@outlook.com
Subject: [PATCH v2 3/5] dump.erofs: add -S options for collecting statistics of the whole filesystem
Date: Tue, 14 Sep 2021 15:44:22 +0800	[thread overview]
Message-ID: <20210914074424.1875409-3-guoxuenan@huawei.com> (raw)
In-Reply-To: <20210914074424.1875409-1-guoxuenan@huawei.com>

From: Wang Qi <mpiglet@outlook.com>

Signed-off-by: Guo Xuenan <guoxuenan@huawei.com>
Signed-off-by: Wang Qi <mpiglet@outlook.com>
---
 dump/main.c | 364 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 361 insertions(+), 3 deletions(-)

diff --git a/dump/main.c b/dump/main.c
index 33521bf..0778354 100644
--- a/dump/main.c
+++ b/dump/main.c
@@ -21,10 +21,63 @@
 
 struct dumpcfg {
 	bool print_superblock;
+	bool print_statistic;
 	bool print_version;
 };
 static struct dumpcfg dumpcfg;
 
+static const char chart_format[] = "%-16s	%-11d %8.2f%% |%-50s|\n";
+static const char header_format[] = "%-16s %11s %16s |%-50s|\n";
+static char *file_types[] = {
+	".so", ".png", ".jpg", ".xml", ".html", ".odex",
+	".vdex", ".apk", ".ttf", ".jar", ".json", ".ogg",
+	".oat", ".art", ".rc", ".otf", ".txt", "others",
+};
+#define OTHERFILETYPE	ARRAY_SIZE(file_types)
+/* (1 << FILE_MAX_SIZE_BITS)KB */
+#define	FILE_MAX_SIZE_BITS	16
+
+static const char * const file_category_types[] = {
+	[EROFS_FT_UNKNOWN] = "unknown type",
+	[EROFS_FT_REG_FILE] = "regular file",
+	[EROFS_FT_DIR] = "directory",
+	[EROFS_FT_CHRDEV] = "char dev",
+	[EROFS_FT_BLKDEV] = "block dev",
+	[EROFS_FT_FIFO] = "FIFO file",
+	[EROFS_FT_SOCK] = "SOCK file",
+	[EROFS_FT_SYMLINK] = "symlink file",
+};
+
+struct statistics {
+	unsigned long blocks;
+	unsigned long files;
+	unsigned long files_total_size;
+	unsigned long files_total_origin_size;
+	double compress_rate;
+	unsigned long compressed_files;
+	unsigned long uncompressed_files;
+
+	unsigned long regular_files;
+	unsigned long dir_files;
+	unsigned long chardev_files;
+	unsigned long blkdev_files;
+	unsigned long fifo_files;
+	unsigned long sock_files;
+	unsigned long symlink_files;
+
+	/* statistics the number of files based on inode_info->flags */
+	unsigned long file_category_stat[EROFS_FT_MAX];
+	/* statistics the number of files based on file name extensions */
+	unsigned int file_type_stat[OTHERFILETYPE];
+	/* statistics the number of files based on file orignial size */
+	unsigned int file_original_size[FILE_MAX_SIZE_BITS + 1];
+	/* statistics the number of files based on the compressed
+	 * size of file
+	 */
+	unsigned int file_comp_size[FILE_MAX_SIZE_BITS + 1];
+};
+static struct statistics stats;
+
 static struct option long_options[] = {
 	{"help", no_argument, 0, 1},
 	{0, 0, 0, 0},
@@ -57,24 +110,29 @@ static void usage(void)
 		"Dump erofs layout from erofs-image, and [options] are:\n"
 		"--help  display this help and exit.\n"
 		"-s          print information about superblock\n"
+		"-S      print statistic information of the erofs-image\n"
 		"-V      print the version number of dump.erofs and exit.\n",
-		stderr);
+		stdout);
 }
+
 static void dumpfs_print_version(void)
 {
-	fprintf(stderr, "dump.erofs %s\n", cfg.c_version);
+	fprintf(stdout, "dump.erofs %s\n", cfg.c_version);
 }
 
 static int dumpfs_parse_options_cfg(int argc, char **argv)
 {
 	int opt;
 
-	while ((opt = getopt_long(argc, argv, "sV",
+	while ((opt = getopt_long(argc, argv, "sSV",
 					long_options, NULL)) != -1) {
 		switch (opt) {
 		case 's':
 			dumpcfg.print_superblock = true;
 			break;
+		case 'S':
+			dumpcfg.print_statistic = true;
+			break;
 		case 'V':
 			dumpfs_print_version();
 			exit(0);
@@ -100,6 +158,303 @@ static int dumpfs_parse_options_cfg(int argc, char **argv)
 	return 0;
 }
 
+static int get_file_compressed_size(struct erofs_inode *inode,
+		erofs_off_t *size)
+{
+	*size = 0;
+	switch (inode->datalayout) {
+	case EROFS_INODE_FLAT_INLINE:
+	case EROFS_INODE_FLAT_PLAIN:
+		stats.uncompressed_files++;
+		*size = inode->i_size;
+		break;
+	case EROFS_INODE_FLAT_COMPRESSION_LEGACY:
+	case EROFS_INODE_FLAT_COMPRESSION:
+		stats.compressed_files++;
+		*size = inode->u.i_blocks * EROFS_BLKSIZ;
+		break;
+	default:
+		erofs_err("unknown datalayout");
+		return -1;
+	}
+	return 0;
+}
+
+static int get_file_type(const char *filename)
+{
+	char *postfix = strrchr(filename, '.');
+	int type = 0;
+
+	if (postfix == NULL)
+		return OTHERFILETYPE - 1;
+	while (type < OTHERFILETYPE - 1) {
+		if (strcmp(postfix, file_types[type]) == 0)
+			break;
+		type++;
+	}
+	return type;
+}
+
+static void update_file_size_statatics(erofs_off_t occupied_size,
+		erofs_off_t original_size)
+{
+	int occupied_size_mark;
+	int original_size_mark;
+
+	original_size_mark = 0;
+	occupied_size_mark = 0;
+	occupied_size >>= 10;
+	original_size >>= 10;
+
+	while (occupied_size || original_size) {
+		if (occupied_size) {
+			occupied_size >>= 1;
+			occupied_size_mark++;
+		}
+		if (original_size) {
+			original_size >>= 1;
+			original_size_mark++;
+		}
+	}
+
+	if (original_size_mark >= FILE_MAX_SIZE_BITS)
+		stats.file_original_size[FILE_MAX_SIZE_BITS]++;
+	else
+		stats.file_original_size[original_size_mark]++;
+
+	if (occupied_size_mark >= FILE_MAX_SIZE_BITS)
+		stats.file_comp_size[FILE_MAX_SIZE_BITS]++;
+	else
+		stats.file_comp_size[occupied_size_mark]++;
+}
+
+static int erofs_read_dir(erofs_nid_t nid, erofs_nid_t parent_nid)
+{
+	struct erofs_inode vi = { .nid = nid};
+	int err;
+	char buf[EROFS_BLKSIZ];
+	char filename[PATH_MAX + 1];
+	erofs_off_t offset;
+
+	err = erofs_read_inode_from_disk(&vi);
+	if (err)
+		return err;
+
+	offset = 0;
+	while (offset < vi.i_size) {
+		erofs_off_t maxsize = min_t(erofs_off_t,
+			vi.i_size - offset, EROFS_BLKSIZ);
+		struct erofs_dirent *de = (void *)buf;
+		struct erofs_dirent *end;
+		unsigned int nameoff;
+
+		err = erofs_pread(&vi, buf, maxsize, offset);
+		if (err)
+			return err;
+
+		nameoff = le16_to_cpu(de->nameoff);
+
+		if (nameoff < sizeof(struct erofs_dirent) ||
+		    nameoff >= PAGE_SIZE) {
+			erofs_err("invalid de[0].nameoff %u @ nid %llu",
+				  nameoff, nid | 0ULL);
+			return -EFSCORRUPTED;
+		}
+		end = (void *)buf + nameoff;
+		while (de < end) {
+			const char *dname;
+			unsigned int dname_len;
+			struct erofs_inode inode = { .nid = de->nid };
+			erofs_off_t occupied_size = 0;
+
+			nameoff = le16_to_cpu(de->nameoff);
+			dname = (char *)buf + nameoff;
+
+			if (de + 1 >= end)
+				dname_len = strnlen(dname, maxsize - nameoff);
+			else
+				dname_len =
+					le16_to_cpu(de[1].nameoff) - nameoff;
+
+			/* a corrupted entry is found */
+			if (nameoff + dname_len > maxsize ||
+				dname_len > EROFS_NAME_LEN) {
+				erofs_err("bogus dirent @ nid %llu",
+						le64_to_cpu(de->nid) | 0ULL);
+				DBG_BUGON(1);
+				return -EFSCORRUPTED;
+			}
+			if (de->nid != nid && de->nid != parent_nid)
+				stats.files++;
+
+			memset(filename, 0, PATH_MAX + 1);
+			memcpy(filename, dname, dname_len);
+			if (de->file_type >= EROFS_FT_MAX) {
+				erofs_err("invalid file type %llu", de->nid);
+				continue;
+			}
+			if (de->file_type != EROFS_FT_DIR)
+				stats.file_category_stat[de->file_type]++;
+			switch (de->file_type) {
+			case EROFS_FT_REG_FILE:
+				err = erofs_read_inode_from_disk(&inode);
+				if (err) {
+					erofs_err("read file inode from disk failed!");
+					return err;
+				}
+				stats.files_total_origin_size += inode.i_size;
+				stats.file_type_stat[get_file_type(filename)]++;
+
+				err = get_file_compressed_size(&inode,
+						&occupied_size);
+				if (err) {
+					erofs_err("get file size failed\n");
+					return err;
+				}
+				stats.files_total_size += occupied_size;
+				update_file_size_statatics(occupied_size, inode.i_size);
+				break;
+
+			case EROFS_FT_DIR:
+				if (de->nid != nid && de->nid != parent_nid) {
+					stats.uncompressed_files++;
+					err = erofs_read_dir(de->nid, nid);
+					if (err) {
+						fprintf(stdout,
+								"parse dir nid %llu error occurred\n",
+								de->nid);
+						return err;
+					}
+					stats.file_category_stat[EROFS_FT_DIR]++;
+				}
+				break;
+			case EROFS_FT_UNKNOWN:
+			case EROFS_FT_CHRDEV:
+			case EROFS_FT_BLKDEV:
+			case EROFS_FT_FIFO:
+			case EROFS_FT_SOCK:
+			case EROFS_FT_SYMLINK:
+				stats.uncompressed_files++;
+				break;
+			default:
+				erofs_err("%d file type not exists", de->file_type);
+			}
+			++de;
+		}
+		offset += maxsize;
+	}
+	return 0;
+}
+
+static void dumpfs_print_statistic_of_filetype(void)
+{
+	fprintf(stdout, "Filesystem total file count:		%lu\n",
+			stats.files);
+	for (int i = 0; i < EROFS_FT_MAX; i++)
+		fprintf(stdout, "Filesystem %s count:		%lu\n",
+			file_category_types[i], stats.file_category_stat[i]);
+}
+
+static void dumpfs_print_chart_row(char *col1, unsigned int col2,
+		double col3, char *col4)
+{
+	char row[500] = {0};
+
+	sprintf(row, chart_format, col1, col2, col3, col4);
+	fprintf(stdout, row);
+}
+
+static void dumpfs_print_chart_of_file(unsigned int *file_counts,
+		unsigned int len)
+{
+	char col1[30];
+	unsigned int col2;
+	double col3;
+	char col4[400];
+	unsigned int lowerbound = 0;
+	unsigned int upperbound = 1;
+
+	fprintf(stdout, header_format, ">=(KB) .. <(KB) ", "count",
+			"ratio", "distribution");
+	for (int i = 0; i < len; i++) {
+		memset(col1, 0, sizeof(col1));
+		memset(col4, 0, sizeof(col4));
+		if (i == len - 1)
+			sprintf(col1, "%6d ..", lowerbound);
+		else if (i <= 6)
+			sprintf(col1, "%6d .. %-6d", lowerbound, upperbound);
+		else
+
+			sprintf(col1, "%6d .. %-6d", lowerbound, upperbound);
+		col2 = file_counts[i];
+		col3 = (double)(100 * col2) / (double)stats.file_category_stat[EROFS_FT_REG_FILE];
+		memset(col4, '#', col3 / 2);
+		dumpfs_print_chart_row(col1, col2, col3, col4);
+		lowerbound = upperbound;
+		upperbound <<= 1;
+	}
+}
+
+static void dumpfs_print_chart_of_file_type(char **file_types, unsigned int len)
+{
+	char col1[30];
+	unsigned int col2;
+	double col3;
+	char col4[401];
+
+	fprintf(stdout, header_format, "type", "count", "ratio",
+			"distribution");
+	for (int i = 0; i < len; i++) {
+		memset(col1, 0, sizeof(col1));
+		memset(col4, 0, sizeof(col4));
+		sprintf(col1, "%-17s", file_types[i]);
+		col2 = stats.file_type_stat[i];
+		col3 = (double)(100 * col2) / (double)stats.file_category_stat[EROFS_FT_REG_FILE];
+		memset(col4, '#', col3 / 2);
+		dumpfs_print_chart_row(col1, col2, col3, col4);
+	}
+}
+
+static void dumpfs_print_statistic_of_compression(void)
+{
+	stats.compress_rate = (double)(100 * stats.files_total_size) /
+		(double)(stats.files_total_origin_size);
+	fprintf(stdout, "Filesystem compressed files:            %lu\n",
+			stats.compressed_files);
+	fprintf(stdout, "Filesystem uncompressed files:          %lu\n",
+			stats.uncompressed_files);
+	fprintf(stdout, "Filesystem total original file size:    %lu Bytes\n",
+			stats.files_total_origin_size);
+	fprintf(stdout, "Filesystem total file size:             %lu Bytes\n",
+			stats.files_total_size);
+	fprintf(stdout, "Filesystem compress rate:               %.2f%%\n",
+			stats.compress_rate);
+}
+
+static void dumpfs_print_statistic(void)
+{
+	int err;
+
+	stats.blocks = sbi.blocks;
+	err = erofs_read_dir(sbi.root_nid, sbi.root_nid);
+	if (err) {
+		erofs_err("read dir failed");
+		return;
+	}
+
+	dumpfs_print_statistic_of_filetype();
+	dumpfs_print_statistic_of_compression();
+
+	fprintf(stdout, "\nOriginal file size distribution:\n");
+	dumpfs_print_chart_of_file(stats.file_original_size,
+			ARRAY_SIZE(stats.file_original_size));
+	fprintf(stdout, "\nOn-Disk file size distribution:\n");
+	dumpfs_print_chart_of_file(stats.file_comp_size,
+			ARRAY_SIZE(stats.file_comp_size));
+	fprintf(stdout, "\nFile type distribution:\n");
+	dumpfs_print_chart_of_file_type(file_types, OTHERFILETYPE);
+}
+
 static void dumpfs_print_superblock(void)
 {
 	time_t time = sbi.build_time;
@@ -156,6 +511,9 @@ int main(int argc, char **argv)
 
 	if (dumpcfg.print_superblock)
 		dumpfs_print_superblock();
+
+	if (dumpcfg.print_statistic)
+		dumpfs_print_statistic();
 out:
 	if (cfg.c_img_path)
 		free(cfg.c_img_path);
-- 
2.25.4


  parent reply	other threads:[~2021-09-14  7:37 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-09-14  7:44 [PATCH v2 1/5] erofs-utils: introduce dump.erofs Guo Xuenan
2021-09-14  7:44 ` [PATCH v2 2/5] dump.erofs: add "-s" option to dump superblock information Guo Xuenan
2021-09-14 13:04   ` Gao Xiang
2021-09-14  7:44 ` Guo Xuenan [this message]
2021-09-14 13:16   ` [PATCH v2 3/5] dump.erofs: add -S options for collecting statistics of the whole filesystem Gao Xiang
2021-09-14  7:44 ` [PATCH v2 4/5] dump.erofs: add -i options to dump file information of specific inode number Guo Xuenan
2021-09-14 13:20   ` Gao Xiang
2021-09-15  1:27     ` Guo Xuenan
2021-09-14  7:44 ` [PATCH v2 5/5] dump.erofs: add -I options to dump the layout of a particular inode on disk Guo Xuenan
2021-09-14 12:53 ` [PATCH v2 1/5] erofs-utils: introduce dump.erofs Gao Xiang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210914074424.1875409-3-guoxuenan@huawei.com \
    --to=guoxuenan@huawei.com \
    --cc=linux-erofs@lists.ozlabs.org \
    --cc=mpiglet@outlook.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).