All of lore.kernel.org
 help / color / mirror / Atom feed
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
To: Linux Doc Mailing List <linux-doc@vger.kernel.org>,
	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>,
	"Jonathan Corbet" <corbet@lwn.net>,
	linux-kernel@vger.kernel.org
Subject: [PATCH 07/13] scripts: get_abi.pl: add a graph to speedup the undefined algorithm
Date: Thu, 23 Sep 2021 15:30:05 +0200	[thread overview]
Message-ID: <d074a28c6cf91f6451ea957a0141601c744d973c.1632402570.git.mchehab+huawei@kernel.org> (raw)
In-Reply-To: <cover.1632402570.git.mchehab+huawei@kernel.org>

Searching for symlinks is an expensive operation with the current
logic, as it is at the order of O(n^3). In practice, running the
check spends 2-3 minutes to check all symbols.

Fix it by storing the directory tree into a graph, and using
a Breadth First Search (BFS) to find the links for each sysfs node.

With such improvement, it can now report issues with ~11 seconds
on my machine.

It comes with a price, though: there are more symbols reported
as undefined after this change. I suspect it is due to some
sysfs circular loops that are dropped by BFS. Despite such
increase, it seems that the reports are now more coherent.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
---
 scripts/get_abi.pl | 189 ++++++++++++++++++++++++++++++---------------
 1 file changed, 127 insertions(+), 62 deletions(-)

diff --git a/scripts/get_abi.pl b/scripts/get_abi.pl
index 41a49ae31c25..9eb8a033d363 100755
--- a/scripts/get_abi.pl
+++ b/scripts/get_abi.pl
@@ -547,6 +547,73 @@ sub dont_parse_special_attributes {
 my %leaf;
 my %aliases;
 my @files;
+my %root;
+
+sub graph_add_file {
+	my $file = shift;
+	my $type = shift;
+
+	my $dir = $file;
+	$dir =~ s,^(.*/).*,$1,;
+	$file =~ s,.*/,,;
+
+	my $name;
+	my $file_ref = \%root;
+	foreach my $edge(split "/", $dir) {
+		$name .= "$edge/";
+		if (!defined ${$file_ref}{$edge}) {
+			${$file_ref}{$edge} = { };
+		}
+		$file_ref = \%{$$file_ref{$edge}};
+		${$file_ref}{"__name"} = [ $name ];
+	}
+	$name .= "$file";
+	${$file_ref}{$file} = {
+		"__name" => [ $name ]
+	};
+
+	return \%{$$file_ref{$file}};
+}
+
+sub graph_add_link {
+	my $file = shift;
+	my $link = shift;
+
+	# Traverse graph to find the reference
+	my $file_ref = \%root;
+	foreach my $edge(split "/", $file) {
+		$file_ref = \%{$$file_ref{$edge}} || die "Missing node!";
+	}
+
+	# do a BFS
+
+	my @queue;
+	my %seen;
+	my $base_name;
+	my $st;
+
+	push @queue, $file_ref;
+	$seen{$start}++;
+
+	while (@queue) {
+		my $v = shift @queue;
+		my @child = keys(%{$v});
+
+		foreach my $c(@child) {
+			next if $seen{$$v{$c}};
+			next if ($c eq "__name");
+
+			# Add new name
+			my $name = @{$$v{$c}{"__name"}}[0];
+			if ($name =~ s#^$file/#$link/#) {
+				push @{$$v{$c}{"__name"}}, $name;
+			}
+			# Add child to the queue and mark as seen
+			push @queue, $$v{$c};
+			$seen{$c}++;
+		}
+	}
+}
 
 my $escape_symbols = qr { ([\x01-\x08\x0e-\x1f\x21-\x29\x2b-\x2d\x3a-\x40\x7b-\xfe]) }x;
 sub parse_existing_sysfs {
@@ -569,19 +636,50 @@ sub parse_existing_sysfs {
 	return if (defined($data{$file}));
 	return if (defined($data{$abs_file}));
 
-	push @files, $abs_file;
+	push @files, graph_add_file($abs_file, "file");
+}
+
+sub get_leave($)
+{
+	my $what = shift;
+	my $leave;
+
+	my $l = $what;
+	my $stop = 1;
+
+	$leave = $l;
+	$leave =~ s,/$,,;
+	$leave =~ s,.*/,,;
+	$leave =~ s/[\(\)]//g;
+
+	# $leave is used to improve search performance at
+	# check_undefined_symbols, as the algorithm there can seek
+	# for a small number of "what". It also allows giving a
+	# hint about a leave with the same name somewhere else.
+	# However, there are a few occurences where the leave is
+	# either a wildcard or a number. Just group such cases
+	# altogether.
+	if ($leave =~ m/^\.\*/ || $leave eq "" || $leave =~ /^\d+$/) {
+		$leave = "others";
+	}
+
+	return $leave;
 }
 
 sub check_undefined_symbols {
-	foreach my $file (sort @files) {
+	foreach my $file_ref (sort @files) {
+		my @names = @{$$file_ref{"__name"}};
+		my $file = $names[0];
 
 		my $defined = 0;
 		my $exact = 0;
-		my $whats = "";
 		my $found_string;
 
-		my $leave = $file;
-		$leave =~ s,.*/,,;
+		my $leave = get_leave($file);
+		if (!defined($leaf{$leave})) {
+			$leave = "others";
+		}
+		my $what = $leaf{$leave};
 
 		my $path = $file;
 		$path =~ s,(.*/).*,$1,;
@@ -591,41 +689,12 @@ sub check_undefined_symbols {
 			$found_string = 1;
 		}
 
-		if ($leave =~ /^\d+$/ || !defined($leaf{$leave})) {
-			$leave = "others";
-		}
-
-		print "--> $file\n" if ($found_string && $hint);
-		my $what = $leaf{$leave};
-		$whats .= " $what" if (!($whats =~ m/$what/));
-
-		foreach my $w (split / /, $what) {
-			if ($file =~ m#^$w$#) {
-				$exact = 1;
-				last;
-			}
-		}
-		# Check for aliases
-		#
-		# TODO: this algorithm is O(w * n²). It can be
-		# improved in the future in order to handle it
-		# faster, by changing parse_existing_sysfs to
-		# store the sysfs inside a tree, at the expense
-		# on making the code less readable and/or using some
-		# additional perl library.
-		foreach my $a (keys %aliases) {
-			my $new = $aliases{$a};
-			my $len = length($new);
-
-			if (substr($file, 0, $len) eq $new) {
-				my $newf = $a . substr($file, $len);
-
-				print "    $newf\n" if ($found_string && $hint);
-				foreach my $w (split / /, $what) {
-					if ($newf =~ m#^$w$#) {
-						$exact = 1;
-						last;
-					}
+		foreach my $a (@names) {
+			print "--> $a\n" if ($found_string && $hint);
+			foreach my $w (split /\xac/, $what) {
+				if ($a =~ m#^$w$#) {
+					$exact = 1;
+					last;
 				}
 			}
 		}
@@ -642,8 +711,13 @@ sub check_undefined_symbols {
 		# is not easily parseable.
 		next if ($file =~ m#/parameters/#);
 
-		if ($hint && $defined && $leave ne "others") {
-			print "$leave at $path might be one of:$whats\n"  if (!$search_string || $found_string);
+		if ($hint && $defined && (!$search_string || $found_string)) {
+			$what =~ s/\xac/\n\t/g;
+			if ($leave ne "others") {
+				print "    more likely regexes:\n\t$what\n";
+			} else {
+				print "    tested regexes:\n\t$what\n";
+			}
 			next;
 		}
 		print "$file not found.\n" if (!$search_string || $found_string);
@@ -657,8 +731,10 @@ sub undefined_symbols {
 		no_chdir => 1
 	     }, $sysfs_prefix);
 
+	$leaf{"others"} = "";
+
 	foreach my $w (sort keys %data) {
-		foreach my $what (split /\xac /,$w) {
+		foreach my $what (split /\xac/,$w) {
 			next if (!($what =~ m/^$sysfs_prefix/));
 
 			# Convert what into regular expressions
@@ -701,20 +777,6 @@ sub undefined_symbols {
 			# (this happens on a few IIO definitions)
 			$what =~ s,\s*\=.*$,,;
 
-			my $leave = $what;
-			$leave =~ s,.*/,,;
-
-			# $leave is used to improve search performance at
-			# check_undefined_symbols, as the algorithm there can seek
-			# for a small number of "what". It also allows giving a
-			# hint about a leave with the same name somewhere else.
-			# However, there are a few occurences where the leave is
-			# either a wildcard or a number. Just group such cases
-			# altogether.
-			if ($leave =~ m/^\.\*/ || $leave eq "" || $leave =~ /^\d+$/) {
-				$leave = "others" ;
-			}
-
 			# Escape all other symbols
 			$what =~ s/$escape_symbols/\\$1/g;
 			$what =~ s/\\\\/\\/g;
@@ -723,17 +785,15 @@ sub undefined_symbols {
 
 			$what =~ s/\xff/\\d+/g;
 
-
 			# Special case: IIO ABI which a parenthesis.
 			$what =~ s/sqrt(.*)/sqrt\(.*\)/;
 
-			$leave =~ s/[\(\)]//g;
-
+			my $leave = get_leave($what);
 			my $added = 0;
 			foreach my $l (split /\|/, $leave) {
 				if (defined($leaf{$l})) {
-					next if ($leaf{$l} =~ m/$what/);
-					$leaf{$l} .= " " . $what;
+					next if ($leaf{$l} =~ m/\b$what\b/);
+					$leaf{$l} .= "\xac" . $what;
 					$added = 1;
 				} else {
 					$leaf{$l} = $what;
@@ -746,6 +806,11 @@ sub undefined_symbols {
 
 		}
 	}
+	# Take links into account
+	foreach my $link (keys %aliases) {
+		my $abs_file = $aliases{$link};
+		graph_add_link($abs_file, $link);
+	}
 	check_undefined_symbols;
 }
 
-- 
2.31.1


  parent reply	other threads:[~2021-09-23 13:30 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-09-23 13:29 [PATCH 00/13] get_abi.pl undefined: improve precision and performance Mauro Carvalho Chehab
2021-09-23 13:29 ` [PATCH 01/13] scripts: get_abi.pl: Better handle multiple What parameters Mauro Carvalho Chehab
2021-09-23 13:30 ` [PATCH 02/13] scripts: get_abi.pl: Check for missing symbols at the ABI specs Mauro Carvalho Chehab
2021-09-23 13:30 ` [PATCH 03/13] scripts: get_abi.pl: detect softlinks Mauro Carvalho Chehab
2021-09-23 13:30 ` [PATCH 04/13] scripts: get_abi.pl: add an option to filter undefined results Mauro Carvalho Chehab
2021-09-23 13:30 ` [PATCH 05/13] scripts: get_abi.pl: don't skip what that ends with wildcards Mauro Carvalho Chehab
2021-09-23 13:30 ` [PATCH 06/13] scripts: get_abi.pl: Ignore fs/cgroup sysfs nodes earlier Mauro Carvalho Chehab
2021-09-23 13:30 ` Mauro Carvalho Chehab [this message]
2021-09-23 13:30 ` [PATCH 08/13] scripts: get_abi.pl: improve debug logic Mauro Carvalho Chehab
2021-09-23 13:30 ` [PATCH 09/13] scripts: get_abi.pl: Better handle leaves with wildcards Mauro Carvalho Chehab
2021-09-23 13:30 ` [PATCH 10/13] scripts: get_abi.pl: ignore some sysfs nodes earlier Mauro Carvalho Chehab
2021-09-23 13:30 ` [PATCH 11/13] scripts: get_abi.pl: stop check loop earlier when regex is found Mauro Carvalho Chehab
2021-09-23 13:30 ` [PATCH 12/13] scripts: get_abi.pl: precompile what match regexes Mauro Carvalho Chehab
2021-09-23 13:30 ` [PATCH 13/13] scripts: get_abi.pl: ensure that "others" regex will be parsed Mauro Carvalho Chehab
2021-09-23 13:58 ` [PATCH 00/13] get_abi.pl undefined: improve precision and performance Greg Kroah-Hartman
2021-09-23 15:41   ` [PATCH 0/8] (REBASED) " Mauro Carvalho Chehab
2021-09-23 15:41     ` [PATCH 1/8] scripts: get_abi.pl: Fix get_abi.pl search output Mauro Carvalho Chehab
2021-09-23 15:41     ` [PATCH 2/8] scripts: get_abi.pl: call get_leave() a little late Mauro Carvalho Chehab
2021-09-23 15:41     ` [PATCH 3/8] scripts: get_abi.pl: improve debug logic Mauro Carvalho Chehab
2021-09-23 15:41     ` [PATCH 4/8] scripts: get_abi.pl: Better handle leaves with wildcards Mauro Carvalho Chehab
2021-09-23 15:41     ` [PATCH 5/8] scripts: get_abi.pl: ignore some sysfs nodes earlier Mauro Carvalho Chehab
2021-09-23 15:41     ` [PATCH 6/8] scripts: get_abi.pl: stop check loop earlier when regex is found Mauro Carvalho Chehab
2021-09-23 15:41     ` [PATCH 7/8] scripts: get_abi.pl: precompile what match regexes Mauro Carvalho Chehab
2021-09-23 15:41     ` [PATCH 8/8] scripts: get_abi.pl: ensure that "others" regex will be parsed Mauro Carvalho Chehab
2021-09-23 17:13     ` [PATCH 0/8] (REBASED) get_abi.pl undefined: improve precision and performance Greg Kroah-Hartman
2021-09-27  8:55       ` Mauro Carvalho Chehab
2021-09-27  9:23         ` Greg Kroah-Hartman
2021-09-27 13:39           ` Mauro Carvalho Chehab
2021-09-27 15:48             ` Greg Kroah-Hartman
2021-09-28 10:03               ` Mauro Carvalho Chehab
2021-09-28 10:43                 ` Greg Kroah-Hartman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=d074a28c6cf91f6451ea957a0141601c744d973c.1632402570.git.mchehab+huawei@kernel.org \
    --to=mchehab+huawei@kernel.org \
    --cc=corbet@lwn.net \
    --cc=gregkh@linuxfoundation.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.