All of lore.kernel.org
 help / color / mirror / Atom feed
* [Buildroot] [PATCH 1/1] ppd-merge: speed up per-package-rsync
@ 2023-11-27 22:41 Brandon Maier via buildroot
  2023-11-28 15:49 ` Herve Codina via buildroot
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Brandon Maier via buildroot @ 2023-11-27 22:41 UTC (permalink / raw)
  To: buildroot; +Cc: Herve Codina, Yann E . MORIN, Brandon Maier

The per-package-rsync stage can add a significant amount of time to
builds. They can also be annoying as the slowest rsyncs, the final
target-finalize and host-finalize stages, run on `make all` which we use
frequently during development.

The per-package-rsync is slow because it launches a new rsync for each
source tree, and each rsync must rescan the destination directory and
potentially overwrite files multiple times. So instead we manually walk
the source trees to find only the files that should be written to the
destination, then feed that to a single instance of rsync.

Below is a benchmark running just the host-finalize merge for a system
with 200 host packages, running in both hardlink and full copy mode.

Benchmark 1: ppd-merge-copy
  Time (mean ± σ):      5.332 s ±  0.098 s    [User: 5.300 s, System: 3.926 s]
  Range (min … max):    5.099 s …  5.468 s    10 runs

Benchmark 2: ppd-merge-hardlink
  Time (mean ± σ):      2.067 s ±  0.027 s    [User: 1.218 s, System: 1.614 s]
  Range (min … max):    2.037 s …  2.114 s    10 runs

Benchmark 3: rsync-copy
  Time (mean ± σ):     25.539 s ±  0.233 s    [User: 11.705 s, System: 10.862 s]
  Range (min … max):   25.215 s … 25.966 s    10 runs

Benchmark 4: rsync-hardlink
  Time (mean ± σ):     24.074 s ±  0.271 s    [User: 3.451 s, System: 10.802 s]
  Range (min … max):   23.672 s … 24.522 s    10 runs

Summary
  'ppd-merge-hardlink' ran
    2.58 ± 0.06 times faster than 'ppd-merge-copy'
   11.65 ± 0.20 times faster than 'rsync-hardlink'
   12.36 ± 0.20 times faster than 'rsync-copy'

Signed-off-by: Brandon Maier <brandon.maier@collins.com>
---
 package/pkg-utils.mk         |  16 +---
 support/scripts/ppd-merge.sh | 154 +++++++++++++++++++++++++++++++++++
 2 files changed, 157 insertions(+), 13 deletions(-)
 create mode 100755 support/scripts/ppd-merge.sh

diff --git a/package/pkg-utils.mk b/package/pkg-utils.mk
index 059e86ae0a..3968cabebd 100644
--- a/package/pkg-utils.mk
+++ b/package/pkg-utils.mk
@@ -216,19 +216,9 @@ ifeq ($(BR2_PER_PACKAGE_DIRECTORIES),y)
 # $3: destination directory
 # $4: literal "copy" or "hardlink" to copy or hardlink files from src to dest
 define per-package-rsync
-	mkdir -p $(3)
-	$(foreach pkg,$(1),\
-		rsync -a \
-			--hard-links \
-			$(if $(filter hardlink,$(4)), \
-				--link-dest=$(PER_PACKAGE_DIR)/$(pkg)/$(2)/, \
-				$(if $(filter copy,$(4)), \
-					$(empty), \
-					$(error per-package-rsync can only "copy" or "hardlink", not "$(4)") \
-				) \
-			) \
-			$(PER_PACKAGE_DIR)/$(pkg)/$(2)/ \
-			$(3)$(sep))
+	PER_PACKAGE_DIR=$(PER_PACKAGE_DIR) \
+		$(TOPDIR)/support/scripts/ppd-merge.sh \
+		$(2) $(3) $(4) $(1)
 endef
 
 # prepares the per-package HOST_DIR and TARGET_DIR of the current
diff --git a/support/scripts/ppd-merge.sh b/support/scripts/ppd-merge.sh
new file mode 100755
index 0000000000..eb1caf9e52
--- /dev/null
+++ b/support/scripts/ppd-merge.sh
@@ -0,0 +1,154 @@
+#!/bin/bash
+# Merge a set of Buildroot per-package-directories (PPD) into a single
+# destination directory.
+#
+# ppd-merge scans through all the PPD and builds a table of which files from
+# each source directory will be written to the destination directory. It feeds
+# that table into rsync to tell it exactly which files to copy.
+#
+# Example:
+#   ppd-merge replaces the original method of merging PPD that ran many rsync
+#   commands like below.
+#
+#   > rsync -a --link-dest=$PER_PACKAGE_DIR/pkg-1/host/ \
+#   >     $PER_PACKAGE_DIR/pkg-1/host/ dest/
+#   > rsync -a --link-dest=$PER_PACKAGE_DIR/pkg-2/host/ \
+#   >     $PER_PACKAGE_DIR/pkg-2/host/ dest/
+#   ...
+#   > rsync -a --link-dest=$PER_PACKAGE_DIR/pkg-n/host/ \
+#   >     $PER_PACKAGE_DIR/pkg-n/host/ dest/
+#
+#   The equivalent command with ppd-merge is
+#
+#   > PER_PACKAGE_DIR=$PER_PACKAGE_DIR \
+#   >     ppd-merge.sh host dest/ hardlink \
+#   >     pkg-1 pkg-2 ... pkg-n
+
+set -euo pipefail
+
+usage() {
+    cat <<EOF >&2
+Usage:
+  ppd-merge.sh <host|target> <destination> <hardlink|copy> <pkg-name>...
+
+  PER_PACKAGE_DIR must be defined in the environment.
+
+  Merges each directory for <pkg-name> at
+  "\$PER_PACKAGE_DIR/<pkg-name>/<host|target>/" into <destination>.
+EOF
+    echo "Error: $*" >&2
+    exit 1
+}
+
+search_subtrees() {
+    # Use `find` to walk all subtrees and print their contents in `rsync`
+    # '--relative' path syntax. Filter paths through awk so we discard any
+    # duplicate files found while walking subtrees.
+    (cd "$PER_PACKAGE_DIR" && find "${subtrees[@]}" -mindepth 1 -printf "%H/./%P\0") \
+        | awk '
+        BEGIN {
+            RS="\0";
+            ORS="\0";
+            FS="/\\./";
+        }
+        {
+            if (!($2 in found)) {
+                found[$2]=1;
+                print;
+            }
+        }'
+}
+
+rsync_hardlink() {
+    # rsync hardlinking with --link-dest hard caps at 20 subtrees. Batch up the
+    # input files in $tmpdir until we have 20 subtrees, then flush them with an
+    # rsync call.
+    tmpdir=$(mktemp -d)
+    awk -v tmpdir="$tmpdir" '
+        function mkfiles() {
+            out_file=tmpdir "/" i ".files";
+            subtrees_file=tmpdir "/" i ".subtrees";
+        }
+        function batch_out() {
+            close(out_file);
+            close(subtrees_file);
+            delete subtrees;
+            print i;
+            i+=1;
+            mkfiles();
+        }
+        BEGIN {
+            RS="\0";
+            ORS="\0";
+            FS="/\\./";
+            i=0;
+            mkfiles();
+        }
+        !($1 in subtrees) && (length(subtrees) >= 20) {
+            batch_out();
+        }
+        {
+            print > out_file;
+            if (!($1 in subtrees)) {
+                subtrees[$1]=1;
+                print $1 > subtrees_file;
+            }
+        }
+        END {
+            if (length(subtrees) > 0) {
+                batch_out();
+            }
+        }' | while IFS= read -r -d $'\0' i; do
+            rsync_cmd_batch=( "${rsync_cmd[@]}" )
+            while IFS= read -r -d $'\0' subtree; do
+                rsync_cmd_batch+=( --link-dest="$PER_PACKAGE_DIR/$subtree/" )
+            done <"$tmpdir/$i.subtrees"
+            "${rsync_cmd_batch[@]}" "$PER_PACKAGE_DIR/" "$dest/" <"$tmpdir/$i.files"
+        done
+    rm -rf "$tmpdir"
+}
+
+if [ "$#" -lt 3 ]; then
+    usage "Invalid number of arguments"
+fi
+type=$1
+dest=$2
+link=$3
+shift 3
+
+case "$type" in
+host|target)
+    ;;
+*)
+    usage "Unknown type '$type'"
+    ;;
+esac
+
+subtrees=()
+for ((i = $#; i > 0; i--)); do
+    subtrees+=( "${!i}/$type" )
+done
+
+if [ -z "${PER_PACKAGE_DIR:-}" ]; then
+    usage "PER_PACKAGE_DIR must be defined"
+fi
+
+rsync_cmd=( rsync -a --hard-links --files-from=- --from0 )
+
+mkdir -p "$dest"
+
+if [ "${#subtrees[@]}" -eq 0 ]; then
+    exit 0
+fi
+
+case "$link" in
+hardlink)
+    search_subtrees | rsync_hardlink
+    ;;
+copy)
+    search_subtrees | "${rsync_cmd[@]}" "$PER_PACKAGE_DIR/" "$dest/"
+    ;;
+*)
+    usage "Unknown link command '$link'"
+    ;;
+esac
-- 
2.43.0

_______________________________________________
buildroot mailing list
buildroot@buildroot.org
https://lists.buildroot.org/mailman/listinfo/buildroot

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [Buildroot] [PATCH 1/1] ppd-merge: speed up per-package-rsync
  2023-11-27 22:41 [Buildroot] [PATCH 1/1] ppd-merge: speed up per-package-rsync Brandon Maier via buildroot
@ 2023-11-28 15:49 ` Herve Codina via buildroot
  2023-11-28 17:21   ` [Buildroot] [External] " Maier, Brandon L Collins via buildroot
  2023-11-28 21:07 ` [Buildroot] " Yann E. MORIN
  2023-12-01 17:05 ` [Buildroot] [PATCH v2 " Brandon Maier via buildroot
  2 siblings, 1 reply; 10+ messages in thread
From: Herve Codina via buildroot @ 2023-11-28 15:49 UTC (permalink / raw)
  To: Brandon Maier; +Cc: Yann E . MORIN, buildroot

Hi Brandon,

On Mon, 27 Nov 2023 22:41:39 +0000
Brandon Maier <brandon.maier@collins.com> wrote:

[...]

> diff --git a/package/pkg-utils.mk b/package/pkg-utils.mk
> index 059e86ae0a..3968cabebd 100644
> --- a/package/pkg-utils.mk
> +++ b/package/pkg-utils.mk
> @@ -216,19 +216,9 @@ ifeq ($(BR2_PER_PACKAGE_DIRECTORIES),y)
>  # $3: destination directory
>  # $4: literal "copy" or "hardlink" to copy or hardlink files from src to dest
>  define per-package-rsync
> -	mkdir -p $(3)

Shouldn't the mkdir be kept here ?

> -	$(foreach pkg,$(1),\
> -		rsync -a \
> -			--hard-links \
> -			$(if $(filter hardlink,$(4)), \
> -				--link-dest=$(PER_PACKAGE_DIR)/$(pkg)/$(2)/, \
> -				$(if $(filter copy,$(4)), \
> -					$(empty), \
> -					$(error per-package-rsync can only "copy" or "hardlink", not "$(4)") \
> -				) \
> -			) \
> -			$(PER_PACKAGE_DIR)/$(pkg)/$(2)/ \
> -			$(3)$(sep))
> +	PER_PACKAGE_DIR=$(PER_PACKAGE_DIR) \
> +		$(TOPDIR)/support/scripts/ppd-merge.sh \
> +		$(2) $(3) $(4) $(1)

What do you think about passing the PER_PACKAGE_DIR by parameter instead of
mixing environment values and parameters ?

>  endef
>  
>  # prepares the per-package HOST_DIR and TARGET_DIR of the current
> diff --git a/support/scripts/ppd-merge.sh b/support/scripts/ppd-merge.sh
> new file mode 100755
> index 0000000000..eb1caf9e52
> --- /dev/null
> +++ b/support/scripts/ppd-merge.sh
[...]


Best regards,
Hervé
_______________________________________________
buildroot mailing list
buildroot@buildroot.org
https://lists.buildroot.org/mailman/listinfo/buildroot

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Buildroot] [External] Re: [PATCH 1/1] ppd-merge: speed up per-package-rsync
  2023-11-28 15:49 ` Herve Codina via buildroot
@ 2023-11-28 17:21   ` Maier, Brandon L Collins via buildroot
  2023-11-28 17:54     ` Herve Codina via buildroot
  0 siblings, 1 reply; 10+ messages in thread
From: Maier, Brandon L Collins via buildroot @ 2023-11-28 17:21 UTC (permalink / raw)
  To: Herve Codina; +Cc: Yann E . MORIN, buildroot

Hi Herve,

> -----Original Message-----
> From: Herve Codina <herve.codina@bootlin.com>
> Sent: Tuesday, November 28, 2023 9:49 AM
> To: Maier, Brandon L Collins <Brandon.Maier@collins.com>
> Cc: buildroot@buildroot.org; Yann E . MORIN <yann.morin.1998@free.fr>
> Subject: [External] Re: [PATCH 1/1] ppd-merge: speed up per-package-rsync
>
> Hi Brandon,
>
> On Mon, 27 Nov 2023 22:41:39 +0000
> Brandon Maier <brandon.maier@collins.com> wrote:
>
> [...]
>
> > diff --git a/package/pkg-utils.mk b/package/pkg-utils.mk
> > index 059e86ae0a..3968cabebd 100644
> > --- a/package/pkg-utils.mk
> > +++ b/package/pkg-utils.mk
> > @@ -216,19 +216,9 @@ ifeq ($(BR2_PER_PACKAGE_DIRECTORIES),y)
> >  # $3: destination directory
> >  # $4: literal "copy" or "hardlink" to copy or hardlink files from src to dest
> >  define per-package-rsync
> > -   mkdir -p $(3)
>
> Shouldn't the mkdir be kept here ?

I moved it into ppd-merge.sh so that the script would be handling all the details of per-package-rsync.

>
> > -   $(foreach pkg,$(1),\
> > -           rsync -a \
> > -                   --hard-links \
> > -                   $(if $(filter hardlink,$(4)), \
> > -                           --link-
> dest=$(PER_PACKAGE_DIR)/$(pkg)/$(2)/, \
> > -                           $(if $(filter copy,$(4)), \
> > -                                   $(empty), \
> > -                                   $(error per-package-rsync can only
> "copy" or "hardlink", not "$(4)") \
> > -                           ) \
> > -                   ) \
> > -                   $(PER_PACKAGE_DIR)/$(pkg)/$(2)/ \
> > -                   $(3)$(sep))
> > +   PER_PACKAGE_DIR=$(PER_PACKAGE_DIR) \
> > +           $(TOPDIR)/support/scripts/ppd-merge.sh \
> > +           $(2) $(3) $(4) $(1)
>
> What do you think about passing the PER_PACKAGE_DIR by parameter
> instead of
> mixing environment values and parameters ?

I was trying to copy the style of support/scripts/fix-rpath, which takes PER_PACKAGE_DIR from the environment. For example, see ./Makefile "prepare-sdk:" target.

>
> >  endef
> >
> >  # prepares the per-package HOST_DIR and TARGET_DIR of the current
> > diff --git a/support/scripts/ppd-merge.sh b/support/scripts/ppd-merge.sh
> > new file mode 100755
> > index 0000000000..eb1caf9e52
> > --- /dev/null
> > +++ b/support/scripts/ppd-merge.sh
> [...]
>
>
> Best regards,
> Hervé

Thanks!
Brandon Maier
_______________________________________________
buildroot mailing list
buildroot@buildroot.org
https://lists.buildroot.org/mailman/listinfo/buildroot

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Buildroot] [External] Re: [PATCH 1/1] ppd-merge: speed up per-package-rsync
  2023-11-28 17:21   ` [Buildroot] [External] " Maier, Brandon L Collins via buildroot
@ 2023-11-28 17:54     ` Herve Codina via buildroot
  2023-11-28 18:04       ` Maier, Brandon L Collins via buildroot
  0 siblings, 1 reply; 10+ messages in thread
From: Herve Codina via buildroot @ 2023-11-28 17:54 UTC (permalink / raw)
  To: Maier, Brandon L                            Collins, Yann E . MORIN
  Cc: buildroot

Hi Brandon, Yann

On Tue, 28 Nov 2023 17:21:25 +0000
"Maier, Brandon L                            Collins" <Brandon.Maier@collins.com> wrote:

[...]

> > >  define per-package-rsync
> > > -   mkdir -p $(3)  
> >
> > Shouldn't the mkdir be kept here ?  
> 
> I moved it into ppd-merge.sh so that the script would be handling all the details of per-package-rsync.

Oops, my bad I missed it!

[...]
> > > +   PER_PACKAGE_DIR=$(PER_PACKAGE_DIR) \
> > > +           $(TOPDIR)/support/scripts/ppd-merge.sh \
> > > +           $(2) $(3) $(4) $(1)  
> >
> > What do you think about passing the PER_PACKAGE_DIR by parameter
> > instead of
> > mixing environment values and parameters ?  
> 
> I was trying to copy the style of support/scripts/fix-rpath, which takes PER_PACKAGE_DIR from the environment. For example, see ./Makefile "prepare-sdk:" target.
> 

Makes sense.
On the other hand, support/scripts/check-host-rpath takes PER_PACKAGE_DIR from parameters.
  https://gitlab.com/buildroot.org/buildroot/-/blob/master/package/pkg-generic.mk?ref_type=heads#L62

I have no opinion about what is the best to do.
Maybe Yann ?

Best regards,
Hervé
_______________________________________________
buildroot mailing list
buildroot@buildroot.org
https://lists.buildroot.org/mailman/listinfo/buildroot

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Buildroot] [External] Re: [PATCH 1/1] ppd-merge: speed up per-package-rsync
  2023-11-28 17:54     ` Herve Codina via buildroot
@ 2023-11-28 18:04       ` Maier, Brandon L Collins via buildroot
  0 siblings, 0 replies; 10+ messages in thread
From: Maier, Brandon L Collins via buildroot @ 2023-11-28 18:04 UTC (permalink / raw)
  To: Herve Codina, Yann E . MORIN; +Cc: buildroot

Hi Herve, Yann

> [...]
> > > > +   PER_PACKAGE_DIR=$(PER_PACKAGE_DIR) \
> > > > +           $(TOPDIR)/support/scripts/ppd-merge.sh \
> > > > +           $(2) $(3) $(4) $(1)
> > >
> > > What do you think about passing the PER_PACKAGE_DIR by parameter
> > > instead of
> > > mixing environment values and parameters ?
> >
> > I was trying to copy the style of support/scripts/fix-rpath, which takes
> PER_PACKAGE_DIR from the environment. For example, see ./Makefile
> "prepare-sdk:" target.
> >
>
> Makes sense.
> On the other hand, support/scripts/check-host-rpath takes
> PER_PACKAGE_DIR from parameters.
>   https://urldefense.com/v3/__https://gitlab.com/buildroot.org/buildroot/-
> /blob/master/package/pkg-
> generic.mk?ref_type=heads*L62__;Iw!!MvWE!E_uYw1n1D_rdclSbh2DWEtXp
> kOR6peTRWGpvjNiTFyvSz_0PFA-
> zD4uc1xXoqQl0T4Cp_wjJ6O6kYHkOqHoURX7ebRA$
>
> I have no opinion about what is the best to do.
> Maybe Yann ?

And oops, I missed check-host-rpath :). It's a trivial change so I can send a v2 with that if it's preferred.

> Best regards,
> Hervé
_______________________________________________
buildroot mailing list
buildroot@buildroot.org
https://lists.buildroot.org/mailman/listinfo/buildroot

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Buildroot] [PATCH 1/1] ppd-merge: speed up per-package-rsync
  2023-11-27 22:41 [Buildroot] [PATCH 1/1] ppd-merge: speed up per-package-rsync Brandon Maier via buildroot
  2023-11-28 15:49 ` Herve Codina via buildroot
@ 2023-11-28 21:07 ` Yann E. MORIN
  2023-11-29  0:00   ` [Buildroot] [External] " Maier, Brandon L Collins via buildroot
  2023-11-29  8:17   ` [Buildroot] " Herve Codina via buildroot
  2023-12-01 17:05 ` [Buildroot] [PATCH v2 " Brandon Maier via buildroot
  2 siblings, 2 replies; 10+ messages in thread
From: Yann E. MORIN @ 2023-11-28 21:07 UTC (permalink / raw)
  To: Brandon Maier; +Cc: Herve Codina, Thomas Petazzoni, buildroot

Brandon, All,

+Thomas for the intial work on this topic

On 2023-11-27 22:41 +0000, Brandon Maier spake thusly:
> The per-package-rsync stage can add a significant amount of time to
> builds. They can also be annoying as the slowest rsyncs, the final
> target-finalize and host-finalize stages, run on `make all` which we use
> frequently during development.
> 
> The per-package-rsync is slow because it launches a new rsync for each
> source tree, and each rsync must rescan the destination directory and
> potentially overwrite files multiple times. So instead we manually walk
> the source trees to find only the files that should be written to the
> destination, then feed that to a single instance of rsync.

Sorry, I haven't looked into the script in details, but why can't we
just do something like:

    rsync -a \
        --hard-links \
        $(if $(filter hardlink,$(4)), \
            --link-dest=$(PER_PACKAGE_DIR)/$(pkg)/$(2)/, \
            $(if $(filter copy,$(4)), \
                $(empty), \
                $(error per-package-rsync can only "copy" or "hardlink", not "$(4)") \
            ) \
        ) \
        $(patsubst %, $(PER_PACKAGE_DIR)/%/$(2)/, $(1)) \
        $(3)

rsync can take multiple sources, and is usually pretty fast, so I would
very much prefer that we do not invent our own tooling if a single rsync
can do the job.

Also, the copy situation is now only ever needed when we aggregate the
final target and host directories, while the hard-linking case is used
when assembling each per-package; so there is no point in benchmarking
the copy mode for the per-package preparation, or the hard-linking for
the final target and host assembling.

Hervé, Thomas, do you remember if we ever addressed using a single rsync
rather than multiple ones in your previous work on the topic, and if so,
why we did not use that?

Regards,
Yann E. MORIN.

>  create mode 100755 support/scripts/ppd-merge.sh
> 
> diff --git a/package/pkg-utils.mk b/package/pkg-utils.mk
> index 059e86ae0a..3968cabebd 100644
> --- a/package/pkg-utils.mk
> +++ b/package/pkg-utils.mk
> @@ -216,19 +216,9 @@ ifeq ($(BR2_PER_PACKAGE_DIRECTORIES),y)
>  # $3: destination directory
>  # $4: literal "copy" or "hardlink" to copy or hardlink files from src to dest
>  define per-package-rsync
> -	mkdir -p $(3)
> -	$(foreach pkg,$(1),\
> -		rsync -a \
> -			--hard-links \
> -			$(if $(filter hardlink,$(4)), \
> -				--link-dest=$(PER_PACKAGE_DIR)/$(pkg)/$(2)/, \
> -				$(if $(filter copy,$(4)), \
> -					$(empty), \
> -					$(error per-package-rsync can only "copy" or "hardlink", not "$(4)") \
> -				) \
> -			) \
> -			$(PER_PACKAGE_DIR)/$(pkg)/$(2)/ \
> -			$(3)$(sep))
> +	PER_PACKAGE_DIR=$(PER_PACKAGE_DIR) \
> +		$(TOPDIR)/support/scripts/ppd-merge.sh \
> +		$(2) $(3) $(4) $(1)
>  endef
>  
>  # prepares the per-package HOST_DIR and TARGET_DIR of the current
> diff --git a/support/scripts/ppd-merge.sh b/support/scripts/ppd-merge.sh
> new file mode 100755
> index 0000000000..eb1caf9e52
> --- /dev/null
> +++ b/support/scripts/ppd-merge.sh
> @@ -0,0 +1,154 @@
> +#!/bin/bash
> +# Merge a set of Buildroot per-package-directories (PPD) into a single
> +# destination directory.
> +#
> +# ppd-merge scans through all the PPD and builds a table of which files from
> +# each source directory will be written to the destination directory. It feeds
> +# that table into rsync to tell it exactly which files to copy.
> +#
> +# Example:
> +#   ppd-merge replaces the original method of merging PPD that ran many rsync
> +#   commands like below.
> +#
> +#   > rsync -a --link-dest=$PER_PACKAGE_DIR/pkg-1/host/ \
> +#   >     $PER_PACKAGE_DIR/pkg-1/host/ dest/
> +#   > rsync -a --link-dest=$PER_PACKAGE_DIR/pkg-2/host/ \
> +#   >     $PER_PACKAGE_DIR/pkg-2/host/ dest/
> +#   ...
> +#   > rsync -a --link-dest=$PER_PACKAGE_DIR/pkg-n/host/ \
> +#   >     $PER_PACKAGE_DIR/pkg-n/host/ dest/
> +#
> +#   The equivalent command with ppd-merge is
> +#
> +#   > PER_PACKAGE_DIR=$PER_PACKAGE_DIR \
> +#   >     ppd-merge.sh host dest/ hardlink \
> +#   >     pkg-1 pkg-2 ... pkg-n
> +
> +set -euo pipefail
> +
> +usage() {
> +    cat <<EOF >&2
> +Usage:
> +  ppd-merge.sh <host|target> <destination> <hardlink|copy> <pkg-name>...
> +
> +  PER_PACKAGE_DIR must be defined in the environment.
> +
> +  Merges each directory for <pkg-name> at
> +  "\$PER_PACKAGE_DIR/<pkg-name>/<host|target>/" into <destination>.
> +EOF
> +    echo "Error: $*" >&2
> +    exit 1
> +}
> +
> +search_subtrees() {
> +    # Use `find` to walk all subtrees and print their contents in `rsync`
> +    # '--relative' path syntax. Filter paths through awk so we discard any
> +    # duplicate files found while walking subtrees.
> +    (cd "$PER_PACKAGE_DIR" && find "${subtrees[@]}" -mindepth 1 -printf "%H/./%P\0") \
> +        | awk '
> +        BEGIN {
> +            RS="\0";
> +            ORS="\0";
> +            FS="/\\./";
> +        }
> +        {
> +            if (!($2 in found)) {
> +                found[$2]=1;
> +                print;
> +            }
> +        }'
> +}
> +
> +rsync_hardlink() {
> +    # rsync hardlinking with --link-dest hard caps at 20 subtrees. Batch up the
> +    # input files in $tmpdir until we have 20 subtrees, then flush them with an
> +    # rsync call.
> +    tmpdir=$(mktemp -d)
> +    awk -v tmpdir="$tmpdir" '
> +        function mkfiles() {
> +            out_file=tmpdir "/" i ".files";
> +            subtrees_file=tmpdir "/" i ".subtrees";
> +        }
> +        function batch_out() {
> +            close(out_file);
> +            close(subtrees_file);
> +            delete subtrees;
> +            print i;
> +            i+=1;
> +            mkfiles();
> +        }
> +        BEGIN {
> +            RS="\0";
> +            ORS="\0";
> +            FS="/\\./";
> +            i=0;
> +            mkfiles();
> +        }
> +        !($1 in subtrees) && (length(subtrees) >= 20) {
> +            batch_out();
> +        }
> +        {
> +            print > out_file;
> +            if (!($1 in subtrees)) {
> +                subtrees[$1]=1;
> +                print $1 > subtrees_file;
> +            }
> +        }
> +        END {
> +            if (length(subtrees) > 0) {
> +                batch_out();
> +            }
> +        }' | while IFS= read -r -d $'\0' i; do
> +            rsync_cmd_batch=( "${rsync_cmd[@]}" )
> +            while IFS= read -r -d $'\0' subtree; do
> +                rsync_cmd_batch+=( --link-dest="$PER_PACKAGE_DIR/$subtree/" )
> +            done <"$tmpdir/$i.subtrees"
> +            "${rsync_cmd_batch[@]}" "$PER_PACKAGE_DIR/" "$dest/" <"$tmpdir/$i.files"
> +        done
> +    rm -rf "$tmpdir"
> +}
> +
> +if [ "$#" -lt 3 ]; then
> +    usage "Invalid number of arguments"
> +fi
> +type=$1
> +dest=$2
> +link=$3
> +shift 3
> +
> +case "$type" in
> +host|target)
> +    ;;
> +*)
> +    usage "Unknown type '$type'"
> +    ;;
> +esac
> +
> +subtrees=()
> +for ((i = $#; i > 0; i--)); do
> +    subtrees+=( "${!i}/$type" )
> +done
> +
> +if [ -z "${PER_PACKAGE_DIR:-}" ]; then
> +    usage "PER_PACKAGE_DIR must be defined"
> +fi
> +
> +rsync_cmd=( rsync -a --hard-links --files-from=- --from0 )
> +
> +mkdir -p "$dest"
> +
> +if [ "${#subtrees[@]}" -eq 0 ]; then
> +    exit 0
> +fi
> +
> +case "$link" in
> +hardlink)
> +    search_subtrees | rsync_hardlink
> +    ;;
> +copy)
> +    search_subtrees | "${rsync_cmd[@]}" "$PER_PACKAGE_DIR/" "$dest/"
> +    ;;
> +*)
> +    usage "Unknown link command '$link'"
> +    ;;
> +esac
> -- 
> 2.43.0
> 

-- 
.-----------------.--------------------.------------------.--------------------.
|  Yann E. MORIN  | Real-Time Embedded | /"\ ASCII RIBBON | Erics' conspiracy: |
| +33 662 376 056 | Software  Designer | \ / CAMPAIGN     |  ___               |
| +33 561 099 427 `------------.-------:  X  AGAINST      |  \e/  There is no  |
| http://ymorin.is-a-geek.org/ | _/*\_ | / \ HTML MAIL    |   v   conspiracy.  |
'------------------------------^-------^------------------^--------------------'
_______________________________________________
buildroot mailing list
buildroot@buildroot.org
https://lists.buildroot.org/mailman/listinfo/buildroot

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Buildroot] [External] Re: [PATCH 1/1] ppd-merge: speed up per-package-rsync
  2023-11-28 21:07 ` [Buildroot] " Yann E. MORIN
@ 2023-11-29  0:00   ` Maier, Brandon L Collins via buildroot
  2023-11-29  8:17   ` [Buildroot] " Herve Codina via buildroot
  1 sibling, 0 replies; 10+ messages in thread
From: Maier, Brandon L Collins via buildroot @ 2023-11-29  0:00 UTC (permalink / raw)
  To: Yann E. MORIN; +Cc: Herve Codina, Thomas Petazzoni, buildroot

Hi Yann,

> -----Original Message-----
> From: Yann E. MORIN <yann.morin.1998@free.fr>
> Sent: Tuesday, November 28, 2023 3:07 PM
> To: Maier, Brandon L Collins <Brandon.Maier@collins.com>
> Cc: buildroot@buildroot.org; Herve Codina <herve.codina@bootlin.com>;
> Thomas Petazzoni <thomas.petazzoni@bootlin.com>
> Subject: [External] Re: [PATCH 1/1] ppd-merge: speed up per-package-rsync
>
> Brandon, All,
>
> +Thomas for the intial work on this topic
>
> On 2023-11-27 22:41 +0000, Brandon Maier spake thusly:
> > The per-package-rsync stage can add a significant amount of time to
> > builds. They can also be annoying as the slowest rsyncs, the final
> > target-finalize and host-finalize stages, run on `make all` which we use
> > frequently during development.
> >
> > The per-package-rsync is slow because it launches a new rsync for each
> > source tree, and each rsync must rescan the destination directory and
> > potentially overwrite files multiple times. So instead we manually walk
> > the source trees to find only the files that should be written to the
> > destination, then feed that to a single instance of rsync.
>
> Sorry, I haven't looked into the script in details, but why can't we
> just do something like:
>
>     rsync -a \
>         --hard-links \
>         $(if $(filter hardlink,$(4)), \
>             --link-dest=$(PER_PACKAGE_DIR)/$(pkg)/$(2)/, \
>             $(if $(filter copy,$(4)), \
>                 $(empty), \
>                 $(error per-package-rsync can only "copy" or "hardlink", not "$(4)") \
>             ) \
>         ) \
>         $(patsubst %, $(PER_PACKAGE_DIR)/%/$(2)/, $(1)) \
>         $(3)
>
> rsync can take multiple sources, and is usually pretty fast, so I would
> very much prefer that we do not invent our own tooling if a single rsync
> can do the job.

I originally thought this wasn't possible, because "--dest-link" only supports up to 20 input trees. However it looks like batching up the rsync's into groups of 20, walking the trees backwards, and using "--ignore-existing" makes it possible to recreate the original behavior with much less code. Some preliminary benchmarks it runs only about 20% slower, which is still significantly better then the original version.

I will make some updates and submit a v2.

>
> Also, the copy situation is now only ever needed when we aggregate the
> final target and host directories, while the hard-linking case is used
> when assembling each per-package; so there is no point in benchmarking
> the copy mode for the per-package preparation, or the hard-linking for
> the final target and host assembling.

I was just using it as a static benchmark to compare the performance of the new vs old hardlinking and new vs old copying. It is not meant to reflect real world improvements, as those will vary wildly with which packages a defconfig is using.

> Hervé, Thomas, do you remember if we ever addressed using a single rsync
> rather than multiple ones in your previous work on the topic, and if so,
> why we did not use that?
>
> Regards,
> Yann E. MORIN.
>
> >  create mode 100755 support/scripts/ppd-merge.sh
> >
> > diff --git a/package/pkg-utils.mk b/package/pkg-utils.mk
> > index 059e86ae0a..3968cabebd 100644
> > --- a/package/pkg-utils.mk
> > +++ b/package/pkg-utils.mk
> > @@ -216,19 +216,9 @@ ifeq ($(BR2_PER_PACKAGE_DIRECTORIES),y)
> >  # $3: destination directory
> >  # $4: literal "copy" or "hardlink" to copy or hardlink files from src to dest
> >  define per-package-rsync
> > -   mkdir -p $(3)
> > -   $(foreach pkg,$(1),\
> > -           rsync -a \
> > -                   --hard-links \
> > -                   $(if $(filter hardlink,$(4)), \
> > -                           --link-
> dest=$(PER_PACKAGE_DIR)/$(pkg)/$(2)/, \
> > -                           $(if $(filter copy,$(4)), \
> > -                                   $(empty), \
> > -                                   $(error per-package-rsync can only
> "copy" or "hardlink", not "$(4)") \
> > -                           ) \
> > -                   ) \
> > -                   $(PER_PACKAGE_DIR)/$(pkg)/$(2)/ \
> > -                   $(3)$(sep))
> > +   PER_PACKAGE_DIR=$(PER_PACKAGE_DIR) \
> > +           $(TOPDIR)/support/scripts/ppd-merge.sh \
> > +           $(2) $(3) $(4) $(1)
> >  endef
> >
> >  # prepares the per-package HOST_DIR and TARGET_DIR of the current
> > diff --git a/support/scripts/ppd-merge.sh b/support/scripts/ppd-merge.sh
> > new file mode 100755
> > index 0000000000..eb1caf9e52
> > --- /dev/null
> > +++ b/support/scripts/ppd-merge.sh
> > @@ -0,0 +1,154 @@
> > +#!/bin/bash
> > +# Merge a set of Buildroot per-package-directories (PPD) into a single
> > +# destination directory.
> > +#
> > +# ppd-merge scans through all the PPD and builds a table of which files
> from
> > +# each source directory will be written to the destination directory. It feeds
> > +# that table into rsync to tell it exactly which files to copy.
> > +#
> > +# Example:
> > +#   ppd-merge replaces the original method of merging PPD that ran many
> rsync
> > +#   commands like below.
> > +#
> > +#   > rsync -a --link-dest=$PER_PACKAGE_DIR/pkg-1/host/ \
> > +#   >     $PER_PACKAGE_DIR/pkg-1/host/ dest/
> > +#   > rsync -a --link-dest=$PER_PACKAGE_DIR/pkg-2/host/ \
> > +#   >     $PER_PACKAGE_DIR/pkg-2/host/ dest/
> > +#   ...
> > +#   > rsync -a --link-dest=$PER_PACKAGE_DIR/pkg-n/host/ \
> > +#   >     $PER_PACKAGE_DIR/pkg-n/host/ dest/
> > +#
> > +#   The equivalent command with ppd-merge is
> > +#
> > +#   > PER_PACKAGE_DIR=$PER_PACKAGE_DIR \
> > +#   >     ppd-merge.sh host dest/ hardlink \
> > +#   >     pkg-1 pkg-2 ... pkg-n
> > +
> > +set -euo pipefail
> > +
> > +usage() {
> > +    cat <<EOF >&2
> > +Usage:
> > +  ppd-merge.sh <host|target> <destination> <hardlink|copy> <pkg-
> name>...
> > +
> > +  PER_PACKAGE_DIR must be defined in the environment.
> > +
> > +  Merges each directory for <pkg-name> at
> > +  "\$PER_PACKAGE_DIR/<pkg-name>/<host|target>/" into <destination>.
> > +EOF
> > +    echo "Error: $*" >&2
> > +    exit 1
> > +}
> > +
> > +search_subtrees() {
> > +    # Use `find` to walk all subtrees and print their contents in `rsync`
> > +    # '--relative' path syntax. Filter paths through awk so we discard any
> > +    # duplicate files found while walking subtrees.
> > +    (cd "$PER_PACKAGE_DIR" && find "${subtrees[@]}" -mindepth 1 -printf
> "%H/./%P\0") \
> > +        | awk '
> > +        BEGIN {
> > +            RS="\0";
> > +            ORS="\0";
> > +            FS="/\\./";
> > +        }
> > +        {
> > +            if (!($2 in found)) {
> > +                found[$2]=1;
> > +                print;
> > +            }
> > +        }'
> > +}
> > +
> > +rsync_hardlink() {
> > +    # rsync hardlinking with --link-dest hard caps at 20 subtrees. Batch up the
> > +    # input files in $tmpdir until we have 20 subtrees, then flush them with
> an
> > +    # rsync call.
> > +    tmpdir=$(mktemp -d)
> > +    awk -v tmpdir="$tmpdir" '
> > +        function mkfiles() {
> > +            out_file=tmpdir "/" i ".files";
> > +            subtrees_file=tmpdir "/" i ".subtrees";
> > +        }
> > +        function batch_out() {
> > +            close(out_file);
> > +            close(subtrees_file);
> > +            delete subtrees;
> > +            print i;
> > +            i+=1;
> > +            mkfiles();
> > +        }
> > +        BEGIN {
> > +            RS="\0";
> > +            ORS="\0";
> > +            FS="/\\./";
> > +            i=0;
> > +            mkfiles();
> > +        }
> > +        !($1 in subtrees) && (length(subtrees) >= 20) {
> > +            batch_out();
> > +        }
> > +        {
> > +            print > out_file;
> > +            if (!($1 in subtrees)) {
> > +                subtrees[$1]=1;
> > +                print $1 > subtrees_file;
> > +            }
> > +        }
> > +        END {
> > +            if (length(subtrees) > 0) {
> > +                batch_out();
> > +            }
> > +        }' | while IFS= read -r -d $'\0' i; do
> > +            rsync_cmd_batch=( "${rsync_cmd[@]}" )
> > +            while IFS= read -r -d $'\0' subtree; do
> > +                rsync_cmd_batch+=( --link-dest="$PER_PACKAGE_DIR/$subtree/"
> )
> > +            done <"$tmpdir/$i.subtrees"
> > +            "${rsync_cmd_batch[@]}" "$PER_PACKAGE_DIR/" "$dest/"
> <"$tmpdir/$i.files"
> > +        done
> > +    rm -rf "$tmpdir"
> > +}
> > +
> > +if [ "$#" -lt 3 ]; then
> > +    usage "Invalid number of arguments"
> > +fi
> > +type=$1
> > +dest=$2
> > +link=$3
> > +shift 3
> > +
> > +case "$type" in
> > +host|target)
> > +    ;;
> > +*)
> > +    usage "Unknown type '$type'"
> > +    ;;
> > +esac
> > +
> > +subtrees=()
> > +for ((i = $#; i > 0; i--)); do
> > +    subtrees+=( "${!i}/$type" )
> > +done
> > +
> > +if [ -z "${PER_PACKAGE_DIR:-}" ]; then
> > +    usage "PER_PACKAGE_DIR must be defined"
> > +fi
> > +
> > +rsync_cmd=( rsync -a --hard-links --files-from=- --from0 )
> > +
> > +mkdir -p "$dest"
> > +
> > +if [ "${#subtrees[@]}" -eq 0 ]; then
> > +    exit 0
> > +fi
> > +
> > +case "$link" in
> > +hardlink)
> > +    search_subtrees | rsync_hardlink
> > +    ;;
> > +copy)
> > +    search_subtrees | "${rsync_cmd[@]}" "$PER_PACKAGE_DIR/" "$dest/"
> > +    ;;
> > +*)
> > +    usage "Unknown link command '$link'"
> > +    ;;
> > +esac
> > --
> > 2.43.0
> >
>
> --
> .-----------------.--------------------.------------------.--------------------.
> |  Yann E. MORIN  | Real-Time Embedded | /"\ ASCII RIBBON | Erics'
> conspiracy: |
> | +33 662 376 056 | Software  Designer | \ / CAMPAIGN     |  ___               |
> | +33 561 099 427 `------------.-------:  X  AGAINST      |  \e/  There is no  |
> | https://urldefense.com/v3/__http://ymorin.is-a-
> geek.org/__;!!MvWE!ESOcVSl3HkPjs74urdWxxKAVO8wqE207RmNp6WqZh
> MaKF8nydTSOWJWa0XHmpofTVshBIRx8ScF-eFb7_WMjzqWnbS3vZQ$  |
> _/*\_ | / \ HTML MAIL    |   v   conspiracy.  |
> '------------------------------^-------^------------------^--------------------'

Thanks,
Brandon

_______________________________________________
buildroot mailing list
buildroot@buildroot.org
https://lists.buildroot.org/mailman/listinfo/buildroot

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Buildroot] [PATCH 1/1] ppd-merge: speed up per-package-rsync
  2023-11-28 21:07 ` [Buildroot] " Yann E. MORIN
  2023-11-29  0:00   ` [Buildroot] [External] " Maier, Brandon L Collins via buildroot
@ 2023-11-29  8:17   ` Herve Codina via buildroot
  1 sibling, 0 replies; 10+ messages in thread
From: Herve Codina via buildroot @ 2023-11-29  8:17 UTC (permalink / raw)
  To: Yann E. MORIN; +Cc: Thomas Petazzoni, Brandon Maier, buildroot

Hi Yann,

On Tue, 28 Nov 2023 22:07:17 +0100
"Yann E. MORIN" <yann.morin.1998@free.fr> wrote:

> Brandon, All,
> 
> +Thomas for the intial work on this topic
> 
> On 2023-11-27 22:41 +0000, Brandon Maier spake thusly:
> > The per-package-rsync stage can add a significant amount of time to
> > builds. They can also be annoying as the slowest rsyncs, the final
> > target-finalize and host-finalize stages, run on `make all` which we use
> > frequently during development.
> > 
> > The per-package-rsync is slow because it launches a new rsync for each
> > source tree, and each rsync must rescan the destination directory and
> > potentially overwrite files multiple times. So instead we manually walk
> > the source trees to find only the files that should be written to the
> > destination, then feed that to a single instance of rsync.  
> 
> Sorry, I haven't looked into the script in details, but why can't we
> just do something like:
> 
>     rsync -a \
>         --hard-links \
>         $(if $(filter hardlink,$(4)), \
>             --link-dest=$(PER_PACKAGE_DIR)/$(pkg)/$(2)/, \
>             $(if $(filter copy,$(4)), \
>                 $(empty), \
>                 $(error per-package-rsync can only "copy" or "hardlink", not "$(4)") \
>             ) \
>         ) \
>         $(patsubst %, $(PER_PACKAGE_DIR)/%/$(2)/, $(1)) \
>         $(3)
> 
> rsync can take multiple sources, and is usually pretty fast, so I would
> very much prefer that we do not invent our own tooling if a single rsync
> can do the job.
> 
> Also, the copy situation is now only ever needed when we aggregate the
> final target and host directories, while the hard-linking case is used
> when assembling each per-package; so there is no point in benchmarking
> the copy mode for the per-package preparation, or the hard-linking for
> the final target and host assembling.
> 
> Hervé, Thomas, do you remember if we ever addressed using a single rsync
> rather than multiple ones in your previous work on the topic, and if so,
> why we did not use that?

I think we did only multiple rsync calls.
We had in mind the file overwrite detection and so we did the operation
per package keeping the $(foreach pkg,$(1), ...) loop.

And why we did not use that ?
First because I didn't think about it...
and second because we were in the strategy to check overwrites per package.

Best regards,
Hervé
_______________________________________________
buildroot mailing list
buildroot@buildroot.org
https://lists.buildroot.org/mailman/listinfo/buildroot

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [Buildroot] [PATCH v2 1/1] ppd-merge: speed up per-package-rsync
  2023-11-27 22:41 [Buildroot] [PATCH 1/1] ppd-merge: speed up per-package-rsync Brandon Maier via buildroot
  2023-11-28 15:49 ` Herve Codina via buildroot
  2023-11-28 21:07 ` [Buildroot] " Yann E. MORIN
@ 2023-12-01 17:05 ` Brandon Maier via buildroot
  2023-12-04 12:46   ` Herve Codina via buildroot
  2 siblings, 1 reply; 10+ messages in thread
From: Brandon Maier via buildroot @ 2023-12-01 17:05 UTC (permalink / raw)
  To: buildroot; +Cc: Thomas Petazzoni, Herve Codina, Yann E . MORIN, Brandon Maier

The per-package-rsync stage can add a significant amount of time to
builds. They can also be annoying as the target-finalize and
host-finalize targets are the slowest and run on every `make all`, which
is used frequently for partial rebuilds.

The per-package-rsync is slow because it launches a new rsync for each
source tree, and each rsync must rescan the destination directory and
potentially overwrite files multiple times. We can instead merge all the
rsync calls down into one call, and rsync is smarter about scanning all
the source directories and only copying over the files it needs to.

We feed the source trees to rsync in reverse-order, as this preserves
the original behaviour. I.e. when using multiple rsyncs, the last source
tree would overwrite anything in the destination. Now when using a
single rsync, we put the last tree first as rsync will select the first
file it finds.

This only supports the 'copy' mode, which is used in the finalize step.
The 'hardlink' mode requires specifying each source tree with the
--link-dest flag but enforces a maximum of 20 trees.

Below is a benchmark running the host-finalize target for a build with
200 packages.

Benchmark 1: before copy
  Time (mean ± σ):     27.171 s ±  0.777 s    [User: 6.170 s, System: 14.830 s]
  Range (min … max):   26.343 s … 28.566 s    10 runs

Benchmark 2: after copy
  Time (mean ± σ):      6.296 s ±  0.196 s    [User: 2.874 s, System: 5.600 s]
  Range (min … max):    6.094 s …  6.709 s    10 runs

Summary
  after copy ran
    4.32 ± 0.18 times faster than before copy

Cc: Herve Codina <herve.codina@bootlin.com>
Cc: Yann E. MORIN <yann.morin.1998@free.fr>
Cc: Thomas Petazzoni <thomas.petazzoni@bootlin.com>
Signed-off-by: Brandon Maier <brandon.maier@collins.com>
---
v1: https://patchwork.ozlabs.org/project/buildroot/patch/20231127224139.35969-1-brandon.maier@collins.com/

v2:
- Simplify logic for 'copy' mode
- Drop support for 'hardlink' mode

Signed-off-by: Brandon Maier <brandon.maier@collins.com>
---
 package/pkg-utils.mk | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/package/pkg-utils.mk b/package/pkg-utils.mk
index 059e86ae0a..bada328829 100644
--- a/package/pkg-utils.mk
+++ b/package/pkg-utils.mk
@@ -217,18 +217,12 @@ ifeq ($(BR2_PER_PACKAGE_DIRECTORIES),y)
 # $4: literal "copy" or "hardlink" to copy or hardlink files from src to dest
 define per-package-rsync
 	mkdir -p $(3)
-	$(foreach pkg,$(1),\
-		rsync -a \
-			--hard-links \
-			$(if $(filter hardlink,$(4)), \
-				--link-dest=$(PER_PACKAGE_DIR)/$(pkg)/$(2)/, \
-				$(if $(filter copy,$(4)), \
-					$(empty), \
-					$(error per-package-rsync can only "copy" or "hardlink", not "$(4)") \
-				) \
-			) \
-			$(PER_PACKAGE_DIR)/$(pkg)/$(2)/ \
-			$(3)$(sep))
+	$(if $(filter hardlink,$(4)), \
+		$(foreach pkg,$(1),\
+			rsync -a --hard-links --link-dest=$(PER_PACKAGE_DIR)/$(pkg)/$(2)/ \
+				$(PER_PACKAGE_DIR)/$(pkg)/$(2)/ $(3)$(sep)), \
+		printf "%s/$(2)/\n" $(1) | tac \
+			| rsync -a --hard-links --files-from=- --no-R -r $(PER_PACKAGE_DIR) $(3))
 endef
 
 # prepares the per-package HOST_DIR and TARGET_DIR of the current
-- 
2.43.0

_______________________________________________
buildroot mailing list
buildroot@buildroot.org
https://lists.buildroot.org/mailman/listinfo/buildroot

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [Buildroot] [PATCH v2 1/1] ppd-merge: speed up per-package-rsync
  2023-12-01 17:05 ` [Buildroot] [PATCH v2 " Brandon Maier via buildroot
@ 2023-12-04 12:46   ` Herve Codina via buildroot
  0 siblings, 0 replies; 10+ messages in thread
From: Herve Codina via buildroot @ 2023-12-04 12:46 UTC (permalink / raw)
  To: Brandon Maier; +Cc: Yann E . MORIN, Thomas Petazzoni, buildroot

Hi Brandon,

On Fri,  1 Dec 2023 17:05:52 +0000
Brandon Maier <brandon.maier@collins.com> wrote:

> The per-package-rsync stage can add a significant amount of time to
> builds. They can also be annoying as the target-finalize and
> host-finalize targets are the slowest and run on every `make all`, which
> is used frequently for partial rebuilds.
> 
> The per-package-rsync is slow because it launches a new rsync for each
> source tree, and each rsync must rescan the destination directory and
> potentially overwrite files multiple times. We can instead merge all the
> rsync calls down into one call, and rsync is smarter about scanning all
> the source directories and only copying over the files it needs to.
> 
> We feed the source trees to rsync in reverse-order, as this preserves
> the original behaviour. I.e. when using multiple rsyncs, the last source
> tree would overwrite anything in the destination. Now when using a
> single rsync, we put the last tree first as rsync will select the first
> file it finds.
> 
> This only supports the 'copy' mode, which is used in the finalize step.
> The 'hardlink' mode requires specifying each source tree with the
> --link-dest flag but enforces a maximum of 20 trees.
> 
> Below is a benchmark running the host-finalize target for a build with
> 200 packages.
> 
> Benchmark 1: before copy
>   Time (mean ± σ):     27.171 s ±  0.777 s    [User: 6.170 s, System: 14.830 s]
>   Range (min … max):   26.343 s … 28.566 s    10 runs
> 
> Benchmark 2: after copy
>   Time (mean ± σ):      6.296 s ±  0.196 s    [User: 2.874 s, System: 5.600 s]
>   Range (min … max):    6.094 s …  6.709 s    10 runs
> 
> Summary
>   after copy ran
>     4.32 ± 0.18 times faster than before copy
> 
> Cc: Herve Codina <herve.codina@bootlin.com>
> Cc: Yann E. MORIN <yann.morin.1998@free.fr>
> Cc: Thomas Petazzoni <thomas.petazzoni@bootlin.com>
> Signed-off-by: Brandon Maier <brandon.maier@collins.com>
> ---
> v1: https://patchwork.ozlabs.org/project/buildroot/patch/20231127224139.35969-1-brandon.maier@collins.com/
> 
> v2:
> - Simplify logic for 'copy' mode
> - Drop support for 'hardlink' mode
> 
> Signed-off-by: Brandon Maier <brandon.maier@collins.com>
> ---
>  package/pkg-utils.mk | 18 ++++++------------
>  1 file changed, 6 insertions(+), 12 deletions(-)
> 
> diff --git a/package/pkg-utils.mk b/package/pkg-utils.mk
> index 059e86ae0a..bada328829 100644
> --- a/package/pkg-utils.mk
> +++ b/package/pkg-utils.mk
> @@ -217,18 +217,12 @@ ifeq ($(BR2_PER_PACKAGE_DIRECTORIES),y)
>  # $4: literal "copy" or "hardlink" to copy or hardlink files from src to dest
>  define per-package-rsync
>  	mkdir -p $(3)
> -	$(foreach pkg,$(1),\
> -		rsync -a \
> -			--hard-links \
> -			$(if $(filter hardlink,$(4)), \
> -				--link-dest=$(PER_PACKAGE_DIR)/$(pkg)/$(2)/, \
> -				$(if $(filter copy,$(4)), \
> -					$(empty), \
> -					$(error per-package-rsync can only "copy" or "hardlink", not "$(4)") \
> -				) \
> -			) \
> -			$(PER_PACKAGE_DIR)/$(pkg)/$(2)/ \
> -			$(3)$(sep))
> +	$(if $(filter hardlink,$(4)), \
> +		$(foreach pkg,$(1),\
> +			rsync -a --hard-links --link-dest=$(PER_PACKAGE_DIR)/$(pkg)/$(2)/ \
> +				$(PER_PACKAGE_DIR)/$(pkg)/$(2)/ $(3)$(sep)), \
> +		printf "%s/$(2)/\n" $(1) | tac \
> +			| rsync -a --hard-links --files-from=- --no-R -r $(PER_PACKAGE_DIR) $(3))
>  endef
>  
>  # prepares the per-package HOST_DIR and TARGET_DIR of the current

On my side, it looks good.

Reviewed-by: Herve Codina <herve.codina@bootlin.com>

Regards,
Hervé
_______________________________________________
buildroot mailing list
buildroot@buildroot.org
https://lists.buildroot.org/mailman/listinfo/buildroot

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2023-12-04 12:56 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-27 22:41 [Buildroot] [PATCH 1/1] ppd-merge: speed up per-package-rsync Brandon Maier via buildroot
2023-11-28 15:49 ` Herve Codina via buildroot
2023-11-28 17:21   ` [Buildroot] [External] " Maier, Brandon L Collins via buildroot
2023-11-28 17:54     ` Herve Codina via buildroot
2023-11-28 18:04       ` Maier, Brandon L Collins via buildroot
2023-11-28 21:07 ` [Buildroot] " Yann E. MORIN
2023-11-29  0:00   ` [Buildroot] [External] " Maier, Brandon L Collins via buildroot
2023-11-29  8:17   ` [Buildroot] " Herve Codina via buildroot
2023-12-01 17:05 ` [Buildroot] [PATCH v2 " Brandon Maier via buildroot
2023-12-04 12:46   ` Herve Codina via buildroot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.