prepstrip/ecompressdir: parallelize operations
authorMike Frysinger <vapier@gentoo.org>
Fri, 11 May 2012 16:36:22 +0000 (12:36 -0400)
committerMike Frysinger <vapier@gentoo.org>
Sat, 12 May 2012 04:09:34 +0000 (00:09 -0400)
Stealing some ideas from ferringb, add a new API for doing parallel
processing in bash, and then deploy this with the stripping and
compressing stages.

For stripping coreutils which has about 100 ELFs, this brings time
to strip down from ~7 seconds to ~0.7 seconds on my system.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
bin/ebuild-helpers/ecompressdir
bin/ebuild-helpers/prepstrip
bin/helper-functions.sh [new file with mode: 0644]

index 17ecd80821ba3b51b2396ed5de8264c516ab68c1..a2c9e52a0049d326800d83c6e3421611e8494263 100755 (executable)
@@ -2,7 +2,7 @@
 # Copyright 1999-2011 Gentoo Foundation
 # Distributed under the terms of the GNU General Public License v2
 
-source "${PORTAGE_BIN_PATH:-/usr/lib/portage/bin}"/isolated-functions.sh
+source "${PORTAGE_BIN_PATH:-/usr/lib/portage/bin}"/helper-functions.sh
 
 if [[ -z $1 ]] ; then
        helpers_die "${0##*/}: at least one argument needed"
@@ -116,6 +116,16 @@ ret=0
 
 rm -rf "${T}"/ecompress-skip
 
+decompressors=(
+       ".Z"    "gunzip -f"
+       ".gz"   "gunzip -f"
+       ".bz2"  "bunzip2 -f"
+       ".xz"   "unxz -f"
+       ".lzma" "unxz -f"
+)
+
+multijob_init
+
 for dir in "$@" ; do
        dir=${dir#/}
        dir="${ED}${dir}"
@@ -136,14 +146,26 @@ for dir in "$@" ; do
        find "${dir}" -type f -name '*.ecompress.file' -print0 | ${XARGS} -0 rm -f
 
        # not uncommon for packages to compress doc files themselves
-       funk_up_dir "decompress" ".Z" "gunzip -f"
-       funk_up_dir "decompress" ".gz" "gunzip -f"
-       funk_up_dir "decompress" ".bz2" "bunzip2 -f"
+       for (( d = 0; d < ${#decompressors[@]}; d += 2 )) ; do
+               # It's faster to parallelize at this stage than to try to
+               # parallelize the compressors.  This is because the find|xargs
+               # ends up launching less compressors overall, so the overhead
+               # of forking children ends up dominating.
+               (
+               multijob_child_init
+               funk_up_dir "decompress" "${decompressors[i]}" "${decompressors[i+1]}"
+               ) &
+               multijob_post_fork
+               : $(( ret |= $? ))
+       done
 
        # forcibly break all hard links as some compressors whine about it
        find "${dir}" -type f -links +1 -exec env file="{}" sh -c \
                'cp -p "${file}" "${file}.ecompress.break" ; mv -f "${file}.ecompress.break" "${file}"' \;
 
+       multijob_finish
+       : $(( ret |= $? ))
+
        # now lets do our work
        if [[ -n ${suffix} ]] ; then
                vecho "${0##*/}: $(ecompress --bin) /${actual_dir#${ED}}"
index daaa25250d8977b2fbc28d0d6c2bd9bf9fdf3fb7..09b0333d9543c9553982e376190b41d93b3a2e1c 100755 (executable)
@@ -1,8 +1,8 @@
 #!/bin/bash
-# Copyright 1999-2011 Gentoo Foundation
+# Copyright 1999-2012 Gentoo Foundation
 # Distributed under the terms of the GNU General Public License v2
 
-source "${PORTAGE_BIN_PATH:-/usr/lib/portage/bin}"/isolated-functions.sh
+source "${PORTAGE_BIN_PATH:-/usr/lib/portage/bin}"/helper-functions.sh
 
 # avoid multiple calls to `has`.  this creates things like:
 #   FEATURES_foo=false
@@ -62,6 +62,8 @@ prepstrip_sources_dir=${EPREFIX}/usr/src/debug/${CATEGORY}/${PF}
 type -P debugedit >/dev/null && debugedit_found=true || debugedit_found=false
 debugedit_warned=false
 
+multijob_init
+
 unset ${!INODE_*}
 
 inode_var_name() {
@@ -171,6 +173,8 @@ process_elf() {
 # We want to log already stripped binaries, as this may be a QA violation.
 # They prevent us from getting the splitdebug data.
 if ! ${RESTRICT_binchecks} && ! ${RESTRICT_strip} ; then
+       (
+       multijob_child_init
        log=$T/scanelf-already-stripped.log
        qa_var="QA_PRESTRIPPED_${ARCH/-/_}"
        [[ -n ${!qa_var} ]] && QA_PRESTRIPPED="${!qa_var}"
@@ -193,6 +197,8 @@ if ! ${RESTRICT_binchecks} && ! ${RESTRICT_strip} ; then
        else
                rm -f "$log"
        fi
+       ) &
+       multijob_post_fork
 fi
 
 # Now we look for unstripped binaries.
@@ -205,8 +211,10 @@ do
                banner=true
        fi
 
-       f=$(file "${x}") || continue
-       [[ -z ${f} ]] && continue
+       (
+       multijob_child_init
+       f=$(file "${x}") || exit 0
+       [[ -z ${f} ]] && exit 0
 
        if ! ${SKIP_STRIP} ; then
                # The noglob funk is to support STRIP_MASK="/*/booga" and to keep
@@ -253,6 +261,8 @@ do
        if ${was_not_writable} ; then
                chmod u-w "${x}"
        fi
+       ) &
+       multijob_post_fork
 done
 
 if [[ -s ${T}/debug.sources ]] && \
@@ -274,3 +284,5 @@ then
                >> "$emptydir"/.keepdir
        done < <(find "${D}${prepstrip_sources_dir}/" -type d -empty -print0)
 fi
+
+multijob_finish
diff --git a/bin/helper-functions.sh b/bin/helper-functions.sh
new file mode 100644 (file)
index 0000000..1c355e2
--- /dev/null
@@ -0,0 +1,62 @@
+#!/bin/bash
+# Copyright 1999-2012 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+
+# For routines we want to use in ebuild-helpers/ but don't want to
+# expose to the general ebuild environment.
+
+source "${PORTAGE_BIN_PATH:-/usr/lib/portage/bin}"/isolated-functions.sh
+
+#
+# API functions for doing parallel processing
+#
+numjobs() {
+       # Copied from eutils.eclass:makeopts_jobs()
+       local jobs=$(echo " ${MAKEOPTS} " | \
+               sed -r -n 's:.*[[:space:]](-j|--jobs[=[:space:]])[[:space:]]*([0-9]+).*:\2:p')
+       echo ${jobs:-1}
+}
+
+multijob_init() {
+       # Setup a pipe for children to write their pids to when they finish.
+       mj_control_pipe=$(mktemp -t multijob.XXXXXX)
+       rm "${mj_control_pipe}"
+       mkfifo "${mj_control_pipe}"
+       exec {mj_control_fd}<>${mj_control_pipe}
+       rm -f "${mj_control_pipe}"
+
+       # See how many children we can fork based on the user's settings.
+       mj_max_jobs=$(numjobs)
+       mj_num_jobs=0
+}
+
+multijob_child_init() {
+       trap 'echo ${BASHPID} $? >&'${mj_control_fd} EXIT
+       trap 'exit 1' INT TERM
+}
+
+multijob_finish_one() {
+       local pid ret
+       read -r -u ${mj_control_fd} pid ret
+       : $(( --mj_num_jobs ))
+       return ${ret}
+}
+
+multijob_finish() {
+       local ret=0
+       while [[ ${mj_num_jobs} -gt 0 ]] ; do
+               multijob_finish_one
+               : $(( ret |= $? ))
+       done
+       # Let bash clean up its internal child tracking state.
+       wait
+       return ${ret}
+}
+
+multijob_post_fork() {
+       : $(( ++mj_num_jobs ))
+       if [[ ${mj_num_jobs} -ge ${mj_max_jobs} ]] ; then
+               multijob_finish_one
+       fi
+       return 0
+}