summaryrefslogtreecommitdiff
path: root/eclass/multiprocessing.eclass
blob: 06e004aa1669eb8acf917a4c25198e5d3e8a0825 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# Copyright 1999-2014 Gentoo Foundation
# Distributed under the terms of the GNU General Public License v2
# $Id$

# @ECLASS: multiprocessing.eclass
# @MAINTAINER:
# base-system@gentoo.org
# @AUTHOR:
# Brian Harring <ferringb@gentoo.org>
# Mike Frysinger <vapier@gentoo.org>
# @BLURB: parallelization with bash (wtf?)
# @DESCRIPTION:
# The multiprocessing eclass contains a suite of functions that allow ebuilds
# to quickly run things in parallel using shell code.
#
# It has two modes: pre-fork and post-fork.  If you don't want to dive into any
# more nuts & bolts, just use the pre-fork mode.  For main threads that mostly
# spawn children and then wait for them to finish, use the pre-fork mode.  For
# main threads that do a bit of processing themselves, use the post-fork mode.
# You may mix & match them for longer computation loops.
# @EXAMPLE:
#
# @CODE
# # First initialize things:
# multijob_init
#
# # Then hash a bunch of files in parallel:
# for n in {0..20} ; do
# 	multijob_child_init md5sum data.${n} > data.${n}
# done
#
# # Then wait for all the children to finish:
# multijob_finish
# @CODE

if [[ -z ${_MULTIPROCESSING_ECLASS} ]]; then
_MULTIPROCESSING_ECLASS=1

# @FUNCTION: bashpid
# @DESCRIPTION:
# Return the process id of the current sub shell.  This is to support bash
# versions older than 4.0 that lack $BASHPID support natively.  Simply do:
# echo ${BASHPID:-$(bashpid)}
#
# Note: Using this func in any other way than the one above is not supported.
bashpid() {
	# Running bashpid plainly will return incorrect results.  This func must
	# be run in a subshell of the current subshell to get the right pid.
	# i.e. This will show the wrong value:
	#   bashpid
	# But this will show the right value:
	#   (bashpid)
	sh -c 'echo ${PPID}'
}

# @FUNCTION: makeopts_jobs
# @USAGE: [${MAKEOPTS}]
# @DESCRIPTION:
# Searches the arguments (defaults to ${MAKEOPTS}) and extracts the jobs number
# specified therein.  Useful for running non-make tools in parallel too.
# i.e. if the user has MAKEOPTS=-j9, this will echo "9" -- we can't return the
# number as bash normalizes it to [0, 255].  If the flags haven't specified a
# -j flag, then "1" is shown as that is the default `make` uses.  Since there's
# no way to represent infinity, we return 999 if the user has -j without a number.
makeopts_jobs() {
	[[ $# -eq 0 ]] && set -- ${MAKEOPTS}
	# This assumes the first .* will be more greedy than the second .*
	# since POSIX doesn't specify a non-greedy match (i.e. ".*?").
	local jobs=$(echo " $* " | sed -r -n \
		-e 's:.*[[:space:]](-j|--jobs[=[:space:]])[[:space:]]*([0-9]+).*:\2:p' \
		-e 's:.*[[:space:]](-j|--jobs)[[:space:]].*:999:p')
	echo ${jobs:-1}
}

# @FUNCTION: makeopts_loadavg
# @USAGE: [${MAKEOPTS}]
# @DESCRIPTION:
# Searches the arguments (defaults to ${MAKEOPTS}) and extracts the value set
# for load-average. For make and ninja based builds this will mean new jobs are
# not only limited by the jobs-value, but also by the current load - which might
# get excessive due to I/O and not just due to CPU load.
# Be aware that the returned number might be a floating-point number. Test
# whether your software supports that.
makeopts_loadavg() {
	[[ $# -eq 0 ]] && set -- ${MAKEOPTS}
	# This assumes the first .* will be more greedy than the second .*
	# since POSIX doesn't specify a non-greedy match (i.e. ".*?").
	local lavg=$(echo " $* " | sed -r -n \
		-e 's:.*[[:space:]](-l|--(load-average|max-load)[=[:space:]])[[:space:]]*([0-9]+|[0-9]+\.[0-9]+).*:\3:p' \
		-e 's:.*[[:space:]](-l|--(load-average|max-load))[[:space:]].*:999:p')
	# Default to 999 since the default is to not use a load limit.
	echo ${lavg:-999}
}

# @FUNCTION: multijob_init
# @USAGE: [${MAKEOPTS}]
# @DESCRIPTION:
# Setup the environment for executing code in parallel.
# You must call this before any other multijob function.
multijob_init() {
	# When something goes wrong, try to wait for all the children so we
	# don't leave any zombies around.
	has wait ${EBUILD_DEATH_HOOKS} || EBUILD_DEATH_HOOKS+=" wait "

	# Setup a pipe for children to write their pids to when they finish.
	# We have to allocate two fd's because POSIX has undefined behavior
	# when you open a FIFO for simultaneous read/write. #487056
	local pipe="${T}/multijob.pipe"
	mkfifo -m 600 "${pipe}"
	redirect_alloc_fd mj_write_fd "${pipe}"
	redirect_alloc_fd mj_read_fd "${pipe}"
	rm -f "${pipe}"

	# See how many children we can fork based on the user's settings.
	mj_max_jobs=$(makeopts_jobs "$@")
	mj_num_jobs=0
}

# @FUNCTION: multijob_child_init
# @USAGE: [--pre|--post] [command to run in background]
# @DESCRIPTION:
# This function has two forms.  You can use it to execute a simple command
# in the background (and it takes care of everything else), or you must
# call this first thing in your forked child process.
#
# The --pre/--post options allow you to select the child generation mode.
#
# @CODE
# # 1st form: pass the command line as arguments:
# multijob_child_init ls /dev
# # Or if you want to use pre/post fork modes:
# multijob_child_init --pre ls /dev
# multijob_child_init --post ls /dev
#
# # 2nd form: execute multiple stuff in the background (post fork):
# (
# multijob_child_init
# out=`ls`
# if echo "${out}" | grep foo ; then
# 	echo "YEAH"
# fi
# ) &
# multijob_post_fork
#
# # 2nd form: execute multiple stuff in the background (pre fork):
# multijob_pre_fork
# (
# multijob_child_init
# out=`ls`
# if echo "${out}" | grep foo ; then
# 	echo "YEAH"
# fi
# ) &
# @CODE
multijob_child_init() {
	local mode="pre"
	case $1 in
	--pre)  mode="pre" ; shift ;;
	--post) mode="post"; shift ;;
	esac

	if [[ $# -eq 0 ]] ; then
		trap 'echo ${BASHPID:-$(bashpid)} $? >&'${mj_write_fd} EXIT
		trap 'exit 1' INT TERM
	else
		local ret
		[[ ${mode} == "pre" ]] && { multijob_pre_fork; ret=$?; }
		( multijob_child_init ; "$@" ) &
		[[ ${mode} == "post" ]] && { multijob_post_fork; ret=$?; }
		return ${ret}
	fi
}

# @FUNCTION: _multijob_fork
# @INTERNAL
# @DESCRIPTION:
# Do the actual book keeping.
_multijob_fork() {
	[[ $# -eq 1 ]] || die "incorrect number of arguments"

	local ret=0
	[[ $1 == "post" ]] && : $(( ++mj_num_jobs ))
	if [[ ${mj_num_jobs} -ge ${mj_max_jobs} ]] ; then
		multijob_finish_one
		ret=$?
	fi
	[[ $1 == "pre" ]] && : $(( ++mj_num_jobs ))
	return ${ret}
}

# @FUNCTION: multijob_pre_fork
# @DESCRIPTION:
# You must call this in the parent process before forking a child process.
# If the parallel limit has been hit, it will wait for one child to finish
# and return its exit status.
multijob_pre_fork() { _multijob_fork pre "$@" ; }

# @FUNCTION: multijob_post_fork
# @DESCRIPTION:
# You must call this in the parent process after forking a child process.
# If the parallel limit has been hit, it will wait for one child to finish
# and return its exit status.
multijob_post_fork() { _multijob_fork post "$@" ; }

# @FUNCTION: multijob_finish_one
# @DESCRIPTION:
# Wait for a single process to exit and return its exit code.
multijob_finish_one() {
	[[ $# -eq 0 ]] || die "${FUNCNAME} takes no arguments"

	local pid ret
	read -r -u ${mj_read_fd} pid ret || die
	: $(( --mj_num_jobs ))
	return ${ret}
}

# @FUNCTION: multijob_finish
# @DESCRIPTION:
# Wait for all pending processes to exit and return the bitwise or
# of all their exit codes.
multijob_finish() {
	local ret=0
	while [[ ${mj_num_jobs} -gt 0 ]] ; do
		multijob_finish_one
		: $(( ret |= $? ))
	done
	# Let bash clean up its internal child tracking state.
	wait

	# Do this after reaping all the children.
	[[ $# -eq 0 ]] || die "${FUNCNAME} takes no arguments"

	# No need to hook anymore.
	EBUILD_DEATH_HOOKS=${EBUILD_DEATH_HOOKS/ wait / }

	return ${ret}
}

# @FUNCTION: redirect_alloc_fd
# @USAGE: <var> <file> [redirection]
# @DESCRIPTION:
# Find a free fd and redirect the specified file via it.  Store the new
# fd in the specified variable.  Useful for the cases where we don't care
# about the exact fd #.
redirect_alloc_fd() {
	local var=$1 file=$2 redir=${3:-"<>"}

	# Make sure /dev/fd is sane on Linux hosts. #479656
	if [[ ! -L /dev/fd && ${CBUILD} == *linux* ]] ; then
		eerror "You're missing a /dev/fd symlink to /proc/self/fd."
		eerror "Please fix the symlink and check your boot scripts (udev/etc...)."
		die "/dev/fd is broken"
	fi

	if [[ $(( (BASH_VERSINFO[0] << 8) + BASH_VERSINFO[1] )) -ge $(( (4 << 8) + 1 )) ]] ; then
		# Newer bash provides this functionality.
		eval "exec {${var}}${redir}'${file}'"
	else
		# Need to provide the functionality ourselves.
		local fd=10
		while :; do
			# Make sure the fd isn't open.  It could be a char device,
			# or a symlink (possibly broken) to something else.
			if [[ ! -e /dev/fd/${fd} ]] && [[ ! -L /dev/fd/${fd} ]] ; then
				eval "exec ${fd}${redir}'${file}'" && break
			fi
			[[ ${fd} -gt 1024 ]] && die 'could not locate a free temp fd !?'
			: $(( ++fd ))
		done
		: $(( ${var} = fd ))
	fi
}

fi