aboutsummaryrefslogtreecommitdiff
path: root/git-deltafy-script
blob: 476d8796ecbb60688a91888f311ac6074eead45b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/bin/bash

# Example script to deltify an entire GIT repository based on the commit list.
# The most recent version of a file is the reference and previous versions
# are made delta against the best earlier version available. And so on for
# successive versions going back in time.  This way the increasing delta
# overhead is pushed towards older versions of any given file.
#
# The -d argument allows to provide a limit on the delta chain depth.
# If 0 is passed then everything is undeltafied.  Limiting the delta
# depth is meaningful for subsequent access performance to old revisions.
# A value of 16 might be a good compromize between performance and good
# space saving.  Current default is unbounded.
#
# The --max-behind=30 argument is passed to git-mkdelta so to keep
# combinations and memory usage bounded a bit.  If you have lots of memory
# and CPU power you may remove it (or set to 0) to let git-mkdelta find the
# best delta match regardless of the number of revisions for a given file.
# You can also make the value smaller to make it faster and less
# memory hungry.  A value of 5 ought to still give pretty good results.
# When set to 0 or ommitted then look behind is unbounded.  Note that
# git-mkdelta might die with a segmentation fault in that case if it
# runs out of memory.  Note that the GIT repository will still be consistent
# even if git-mkdelta dies unexpectedly.

set -e

max_depth=
[ "$1" == "-d" ] && max_depth="--max-depth=$2" && shift 2

overlap=30
max_behind="--max-behind=$overlap"

function process_list() {
	if [ "$list" ]; then
		echo "Processing $curr_file"
		echo "$list" | xargs git-mkdelta $max_depth $max_behind -v
	fi
}

rev_list=""
curr_file=""

git-rev-list HEAD |
while true; do
	# Let's batch revisions into groups of 1000 to give it a chance to
	# scale with repositories containing long revision lists.  We also
	# overlap with the previous batch the size of mkdelta's look behind
	# value in order to account for the processing discontinuity.
	rev_list="$(echo -e -n "$rev_list" | tail --lines=$overlap)"
	for i in $(seq 1000); do
		read rev || break
		rev_list="$rev_list$rev\n"
	done
	echo -e -n "$rev_list" |
	git-diff-tree -r -t --stdin |
	awk '/^:/ { if ($5 == "M") printf "%s %s\n%s %s\n", $4, $6, $3, $6 }' |
	LC_ALL=C sort -s -k 2 | uniq |
	while read sha1 file; do
		if [ "$file" == "$curr_file" ]; then
			list="$list $sha1"
		else
			process_list
			curr_file="$file"
			list="$sha1"
		fi
	done
	[ "$rev" ] || break
done
process_list

curr_file="root directory"
list="$(
	git-rev-list HEAD |
	while read commit; do
		git-cat-file commit $commit |
		sed -n 's/tree //p;Q'
	done
	)"
process_list