Merge branch 'jk/rev-list-disk-usage'

"git rev-list" command learned "--disk-usage" option.

* jk/rev-list-disk-usage:
  docs/rev-list: add some examples of --disk-usage
  docs/rev-list: add an examples section
  rev-list: add --disk-usage option for calculating disk usage
  t: add --no-tag option to test_commit
diff --git a/Documentation/git-rev-list.txt b/Documentation/git-rev-list.txt
index 5da6623..20bb8e8 100644
--- a/Documentation/git-rev-list.txt
+++ b/Documentation/git-rev-list.txt
@@ -31,6 +31,99 @@
 
 include::pretty-formats.txt[]
 
+EXAMPLES
+--------
+
+* Print the list of commits reachable from the current branch.
++
+----------
+git rev-list HEAD
+----------
+
+* Print the list of commits on this branch, but not present in the
+  upstream branch.
++
+----------
+git rev-list @{upstream}..HEAD
+----------
+
+* Format commits with their author and commit message (see also the
+  porcelain linkgit:git-log[1]).
++
+----------
+git rev-list --format=medium HEAD
+----------
+
+* Format commits along with their diffs (see also the porcelain
+  linkgit:git-log[1], which can do this in a single process).
++
+----------
+git rev-list HEAD |
+git diff-tree --stdin --format=medium -p
+----------
+
+* Print the list of commits on the current branch that touched any
+  file in the `Documentation` directory.
++
+----------
+git rev-list HEAD -- Documentation/
+----------
+
+* Print the list of commits authored by you in the past year, on
+  any branch, tag, or other ref.
++
+----------
+git rev-list --author=you@example.com --since=1.year.ago --all
+----------
+
+* Print the list of objects reachable from the current branch (i.e., all
+  commits and the blobs and trees they contain).
++
+----------
+git rev-list --objects HEAD
+----------
+
+* Compare the disk size of all reachable objects, versus those
+  reachable from reflogs, versus the total packed size. This can tell
+  you whether running `git repack -ad` might reduce the repository size
+  (by dropping unreachable objects), and whether expiring reflogs might
+  help.
++
+----------
+# reachable objects
+git rev-list --disk-usage --objects --all
+# plus reflogs
+git rev-list --disk-usage --objects --all --reflog
+# total disk size used
+du -c .git/objects/pack/*.pack .git/objects/??/*
+# alternative to du: add up "size" and "size-pack" fields
+git count-objects -v
+----------
+
+* Report the disk size of each branch, not including objects used by the
+  current branch. This can find outliers that are contributing to a
+  bloated repository size (e.g., because somebody accidentally committed
+  large build artifacts).
++
+----------
+git for-each-ref --format='%(refname)' |
+while read branch
+do
+	size=$(git rev-list --disk-usage --objects HEAD..$branch)
+	echo "$size $branch"
+done |
+sort -n
+----------
+
+* Compare the on-disk size of branches in one group of refs, excluding
+  another. If you co-mingle objects from multiple remotes in a single
+  repository, this can show which remotes are contributing to the
+  repository size (taking the size of `origin` as a baseline).
++
+----------
+git rev-list --disk-usage --objects --remotes=$suspect --not --remotes=origin
+----------
+
 GIT
 ---
 Part of the linkgit:git[1] suite
diff --git a/Documentation/rev-list-options.txt b/Documentation/rev-list-options.txt
index 96cc89d..1238bfd 100644
--- a/Documentation/rev-list-options.txt
+++ b/Documentation/rev-list-options.txt
@@ -227,6 +227,15 @@
 	test the exit status to see if a range of objects is fully
 	connected (or not).  It is faster than redirecting stdout
 	to `/dev/null` as the output does not have to be formatted.
+
+--disk-usage::
+	Suppress normal output; instead, print the sum of the bytes used
+	for on-disk storage by the selected commits or objects. This is
+	equivalent to piping the output into `git cat-file
+	--batch-check='%(objectsize:disk)'`, except that it runs much
+	faster (especially with `--use-bitmap-index`). See the `CAVEATS`
+	section in linkgit:git-cat-file[1] for the limitations of what
+	"on-disk storage" means.
 endif::git-rev-list[]
 
 --cherry-mark::
diff --git a/builtin/rev-list.c b/builtin/rev-list.c
index 25c6c3b..b4d8ea0 100644
--- a/builtin/rev-list.c
+++ b/builtin/rev-list.c
@@ -80,6 +80,19 @@
 
 #define DEFAULT_OIDSET_SIZE     (16*1024)
 
+static int show_disk_usage;
+static off_t total_disk_usage;
+
+static off_t get_object_disk_usage(struct object *obj)
+{
+	off_t size;
+	struct object_info oi = OBJECT_INFO_INIT;
+	oi.disk_sizep = &size;
+	if (oid_object_info_extended(the_repository, &obj->oid, &oi, 0) < 0)
+		die(_("unable to get disk usage of %s"), oid_to_hex(&obj->oid));
+	return size;
+}
+
 static void finish_commit(struct commit *commit);
 static void show_commit(struct commit *commit, void *data)
 {
@@ -88,6 +101,9 @@
 
 	display_progress(progress, ++progress_counter);
 
+	if (show_disk_usage)
+		total_disk_usage += get_object_disk_usage(&commit->object);
+
 	if (info->flags & REV_LIST_QUIET) {
 		finish_commit(commit);
 		return;
@@ -258,6 +274,8 @@
 	if (finish_object(obj, name, cb_data))
 		return;
 	display_progress(progress, ++progress_counter);
+	if (show_disk_usage)
+		total_disk_usage += get_object_disk_usage(obj);
 	if (info->flags & REV_LIST_QUIET)
 		return;
 
@@ -452,6 +470,23 @@
 	return 0;
 }
 
+static int try_bitmap_disk_usage(struct rev_info *revs,
+				 struct list_objects_filter_options *filter)
+{
+	struct bitmap_index *bitmap_git;
+
+	if (!show_disk_usage)
+		return -1;
+
+	bitmap_git = prepare_bitmap_walk(revs, filter);
+	if (!bitmap_git)
+		return -1;
+
+	printf("%"PRIuMAX"\n",
+	       (uintmax_t)get_disk_usage_from_bitmap(bitmap_git, revs));
+	return 0;
+}
+
 int cmd_rev_list(int argc, const char **argv, const char *prefix)
 {
 	struct rev_info revs;
@@ -584,6 +619,12 @@
 			continue;
 		}
 
+		if (!strcmp(arg, "--disk-usage")) {
+			show_disk_usage = 1;
+			info.flags |= REV_LIST_QUIET;
+			continue;
+		}
+
 		usage(rev_list_usage);
 
 	}
@@ -626,6 +667,8 @@
 	if (use_bitmap_index) {
 		if (!try_bitmap_count(&revs, &filter_options))
 			return 0;
+		if (!try_bitmap_disk_usage(&revs, &filter_options))
+			return 0;
 		if (!try_bitmap_traversal(&revs, &filter_options))
 			return 0;
 	}
@@ -690,5 +733,8 @@
 			printf("%d\n", revs.count_left + revs.count_right);
 	}
 
+	if (show_disk_usage)
+		printf("%"PRIuMAX"\n", (uintmax_t)total_disk_usage);
+
 	return 0;
 }
diff --git a/pack-bitmap.c b/pack-bitmap.c
index 60fe20f..1f69b5f 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -1430,3 +1430,84 @@
 	return bitmap_git &&
 		bitmap_walk_contains(bitmap_git, bitmap_git->haves, oid);
 }
+
+static off_t get_disk_usage_for_type(struct bitmap_index *bitmap_git,
+				     enum object_type object_type)
+{
+	struct bitmap *result = bitmap_git->result;
+	struct packed_git *pack = bitmap_git->pack;
+	off_t total = 0;
+	struct ewah_iterator it;
+	eword_t filter;
+	size_t i;
+
+	init_type_iterator(&it, bitmap_git, object_type);
+	for (i = 0; i < result->word_alloc &&
+			ewah_iterator_next(&filter, &it); i++) {
+		eword_t word = result->words[i] & filter;
+		size_t base = (i * BITS_IN_EWORD);
+		unsigned offset;
+
+		if (!word)
+			continue;
+
+		for (offset = 0; offset < BITS_IN_EWORD; offset++) {
+			size_t pos;
+
+			if ((word >> offset) == 0)
+				break;
+
+			offset += ewah_bit_ctz64(word >> offset);
+			pos = base + offset;
+			total += pack_pos_to_offset(pack, pos + 1) -
+				 pack_pos_to_offset(pack, pos);
+		}
+	}
+
+	return total;
+}
+
+static off_t get_disk_usage_for_extended(struct bitmap_index *bitmap_git)
+{
+	struct bitmap *result = bitmap_git->result;
+	struct packed_git *pack = bitmap_git->pack;
+	struct eindex *eindex = &bitmap_git->ext_index;
+	off_t total = 0;
+	struct object_info oi = OBJECT_INFO_INIT;
+	off_t object_size;
+	size_t i;
+
+	oi.disk_sizep = &object_size;
+
+	for (i = 0; i < eindex->count; i++) {
+		struct object *obj = eindex->objects[i];
+
+		if (!bitmap_get(result, pack->num_objects + i))
+			continue;
+
+		if (oid_object_info_extended(the_repository, &obj->oid, &oi, 0) < 0)
+			die(_("unable to get disk usage of %s"),
+			    oid_to_hex(&obj->oid));
+
+		total += object_size;
+	}
+	return total;
+}
+
+off_t get_disk_usage_from_bitmap(struct bitmap_index *bitmap_git,
+				 struct rev_info *revs)
+{
+	off_t total = 0;
+
+	total += get_disk_usage_for_type(bitmap_git, OBJ_COMMIT);
+	if (revs->tree_objects)
+		total += get_disk_usage_for_type(bitmap_git, OBJ_TREE);
+	if (revs->blob_objects)
+		total += get_disk_usage_for_type(bitmap_git, OBJ_BLOB);
+	if (revs->tag_objects)
+		total += get_disk_usage_for_type(bitmap_git, OBJ_TAG);
+
+	total += get_disk_usage_for_extended(bitmap_git);
+
+	return total;
+}
diff --git a/pack-bitmap.h b/pack-bitmap.h
index 25dfcf5..36d9993 100644
--- a/pack-bitmap.h
+++ b/pack-bitmap.h
@@ -68,6 +68,8 @@
  */
 int bitmap_has_oid_in_uninteresting(struct bitmap_index *, const struct object_id *oid);
 
+off_t get_disk_usage_from_bitmap(struct bitmap_index *, struct rev_info *);
+
 void bitmap_writer_show_progress(int show);
 void bitmap_writer_set_checksum(unsigned char *sha1);
 void bitmap_writer_build_type_index(struct packing_data *to_pack,
diff --git a/t/t4208-log-magic-pathspec.sh b/t/t4208-log-magic-pathspec.sh
index 5e10136..7f0c1dc 100755
--- a/t/t4208-log-magic-pathspec.sh
+++ b/t/t4208-log-magic-pathspec.sh
@@ -31,13 +31,8 @@
 test_expect_success '"git log :/detached -- " should find a commit only in HEAD' '
 	test_when_finished "git checkout main" &&
 	git checkout --detach &&
-	# Must manually call `test_tick` instead of using `test_commit`,
-	# because the latter additionally creates a tag, which would make
-	# the commit reachable not only via HEAD.
-	test_tick &&
-	git commit --allow-empty -m detached &&
-	test_tick &&
-	git commit --allow-empty -m something-else &&
+	test_commit --no-tag detached &&
+	test_commit --no-tag something-else &&
 	git log :/detached --
 '
 
diff --git a/t/t6115-rev-list-du.sh b/t/t6115-rev-list-du.sh
new file mode 100755
index 0000000..b4aef32
--- /dev/null
+++ b/t/t6115-rev-list-du.sh
@@ -0,0 +1,51 @@
+#!/bin/sh
+
+test_description='basic tests of rev-list --disk-usage'
+. ./test-lib.sh
+
+# we want a mix of reachable and unreachable, as well as
+# objects in the bitmapped pack and some outside of it
+test_expect_success 'set up repository' '
+	test_commit --no-tag one &&
+	test_commit --no-tag two &&
+	git repack -adb &&
+	git reset --hard HEAD^ &&
+	test_commit --no-tag three &&
+	test_commit --no-tag four &&
+	git reset --hard HEAD^
+'
+
+# We don't want to hardcode sizes, because they depend on the exact details of
+# packing, zlib, etc. We'll assume that the regular rev-list and cat-file
+# machinery works and compare the --disk-usage output to that.
+disk_usage_slow () {
+	git rev-list --no-object-names "$@" |
+	git cat-file --batch-check="%(objectsize:disk)" |
+	perl -lne '$total += $_; END { print $total}'
+}
+
+# check behavior with given rev-list options; note that
+# whitespace is not preserved in args
+check_du () {
+	args=$*
+
+	test_expect_success "generate expected size ($args)" "
+		disk_usage_slow $args >expect
+	"
+
+	test_expect_success "rev-list --disk-usage without bitmaps ($args)" "
+		git rev-list --disk-usage $args >actual &&
+		test_cmp expect actual
+	"
+
+	test_expect_success "rev-list --disk-usage with bitmaps ($args)" "
+		git rev-list --disk-usage --use-bitmap-index $args >actual &&
+		test_cmp expect actual
+	"
+}
+
+check_du HEAD
+check_du --objects HEAD
+check_du --objects HEAD^..HEAD
+
+test_done
diff --git a/t/test-lib-functions.sh b/t/test-lib-functions.sh
index c96a555..6348e8d 100644
--- a/t/test-lib-functions.sh
+++ b/t/test-lib-functions.sh
@@ -190,6 +190,7 @@
 	author= &&
 	signoff= &&
 	indir= &&
+	no_tag= &&
 	while test $# != 0
 	do
 		case "$1" in
@@ -216,6 +217,9 @@
 			indir="$2"
 			shift
 			;;
+		--no-tag)
+			no_tag=yes
+			;;
 		*)
 			break
 			;;
@@ -238,7 +242,10 @@
 	git ${indir:+ -C "$indir"} commit \
 	    ${author:+ --author "$author"} \
 	    $signoff -m "$1" &&
-	git ${indir:+ -C "$indir"} tag "${4:-$1}"
+	if test -z "$no_tag"
+	then
+		git ${indir:+ -C "$indir"} tag "${4:-$1}"
+	fi
 }
 
 # Call test_merge with the arguments "<message> <commit>", where <commit>