Merge branch 'kw/patch-ids-optim'

When "git rebase" tries to compare set of changes on the updated
upstream and our own branch, it computes patch-id for all of these
changes and attempts to find matches. This has been optimized by
lazily computing the full patch-id (which is expensive) to be
compared only for changes that touch the same set of paths.

* kw/patch-ids-optim:
  rebase: avoid computing unnecessary patch IDs
  patch-ids: add flag to create the diff patch id using header only data
  patch-ids: replace the seen indicator with a commit pointer
  patch-ids: stop using a hand-rolled hashmap implementation
diff --git a/builtin/log.c b/builtin/log.c
index 1f116be..92dc34d 100644
--- a/builtin/log.c
+++ b/builtin/log.c
@@ -1343,7 +1343,7 @@ static void prepare_bases(struct base_tree_info *bases,
 		struct object_id *patch_id;
 		if (commit->util)
 			continue;
-		if (commit_patch_id(commit, &diffopt, sha1))
+		if (commit_patch_id(commit, &diffopt, sha1, 0))
 			die(_("cannot get patch id"));
 		ALLOC_GROW(bases->patch_id, bases->nr_patch_id + 1, bases->alloc_patch_id);
 		patch_id = bases->patch_id + bases->nr_patch_id;
diff --git a/diff.c b/diff.c
index b43d3dd..534c12e 100644
--- a/diff.c
+++ b/diff.c
@@ -4462,7 +4462,7 @@ static void patch_id_consume(void *priv, char *line, unsigned long len)
 }
 
 /* returns 0 upon success, and writes result into sha1 */
-static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1)
+static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1, int diff_header_only)
 {
 	struct diff_queue_struct *q = &diff_queued_diff;
 	int i;
@@ -4497,9 +4497,6 @@ static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1)
 
 		diff_fill_sha1_info(p->one);
 		diff_fill_sha1_info(p->two);
-		if (fill_mmfile(&mf1, p->one) < 0 ||
-				fill_mmfile(&mf2, p->two) < 0)
-			return error("unable to read files to diff");
 
 		len1 = remove_space(p->one->path, strlen(p->one->path));
 		len2 = remove_space(p->two->path, strlen(p->two->path));
@@ -4534,6 +4531,13 @@ static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1)
 					len2, p->two->path);
 		git_SHA1_Update(&ctx, buffer, len1);
 
+		if (diff_header_only)
+			continue;
+
+		if (fill_mmfile(&mf1, p->one) < 0 ||
+		    fill_mmfile(&mf2, p->two) < 0)
+			return error("unable to read files to diff");
+
 		if (diff_filespec_is_binary(p->one) ||
 		    diff_filespec_is_binary(p->two)) {
 			git_SHA1_Update(&ctx, oid_to_hex(&p->one->oid),
@@ -4556,11 +4560,11 @@ static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1)
 	return 0;
 }
 
-int diff_flush_patch_id(struct diff_options *options, unsigned char *sha1)
+int diff_flush_patch_id(struct diff_options *options, unsigned char *sha1, int diff_header_only)
 {
 	struct diff_queue_struct *q = &diff_queued_diff;
 	int i;
-	int result = diff_get_patch_id(options, sha1);
+	int result = diff_get_patch_id(options, sha1, diff_header_only);
 
 	for (i = 0; i < q->nr; i++)
 		diff_free_filepair(q->queue[i]);
diff --git a/diff.h b/diff.h
index 125447b..7883729 100644
--- a/diff.h
+++ b/diff.h
@@ -342,7 +342,7 @@ extern int run_diff_files(struct rev_info *revs, unsigned int option);
 extern int run_diff_index(struct rev_info *revs, int cached);
 
 extern int do_diff_cache(const unsigned char *, struct diff_options *);
-extern int diff_flush_patch_id(struct diff_options *, unsigned char *);
+extern int diff_flush_patch_id(struct diff_options *, unsigned char *, int);
 
 extern int diff_result_code(struct diff_options *, int);
 
diff --git a/patch-ids.c b/patch-ids.c
index a4d0016..082412a 100644
--- a/patch-ids.c
+++ b/patch-ids.c
@@ -5,7 +5,7 @@
 #include "patch-ids.h"
 
 int commit_patch_id(struct commit *commit, struct diff_options *options,
-		    unsigned char *sha1)
+		    unsigned char *sha1, int diff_header_only)
 {
 	if (commit->parents)
 		diff_tree_sha1(commit->parents->item->object.oid.hash,
@@ -13,93 +13,86 @@ int commit_patch_id(struct commit *commit, struct diff_options *options,
 	else
 		diff_root_tree_sha1(commit->object.oid.hash, "", options);
 	diffcore_std(options);
-	return diff_flush_patch_id(options, sha1);
+	return diff_flush_patch_id(options, sha1, diff_header_only);
 }
 
-static const unsigned char *patch_id_access(size_t index, void *table)
+/*
+ * When we cannot load the full patch-id for both commits for whatever
+ * reason, the function returns -1 (i.e. return error(...)). Despite
+ * the "cmp" in the name of this function, the caller only cares about
+ * the return value being zero (a and b are equivalent) or non-zero (a
+ * and b are different), and returning non-zero would keep both in the
+ * result, even if they actually were equivalent, in order to err on
+ * the side of safety.  The actual value being negative does not have
+ * any significance; only that it is non-zero matters.
+ */
+static int patch_id_cmp(struct patch_id *a,
+			struct patch_id *b,
+			struct diff_options *opt)
 {
-	struct patch_id **id_table = table;
-	return id_table[index]->patch_id;
+	if (is_null_sha1(a->patch_id) &&
+	    commit_patch_id(a->commit, opt, a->patch_id, 0))
+		return error("Could not get patch ID for %s",
+			oid_to_hex(&a->commit->object.oid));
+	if (is_null_sha1(b->patch_id) &&
+	    commit_patch_id(b->commit, opt, b->patch_id, 0))
+		return error("Could not get patch ID for %s",
+			oid_to_hex(&b->commit->object.oid));
+	return hashcmp(a->patch_id, b->patch_id);
 }
 
-static int patch_pos(struct patch_id **table, int nr, const unsigned char *id)
-{
-	return sha1_pos(id, table, nr, patch_id_access);
-}
-
-#define BUCKET_SIZE 190 /* 190 * 21 = 3990, with slop close enough to 4K */
-struct patch_id_bucket {
-	struct patch_id_bucket *next;
-	int nr;
-	struct patch_id bucket[BUCKET_SIZE];
-};
-
 int init_patch_ids(struct patch_ids *ids)
 {
 	memset(ids, 0, sizeof(*ids));
 	diff_setup(&ids->diffopts);
 	DIFF_OPT_SET(&ids->diffopts, RECURSIVE);
 	diff_setup_done(&ids->diffopts);
+	hashmap_init(&ids->patches, (hashmap_cmp_fn)patch_id_cmp, 256);
 	return 0;
 }
 
 int free_patch_ids(struct patch_ids *ids)
 {
-	struct patch_id_bucket *next, *patches;
-
-	free(ids->table);
-	for (patches = ids->patches; patches; patches = next) {
-		next = patches->next;
-		free(patches);
-	}
+	hashmap_free(&ids->patches, 1);
 	return 0;
 }
 
-static struct patch_id *add_commit(struct commit *commit,
-				   struct patch_ids *ids,
-				   int no_add)
+static int init_patch_id_entry(struct patch_id *patch,
+			       struct commit *commit,
+			       struct patch_ids *ids)
 {
-	struct patch_id_bucket *bucket;
-	struct patch_id *ent;
-	unsigned char sha1[20];
-	int pos;
+	unsigned char header_only_patch_id[GIT_SHA1_RAWSZ];
 
-	if (commit_patch_id(commit, &ids->diffopts, sha1))
-		return NULL;
-	pos = patch_pos(ids->table, ids->nr, sha1);
-	if (0 <= pos)
-		return ids->table[pos];
-	if (no_add)
-		return NULL;
+	patch->commit = commit;
+	if (commit_patch_id(commit, &ids->diffopts, header_only_patch_id, 1))
+		return -1;
 
-	pos = -1 - pos;
-
-	bucket = ids->patches;
-	if (!bucket || (BUCKET_SIZE <= bucket->nr)) {
-		bucket = xcalloc(1, sizeof(*bucket));
-		bucket->next = ids->patches;
-		ids->patches = bucket;
-	}
-	ent = &bucket->bucket[bucket->nr++];
-	hashcpy(ent->patch_id, sha1);
-
-	ALLOC_GROW(ids->table, ids->nr + 1, ids->alloc);
-	if (pos < ids->nr)
-		memmove(ids->table + pos + 1, ids->table + pos,
-			sizeof(ent) * (ids->nr - pos));
-	ids->nr++;
-	ids->table[pos] = ent;
-	return ids->table[pos];
+	hashmap_entry_init(patch, sha1hash(header_only_patch_id));
+	return 0;
 }
 
 struct patch_id *has_commit_patch_id(struct commit *commit,
 				     struct patch_ids *ids)
 {
-	return add_commit(commit, ids, 1);
+	struct patch_id patch;
+
+	memset(&patch, 0, sizeof(patch));
+	if (init_patch_id_entry(&patch, commit, ids))
+		return NULL;
+
+	return hashmap_get(&ids->patches, &patch, &ids->diffopts);
 }
 
 struct patch_id *add_commit_patch_id(struct commit *commit,
 				     struct patch_ids *ids)
 {
-	return add_commit(commit, ids, 0);
+	struct patch_id *key = xcalloc(1, sizeof(*key));
+
+	if (init_patch_id_entry(key, commit, ids)) {
+		free(key);
+		return NULL;
+	}
+
+	hashmap_add(&ids->patches, key);
+	return key;
 }
diff --git a/patch-ids.h b/patch-ids.h
index eeb56b3..0f34ea1 100644
--- a/patch-ids.h
+++ b/patch-ids.h
@@ -2,19 +2,18 @@
 #define PATCH_IDS_H
 
 struct patch_id {
-	unsigned char patch_id[20];
-	char seen;
+	struct hashmap_entry ent;
+	unsigned char patch_id[GIT_SHA1_RAWSZ];
+	struct commit *commit;
 };
 
 struct patch_ids {
+	struct hashmap patches;
 	struct diff_options diffopts;
-	int nr, alloc;
-	struct patch_id **table;
-	struct patch_id_bucket *patches;
 };
 
 int commit_patch_id(struct commit *commit, struct diff_options *options,
-		    unsigned char *sha1);
+		    unsigned char *sha1, int);
 int init_patch_ids(struct patch_ids *);
 int free_patch_ids(struct patch_ids *);
 struct patch_id *add_commit_patch_id(struct commit *, struct patch_ids *);
diff --git a/revision.c b/revision.c
index 15873bf..8a29cb0 100644
--- a/revision.c
+++ b/revision.c
@@ -846,7 +846,7 @@ static void cherry_pick_list(struct commit_list *list, struct rev_info *revs)
 		 */
 		if (left_first != !!(flags & SYMMETRIC_LEFT))
 			continue;
-		commit->util = add_commit_patch_id(commit, &ids);
+		add_commit_patch_id(commit, &ids);
 	}
 
 	/* either cherry_mark or cherry_pick are true */
@@ -873,21 +873,9 @@ static void cherry_pick_list(struct commit_list *list, struct rev_info *revs)
 		id = has_commit_patch_id(commit, &ids);
 		if (!id)
 			continue;
-		id->seen = 1;
+
 		commit->object.flags |= cherry_flag;
-	}
-
-	/* Now check the original side for seen ones */
-	for (p = list; p; p = p->next) {
-		struct commit *commit = p->item;
-		struct patch_id *ent;
-
-		ent = commit->util;
-		if (!ent)
-			continue;
-		if (ent->seen)
-			commit->object.flags |= cherry_flag;
-		commit->util = NULL;
+		id->commit->object.flags |= cherry_flag;
 	}
 
 	free_patch_ids(&ids);
diff --git a/t/perf/p3400-rebase.sh b/t/perf/p3400-rebase.sh
new file mode 100644
index 0000000..b3e7d52
--- /dev/null
+++ b/t/perf/p3400-rebase.sh
@@ -0,0 +1,36 @@
+#!/bin/sh
+
+test_description='Tests rebase performance'
+. ./perf-lib.sh
+
+test_perf_default_repo
+
+test_expect_success 'setup' '
+	git checkout -f -b base &&
+	git checkout -b to-rebase &&
+	git checkout -b upstream &&
+	for i in $(seq 100)
+	do
+		# simulate huge diffs
+		echo change$i >unrelated-file$i &&
+		seq 1000 >>unrelated-file$i &&
+		git add unrelated-file$i &&
+		test_tick &&
+		git commit -m commit$i unrelated-file$i &&
+		echo change$i >unrelated-file$i &&
+		seq 1000 | tac >>unrelated-file$i &&
+		git add unrelated-file$i &&
+		test_tick &&
+		git commit -m commit$i-reverse unrelated-file$i ||
+		break
+	done &&
+	git checkout to-rebase &&
+	test_commit our-patch interesting-file
+'
+
+test_perf 'rebase on top of a lot of unrelated changes' '
+	git rebase --onto upstream HEAD^ &&
+	git rebase --onto base HEAD^
+'
+
+test_done
diff --git a/t/t6007-rev-list-cherry-pick-file.sh b/t/t6007-rev-list-cherry-pick-file.sh
index 28d4f6b..1408b60 100755
--- a/t/t6007-rev-list-cherry-pick-file.sh
+++ b/t/t6007-rev-list-cherry-pick-file.sh
@@ -207,4 +207,25 @@
 	test_cmp expect actual
 '
 
+# Corrupt the object store deliberately to make sure
+# the object is not even checked for its existence.
+remove_loose_object () {
+	sha1="$(git rev-parse "$1")" &&
+	remainder=${sha1#??} &&
+	firsttwo=${sha1%$remainder} &&
+	rm .git/objects/$firsttwo/$remainder
+}
+
+test_expect_success '--cherry-pick avoids looking at full diffs' '
+	git checkout -b shy-diff &&
+	test_commit dont-look-at-me &&
+	echo Hello >dont-look-at-me.t &&
+	test_tick &&
+	git commit -m tip dont-look-at-me.t &&
+	git checkout -b mainline HEAD^ &&
+	test_commit to-cherry-pick &&
+	remove_loose_object shy-diff^:dont-look-at-me.t &&
+	git rev-list --cherry-pick ...shy-diff
+'
+
 test_done