reftable/block.c - git - Git at Google

 /*
  * Copyright 2020 Google LLC
  *
  * Use of this source code is governed by a BSD-style
  * license that can be found in the LICENSE file or at
  * https://developers.google.com/open-source/licenses/bsd
  */

 #include "block.h"

 #include "blocksource.h"
 #include "constants.h"
 #include "iter.h"
 #include "record.h"
 #include "reftable-error.h"
 #include "system.h"

 size_t header_size(int version)
 {
 	switch (version) {
 	case 1:
 		return 24;
 	case 2:
 		return 28;
 	}
 	abort();
 }

 size_t footer_size(int version)
 {
 	switch (version) {
 	case 1:
 		return 68;
 	case 2:
 		return 72;
 	}
 	abort();
 }

 static int block_writer_register_restart(struct block_writer *w, int n,
 					 int is_restart, struct reftable_buf *key)
 {
 	uint32_t rlen;
 	int err;

 	rlen = w->restart_len;
 	if (rlen >= MAX_RESTARTS)
 		is_restart = 0;

 	if (is_restart)
 		rlen++;
 	if (2 + 3 * rlen + n > w->block_size - w->next)
 		return REFTABLE_ENTRY_TOO_BIG_ERROR;
 	if (is_restart) {
 		REFTABLE_ALLOC_GROW_OR_NULL(w->restarts, w->restart_len + 1,
 					    w->restart_cap);
 		if (!w->restarts)
 			return REFTABLE_OUT_OF_MEMORY_ERROR;
 		w->restarts[w->restart_len++] = w->next;
 	}

 	w->next += n;

 	reftable_buf_reset(&w->last_key);
 	err = reftable_buf_add(&w->last_key, key->buf, key->len);
 	if (err < 0)
 		return err;

 	w->entries++;
 	return 0;
 }

 int block_writer_init(struct block_writer *bw, uint8_t typ, uint8_t *block,
 		      uint32_t block_size, uint32_t header_off, uint32_t hash_size)
 {
 	bw->block = block;
 	bw->hash_size = hash_size;
 	bw->block_size = block_size;
 	bw->header_off = header_off;
 	bw->block[header_off] = typ;
 	bw->next = header_off + 4;
 	bw->restart_interval = 16;
 	bw->entries = 0;
 	bw->restart_len = 0;
 	bw->last_key.len = 0;
 	if (!bw->zstream) {
 		REFTABLE_CALLOC_ARRAY(bw->zstream, 1);
 		if (!bw->zstream)
 			return REFTABLE_OUT_OF_MEMORY_ERROR;
 		deflateInit(bw->zstream, 9);
 	}

 	return 0;
 }

 uint8_t block_writer_type(struct block_writer *bw)
 {
 	return bw->block[bw->header_off];
 }

 /*
  * Adds the reftable_record to the block. Returns 0 on success and
  * appropriate error codes on failure.
  */
 int block_writer_add(struct block_writer *w, struct reftable_record *rec)
 {
 	struct reftable_buf empty = REFTABLE_BUF_INIT;
 	struct reftable_buf last =
 		w->entries % w->restart_interval == 0 ? empty : w->last_key;
 	struct string_view out = {
 		.buf = w->block + w->next,
 		.len = w->block_size - w->next,
 	};
 	struct string_view start = out;
 	int is_restart = 0;
 	int n = 0;
 	int err;

 	err = reftable_record_key(rec, &w->scratch);
 	if (err < 0)
 		goto done;

 	if (!w->scratch.len) {
 		err = REFTABLE_API_ERROR;
 		goto done;
 	}

 	n = reftable_encode_key(&is_restart, out, last, w->scratch,
 				reftable_record_val_type(rec));
 	if (n < 0) {
 		err = n;
 		goto done;
 	}
 	string_view_consume(&out, n);

 	n = reftable_record_encode(rec, out, w->hash_size);
 	if (n < 0) {
 		err = n;
 		goto done;
 	}
 	string_view_consume(&out, n);

 	err = block_writer_register_restart(w, start.len - out.len, is_restart,
 					    &w->scratch);
 done:
 	return err;
 }

 int block_writer_finish(struct block_writer *w)
 {
 	for (uint32_t i = 0; i < w->restart_len; i++) {
 		reftable_put_be24(w->block + w->next, w->restarts[i]);
 		w->next += 3;
 	}

 	reftable_put_be16(w->block + w->next, w->restart_len);
 	w->next += 2;
 	reftable_put_be24(w->block + 1 + w->header_off, w->next);

 	/*
 	 * Log records are stored zlib-compressed. Note that the compression
 	 * also spans over the restart points we have just written.
 	 */
 	if (block_writer_type(w) == REFTABLE_BLOCK_TYPE_LOG) {
 		int block_header_skip = 4 + w->header_off;
 		uLongf src_len = w->next - block_header_skip, compressed_len;
 		int ret;

 		ret = deflateReset(w->zstream);
 		if (ret != Z_OK)
 			return REFTABLE_ZLIB_ERROR;

 		/*
 		 * Precompute the upper bound of how many bytes the compressed
 		 * data may end up with. Combined with `Z_FINISH`, `deflate()`
 		 * is guaranteed to return `Z_STREAM_END`.
 		 */
 		compressed_len = deflateBound(w->zstream, src_len);
 		REFTABLE_ALLOC_GROW_OR_NULL(w->compressed, compressed_len,
 					    w->compressed_cap);
 		if (!w->compressed) {
 			ret = REFTABLE_OUT_OF_MEMORY_ERROR;
 			return ret;
 		}

 		w->zstream->next_out = w->compressed;
 		w->zstream->avail_out = compressed_len;
 		w->zstream->next_in = w->block + block_header_skip;
 		w->zstream->avail_in = src_len;

 		/*
 		 * We want to perform all decompression in a single step, which
 		 * is why we can pass Z_FINISH here. As we have precomputed the
 		 * deflated buffer's size via `deflateBound()` this function is
 		 * guaranteed to succeed according to the zlib documentation.
 		 */
 		ret = deflate(w->zstream, Z_FINISH);
 		if (ret != Z_STREAM_END)
 			return REFTABLE_ZLIB_ERROR;

 		/*
 		 * Overwrite the uncompressed data we have already written and
 		 * adjust the `next` pointer to point right after the
 		 * compressed data.
 		 */
 		memcpy(w->block + block_header_skip, w->compressed,
 		       w->zstream->total_out);
 		w->next = w->zstream->total_out + block_header_skip;
 	}

 	return w->next;
 }

 static int read_block(struct reftable_block_source *source,
 		      struct reftable_block_data *dest, uint64_t off,
 		      uint32_t sz)
 {
 	size_t size = block_source_size(source);
 	block_source_release_data(dest);
 	if (off >= size)
 		return 0;
 	if (off + sz > size)
 		sz = size - off;
 	return block_source_read_data(source, dest, off, sz);
 }

 int reftable_block_init(struct reftable_block *block,
 			struct reftable_block_source *source,
 			uint32_t offset, uint32_t header_size,
 			uint32_t table_block_size, uint32_t hash_size,
 			uint8_t want_type)
 {
 	uint32_t guess_block_size = table_block_size ?
 		table_block_size : DEFAULT_BLOCK_SIZE;
 	uint32_t full_block_size = table_block_size;
 	uint16_t restart_count;
 	uint32_t restart_off;
 	uint32_t block_size;
 	uint8_t block_type;
 	int err;

 	err = read_block(source, &block->block_data, offset, guess_block_size);
 	if (err < 0)
 		goto done;

 	block_type = block->block_data.data[header_size];
 	if (!reftable_is_block_type(block_type)) {
 		err = REFTABLE_FORMAT_ERROR;
 		goto done;
 	}
 	if (want_type != REFTABLE_BLOCK_TYPE_ANY && block_type != want_type) {
 		err = 1;
 		goto done;
 	}

 	block_size = reftable_get_be24(block->block_data.data + header_size + 1);
 	if (block_size > guess_block_size) {
 		err = read_block(source, &block->block_data, offset, block_size);
 		if (err < 0)
 			goto done;
 	}

 	if (block_type == REFTABLE_BLOCK_TYPE_LOG) {
 		uint32_t block_header_skip = 4 + header_size;
 		uLong dst_len = block_size - block_header_skip;
 		uLong src_len = block->block_data.len - block_header_skip;

 		/* Log blocks specify the *uncompressed* size in their header. */
 		REFTABLE_ALLOC_GROW_OR_NULL(block->uncompressed_data, block_size,
 					    block->uncompressed_cap);
 		if (!block->uncompressed_data) {
 			err = REFTABLE_OUT_OF_MEMORY_ERROR;
 			goto done;
 		}

 		/* Copy over the block header verbatim. It's not compressed. */
 		memcpy(block->uncompressed_data, block->block_data.data, block_header_skip);

 		if (!block->zstream) {
 			REFTABLE_CALLOC_ARRAY(block->zstream, 1);
 			if (!block->zstream) {
 				err = REFTABLE_OUT_OF_MEMORY_ERROR;
 				goto done;
 			}

 			err = inflateInit(block->zstream);
 		} else {
 			err = inflateReset(block->zstream);
 		}
 		if (err != Z_OK) {
 			err = REFTABLE_ZLIB_ERROR;
 			goto done;
 		}

 		block->zstream->next_in = block->block_data.data + block_header_skip;
 		block->zstream->avail_in = src_len;
 		block->zstream->next_out = block->uncompressed_data + block_header_skip;
 		block->zstream->avail_out = dst_len;

 		/*
 		 * We know both input as well as output size, and we know that
 		 * the sizes should never be bigger than `uInt_MAX` because
 		 * blocks can at most be 16MB large. We can thus use `Z_FINISH`
 		 * here to instruct zlib to inflate the data in one go, which
 		 * is more efficient than using `Z_NO_FLUSH`.
 		 */
 		err = inflate(block->zstream, Z_FINISH);
 		if (err != Z_STREAM_END) {
 			err = REFTABLE_ZLIB_ERROR;
 			goto done;
 		}
 		err = 0;

 		if (block->zstream->total_out + block_header_skip != block_size) {
 			err = REFTABLE_FORMAT_ERROR;
 			goto done;
 		}

 		/* We're done with the input data. */
 		block_source_release_data(&block->block_data);
 		block->block_data.data = block->uncompressed_data;
 		block->block_data.len = block_size;
 		full_block_size = src_len + block_header_skip - block->zstream->avail_in;
 	} else if (full_block_size == 0) {
 		full_block_size = block_size;
 	} else if (block_size < full_block_size && block_size < block->block_data.len &&
 		   block->block_data.data[block_size] != 0) {
 		/* If the block is smaller than the full block size, it is
 		   padded (data followed by '\0') or the next block is
 		   unaligned. */
 		full_block_size = block_size;
 	}

 	restart_count = reftable_get_be16(block->block_data.data + block_size - 2);
 	restart_off = block_size - 2 - 3 * restart_count;

 	block->block_type = block_type;
 	block->hash_size = hash_size;
 	block->restart_off = restart_off;
 	block->full_block_size = full_block_size;
 	block->header_off = header_size;
 	block->restart_count = restart_count;

 	err = 0;

 done:
 	if (err < 0)
 		reftable_block_release(block);
 	return err;
 }

 void reftable_block_release(struct reftable_block *block)
 {
 	inflateEnd(block->zstream);
 	reftable_free(block->zstream);
 	reftable_free(block->uncompressed_data);
 	block_source_release_data(&block->block_data);
 	memset(block, 0, sizeof(*block));
 }

 uint8_t reftable_block_type(const struct reftable_block *b)
 {
 	return b->block_data.data[b->header_off];
 }

 int reftable_block_first_key(const struct reftable_block *block, struct reftable_buf *key)
 {
 	int off = block->header_off + 4, n;
 	struct string_view in = {
 		.buf = block->block_data.data + off,
 		.len = block->restart_off - off,
 	};
 	uint8_t extra = 0;

 	reftable_buf_reset(key);

 	n = reftable_decode_key(key, &extra, in);
 	if (n < 0)
 		return n;
 	if (!key->len)
 		return REFTABLE_FORMAT_ERROR;

 	return 0;
 }

 static uint32_t block_restart_offset(const struct reftable_block *b, size_t idx)
 {
 	return reftable_get_be24(b->block_data.data + b->restart_off + 3 * idx);
 }

 void block_iter_init(struct block_iter *it, const struct reftable_block *block)
 {
 	it->block = block;
 	block_iter_seek_start(it);
 }

 void block_iter_seek_start(struct block_iter *it)
 {
 	reftable_buf_reset(&it->last_key);
 	it->next_off = it->block->header_off + 4;
 }

 struct restart_needle_less_args {
 	int error;
 	struct reftable_buf needle;
 	const struct reftable_block *block;
 };

 static int restart_needle_less(size_t idx, void *_args)
 {
 	struct restart_needle_less_args *args = _args;
 	uint32_t off = block_restart_offset(args->block, idx);
 	struct string_view in = {
 		.buf = args->block->block_data.data + off,
 		.len = args->block->restart_off - off,
 	};
 	uint64_t prefix_len, suffix_len;
 	uint8_t extra;
 	int n;

 	/*
 	 * Records at restart points are stored without prefix compression, so
 	 * there is no need to fully decode the record key here. This removes
 	 * the need for allocating memory.
 	 */
 	n = reftable_decode_keylen(in, &prefix_len, &suffix_len, &extra);
 	if (n < 0 || prefix_len) {
 		args->error = 1;
 		return -1;
 	}

 	string_view_consume(&in, n);
 	if (suffix_len > in.len) {
 		args->error = 1;
 		return -1;
 	}

 	n = memcmp(args->needle.buf, in.buf,
 		   args->needle.len < suffix_len ? args->needle.len : suffix_len);
 	if (n)
 		return n < 0;
 	return args->needle.len < suffix_len;
 }

 int block_iter_next(struct block_iter *it, struct reftable_record *rec)
 {
 	struct string_view in = {
 		.buf = (unsigned char *) it->block->block_data.data + it->next_off,
 		.len = it->block->restart_off - it->next_off,
 	};
 	struct string_view start = in;
 	uint8_t extra = 0;
 	int n = 0;

 	if (it->next_off >= it->block->restart_off)
 		return 1;

 	n = reftable_decode_key(&it->last_key, &extra, in);
 	if (n < 0)
 		return -1;
 	if (!it->last_key.len)
 		return REFTABLE_FORMAT_ERROR;

 	string_view_consume(&in, n);
 	n = reftable_record_decode(rec, it->last_key, extra, in, it->block->hash_size,
 				   &it->scratch);
 	if (n < 0)
 		return -1;
 	string_view_consume(&in, n);

 	it->next_off += start.len - in.len;
 	return 0;
 }

 void block_iter_reset(struct block_iter *it)
 {
 	reftable_buf_reset(&it->last_key);
 	it->next_off = 0;
 	it->block = NULL;
 }

 void block_iter_close(struct block_iter *it)
 {
 	reftable_buf_release(&it->last_key);
 	reftable_buf_release(&it->scratch);
 }

 int block_iter_seek_key(struct block_iter *it, struct reftable_buf *want)
 {
 	struct restart_needle_less_args args = {
 		.needle = *want,
 		.block = it->block,
 	};
 	struct reftable_record rec;
 	int err = 0;
 	size_t i;

 	/*
 	 * Perform a binary search over the block's restart points, which
 	 * avoids doing a linear scan over the whole block. Like this, we
 	 * identify the section of the block that should contain our key.
 	 *
 	 * Note that we explicitly search for the first restart point _greater_
 	 * than the sought-after record, not _greater or equal_ to it. In case
 	 * the sought-after record is located directly at the restart point we
 	 * would otherwise start doing the linear search at the preceding
 	 * restart point. While that works alright, we would end up scanning
 	 * too many record.
 	 */
 	i = binsearch(it->block->restart_count, &restart_needle_less, &args);
 	if (args.error) {
 		err = REFTABLE_FORMAT_ERROR;
 		goto done;
 	}

 	/*
 	 * Now there are multiple cases:
 	 *
 	 *   - `i == 0`: The wanted record is smaller than the record found at
 	 *     the first restart point. As the first restart point is the first
 	 *     record in the block, our wanted record cannot be located in this
 	 *     block at all. We still need to position the iterator so that the
 	 *     next call to `block_iter_next()` will yield an end-of-iterator
 	 *     signal.
 	 *
 	 *   - `i == restart_count`: The wanted record was not found at any of
 	 *     the restart points. As there is no restart point at the end of
 	 *     the section the record may thus be contained in the last block.
 	 *
 	 *   - `i > 0`: The wanted record must be contained in the section
 	 *     before the found restart point. We thus do a linear search
 	 *     starting from the preceding restart point.
 	 */
 	if (i > 0)
 		it->next_off = block_restart_offset(it->block, i - 1);
 	else
 		it->next_off = it->block->header_off + 4;

 	err = reftable_record_init(&rec, reftable_block_type(it->block));
 	if (err < 0)
 		goto done;

 	/*
 	 * We're looking for the last entry less than the wanted key so that
 	 * the next call to `block_reader_next()` would yield the wanted
 	 * record. We thus don't want to position our iterator at the sought
 	 * after record, but one before. To do so, we have to go one entry too
 	 * far and then back up.
 	 */
 	while (1) {
 		size_t prev_off = it->next_off;

 		err = block_iter_next(it, &rec);
 		if (err < 0)
 			goto done;
 		if (err > 0) {
 			it->next_off = prev_off;
 			err = 0;
 			goto done;
 		}

 		err = reftable_record_key(&rec, &it->last_key);
 		if (err < 0)
 			goto done;

 		/*
 		 * Check whether the current key is greater or equal to the
 		 * sought-after key. In case it is greater we know that the
 		 * record does not exist in the block and can thus abort early.
 		 * In case it is equal to the sought-after key we have found
 		 * the desired record.
 		 *
 		 * Note that we store the next record's key record directly in
 		 * `last_key` without restoring the key of the preceding record
 		 * in case we need to go one record back. This is safe to do as
 		 * `block_iter_next()` would return the ref whose key is equal
 		 * to `last_key` now, and naturally all keys share a prefix
 		 * with themselves.
 		 */
 		if (reftable_buf_cmp(&it->last_key, want) >= 0) {
 			it->next_off = prev_off;
 			goto done;
 		}
 	}

 done:
 	reftable_record_release(&rec);
 	return err;
 }

 static int block_iter_seek_void(void *it, struct reftable_record *want)
 {
 	struct reftable_buf buf = REFTABLE_BUF_INIT;
 	struct block_iter *bi = it;
 	int err;

 	if (bi->block->block_type != want->type)
 		return REFTABLE_API_ERROR;

 	err = reftable_record_key(want, &buf);
 	if (err < 0)
 		goto out;

 	err = block_iter_seek_key(it, &buf);
 	if (err < 0)
 		goto out;

 	err = 0;

 out:
 	reftable_buf_release(&buf);
 	return err;
 }

 static int block_iter_next_void(void *it, struct reftable_record *rec)
 {
 	return block_iter_next(it, rec);
 }

 static void block_iter_close_void(void *it)
 {
 	block_iter_close(it);
 }

 static struct reftable_iterator_vtable block_iter_vtable = {
 	.seek = &block_iter_seek_void,
 	.next = &block_iter_next_void,
 	.close = &block_iter_close_void,
 };

 int reftable_block_init_iterator(const struct reftable_block *b,
 				 struct reftable_iterator *it)
 {
 	struct block_iter *bi;

 	REFTABLE_CALLOC_ARRAY(bi, 1);
 	block_iter_init(bi, b);

 	assert(!it->ops);
 	it->iter_arg = bi;
 	it->ops = &block_iter_vtable;

 	return 0;
 }

 void block_writer_release(struct block_writer *bw)
 {
 	deflateEnd(bw->zstream);
 	REFTABLE_FREE_AND_NULL(bw->zstream);
 	REFTABLE_FREE_AND_NULL(bw->restarts);
 	REFTABLE_FREE_AND_NULL(bw->compressed);
 	reftable_buf_release(&bw->scratch);
 	reftable_buf_release(&bw->last_key);
 	/* the block is not owned. */
 }
	/*
	* Copyright 2020 Google LLC
	*
	* Use of this source code is governed by a BSD-style
	* license that can be found in the LICENSE file or at
	* https://developers.google.com/open-source/licenses/bsd
	*/

	#include "block.h"

	#include "blocksource.h"
	#include "constants.h"
	#include "iter.h"
	#include "record.h"
	#include "reftable-error.h"
	#include "system.h"

	size_t header_size(int version)
	{
	switch (version) {
	case 1:
	return 24;
	case 2:
	return 28;
	}
	abort();
	}

	size_t footer_size(int version)
	{
	switch (version) {
	case 1:
	return 68;
	case 2:
	return 72;
	}
	abort();
	}

	static int block_writer_register_restart(struct block_writer *w, int n,
	int is_restart, struct reftable_buf *key)
	{
	uint32_t rlen;
	int err;

	rlen = w->restart_len;
	if (rlen >= MAX_RESTARTS)
	is_restart = 0;

	if (is_restart)
	rlen++;
	if (2 + 3 * rlen + n > w->block_size - w->next)
	return REFTABLE_ENTRY_TOO_BIG_ERROR;
	if (is_restart) {
	REFTABLE_ALLOC_GROW_OR_NULL(w->restarts, w->restart_len + 1,
	w->restart_cap);
	if (!w->restarts)
	return REFTABLE_OUT_OF_MEMORY_ERROR;
	w->restarts[w->restart_len++] = w->next;
	}

	w->next += n;

	reftable_buf_reset(&w->last_key);
	err = reftable_buf_add(&w->last_key, key->buf, key->len);
	if (err < 0)
	return err;

	w->entries++;
	return 0;
	}

	int block_writer_init(struct block_writer bw, uint8_t typ, uint8_t block,
	uint32_t block_size, uint32_t header_off, uint32_t hash_size)
	{
	bw->block = block;
	bw->hash_size = hash_size;
	bw->block_size = block_size;
	bw->header_off = header_off;
	bw->block[header_off] = typ;
	bw->next = header_off + 4;
	bw->restart_interval = 16;
	bw->entries = 0;
	bw->restart_len = 0;
	bw->last_key.len = 0;
	if (!bw->zstream) {
	REFTABLE_CALLOC_ARRAY(bw->zstream, 1);
	if (!bw->zstream)
	return REFTABLE_OUT_OF_MEMORY_ERROR;
	deflateInit(bw->zstream, 9);
	}

	return 0;
	}

	uint8_t block_writer_type(struct block_writer *bw)
	{
	return bw->block[bw->header_off];
	}

	/*
	* Adds the reftable_record to the block. Returns 0 on success and
	* appropriate error codes on failure.
	*/
	int block_writer_add(struct block_writer w, struct reftable_record rec)
	{
	struct reftable_buf empty = REFTABLE_BUF_INIT;
	struct reftable_buf last =
	w->entries % w->restart_interval == 0 ? empty : w->last_key;
	struct string_view out = {
	.buf = w->block + w->next,
	.len = w->block_size - w->next,
	};
	struct string_view start = out;
	int is_restart = 0;
	int n = 0;
	int err;

	err = reftable_record_key(rec, &w->scratch);
	if (err < 0)
	goto done;

	if (!w->scratch.len) {
	err = REFTABLE_API_ERROR;
	goto done;
	}

	n = reftable_encode_key(&is_restart, out, last, w->scratch,
	reftable_record_val_type(rec));
	if (n < 0) {
	err = n;
	goto done;
	}
	string_view_consume(&out, n);

	n = reftable_record_encode(rec, out, w->hash_size);
	if (n < 0) {
	err = n;
	goto done;
	}
	string_view_consume(&out, n);

	err = block_writer_register_restart(w, start.len - out.len, is_restart,
	&w->scratch);
	done:
	return err;
	}

	int block_writer_finish(struct block_writer *w)
	{
	for (uint32_t i = 0; i < w->restart_len; i++) {
	reftable_put_be24(w->block + w->next, w->restarts[i]);
	w->next += 3;
	}

	reftable_put_be16(w->block + w->next, w->restart_len);
	w->next += 2;
	reftable_put_be24(w->block + 1 + w->header_off, w->next);

	/*
	* Log records are stored zlib-compressed. Note that the compression
	* also spans over the restart points we have just written.
	*/
	if (block_writer_type(w) == REFTABLE_BLOCK_TYPE_LOG) {
	int block_header_skip = 4 + w->header_off;
	uLongf src_len = w->next - block_header_skip, compressed_len;
	int ret;

	ret = deflateReset(w->zstream);
	if (ret != Z_OK)
	return REFTABLE_ZLIB_ERROR;

	/*
	* Precompute the upper bound of how many bytes the compressed
	* data may end up with. Combined with `Z_FINISH`, `deflate()`
	* is guaranteed to return `Z_STREAM_END`.
	*/
	compressed_len = deflateBound(w->zstream, src_len);
	REFTABLE_ALLOC_GROW_OR_NULL(w->compressed, compressed_len,
	w->compressed_cap);
	if (!w->compressed) {
	ret = REFTABLE_OUT_OF_MEMORY_ERROR;
	return ret;
	}

	w->zstream->next_out = w->compressed;
	w->zstream->avail_out = compressed_len;
	w->zstream->next_in = w->block + block_header_skip;
	w->zstream->avail_in = src_len;

	/*
	* We want to perform all decompression in a single step, which
	* is why we can pass Z_FINISH here. As we have precomputed the
	* deflated buffer's size via `deflateBound()` this function is
	* guaranteed to succeed according to the zlib documentation.
	*/
	ret = deflate(w->zstream, Z_FINISH);
	if (ret != Z_STREAM_END)
	return REFTABLE_ZLIB_ERROR;

	/*
	* Overwrite the uncompressed data we have already written and
	* adjust the `next` pointer to point right after the
	* compressed data.
	*/
	memcpy(w->block + block_header_skip, w->compressed,
	w->zstream->total_out);
	w->next = w->zstream->total_out + block_header_skip;
	}

	return w->next;
	}

	static int read_block(struct reftable_block_source *source,
	struct reftable_block_data *dest, uint64_t off,
	uint32_t sz)
	{
	size_t size = block_source_size(source);
	block_source_release_data(dest);
	if (off >= size)
	return 0;
	if (off + sz > size)
	sz = size - off;
	return block_source_read_data(source, dest, off, sz);
	}

	int reftable_block_init(struct reftable_block *block,
	struct reftable_block_source *source,
	uint32_t offset, uint32_t header_size,
	uint32_t table_block_size, uint32_t hash_size,
	uint8_t want_type)
	{
	uint32_t guess_block_size = table_block_size ?
	table_block_size : DEFAULT_BLOCK_SIZE;
	uint32_t full_block_size = table_block_size;
	uint16_t restart_count;
	uint32_t restart_off;
	uint32_t block_size;
	uint8_t block_type;
	int err;

	err = read_block(source, &block->block_data, offset, guess_block_size);
	if (err < 0)
	goto done;

	block_type = block->block_data.data[header_size];
	if (!reftable_is_block_type(block_type)) {
	err = REFTABLE_FORMAT_ERROR;
	goto done;
	}
	if (want_type != REFTABLE_BLOCK_TYPE_ANY && block_type != want_type) {
	err = 1;
	goto done;
	}

	block_size = reftable_get_be24(block->block_data.data + header_size + 1);
	if (block_size > guess_block_size) {
	err = read_block(source, &block->block_data, offset, block_size);
	if (err < 0)
	goto done;
	}

	if (block_type == REFTABLE_BLOCK_TYPE_LOG) {
	uint32_t block_header_skip = 4 + header_size;
	uLong dst_len = block_size - block_header_skip;
	uLong src_len = block->block_data.len - block_header_skip;

	/* Log blocks specify the uncompressed size in their header. */
	REFTABLE_ALLOC_GROW_OR_NULL(block->uncompressed_data, block_size,
	block->uncompressed_cap);
	if (!block->uncompressed_data) {
	err = REFTABLE_OUT_OF_MEMORY_ERROR;
	goto done;
	}

	/* Copy over the block header verbatim. It's not compressed. */
	memcpy(block->uncompressed_data, block->block_data.data, block_header_skip);

	if (!block->zstream) {
	REFTABLE_CALLOC_ARRAY(block->zstream, 1);
	if (!block->zstream) {
	err = REFTABLE_OUT_OF_MEMORY_ERROR;
	goto done;
	}

	err = inflateInit(block->zstream);
	} else {
	err = inflateReset(block->zstream);
	}
	if (err != Z_OK) {
	err = REFTABLE_ZLIB_ERROR;
	goto done;
	}

	block->zstream->next_in = block->block_data.data + block_header_skip;
	block->zstream->avail_in = src_len;
	block->zstream->next_out = block->uncompressed_data + block_header_skip;
	block->zstream->avail_out = dst_len;

	/*
	* We know both input as well as output size, and we know that
	* the sizes should never be bigger than `uInt_MAX` because
	* blocks can at most be 16MB large. We can thus use `Z_FINISH`
	* here to instruct zlib to inflate the data in one go, which
	* is more efficient than using `Z_NO_FLUSH`.
	*/
	err = inflate(block->zstream, Z_FINISH);
	if (err != Z_STREAM_END) {
	err = REFTABLE_ZLIB_ERROR;
	goto done;
	}
	err = 0;

	if (block->zstream->total_out + block_header_skip != block_size) {
	err = REFTABLE_FORMAT_ERROR;
	goto done;
	}

	/* We're done with the input data. */
	block_source_release_data(&block->block_data);
	block->block_data.data = block->uncompressed_data;
	block->block_data.len = block_size;
	full_block_size = src_len + block_header_skip - block->zstream->avail_in;
	} else if (full_block_size == 0) {
	full_block_size = block_size;
	} else if (block_size < full_block_size && block_size < block->block_data.len &&
	block->block_data.data[block_size] != 0) {
	/* If the block is smaller than the full block size, it is
	padded (data followed by '\0') or the next block is
	unaligned. */
	full_block_size = block_size;
	}

	restart_count = reftable_get_be16(block->block_data.data + block_size - 2);
	restart_off = block_size - 2 - 3 * restart_count;

	block->block_type = block_type;
	block->hash_size = hash_size;
	block->restart_off = restart_off;
	block->full_block_size = full_block_size;
	block->header_off = header_size;
	block->restart_count = restart_count;

	err = 0;

	done:
	if (err < 0)
	reftable_block_release(block);
	return err;
	}

	void reftable_block_release(struct reftable_block *block)
	{
	inflateEnd(block->zstream);
	reftable_free(block->zstream);
	reftable_free(block->uncompressed_data);
	block_source_release_data(&block->block_data);
	memset(block, 0, sizeof(*block));
	}

	uint8_t reftable_block_type(const struct reftable_block *b)
	{
	return b->block_data.data[b->header_off];
	}

	int reftable_block_first_key(const struct reftable_block block, struct reftable_buf key)
	{
	int off = block->header_off + 4, n;
	struct string_view in = {
	.buf = block->block_data.data + off,
	.len = block->restart_off - off,
	};
	uint8_t extra = 0;

	reftable_buf_reset(key);

	n = reftable_decode_key(key, &extra, in);
	if (n < 0)
	return n;
	if (!key->len)
	return REFTABLE_FORMAT_ERROR;

	return 0;
	}

	static uint32_t block_restart_offset(const struct reftable_block *b, size_t idx)
	{
	return reftable_get_be24(b->block_data.data + b->restart_off + 3 * idx);
	}

	void block_iter_init(struct block_iter it, const struct reftable_block block)
	{
	it->block = block;
	block_iter_seek_start(it);
	}

	void block_iter_seek_start(struct block_iter *it)
	{
	reftable_buf_reset(&it->last_key);
	it->next_off = it->block->header_off + 4;
	}

	struct restart_needle_less_args {
	int error;
	struct reftable_buf needle;
	const struct reftable_block *block;
	};

	static int restart_needle_less(size_t idx, void *_args)
	{
	struct restart_needle_less_args *args = _args;
	uint32_t off = block_restart_offset(args->block, idx);
	struct string_view in = {
	.buf = args->block->block_data.data + off,
	.len = args->block->restart_off - off,
	};
	uint64_t prefix_len, suffix_len;
	uint8_t extra;
	int n;

	/*
	* Records at restart points are stored without prefix compression, so
	* there is no need to fully decode the record key here. This removes
	* the need for allocating memory.
	*/
	n = reftable_decode_keylen(in, &prefix_len, &suffix_len, &extra);
	if (n < 0 \|\| prefix_len) {
	args->error = 1;
	return -1;
	}

	string_view_consume(&in, n);
	if (suffix_len > in.len) {
	args->error = 1;
	return -1;
	}

	n = memcmp(args->needle.buf, in.buf,
	args->needle.len < suffix_len ? args->needle.len : suffix_len);
	if (n)
	return n < 0;
	return args->needle.len < suffix_len;
	}

	int block_iter_next(struct block_iter it, struct reftable_record rec)
	{
	struct string_view in = {
	.buf = (unsigned char *) it->block->block_data.data + it->next_off,
	.len = it->block->restart_off - it->next_off,
	};
	struct string_view start = in;
	uint8_t extra = 0;
	int n = 0;

	if (it->next_off >= it->block->restart_off)
	return 1;

	n = reftable_decode_key(&it->last_key, &extra, in);
	if (n < 0)
	return -1;
	if (!it->last_key.len)
	return REFTABLE_FORMAT_ERROR;

	string_view_consume(&in, n);
	n = reftable_record_decode(rec, it->last_key, extra, in, it->block->hash_size,
	&it->scratch);
	if (n < 0)
	return -1;
	string_view_consume(&in, n);

	it->next_off += start.len - in.len;
	return 0;
	}

	void block_iter_reset(struct block_iter *it)
	{
	reftable_buf_reset(&it->last_key);
	it->next_off = 0;
	it->block = NULL;
	}

	void block_iter_close(struct block_iter *it)
	{
	reftable_buf_release(&it->last_key);
	reftable_buf_release(&it->scratch);
	}

	int block_iter_seek_key(struct block_iter it, struct reftable_buf want)
	{
	struct restart_needle_less_args args = {
	.needle = *want,
	.block = it->block,
	};
	struct reftable_record rec;
	int err = 0;
	size_t i;

	/*
	* Perform a binary search over the block's restart points, which
	* avoids doing a linear scan over the whole block. Like this, we
	* identify the section of the block that should contain our key.
	*
	* Note that we explicitly search for the first restart point _greater_
	* than the sought-after record, not _greater or equal_ to it. In case
	* the sought-after record is located directly at the restart point we
	* would otherwise start doing the linear search at the preceding
	* restart point. While that works alright, we would end up scanning
	* too many record.
	*/
	i = binsearch(it->block->restart_count, &restart_needle_less, &args);
	if (args.error) {
	err = REFTABLE_FORMAT_ERROR;
	goto done;
	}

	/*
	* Now there are multiple cases:
	*
	* - `i == 0`: The wanted record is smaller than the record found at
	* the first restart point. As the first restart point is the first
	* record in the block, our wanted record cannot be located in this
	* block at all. We still need to position the iterator so that the
	* next call to `block_iter_next()` will yield an end-of-iterator
	* signal.
	*
	* - `i == restart_count`: The wanted record was not found at any of
	* the restart points. As there is no restart point at the end of
	* the section the record may thus be contained in the last block.
	*
	* - `i > 0`: The wanted record must be contained in the section
	* before the found restart point. We thus do a linear search
	* starting from the preceding restart point.
	*/
	if (i > 0)
	it->next_off = block_restart_offset(it->block, i - 1);
	else
	it->next_off = it->block->header_off + 4;

	err = reftable_record_init(&rec, reftable_block_type(it->block));
	if (err < 0)
	goto done;

	/*
	* We're looking for the last entry less than the wanted key so that
	* the next call to `block_reader_next()` would yield the wanted
	* record. We thus don't want to position our iterator at the sought
	* after record, but one before. To do so, we have to go one entry too
	* far and then back up.
	*/
	while (1) {
	size_t prev_off = it->next_off;

	err = block_iter_next(it, &rec);
	if (err < 0)
	goto done;
	if (err > 0) {
	it->next_off = prev_off;
	err = 0;
	goto done;
	}

	err = reftable_record_key(&rec, &it->last_key);
	if (err < 0)
	goto done;

	/*
	* Check whether the current key is greater or equal to the
	* sought-after key. In case it is greater we know that the
	* record does not exist in the block and can thus abort early.
	* In case it is equal to the sought-after key we have found
	* the desired record.
	*
	* Note that we store the next record's key record directly in
	* `last_key` without restoring the key of the preceding record
	* in case we need to go one record back. This is safe to do as
	* `block_iter_next()` would return the ref whose key is equal
	* to `last_key` now, and naturally all keys share a prefix
	* with themselves.
	*/
	if (reftable_buf_cmp(&it->last_key, want) >= 0) {
	it->next_off = prev_off;
	goto done;
	}
	}

	done:
	reftable_record_release(&rec);
	return err;
	}

	static int block_iter_seek_void(void it, struct reftable_record want)
	{
	struct reftable_buf buf = REFTABLE_BUF_INIT;
	struct block_iter *bi = it;
	int err;

	if (bi->block->block_type != want->type)
	return REFTABLE_API_ERROR;

	err = reftable_record_key(want, &buf);
	if (err < 0)
	goto out;

	err = block_iter_seek_key(it, &buf);
	if (err < 0)
	goto out;

	err = 0;

	out:
	reftable_buf_release(&buf);
	return err;
	}

	static int block_iter_next_void(void it, struct reftable_record rec)
	{
	return block_iter_next(it, rec);
	}

	static void block_iter_close_void(void *it)
	{
	block_iter_close(it);
	}

	static struct reftable_iterator_vtable block_iter_vtable = {
	.seek = &block_iter_seek_void,
	.next = &block_iter_next_void,
	.close = &block_iter_close_void,
	};

	int reftable_block_init_iterator(const struct reftable_block *b,
	struct reftable_iterator *it)
	{
	struct block_iter *bi;

	REFTABLE_CALLOC_ARRAY(bi, 1);
	block_iter_init(bi, b);

	assert(!it->ops);
	it->iter_arg = bi;
	it->ops = &block_iter_vtable;

	return 0;
	}

	void block_writer_release(struct block_writer *bw)
	{
	deflateEnd(bw->zstream);
	REFTABLE_FREE_AND_NULL(bw->zstream);
	REFTABLE_FREE_AND_NULL(bw->restarts);
	REFTABLE_FREE_AND_NULL(bw->compressed);
	reftable_buf_release(&bw->scratch);
	reftable_buf_release(&bw->last_key);
	/* the block is not owned. */
	}