blob: b6b5260e1b8c39218563bb25e07d2f4d6cabeaa3 [file] [log] [blame]
// Copyright 2010-2015, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "storage/existence_filter.h"
#include <cstring>
#include <cmath>
#include "base/logging.h"
#include "base/port.h"
namespace mozc {
namespace storage {
namespace {
// Rotate the value in 'original' by 'num_bits'
inline uint64 RotateLeft64(uint64 original, int num_bits) {
// TODO(team): we may want to use rotl64 depending on the platform.
DCHECK(0 < num_bits && num_bits < 64) << num_bits;
return (original << (64 - num_bits)) | (original >> num_bits);
}
inline uint32 BitsToWords(uint32 bits) {
uint32 words = (bits + 31) >> 5;
if (bits > 0 && words == 0) {
words = 1 << (32 - 5); // possible overflow
}
return words;
}
} // namespace
class ExistenceFilter::BlockBitmap {
public:
BlockBitmap(uint32 length, bool is_mutable);
~BlockBitmap();
void Clear();
bool Get(uint32 index) const;
void Set(uint32 index);
// REQUIRES: "iter" is zero, or was set by a preceding call
// to GetMutableFragment().
//
// This allows caller to peek into and write to the underlying bitmap
// as a series of non-empty fragments in whole number of 4-byte words.
// If the entire bitmap has been exhausted, return false. Otherwise,
// return true and point caller to the next non-empty fragment.
//
// Usage:
// char** ptr;
// size_t bytes;
// for (uint32 iter = 0; bm.GetMutableFragment(&iter, &ptr, &bytes); ) {
// Process(*ptr, bytes);
// }
bool GetMutableFragment(uint32* itr, char*** ptr, size_t* size);
private:
static const int kBlockShift = 21; // 2^21 bits == 256KB block
static const int kBlockBits = 1 << kBlockShift;
static const int kBlockMask = kBlockBits - 1;
static const int kBlockBytes = kBlockBits >> 3;
static const int kBlockWords = kBlockBits >> 5;
// Array of blocks. Each block has kBlockBits region except for last block.
uint32 **block_;
uint32 num_blocks_;
uint32 bytes_in_last_;
const bool is_mutable_;
DISALLOW_COPY_AND_ASSIGN(BlockBitmap);
};
ExistenceFilter::BlockBitmap::BlockBitmap(uint32 length, bool is_mutable)
: is_mutable_(is_mutable) {
CHECK_GT(length, 0);
const uint32 bits_in_last_block = (length & kBlockMask);
// Allocate the block array
num_blocks_ = (length >> kBlockShift);
if (bits_in_last_block) {
// Need an extra block for the leftover bits
num_blocks_++;
}
CHECK_GT(num_blocks_, 0);
block_ = new uint32*[num_blocks_];
CHECK(block_);
// Allocate full blocks
for (size_t i = 0; i < num_blocks_ - 1; ++i) {
block_[i] = is_mutable_ ? new uint32[kBlockWords] : NULL;
}
// Allocate the last block
if (bits_in_last_block) {
bytes_in_last_ = sizeof(uint32) * ((bits_in_last_block + 31)/32);
} else {
bytes_in_last_ = kBlockBytes;
}
CHECK_EQ(bytes_in_last_ % sizeof(uint32), 0);
block_[num_blocks_-1] =
is_mutable_ ? new uint32[bytes_in_last_/sizeof(uint32)] : NULL;
}
ExistenceFilter::BlockBitmap::~BlockBitmap() {
if (is_mutable_) {
for (int i = 0; i < num_blocks_; ++i) {
delete [] block_[i];
}
}
delete [] block_;
}
void ExistenceFilter::BlockBitmap::Clear() {
if (!is_mutable_) {
return;
}
for (int i = 0; i < num_blocks_ - 1; ++i) {
memset(block_[i], 0, kBlockBytes);
}
memset(block_[num_blocks_-1], 0, bytes_in_last_);
}
ExistenceFilter::ExistenceFilter(uint32 m, uint32 n, int k)
: vec_size_(m ? m : 1),
expected_nelts_(n),
num_hashes_(k) {
CHECK_LT(num_hashes_, 8);
rep_.reset(new BlockBitmap(m ? m : 1, true));
rep_->Clear();
}
// this is private constructor
ExistenceFilter::ExistenceFilter(uint32 m, uint32 n, int k,
bool is_mutable)
: vec_size_(m ? m : 1),
expected_nelts_(n),
num_hashes_(k) {
CHECK_LT(num_hashes_, 8);
rep_.reset(new BlockBitmap(m ? m : 1, is_mutable));
rep_->Clear();
}
// static
ExistenceFilter *
ExistenceFilter::CreateImmutableExietenceFilter(uint32 m,
uint32 n,
int k) {
return new ExistenceFilter(m, n, k, false);
}
ExistenceFilter* ExistenceFilter::CreateOptimal(size_t size_in_bytes,
uint32 estimated_insertions) {
CHECK_LT(size_in_bytes, (1 << 29))
<< "Requested size is too big";
CHECK_GT(estimated_insertions, 0);
const uint32 m = size_in_bytes * 8;
const uint32 n = estimated_insertions;
int optimal_k = static_cast<int>((static_cast<float>(m) / n * log(2.0))
+ 0.5);
if (optimal_k < 1) {
optimal_k = 1;
}
if (optimal_k > 7) {
optimal_k = 7;
}
VLOG(1) << "optimal_k: " << optimal_k;
ExistenceFilter *filter = new ExistenceFilter(m, n, optimal_k);
CHECK(filter);
return filter;
}
ExistenceFilter::~ExistenceFilter() {
}
void ExistenceFilter::Clear() {
rep_->Clear();
}
inline bool ExistenceFilter::BlockBitmap::Get(uint32 index) const {
const uint32 bindex = index >> kBlockShift;
const uint32 windex = (index & kBlockMask) >> 5;
const uint32 bitpos = index & 31;
return (block_[bindex][windex] >> bitpos) & 1;
}
inline void ExistenceFilter::BlockBitmap::Set(uint32 index) {
const uint32 bindex = index >> kBlockShift;
const uint32 windex = (index & kBlockMask) >> 5;
const uint32 bitpos = index & 31;
block_[bindex][windex] |= (static_cast<uint32>(1) << bitpos);
}
bool ExistenceFilter::BlockBitmap::GetMutableFragment(uint32 *iter,
char ***ptr,
size_t *size) {
const uint32 b = *iter;
if (b >= num_blocks_) {
// |iter| reached to the end of the block.
return false;
}
(*iter)++;
*ptr = reinterpret_cast<char **>(&block_[b]);
*size = (b == num_blocks_-1) ? bytes_in_last_ : kBlockBytes;
return true;
}
bool ExistenceFilter::Exists(uint64 hash) const {
for (size_t i = 0; i < num_hashes_; ++i) {
hash = RotateLeft64(hash, 8);
uint32 index = hash % vec_size_;
if (!rep_->Get(index)) {
return false;
}
}
return true;
}
void ExistenceFilter::Insert(uint64 hash) {
for (size_t i = 0; i < num_hashes_; ++i) {
hash = RotateLeft64(hash, 8);
uint32 index = hash % vec_size_;
rep_->Set(index);
}
}
size_t ExistenceFilter::Size() const {
return (BitsToWords(vec_size_) * sizeof(uint32));
}
size_t ExistenceFilter::MinFilterSizeInBytesForErrorRate(float error_rate,
size_t num_elements) {
// (-num_hashes * num_elements) / log(1 - error_rate^(1/num_hashes))
double min_bits = 0;
for (size_t num_hashes = 1; num_hashes < 8; ++num_hashes) {
double num_bits = (-1.0 * num_hashes * num_elements) /
log(1.0 - pow(static_cast<double>(error_rate), (1.0 / num_hashes)));
if (min_bits == 0 || num_bits < min_bits)
min_bits = num_bits;
}
return static_cast<size_t>(ceil(min_bits / 8));
}
// allocate 'buf' and write filter to the buf.
// 'size' will hold the size of buf
void ExistenceFilter::Write(char **buf, size_t *size) {
const int require_bytes = sizeof(Header) + Size();
*buf = new char[require_bytes];
CHECK(*buf);
*size = require_bytes;
char *buf_ptr = *buf;
// write header
memcpy(buf_ptr, &vec_size_, sizeof(vec_size_));
buf_ptr += sizeof(vec_size_);
memcpy(buf_ptr, &expected_nelts_, sizeof(expected_nelts_));
buf_ptr += sizeof(expected_nelts_);
memcpy(buf_ptr, &num_hashes_, sizeof(num_hashes_));
buf_ptr += sizeof(num_hashes_);
LOG(INFO) << "Write header : vec_size" << vec_size_ << " expected_nelts "
<< expected_nelts_ << " num_hashes " << num_hashes_;
// write bitmap
char **fragment_ptr = NULL;
size_t bytes = 0;
size_t write = 0;
for (uint32 iter = 0;
rep_->GetMutableFragment(&iter, &fragment_ptr, &bytes);) {
memcpy(buf_ptr, *fragment_ptr, bytes);
buf_ptr += bytes;
write += bytes;
}
if (write != Size()) {
LOG(ERROR) << "Write " << write << " bytes instead of " << Size();
}
}
bool ExistenceFilter::ReadHeader(const char *buf, Header* header) {
memcpy(&(header->m), buf, sizeof(header->m));
buf += sizeof(header->m);
memcpy(&(header->n), buf, sizeof(header->n));
buf += sizeof(header->n);
memcpy(&(header->k), buf, sizeof(header->k));
buf += sizeof(header->k);
if (header->k >= 8 || header->k <= 0) {
LOG(ERROR) << "Bad number of hashes (header->k)";
return false;
}
return true;
}
ExistenceFilter* ExistenceFilter::Read(const char *buf, size_t size) {
Header header;
const uint32 header_bytes =
sizeof(header.m) + sizeof(header.n) + sizeof(header.k);
if (size < header_bytes) {
LOG(ERROR) << "Not enough bufsize: could not read header";
return NULL;
}
if (!ReadHeader(buf, &header)) {
LOG(ERROR) << "Invalid format: could not read header";
return NULL;
}
buf += header_bytes;
const uint32 filter_size = BitsToWords(header.m);
const uint32 filter_bytes = filter_size * sizeof(uint32);
VLOG(1) << "Reading bloom filter with size: " << filter_bytes << " bytes, "
<< "estimated insertions: " << header.n << " (k: " << header.k
<< ")";
if (size < header_bytes + filter_bytes) {
LOG(ERROR) << "Not enough bufsize: could not read filter";
return NULL;
}
ExistenceFilter* filter =
ExistenceFilter::CreateImmutableExietenceFilter(header.m,
header.n,
header.k);
char **ptr = NULL;
size_t n = 0;
size_t read = 0;
for (uint32 iter = 0; filter->rep_->GetMutableFragment(&iter, &ptr, &n);) {
*ptr = const_cast<char *>(buf); // TODO(taku): don't want to remove const
buf += n;
read += n;
}
if (read != filter_bytes) {
LOG(ERROR) << "Read " << read << " bytes instead of " << filter_bytes;
delete filter;
return NULL;
}
return filter;
}
} // namespace storage
} // namespace mozc