blob: abf8338d8acfba92380faa1abcc5aa501178a646 [file] [log] [blame]
// Copyright 2010-2015, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "storage/louds/simple_succinct_bit_vector_index.h"
#include <algorithm>
#include <iterator>
#include <vector>
#include "base/iterator_adapter.h"
#include "base/logging.h"
#include "base/port.h"
namespace mozc {
namespace storage {
namespace louds {
namespace {
#ifdef __GNUC__
// TODO(hidehiko): Support XMM and 64-bits popcount for 64bits architectures.
inline int BitCount1(uint32 x) {
return __builtin_popcount(x);
}
#else
int BitCount1(uint32 x) {
x = ((x & 0xaaaaaaaa) >> 1) + (x & 0x55555555);
x = ((x & 0xcccccccc) >> 2) + (x & 0x33333333);
x = ((x >> 4) + x) & 0x0f0f0f0f;
x = (x >> 8) + x;
x = ((x >> 16) + x) & 0x3f;
return x;
}
#endif
inline int BitCount0(uint32 x) {
// Flip all bits, and count 1-bits.
return BitCount1(~x);
}
inline bool IsPowerOfTwo(int value) {
// value & -value is the well-known idiom to take the lowest 1-bit in
// value, so value & ~(value & -value) clears the lowest 1-bit in value.
// If the result is 0, value has only one 1-bit i.e. it is power of two.
return value != 0 && (value & ~(value & -value)) == 0;
}
// Returns 1-bits in the data[0] ... data[length - 1].
int Count1Bits(const uint32 *data, int length) {
int num_bits = 0;
for (; length > 0; ++data, --length) {
num_bits += BitCount1(*data);
}
return num_bits;
}
// Stores index (the camulative number of the 1-bits from begin of each chunk).
void InitIndex(
const uint8 *data, int length, int chunk_size, vector<int> *index) {
DCHECK_GE(chunk_size, 4);
DCHECK(IsPowerOfTwo(chunk_size)) << chunk_size;
DCHECK_EQ(length % 4, 0);
index->clear();
// Count the number of chunks with ceiling.
const int chunk_length = (length + chunk_size - 1) / chunk_size;
// Reserve the memory including a sentinel.
index->reserve(chunk_length + 1);
int num_bits = 0;
for (int remaining_num_words = length / 4; remaining_num_words > 0;
data += chunk_size, remaining_num_words -= chunk_size / 4) {
index->push_back(num_bits);
num_bits += Count1Bits(
reinterpret_cast<const uint32 *>(data),
min(chunk_size / 4, remaining_num_words));
}
index->push_back(num_bits);
CHECK_EQ(chunk_length + 1, index->size());
}
} // namespace
void SimpleSuccinctBitVectorIndex::Init(const uint8 *data, int length) {
data_ = data;
length_ = length;
InitIndex(data, length, chunk_size_, &index_);
}
void SimpleSuccinctBitVectorIndex::Reset() {
data_ = NULL;
length_ = 0;
index_.clear();
}
int SimpleSuccinctBitVectorIndex::Rank1(int n) const {
// Look up pre-computed 1-bits for the preceding chunks.
const int num_chunks = n / (chunk_size_ * 8);
int result = index_[n / (chunk_size_ * 8)];
// Count 1-bits for remaining "words".
result += Count1Bits(
reinterpret_cast<const uint32 *>(data_ + num_chunks * chunk_size_),
(n / 8 - num_chunks * chunk_size_) / 4);
// Count 1-bits for remaining "bits".
if (n % 32 > 0) {
const int index = n / 32;
const int shift = 32 - n % 32;
result +=
BitCount1(reinterpret_cast<const uint32 *>(data_)[index] << shift);
}
return result;
}
namespace {
// Simple adapter to convert from 1-bit index to 0-bit index.
class ZeroBitAdapter : public AdapterBase<int> {
public:
// Needs to be default constructive to create invalid iterator.
ZeroBitAdapter() {}
ZeroBitAdapter(const vector<int>* index, int chunk_size)
: index_(index), chunk_size_(chunk_size) {
}
template<typename Iter>
value_type operator()(Iter iter) const {
// The number of 0-bits
// = (total num bits) - (1-bits)
// = (chunk_size [bytes] * 8 [bits/byte] * (iter's position) - (1-bits)
return chunk_size_ * 8 * distance(index_->begin(), iter) - *iter;
}
private:
const vector<int> *index_;
int chunk_size_;
};
} // namespace
int SimpleSuccinctBitVectorIndex::Select0(int n) const {
DCHECK_GT(n, 0);
// Binary search on chunks.
ZeroBitAdapter adapter(&index_, chunk_size_);
const vector<int>::const_iterator iter = lower_bound(
MakeIteratorAdapter(index_.begin(), adapter),
MakeIteratorAdapter(index_.end(), adapter),
n).base();
const int chunk_index = distance(index_.begin(), iter) - 1;
DCHECK_GE(chunk_index, 0);
n -= chunk_size_ * 8 * chunk_index - index_[chunk_index];
// Linear search on remaining "words"
const uint32 *ptr =
reinterpret_cast<const uint32 *>(data_) + chunk_index * chunk_size_ / 4;
while (true) {
const int bit_count = BitCount0(*ptr);
if (bit_count >= n) {
break;
}
n -= bit_count;
++ptr;
}
int index = (ptr - reinterpret_cast<const uint32 *>(data_)) * 32;
for (uint32 word = ~(*ptr); n > 0; word >>= 1, ++index) {
n -= (word & 1);
}
// Index points to the "next bit" of the target one.
// Thus, subtract one to adjust.
return index - 1;
}
int SimpleSuccinctBitVectorIndex::Select1(int n) const {
DCHECK_GT(n, 0);
// Binary search on chunks.
const vector<int>::const_iterator iter =
lower_bound(index_.begin(), index_.end(), n);
const int chunk_index = distance(index_.begin(), iter) - 1;
DCHECK_GE(chunk_index, 0);
n -= index_[chunk_index];
// Linear search on remaining "words"
const uint32 *ptr =
reinterpret_cast<const uint32 *>(data_) + chunk_index * chunk_size_ / 4;
while (true) {
const int bit_count = BitCount1(*ptr);
if (bit_count >= n) {
break;
}
n -= bit_count;
++ptr;
}
int index = (ptr - reinterpret_cast<const uint32 *>(data_)) * 32;
for (uint32 word = *ptr; n > 0; word >>= 1, ++index) {
n -= (word & 1);
}
// Index points to the "next bit" of the target one.
// Thus, subtract one to adjust.
return index - 1;
}
} // namespace louds
} // namespace storage
} // namespace mozc