tesseract 4.1.1
Loading...
Searching...
No Matches
bitvector.cpp
Go to the documentation of this file.
1// Copyright 2011 Google Inc. All Rights Reserved.
2// Author: rays@google.com (Ray Smith)
4// File: bitvector.cpp
5// Description: Class replacement for BITVECTOR.
6// Author: Ray Smith
7// Created: Mon Jan 10 17:45:01 PST 2011
8//
9// (C) Copyright 2011, Google Inc.
10// Licensed under the Apache License, Version 2.0 (the "License");
11// you may not use this file except in compliance with the License.
12// You may obtain a copy of the License at
13// http://www.apache.org/licenses/LICENSE-2.0
14// Unless required by applicable law or agreed to in writing, software
15// distributed under the License is distributed on an "AS IS" BASIS,
16// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17// See the License for the specific language governing permissions and
18// limitations under the License.
19//
21
22#include "bitvector.h"
23#include <algorithm>
24#include <cstring>
25#include "helpers.h"
26#include "serialis.h" // for tesseract::Serialize
27
28namespace tesseract {
29
30// Fast lookup table to get the first least significant set bit in a byte.
31// For zero, the table has 255, but since it is a special case, most code
32// that uses this table will check for zero before looking up lsb_index_.
33const uint8_t BitVector::lsb_index_[256] = {
34 255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
35 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
36 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
37 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
38 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
39 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
40 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
41 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
42 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
43 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
44 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
45 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
46 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
47 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
48 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
49 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
50};
51
52// Fast lookup table to get the residual bits after zeroing the first (lowest)
53// set bit in a byte.
54const uint8_t BitVector::lsb_eroded_[256] = {
55 0, 0, 0, 0x2, 0, 0x4, 0x4, 0x6,
56 0, 0x8, 0x8, 0x0a, 0x08, 0x0c, 0x0c, 0x0e,
57 0, 0x10, 0x10, 0x12, 0x10, 0x14, 0x14, 0x16,
58 0x10, 0x18, 0x18, 0x1a, 0x18, 0x1c, 0x1c, 0x1e,
59 0, 0x20, 0x20, 0x22, 0x20, 0x24, 0x24, 0x26,
60 0x20, 0x28, 0x28, 0x2a, 0x28, 0x2c, 0x2c, 0x2e,
61 0x20, 0x30, 0x30, 0x32, 0x30, 0x34, 0x34, 0x36,
62 0x30, 0x38, 0x38, 0x3a, 0x38, 0x3c, 0x3c, 0x3e,
63 0, 0x40, 0x40, 0x42, 0x40, 0x44, 0x44, 0x46,
64 0x40, 0x48, 0x48, 0x4a, 0x48, 0x4c, 0x4c, 0x4e,
65 0x40, 0x50, 0x50, 0x52, 0x50, 0x54, 0x54, 0x56,
66 0x50, 0x58, 0x58, 0x5a, 0x58, 0x5c, 0x5c, 0x5e,
67 0x40, 0x60, 0x60, 0x62, 0x60, 0x64, 0x64, 0x66,
68 0x60, 0x68, 0x68, 0x6a, 0x68, 0x6c, 0x6c, 0x6e,
69 0x60, 0x70, 0x70, 0x72, 0x70, 0x74, 0x74, 0x76,
70 0x70, 0x78, 0x78, 0x7a, 0x78, 0x7c, 0x7c, 0x7e,
71 0, 0x80, 0x80, 0x82, 0x80, 0x84, 0x84, 0x86,
72 0x80, 0x88, 0x88, 0x8a, 0x88, 0x8c, 0x8c, 0x8e,
73 0x80, 0x90, 0x90, 0x92, 0x90, 0x94, 0x94, 0x96,
74 0x90, 0x98, 0x98, 0x9a, 0x98, 0x9c, 0x9c, 0x9e,
75 0x80, 0xa0, 0xa0, 0xa2, 0xa0, 0xa4, 0xa4, 0xa6,
76 0xa0, 0xa8, 0xa8, 0xaa, 0xa8, 0xac, 0xac, 0xae,
77 0xa0, 0xb0, 0xb0, 0xb2, 0xb0, 0xb4, 0xb4, 0xb6,
78 0xb0, 0xb8, 0xb8, 0xba, 0xb8, 0xbc, 0xbc, 0xbe,
79 0x80, 0xc0, 0xc0, 0xc2, 0xc0, 0xc4, 0xc4, 0xc6,
80 0xc0, 0xc8, 0xc8, 0xca, 0xc8, 0xcc, 0xcc, 0xce,
81 0xc0, 0xd0, 0xd0, 0xd2, 0xd0, 0xd4, 0xd4, 0xd6,
82 0xd0, 0xd8, 0xd8, 0xda, 0xd8, 0xdc, 0xdc, 0xde,
83 0xc0, 0xe0, 0xe0, 0xe2, 0xe0, 0xe4, 0xe4, 0xe6,
84 0xe0, 0xe8, 0xe8, 0xea, 0xe8, 0xec, 0xec, 0xee,
85 0xe0, 0xf0, 0xf0, 0xf2, 0xf0, 0xf4, 0xf4, 0xf6,
86 0xf0, 0xf8, 0xf8, 0xfa, 0xf8, 0xfc, 0xfc, 0xfe
87};
88
89// Fast lookup table to give the number of set bits in a byte.
90const int BitVector::hamming_table_[256] = {
91 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
92 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
93 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
94 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
95 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
96 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
97 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
98 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
99 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
100 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
101 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
102 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
103 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
104 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
105 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
106 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
107};
108
109
110BitVector::BitVector() : bit_size_(0), array_(nullptr) {}
111
112BitVector::BitVector(int length) : bit_size_(length) {
113 array_ = new uint32_t[WordLength()];
114 SetAllFalse();
115}
116
117BitVector::BitVector(const BitVector& src) : bit_size_(src.bit_size_) {
118 if (src.bit_size_ > 0) {
119 array_ = new uint32_t[WordLength()];
120 memcpy(array_, src.array_, ByteLength());
121 } else {
122 array_ = nullptr;
123 }
124}
125
127 Alloc(src.bit_size_);
128 if (src.bit_size_ > 0) {
129 memcpy(array_, src.array_, ByteLength());
130 }
131 return *this;
132}
133
135 delete [] array_;
136}
137
138// Initializes the array to length * false.
139void BitVector::Init(int length) {
140 Alloc(length);
141 SetAllFalse();
142}
143
144// Writes to the given file. Returns false in case of error.
145bool BitVector::Serialize(FILE* fp) const {
146 if (!tesseract::Serialize(fp, &bit_size_)) return false;
147 int wordlen = WordLength();
148 return tesseract::Serialize(fp, &array_[0], wordlen);
149}
150
151// Reads from the given file. Returns false in case of error.
152// If swap is true, assumes a big/little-endian swap is needed.
153bool BitVector::DeSerialize(bool swap, FILE* fp) {
154 uint32_t new_bit_size;
155 if (!tesseract::DeSerialize(fp, &new_bit_size)) return false;
156 if (swap) {
157 ReverseN(&new_bit_size, sizeof(new_bit_size));
158 }
159 Alloc(new_bit_size);
160 int wordlen = WordLength();
161 if (!tesseract::DeSerialize(fp, &array_[0], wordlen)) return false;
162 if (swap) {
163 for (int i = 0; i < wordlen; ++i)
164 ReverseN(&array_[i], sizeof(array_[i]));
165 }
166 return true;
167}
168
170 memset(array_, 0, ByteLength());
171}
173 memset(array_, ~0, ByteLength());
174}
175
176// Returns the index of the next set bit after the given index.
177// Useful for quickly iterating through the set bits in a sparse vector.
178int BitVector::NextSetBit(int prev_bit) const {
179 // Move on to the next bit.
180 int next_bit = prev_bit + 1;
181 if (next_bit >= bit_size_) return -1;
182 // Check the remains of the word containing the next_bit first.
183 int next_word = WordIndex(next_bit);
184 int bit_index = next_word * kBitFactor;
185 int word_end = bit_index + kBitFactor;
186 uint32_t word = array_[next_word];
187 uint8_t byte = word & 0xff;
188 while (bit_index < word_end) {
189 if (bit_index + 8 > next_bit && byte != 0) {
190 while (bit_index + lsb_index_[byte] < next_bit && byte != 0)
191 byte = lsb_eroded_[byte];
192 if (byte != 0)
193 return bit_index + lsb_index_[byte];
194 }
195 word >>= 8;
196 bit_index += 8;
197 byte = word & 0xff;
198 }
199 // next_word didn't contain a 1, so find the next word with set bit.
200 ++next_word;
201 int wordlen = WordLength();
202 while (next_word < wordlen && (word = array_[next_word]) == 0) {
203 ++next_word;
204 bit_index += kBitFactor;
205 }
206 if (bit_index >= bit_size_) return -1;
207 // Find the first non-zero byte within the word.
208 while ((word & 0xff) == 0) {
209 word >>= 8;
210 bit_index += 8;
211 }
212 return bit_index + lsb_index_[word & 0xff];
213}
214
215// Returns the number of set bits in the vector.
217 int wordlen = WordLength();
218 int total_bits = 0;
219 for (int w = 0; w < wordlen; ++w) {
220 uint32_t word = array_[w];
221 for (int i = 0; i < 4; ++i) {
222 total_bits += hamming_table_[word & 0xff];
223 word >>= 8;
224 }
225 }
226 return total_bits;
227}
228
229// Logical in-place operations on whole bit vectors. Tries to do something
230// sensible if they aren't the same size, but they should be really.
232 int length = std::min(WordLength(), other.WordLength());
233 for (int w = 0; w < length; ++w)
234 array_[w] |= other.array_[w];
235}
237 int length = std::min(WordLength(), other.WordLength());
238 for (int w = 0; w < length; ++w)
239 array_[w] &= other.array_[w];
240 for (int w = WordLength() - 1; w >= length; --w)
241 array_[w] = 0;
242}
244 int length = std::min(WordLength(), other.WordLength());
245 for (int w = 0; w < length; ++w)
246 array_[w] ^= other.array_[w];
247}
248// Set subtraction *this = v1 - v2.
249void BitVector::SetSubtract(const BitVector& v1, const BitVector& v2) {
250 Alloc(v1.size());
251 int length = std::min(v1.WordLength(), v2.WordLength());
252 for (int w = 0; w < length; ++w)
253 array_[w] = v1.array_[w] ^ (v1.array_[w] & v2.array_[w]);
254 for (int w = WordLength() - 1; w >= length; --w)
255 array_[w] = v1.array_[w];
256}
257
258// Allocates memory for a vector of the given length.
259// Reallocates if the array is a different size, larger or smaller.
260void BitVector::Alloc(int length) {
261 int initial_wordlength = WordLength();
262 bit_size_ = length;
263 int new_wordlength = WordLength();
264 if (new_wordlength != initial_wordlength) {
265 delete [] array_;
266 array_ = new uint32_t[new_wordlength];
267 }
268}
269
270
271} // namespace tesseract.
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:185
bool DeSerialize(FILE *fp, char *data, size_t n)
Definition: serialis.cpp:28
bool Serialize(FILE *fp, const char *data, size_t n)
Definition: serialis.cpp:60
void Init(int length)
Definition: bitvector.cpp:139
static const uint8_t lsb_index_[256]
Definition: bitvector.h:35
void operator^=(const BitVector &other)
Definition: bitvector.cpp:243
static const int hamming_table_[256]
Definition: bitvector.h:40
BitVector & operator=(const BitVector &src)
Definition: bitvector.cpp:126
void SetSubtract(const BitVector &v1, const BitVector &v2)
Definition: bitvector.cpp:249
void operator&=(const BitVector &other)
Definition: bitvector.cpp:236
int NumSetBits() const
Definition: bitvector.cpp:216
bool DeSerialize(bool swap, FILE *fp)
Definition: bitvector.cpp:153
int NextSetBit(int prev_bit) const
Definition: bitvector.cpp:178
void operator|=(const BitVector &other)
Definition: bitvector.cpp:231
int size() const
Definition: bitvector.h:53
bool Serialize(FILE *fp) const
Definition: bitvector.cpp:145
static const uint8_t lsb_eroded_[256]
Definition: bitvector.h:38