Why is std::vector<bool> faster?

Question

As I was implementing the Sieve of Eratosthenes I ran into an issue with std::vector<bool> : there is no access to the raw data.

So I decided to use a custom minimalistic implementation where I would have access to the data pointer.

#ifndef LIB_BITS_T_H
#define LIB_BITS_T_H

#include <algorithm>
template <typename B>

class bits_t{

public:

    typedef B block_t;
    static const size_t block_size = sizeof(block_t) * 8;

    block_t* data;
    size_t size;
    size_t blocks;

    class bit_ref{
    public:
        block_t* const block;
        const block_t mask;

        bit_ref(block_t& block, const block_t mask) noexcept : block(&block), mask(mask){}

        inline void operator=(bool v) const noexcept{
            if(v) *block |= mask;
            else  *block &= ~mask;
        }

        inline operator bool() const noexcept{
            return (bool)(*block & mask);
        }
    };



    bits_t() noexcept : data(nullptr){}

    void resize(const size_t n, const bool v) noexcept{
        block_t fill = v ? ~block_t(0) : block_t(0);
        size = n;
        blocks = (n + block_size - 1) / block_size;
        data = new block_t[blocks];
        std::fill(data, data + blocks, fill);
    }

    inline block_t& block_at_index(const size_t i) const noexcept{
        return data[i / block_size];
    }

    inline size_t index_in_block(const size_t i) const noexcept{
        return i % block_size;
    }

    inline bit_ref operator[](const size_t i) noexcept{
        return bit_ref(block_at_index(i), block_t(1) << index_in_block(i));
    }

    ~bits_t(){
        delete[] data;
    }

};

#endif // LIB_BITS_T_H

The code is nearly the same than the one in /usr/include/c++/4.7/bits/stl_bvector.h but is slower.

I tried an optimization,

#ifndef LIB_BITS_T_H
#define LIB_BITS_T_H

#include <algorithm>
template <typename B>

class bits_t{

const B mask[64] = {
    0b0000000000000000000000000000000000000000000000000000000000000001,
    0b0000000000000000000000000000000000000000000000000000000000000010,
    0b0000000000000000000000000000000000000000000000000000000000000100,
    0b0000000000000000000000000000000000000000000000000000000000001000,
    0b0000000000000000000000000000000000000000000000000000000000010000,
    0b0000000000000000000000000000000000000000000000000000000000100000,
    0b0000000000000000000000000000000000000000000000000000000001000000,
    0b0000000000000000000000000000000000000000000000000000000010000000,
    0b0000000000000000000000000000000000000000000000000000000100000000,
    0b0000000000000000000000000000000000000000000000000000001000000000,
    0b0000000000000000000000000000000000000000000000000000010000000000,
    0b0000000000000000000000000000000000000000000000000000100000000000,
    0b0000000000000000000000000000000000000000000000000001000000000000,
    0b0000000000000000000000000000000000000000000000000010000000000000,
    0b0000000000000000000000000000000000000000000000000100000000000000,
    0b0000000000000000000000000000000000000000000000001000000000000000,
    0b0000000000000000000000000000000000000000000000010000000000000000,
    0b0000000000000000000000000000000000000000000000100000000000000000,
    0b0000000000000000000000000000000000000000000001000000000000000000,
    0b0000000000000000000000000000000000000000000010000000000000000000,
    0b0000000000000000000000000000000000000000000100000000000000000000,
    0b0000000000000000000000000000000000000000001000000000000000000000,
    0b0000000000000000000000000000000000000000010000000000000000000000,
    0b0000000000000000000000000000000000000000100000000000000000000000,
    0b0000000000000000000000000000000000000001000000000000000000000000,
    0b0000000000000000000000000000000000000010000000000000000000000000,
    0b0000000000000000000000000000000000000100000000000000000000000000,
    0b0000000000000000000000000000000000001000000000000000000000000000,
    0b0000000000000000000000000000000000010000000000000000000000000000,
    0b0000000000000000000000000000000000100000000000000000000000000000,
    0b0000000000000000000000000000000001000000000000000000000000000000,
    0b0000000000000000000000000000000010000000000000000000000000000000,
    0b0000000000000000000000000000000100000000000000000000000000000000,
    0b0000000000000000000000000000001000000000000000000000000000000000,
    0b0000000000000000000000000000010000000000000000000000000000000000,
    0b0000000000000000000000000000100000000000000000000000000000000000,
    0b0000000000000000000000000001000000000000000000000000000000000000,
    0b0000000000000000000000000010000000000000000000000000000000000000,
    0b0000000000000000000000000100000000000000000000000000000000000000,
    0b0000000000000000000000001000000000000000000000000000000000000000,
    0b0000000000000000000000010000000000000000000000000000000000000000,
    0b0000000000000000000000100000000000000000000000000000000000000000,
    0b0000000000000000000001000000000000000000000000000000000000000000,
    0b0000000000000000000010000000000000000000000000000000000000000000,
    0b0000000000000000000100000000000000000000000000000000000000000000,
    0b0000000000000000001000000000000000000000000000000000000000000000,
    0b0000000000000000010000000000000000000000000000000000000000000000,
    0b0000000000000000100000000000000000000000000000000000000000000000,
    0b0000000000000001000000000000000000000000000000000000000000000000,
    0b0000000000000010000000000000000000000000000000000000000000000000,
    0b0000000000000100000000000000000000000000000000000000000000000000,
    0b0000000000001000000000000000000000000000000000000000000000000000,
    0b0000000000010000000000000000000000000000000000000000000000000000,
    0b0000000000100000000000000000000000000000000000000000000000000000,
    0b0000000001000000000000000000000000000000000000000000000000000000,
    0b0000000010000000000000000000000000000000000000000000000000000000,
    0b0000000100000000000000000000000000000000000000000000000000000000,
    0b0000001000000000000000000000000000000000000000000000000000000000,
    0b0000010000000000000000000000000000000000000000000000000000000000,
    0b0000100000000000000000000000000000000000000000000000000000000000,
    0b0001000000000000000000000000000000000000000000000000000000000000,
    0b0010000000000000000000000000000000000000000000000000000000000000,
    0b0100000000000000000000000000000000000000000000000000000000000000,
    0b1000000000000000000000000000000000000000000000000000000000000000
};

public:

    typedef B block_t;
    static const size_t block_size = sizeof(block_t) * 8;

    block_t* data;
    size_t size;
    size_t blocks;

    class bit_ref{
    public:
        block_t* const block;
        const block_t mask;

        bit_ref(block_t& block, const block_t mask) noexcept : block(&block), mask(mask){}

        inline void operator=(bool v) const noexcept{
            if(v) *block |= mask;
            else  *block &= ~mask;
        }

        inline operator bool() const noexcept{
            return (bool)(*block & mask);
        }
    };



    bits_t() noexcept : data(nullptr){}

    void resize(const size_t n, const bool v) noexcept{
        block_t fill = v ? ~block_t(0) : block_t(0);
        size = n;
        blocks = (n + block_size - 1) / block_size;
        data = new block_t[blocks];
        std::fill(data, data + blocks, fill);
    }

    inline block_t& block_at_index(const size_t i) const noexcept{
        return data[i / block_size];
    }

    inline size_t index_in_block(const size_t i) const noexcept{
        return i % block_size;
    }

    inline bit_ref operator[](const size_t i) noexcept{
        return bit_ref(block_at_index(i), mask[index_in_block(i)]);
    }

    ~bits_t(){
        delete[] data;
    }

};

#endif // LIB_BITS_T_H

(Compiling with g++4.7 -O3)

Eratosthenes sieve algorithm (33.333.333 bits)

std::vector<bool> 19.1s

bits_t<size_t> 19.9s

bits_t<size_t> (with lookup table) 19.7s

ctor + resize(33.333.333 bits) + dtor

std::vector<bool> 120ms

bits_t<size_t> 150ms

QUESTION : Where does the slowdown come from?

Answer 1

Outside of all the problems as pointed out by some other users, your resize is allocating more memory each time the current block limit is reached to add ONE block. The std::vector will double the size of the buffer (so if you already had 16 blocks, now you have 32 blocks). In other words, they will do less new than you.

This being said, you do not do the necessary delete & copy and that could have a "positive" impact in your version... ("positive" impact speed wise, it is not positive that you do not delete the old data, nor copy it in your new buffer.)

Also, the std::vector will properly enlarge the buffer and thus copy data that is likely already in your CPU cache. With your version, that cache is lost since you just ignore the old buffer on each resize().

Also when a class handles a memory buffer it is customary to implement the copy and assignment operators, for some reasons... and you could look into using a shared_ptr<>() too. The delete is then hidden and the class is a template so it is very fast (it does not add any code that you would not already have in your own version.)

=== Update

There is one other thing. You're operator [] implementation:

inline bit_ref operator[](const size_t i) noexcept{
    return bit_ref(block_at_index(i), mask[index_in_block(i)]);
}

(side note: the inline is not required since the fact that you write the code within the class already means you okayed the inline capability already.)

You only offer a non-const version which "is slow" because it creates a sub-class. You should try implementing a const version that returns bool and see whether that accounts for the ~3% difference you see.

bool operator[](const size_t i) const noexcept
{
    return (block_at_index(i) & mask[index_in_block(i)]) != 0;
}

Also, using a mask[] array can also slow down things. (1LL << (index & 0x3F)) should be faster (2 CPU instructions with 0 memory access).

Answer 2

Apparently, the wrapping of i % block_size in a function was the culprit

inline size_t index_in_block ( const size_t i ) const noexcept {
    return i % block_size;
}

inline bit_ref operator[] ( const size_t i ) noexcept {
    return bit_ref( block_at_index( i ), block_t( 1 ) << index_in_block( i ) );
}

so replacing the above code with

inline bit_ref operator[] ( const size_t i ) noexcept {
    return bit_ref( block_at_index( i ), block_t( 1 ) << ( i % block_size ) );
}

solves the issue. However, I still don't know why it is. My best guess is that I didn't get the signature of index_in_block right and that the optimizer is thus not able to inline this function in a similar way to the manual inlining way.

Here is the new code.

#ifndef LIB_BITS_2_T_H
#define LIB_BITS_2_T_H

#include <algorithm>

template <typename B>

class bits_2_t {

public:

    typedef B block_t;
    static const int block_size = sizeof( block_t ) * __CHAR_BIT__;


private:

    block_t* _data;
    size_t _size;
    size_t _blocks;


public:

    class bit_ref {

    public:

        block_t* const block;
        const block_t mask;


        bit_ref ( block_t& block, const block_t mask) noexcept
        : block( &block ), mask( mask ) {}


        inline bool operator= ( const bool v ) const noexcept {

            if ( v ) *block |= mask;
            else     *block &= ~mask;

            return v;

        }

        inline operator bool() const noexcept {
            return (bool)( *block & mask );
        }


    };


    bits_2_t () noexcept : _data( nullptr ), _size( 0 ), _blocks( 0 ) {}

    bits_2_t ( const size_t n ) noexcept : _data( nullptr ), _size( n ) {

        _blocks = number_of_blocks_needed( n );
        _data = new block_t[_blocks];

        const block_t fill( 0 );
        std::fill( _data, _data + _blocks, fill );

    }

    bits_2_t ( const size_t n, const bool v ) noexcept : _data( nullptr ), _size( n ) {

        _blocks = number_of_blocks_needed( n );
        _data = new block_t[_blocks];

        const block_t fill = v ? ~block_t( 0 ) : block_t( 0 );
        std::fill( _data, _data + _blocks, fill );

    }

    void resize ( const size_t n ) noexcept {
        resize( n, false );
    }

    void resize ( const size_t n, const bool v ) noexcept {

        const size_t tmpblocks = number_of_blocks_needed( n );
        const size_t copysize = std::min( _blocks, tmpblocks );

        block_t* tmpdata = new block_t[tmpblocks];
        std::copy( _data, _data + copysize, tmpdata );

        const block_t fill = v ? ~block_t( 0 ) : block_t( 0 );
        std::fill( tmpdata + copysize, tmpdata + tmpblocks, fill );

        delete[] _data;

        _data = tmpdata;
        _blocks = tmpblocks;
        _size = n;

    }

    inline size_t number_of_blocks_needed ( const size_t n ) const noexcept {
        return ( n + block_size - 1 ) / block_size;
    }

    inline block_t& block_at_index ( const size_t i ) const noexcept {
        return _data[i / block_size];
    }

    inline bit_ref operator[] ( const size_t i ) noexcept {
        return bit_ref( block_at_index( i ), block_t( 1 ) << ( i % block_size ) );
    }

    inline bool operator[] ( const size_t i ) const noexcept {
        return (bool)( block_at_index( i ) & ( block_t( 1 ) << ( i % block_size ) ) );
    }

    inline block_t* data () {
        return _data;
    }

    inline const block_t* data () const {
        return _data;
    }

    inline size_t size () const {
        return _size;
    }

    void clear () noexcept {

        delete[] _data;

        _size = 0;
        _blocks = 0;
        _data = nullptr;

    }

    ~bits_2_t () {
        clear();
    }


};

#endif // LIB_BITS_2_T_H

Here are the results for this new code on my amd64 machine for primes up to 1.000.000.000 (best of 3 runs, real time).

Sieve of Eratosthenes with 1 memory unit per number ( not skipping multiples of 2 ).

bits_t<uint8_t>

real 0m23.614s user 0m23.493s sys 0m0.092s

bits_t<uint16_t>

real 0m24.399s user 0m24.294s sys 0m0.084s

bits_t<uint32_t>

real 0m23.501s user 0m23.372s sys 0m0.108s <-- best

bits_t<uint64_t>

real 0m24.393s user 0m24.304s sys 0m0.068s

std::vector<bool>

real 0m24.362s user 0m24.276s sys 0m0.056s

std::vector<uint8_t>

real 0m38.303s user 0m37.570s sys 0m0.683s

Here is the code of the sieve (where (...) should be replaced by the bit array of your choice).

#include <iostream>

typedef (...) array_t;

int main ( int argc, char const *argv[] ) {

    if ( argc != 2 ) {
        std::cout << "#0 missing" << std::endl;
        return 1;
    }

    const size_t count = std::stoull( argv[1] );
    array_t prime( count, true );
    prime[0] = prime[1] = false;


    for ( size_t k = 2 ; k * k < count ; ++k ) {

        if ( prime[k] ) {

            for ( size_t i = k * k ; i < count ; i += k ) {
                prime[i] = false;
            }

        }

    }

    return 0;
}

Why is std::vector<bool> faster?

Question

2 answers

solution1
3 2013-11-23 03:21:58

solution2
0 ACCPTED 2014-10-21 06:41:23

Why is std::vector<bool> faster?

Question

2 answers

solution1 3 2013-11-23 03:21:58

solution2 0 ACCPTED 2014-10-21 06:41:23

solution1
3 2013-11-23 03:21:58

solution2
0 ACCPTED 2014-10-21 06:41:23