Why are some of the bits in my Postgres VarBit (C-language function) being zeroed for specific input?

Question

I am attempting to write a C-language function for Postgres that takes an integer array in and converts the integers to a bitstring representing the morton encoding (z-curve encoding) of the integers in the array. Two parameters are passed in: first the actual array, and second, the number of bits to use from each integer, starting with the least significant bit (max possible is 31 since we must use unsigned integers). I am using an implementation (see comment in code) that I have tested outside of Postgres and I know works.

The code (without some obviously necessary checks for negativeness in integers, size bounds, etc.):

#include <postgres.h>
#include <utils/array.h>
#include <utils/varbit.h>
#include <fmgr.h>
#include <limits.h>

#ifdef PG_MODULE_MAGIC
PG_MODULE_MAGIC;
#endif

#define MC_SEGMENT(bit, dim, ndim)  ((((bit)*(ndim)) + dim) / CHAR_BIT)
#define MC_SEGMENTBIT(bit, dim, ndim)   ((((bit)*(ndim)) + dim) % CHAR_BIT)

Datum pg_morton_encode_integer(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(pg_morton_encode_integer);
Datum
pg_morton_encode_integer(PG_FUNCTION_ARGS)
{
    ArrayType   *input_vector;
    int32       *toencode;
    int     i, n, rlen, len;
    int16       num_bits_per;
    VarBit      *result;
    int     mask;
    int     ipad;
    char        tempvalue;

    int     bit;
    int     dim;
    int     num_bits;
    int     num_bytes;

    /* Ensure first argument is not null. */
    if (PG_ARGISNULL(0))
    {
        ereport(ERROR, (errmsg("First operand must be non-null")));
    }
    /* Ensure second argument is not null. */
    if (PG_ARGISNULL(1))
    {
        ereport(ERROR, (errmsg("Second operand must be non-null")));
    }

    /* Get input array and its length (first argument). */
    input_vector    = PG_GETARG_ARRAYTYPE_P(0);
    n       = (ARR_DIMS(input_vector))[0];
ereport(NOTICE, (errmsg("n=%d", n) ));
    toencode    = (int32 *)ARR_DATA_PTR(input_vector);

    /* Get number of bits per dimensions (second argument). */
    num_bits_per    = PG_GETARG_INT16(1);

    /* Allocated the VarBit. */
    len     = n*num_bits_per;
    rlen        = VARBITTOTALLEN(len);
    result      = palloc0(rlen);
    SET_VARSIZE(result, rlen);
    VARBITLEN(result)
                = len;

    /* Perform the morton encoding. */
    num_bits        = num_bits_per*n;
    num_bytes       = (num_bits/8) + (num_bits % 8 == 0 ? 0 : 1);

    /*** TESTED THIS CODE, IT WORKS, BUT NOT IN POSTGRES. :( ***/
    for (bit = 0; bit < num_bits_per; ++bit)
    for (dim = 0; dim < n; ++dim)
    {
        tempvalue
            = VARBITS(result)[MC_SEGMENT(bit, dim, n)];
        tempvalue
            |= (char)
             (((toencode[dim] & (1 << bit)) >> bit)
              << MC_SEGMENTBIT(bit, dim, n));
        VARBITS(result)[MC_SEGMENT(bit, dim, n)]
            = tempvalue;
ereport(NOTICE, (errmsg("[%d,%d]=%d:%x", MC_SEGMENT(bit, dim, n), MC_SEGMENTBIT(bit,dim,n), (((toencode[dim] & (1 << bit)) >> bit)), (int)tempvalue)) );
    }
    /*** END OF TESTED CODE. ***/
    PG_RETURN_VARBIT_P(result);
}

Makefile:

MODULE_big = pgmorton
OBJS = pgmorton.o
PGXS := $(shell pg_config --pgxs)
include $(PGXS)

Make instructions (in directory where code and Makefile reside):

make install

Once installed, run this from a postgres shell to make the function useable:

DROP FUNCTION IF EXISTS pg_morton_encode_integer(integer[], smallint);
CREATE FUNCTION pg_morton_encode_integer(integer[], smallint) RETURNS varbit
  AS 'pgmorton', 'pg_morton_encode_integer'
  LANGUAGE C STRICT;

Some examples producing obviously erroneous input:

SELECT pg_morton_encode_integer('{2147483647, 2147483647, 2147483647}'::integer[], 31::smallint); -- gives '111111111111111111111111111111111111111111111111111111111111111111111111111111111111111100011' instead of '111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111'
SELECT pg_morton_encode_integer('{2147483647, 2147483647}'::integer[], 2::smallint); -- gives '0000' instead of '1111'

I am doing all of this on OS X Yosemite with Postgres.app version 9.4.2.0, with default clang compiler.

EDIT: As noted in a comment to my question below, the bytes needed to be written in a different order because Postgres normalizes bitstring byte-order by making the most significant byte first. I've also additionally discovered that bitstrings of length n bits where n is not a multiple of 8 (CHAR_BIT, the number of bits in a byte) lose the least significant bits from the least significant byte. For example, if my bitstring is 14 bits, then 2 bytes (16 bits) are needed to store the bitstring, with byte zero being most significant and byte 1 being least. We lose bits 0 and 1 (the two least significant bits) from the least significant byte, namely byte 1. (The following diagram has B for bits used and X for bits truncated to make the above clear).

byte 0            byte 1
=================================
7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0
B B B B B B B B | B B B B B B X X

It should also be noted that truncated bits can only have zeroes written to them, otherwise you risk Postgres crashing. :)

Here is the updated code fixing the first problem (the second regarding bit truncation I still haven't solved), noting the lines that changed with in-code comments below:

#include <postgres.h>
#include <utils/array.h>
#include <utils/varbit.h>
#include <fmgr.h>
#include <limits.h>

#ifdef PG_MODULE_MAGIC
PG_MODULE_MAGIC;
#endif

#define MC_SEGMENT(bit, dim, ndim)  ((((bit)*(ndim)) + dim) / CHAR_BIT)
#define MC_SEGMENTBIT(bit, dim, ndim)   ((((bit)*(ndim)) + dim) % CHAR_BIT)

Datum pg_morton_encode_integer(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(pg_morton_encode_integer);
Datum
pg_morton_encode_integer(PG_FUNCTION_ARGS)
{
    ArrayType   *input_vector;
    int32       *toencode;
    int     i, n, rlen, len;
    int16       num_bits_per;
    VarBit      *result;
    int     mask;
    int     ipad;
    char        tempvalue;

    int     bit;
    int     dim;
    int     num_bits;
    int     num_bytes;

    /* Ensure first argument is not null. */
    if (PG_ARGISNULL(0))
    {
        ereport(ERROR, (errmsg("First operand must be non-null")));
    }
    /* Ensure second argument is not null. */
    if (PG_ARGISNULL(1))
    {
        ereport(ERROR, (errmsg("Second operand must be non-null")));
    }

    /* Get input array and its length (first argument). */
    input_vector    = PG_GETARG_ARRAYTYPE_P(0);
    n       = (ARR_DIMS(input_vector))[0];
ereport(NOTICE, (errmsg("n=%d", n) ));
    toencode    = (int32 *)ARR_DATA_PTR(input_vector);

    /* Get number of bits per dimensions (second argument). */
    num_bits_per    = PG_GETARG_INT16(1);

    /* Allocated the VarBit. */
    len     = n*num_bits_per;
    rlen        = VARBITTOTALLEN(len);
    result      = palloc0(rlen);
    SET_VARSIZE(result, rlen);
    VARBITLEN(result)
                = len;

    /* Perform the morton encoding. */
    num_bits        = num_bits_per*n;
    num_bytes       = (num_bits/8) + (num_bits % 8 == 0 ? 0 : 1);

    /*** TESTED THIS CODE, IT WORKS, BUT NOT IN POSTGRES. :( ***/
    for (bit = 0; bit < num_bits_per; ++bit)
    for (dim = 0; dim < n; ++dim)
    {
        // CHANGE!
        tempvalue
            = VARBITS(result)[num_bytes - 1 - MC_SEGMENT(bit, dim, n)];
        tempvalue
            |= (char)
             (((toencode[dim] & (1 << bit)) >> bit)
              << MC_SEGMENTBIT(bit, dim, n));
        // CHANGE!
        VARBITS(result)[num_bytes - 1 - MC_SEGMENT(bit, dim, n)]
            = tempvalue;
ereport(NOTICE, (errmsg("[%d,%d]=%d:%x", MC_SEGMENT(bit, dim, n), MC_SEGMENTBIT(bit,dim,n), (((toencode[dim] & (1 << bit)) >> bit)), (int)tempvalue)) );
    }
    /*** END OF TESTED CODE. ***/
    PG_RETURN_VARBIT_P(result);
}

Answer 1

The underlying varbit representation fills out the high-order bits first, so for example, 1111 would be stored as the byte 11110000 . Your implementation starts by filling the low-order bits in the last byte (since your shift distances are all zero in the first iteration), and these bits are out of bounds for any non-multiple of 8.

It's probably simpler, at least conceptually, to work through the output bits in order, though the offset calculations get a bit less straightforward. The implementation below seems to work. Note that I'm traversing the input array backwards; all of the sources I saw draw the first bit from the last coordinate, but I'm not sure if this was the intention in your original code.

Datum
pg_morton_encode_integer(PG_FUNCTION_ARGS)
{
    ArrayType   *input_vector;
    int32       *toencode;
    int16       num_bits_per;
    VarBit      *result;
    int         n, rlen, num_bits;
    int         bit, dim;
    int         in_bitpos, out_bitnum, out_bytenum, out_bitpos;
    bits8       in_bitval;

    /* Get input array and its length (first argument). */
    input_vector = PG_GETARG_ARRAYTYPE_P(0);
    n            = (ARR_DIMS(input_vector))[0];
    toencode     = (int32 *)ARR_DATA_PTR(input_vector);

    /* Get number of bits per dimensions (second argument). */
    num_bits_per = PG_GETARG_INT16(1);

    /* Allocated the VarBit. */
    num_bits = n * num_bits_per;
    rlen     = VARBITTOTALLEN(num_bits);
    result   = palloc0(rlen);
    SET_VARSIZE(result, rlen);
    VARBITLEN(result) = num_bits;

    /* Perform the morton encoding. */
    for (bit = 0; bit < num_bits_per; ++bit)
    {
        in_bitpos = num_bits_per - 1 - bit;
        for (dim = 0; dim < n; ++dim)
        {
            in_bitval = (toencode[n - dim - 1] & (1 << in_bitpos)) >> in_bitpos;
            out_bitnum = bit * n + dim;
            out_bytenum = out_bitnum / CHAR_BIT;
            out_bitpos = CHAR_BIT - 1 - (out_bitnum % CHAR_BIT);

            VARBITS(result)[out_bytenum] |= in_bitval << out_bitpos;
        }
    }

    PG_RETURN_VARBIT_P(result);
}

Here's a simple test case which reproduces the table on the wiki page .

 SELECT
   pg_morton_encode_integer(ARRAY[y],   3::INT2) AS y,
   pg_morton_encode_integer(ARRAY[0,y], 3::INT2) AS x_000,
   pg_morton_encode_integer(ARRAY[1,y], 3::INT2) AS x_001,
   pg_morton_encode_integer(ARRAY[2,y], 3::INT2) AS x_010,
   pg_morton_encode_integer(ARRAY[3,y], 3::INT2) AS x_011,
   pg_morton_encode_integer(ARRAY[4,y], 3::INT2) AS x_100,
   pg_morton_encode_integer(ARRAY[5,y], 3::INT2) AS x_101,
   pg_morton_encode_integer(ARRAY[6,y], 3::INT2) AS x_110,
   pg_morton_encode_integer(ARRAY[7,y], 3::INT2) AS x_111
 FROM generate_series(0,7) s(y)

Answer 2

The simplest solution, albeit not necessarily the most efficient, is to perform a substring operation on an oversized bitstring that we intially write the bits into. This will essentially "shift" the bits over for you automatically into a new bitstring so you don't write non-zero values into truncated bits.

Unfortunately, Postgres's underlying substring function for bitstrings is declared static (see the function bitsubstring in: https://github.com/postgres/postgres/blob/master/src/backend/utils/adt/varbit.c ). The quick fix is to copy the entire function into your code. The working solution is then the following:

#include <postgres.h>
#include <utils/array.h>
#include <utils/varbit.h>
#include <fmgr.h>
#include <limits.h>

#ifdef PG_MODULE_MAGIC
PG_MODULE_MAGIC;
#endif

// Copy bitstring substring function here.

#define MC_SEGMENT(bit, dim, ndim)  ((((bit)*(ndim)) + dim) / CHAR_BIT)
#define MC_SEGMENTBIT(bit, dim, ndim)   ((((bit)*(ndim)) + dim) % CHAR_BIT)
#define ROUNDUP_MULT_IS_POW_2(toRound, mult) ((toRound) + (mult) - 1) & ~((mult) - 1)

Datum pg_morton_encode_integer(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(pg_morton_encode_integer);
Datum
pg_morton_encode_integer(PG_FUNCTION_ARGS)
{
    ArrayType   *input_vector;
    int32       *toencode;
    int     i, n, rlen, len;
    int16       num_bits_per;
    VarBit  *temp,  *result;
    int     mask;
    int     ipad;
    char        tempvalue;

    int     bit;
    int     dim;
    int     num_bits;
    int     num_bytes;

    /* Ensure first argument is not null. */
    if (PG_ARGISNULL(0))
    {
        ereport(ERROR, (errmsg("First operand must be non-null")));
    }
    /* Ensure second argument is not null. */
    if (PG_ARGISNULL(1))
    {
        ereport(ERROR, (errmsg("Second operand must be non-null")));
    }

    /* Get input array and its length (first argument). */
    input_vector    = PG_GETARG_ARRAYTYPE_P(0);
    n       = (ARR_DIMS(input_vector))[0];
    toencode    = (int32 *)ARR_DATA_PTR(input_vector);

    /* Get number of bits per dimensions (second argument). */
    num_bits_per    = PG_GETARG_INT16(1);

    /* Allocated the VarBit. */
    len     = n*num_bits_per;
    len             = ROUNDUP_MULT_IS_POW_2(len, CHAR_BIT);
    rlen        = VARBITTOTALLEN(len);
    result      = palloc0(rlen);
    SET_VARSIZE(temp, rlen);
    VARBITLEN(temp) = len;

    /* Perform the morton encoding. */
    num_bits        = num_bits_per*n;
    num_bytes   = (num_bits/8) + (num_bits % 8 == 0 ? 0 : 1);

    for (bit = 0; bit < num_bits_per; ++bit)
    for (dim = 0; dim < n; ++dim)
    {
        tempvalue
            = VARBITS(temp)[num_bytes - 1 - MC_SEGMENT(bit, dim, n)];
        tempvalue
            |= (char)
             (((toencode[dim] & (1 << bit)) >> bit)
              << MC_SEGMENTBIT(bit, dim, n));
        VARBITS(temp)[num_bytes - 1 - MC_SEGMENT(bit, dim, n)]
            = tempvalue;
    }

    if (len == num_bits)
    {
        result     = temp;
    }
    else
    {
        result     = bitsubstring(temp, len - num_bits+1, num_bits, false);
        pfree(temp);
    }
    PG_RETURN_VARBIT_P(result);
}

Why are some of the bits in my Postgres VarBit (C-language function) being zeroed for specific input?

Question

2 answers

solution1
1 ACCPTED 2015-09-17 11:53:20

solution2
0 2015-09-16 19:39:19

Why are some of the bits in my Postgres VarBit (C-language function) being zeroed for specific input?

Question

2 answers

solution1 1 ACCPTED 2015-09-17 11:53:20

solution2 0 2015-09-16 19:39:19

solution1
1 ACCPTED 2015-09-17 11:53:20

solution2
0 2015-09-16 19:39:19