Fastest way for wrapping a value in an interval

Question

I am curious about the ways to wrap a floating-point value x in a semi-closed interval [0; a[ [0; a[ .

For instance, I could have an arbitrary real number, say x = 354638.515 , that I wish to fold into [0; 2π[ [0; 2π[ because I have a good sin approximation for that range.

The fmod standard C functions show up quite high in my benchmarks, and by checking the source code of various libc implementations, I can understand why: the thing is fairly branch-ey, likely in order to handle a lot of IEEE754-specific issues:

glibc: https://github.com/bminor/glibc/blob/master/sysdeps/ieee754/flt-32/e_fmodf.c
Apple: https://opensource.apple.com/source/Libm/Libm-315/Source/ARM/fmod.c.auto.html
musl: https://git.musl-libc.org/cgit/musl/tree/src/math/fmod.c
Running with -ffast-math causes GCC to generate code that goes through the x87 FPU on x86/x86_64 which comes with its own set of problems (such as 80-bit doubles, FP state, and other fun things). I would like the implementation to vectorize at least semi-correctly, and if possible to not go through the x87 FPU but through vector registers at least as the rest of my code ends up vectorized, even if not necessarily an optimal way, by the compiler.
This one looks much simpler: https://github.com/KnightOS/libc/blob/master/src/fmod.c

In my case, I am only concerned about usual real values, not NaNs, not infinity. My range is also known at compile-time and sane (a common occurence being π/2 ), thus the checks for "special cases" such as range == 0 are unnecessary.

Thus, what would be good implementations of fmod for that specific use case?

Answer 1

Assuming that the range is constant and positive you can compute its reciprocal to avoid costly division.

void fast_fmod(float * restrict dst, const float * restrict src, size_t n, float divisor) {
   float reciprocal = 1.0f / divisor;
   for (size_t i = 0; i < n; ++i)
     dst[i] = src[i] - divisor * (int)(src[i] * reciprocal);
}

The final code with a simple demo is:

#include <stdlib.h>
#include <stdio.h>
#include <math.h>

void fast_fmod(float * restrict dst, const float * restrict src, size_t n, float divisor) {
   float reciprocal = 1.0f / divisor;

   for (size_t i = 0; i < n; ++i)
     dst[i] = src[i] - divisor * (int)(src[i] * reciprocal);
}

int main() {
    float src[9] = {-4, -3, -2, -1, 0, 1, 2, 3, 4};
    float dst[9];
    float div = 3;

    fast_fmod(dst, src, 9, div);

    for (int i = 0; i < 9; ++i) {
        printf("fmod(%g, %g) = %g vs %g\n", src[i], div, dst[i], fmod(src[i], div));
    }
}

produces an expected output:

fmod(-4, 3) = -1 vs -1
fmod(-3, 3) = 0 vs -0
fmod(-2, 3) = -2 vs -2
fmod(-1, 3) = -1 vs -1
fmod(0, 3) = 0 vs 0
fmod(1, 3) = 1 vs 1
fmod(2, 3) = 2 vs 2
fmod(3, 3) = 0 vs 0
fmod(4, 3) = 1 vs 1

Compilation with GCC with command:

$ gcc prog.c -o prog -O3 -march=haswell -lm -fopt-info-vec
prog.c:8:4: optimized: loop vectorized using 32 byte vectors
prog.c:8:4: optimized: loop vectorized using 32 byte vectors
prog.c:8:30: optimized: basic block part vectorized using 32 byte vectors

Thus the code was nicely vectorized.

EDIT

It looks that CLANG does even a better job vectorizing this code:

  401170:   c5 fc 10 24 8e          vmovups (%rsi,%rcx,4),%ymm4
  401175:   c5 fc 10 6c 8e 20       vmovups 0x20(%rsi,%rcx,4),%ymm5
  40117b:   c5 fc 10 74 8e 40       vmovups 0x40(%rsi,%rcx,4),%ymm6
  401181:   c5 fc 10 7c 8e 60       vmovups 0x60(%rsi,%rcx,4),%ymm7
  401187:   c5 6c 59 c4             vmulps %ymm4,%ymm2,%ymm8
  40118b:   c5 6c 59 cd             vmulps %ymm5,%ymm2,%ymm9
  40118f:   c5 6c 59 d6             vmulps %ymm6,%ymm2,%ymm10
  401193:   c5 6c 59 df             vmulps %ymm7,%ymm2,%ymm11
  401197:   c4 41 7e 5b c0          vcvttps2dq %ymm8,%ymm8
  40119c:   c4 41 7e 5b c9          vcvttps2dq %ymm9,%ymm9
  4011a1:   c4 41 7e 5b d2          vcvttps2dq %ymm10,%ymm10
  4011a6:   c4 41 7e 5b db          vcvttps2dq %ymm11,%ymm11
  4011ab:   c4 41 7c 5b c0          vcvtdq2ps %ymm8,%ymm8
  4011b0:   c4 41 7c 5b c9          vcvtdq2ps %ymm9,%ymm9
  4011b5:   c4 41 7c 5b d2          vcvtdq2ps %ymm10,%ymm10
  4011ba:   c4 41 7c 5b db          vcvtdq2ps %ymm11,%ymm11
  4011bf:   c5 3c 59 c3             vmulps %ymm3,%ymm8,%ymm8
  4011c3:   c5 34 59 cb             vmulps %ymm3,%ymm9,%ymm9
  4011c7:   c5 2c 59 d3             vmulps %ymm3,%ymm10,%ymm10
  4011cb:   c5 24 59 db             vmulps %ymm3,%ymm11,%ymm11
  4011cf:   c4 c1 5c 5c e0          vsubps %ymm8,%ymm4,%ymm4
  4011d4:   c4 c1 54 5c e9          vsubps %ymm9,%ymm5,%ymm5
  4011d9:   c4 c1 4c 5c f2          vsubps %ymm10,%ymm6,%ymm6
  4011de:   c4 c1 44 5c fb          vsubps %ymm11,%ymm7,%ymm7
  4011e3:   c5 fc 11 24 8f          vmovups %ymm4,(%rdi,%rcx,4)
  4011e8:   c5 fc 11 6c 8f 20       vmovups %ymm5,0x20(%rdi,%rcx,4)
  4011ee:   c5 fc 11 74 8f 40       vmovups %ymm6,0x40(%rdi,%rcx,4)
  4011f4:   c5 fc 11 7c 8f 60       vmovups %ymm7,0x60(%rdi,%rcx,4)
  4011fa:   48 83 c1 20             add    $0x20,%rcx
  4011fe:   48 39 c8                cmp    %rcx,%rax
  401201:   0f 85 69 ff ff ff       jne    401170 <fast_fmod+0x40>

Answer 2

This is a fmod()-alternative without precision loss I wrote.
The computation can take very long if the counter has a very high exponent and the denominator has a very low exponent, but it is still faster than the current Gnu C libary implementation:

#include <stdint.h>
#include <string.h>
#include <fenv.h>
#if defined(_MSC_VER)
    #include <intrin.h>
#endif

#if defined(__GNUC__) || defined(__clang__)
    #define likely(x) __builtin_expect((x), 1)
    #define unlikely(x) __builtin_expect((x), 0)
#else
    #define likely(x) (x)
    #define unlikely(x) (x)
#endif

#define MAX_EXP (0x7FF)
#define SIGN_BIT ((uint64_t)1 << 63)
#define EXP_MASK ((uint64_t)MAX_EXP << 52)
#define IMPLCIT_BIT ((uint64_t)1 << 52)
#define MANT_MASK (IMPLCIT_BIT - 1)
#define QNAN_BIT (IMPLCIT_BIT >> 1)

inline uint64_t bin( double d )
{
    uint64_t u;
    memcpy( &u, &d, sizeof d );
    return u;
}

inline double dbl( uint64_t u )
{
    double d;
    memcpy( &d, &u, sizeof u );
    return d;
}

inline double NaN()
{
    feraiseexcept( FE_INVALID );
    return dbl( SIGN_BIT | EXP_MASK | QNAN_BIT );
}

inline void normalize( uint64_t *mant, int *exp )
{
#if defined(__GNUC__) || defined(__clang__)
    unsigned bits = __builtin_clz( *mant ) - 11;
    *mant <<= bits;
#elif defined(_MSC_VER)
    unsigned long bits;
    _BitScanReverse64( &bits, *mant );
    bits = (bits ^ 63) - 11;
    *mant <<= bits;
#else
    unsigned bits = 0;
    for( ; !(*mant & IMPLCIT_BIT); *mant <<= 1, ++bits );
#endif
    *exp -= bits;
}

double myFmodC( double counter, double denominator )
{
    uint64_t const
        bCounter = bin( counter ),
        bDenom = bin( denominator );
    uint64_t const sign = bCounter & SIGN_BIT;
    if( unlikely((bCounter & EXP_MASK) == EXP_MASK) )
        // +/-[Inf|QNaN|SNaN] % ... = -QNaN
        return NaN();
    if( unlikely((bDenom & EXP_MASK) == EXP_MASK) )
        // +/-x % +/-[Inf|QNan|SNaN]
        if( likely(!(bDenom & MANT_MASK)) )
            // +/-x % +/-Inf = -/+x
            return dbl( sign | bCounter & ~SIGN_BIT );
        else
            // +/-x % +/-[QNaN|SNaN] = -NaN
            return NaN();
    int
        counterExp = bCounter >> 52 & MAX_EXP,
        denomExp = bDenom >> 52 & MAX_EXP;
    uint64_t
        counterMant = (uint64_t)!!counterExp << 52 | bCounter & MANT_MASK,
        denomMant = (uint64_t)!!denomExp << 52 | bDenom & MANT_MASK;
    if( unlikely(!counterExp) )
        // counter is denormal
        if( likely(!counterMant) )
            // counter == +/-0.0
            if( likely(denomMant) )
                // +/-0.0 % +/-x = -/+0.0
                return dbl( sign );
            else
                // +/-0.0 % +/-0.0 = -QNaN
                return NaN();
        else
            // normalize counter
            normalize( &counterMant, &counterExp ),
            ++counterExp;
    if( unlikely(!denomExp) )
        // denominator is denormal
        if( likely(!denomMant) )
            // +/-x % +/-0.0 = -/+QNaN
            return NaN();
        else
            // normalize denominator
            normalize( &denomMant, &denomExp ),
            ++denomExp;
    int remExp = counterExp;
    uint64_t remMant = counterMant;
    for( ; ; )
    {
        int below = remMant < denomMant;
        if( unlikely(remExp - below < denomExp) )
            break;
        remExp -= below;
        remMant <<= below;
        if( unlikely(!(remMant -= denomMant)) )
        {
            remExp = 0;
            break;
        }
        normalize( &remMant, &remExp );
    };
    if( unlikely(remExp <= 0) )
        // denormal result
        remMant >>= -remExp + 1,
        remExp = 0;
    return dbl( sign | (uint64_t)remExp << 52 | remMant & MANT_MASK );
}

Fastest way for wrapping a value in an interval

Question

2 answers

solution1
2 ACCPTED 2021-09-17 11:58:46

solution2
1

Fastest way for wrapping a value in an interval

Question

2 answers

solution1 2 ACCPTED 2021-09-17 11:58:46

solution2 1

solution1
2 ACCPTED 2021-09-17 11:58:46

solution2
1