基准可变参数模板函数调用

Question

I used Quick bench (see link below) to measure the performance between decoding a buffer with call to a function with variadic template arguments versus decoding the same buffer with call to a function without variadic pack expansion. 我使用了Quick Bench（请参见下面的链接）来测量在通过调用带有可变参数模板函数的函数解码缓冲区与在不调用带有可变参数包扩展函数的函数解码同一缓冲区之间的性能。

Any idea on how to make the variadic implementation on par with the other implementation? 关于如何使可变参数实现与其他实现相同的任何想法？

The result from the benchmark is a ratio of (CPU time / Noop time). 基准测试的结果是（CPU时间/ Noop时间）的比率。 The benchmark runs on a pool of AWS machines whose load is unknown. 基准测试在负载未知的AWS机器池上运行。 The purpose is to give a reasonnably good comparison between two snippet of code, run in the same conditions. 目的是对在相同条件下运行的两个代码片段进行合理的比较。 The CPU time for the non variadic template function was 5.9 and for the variadic implementation it was 21.3. 非可变参数模板功能的CPU时间为5.9，可变参数实现的CPU时间为21.3。 Compiler: Clang 5.0 with optimization level O3. 编译器：Clang 5.0，优化级别为O3。

#include <cstdint>
#include <cstring>
#include <string>
#include <type_traits>

namespace core { namespace decoder 
{
   class LittleEndian
   {
   public:
      LittleEndian(const LittleEndian&) = delete;
      LittleEndian& operator=(const LittleEndian&) = delete;

   public:
      constexpr LittleEndian(const std::uint8_t* buffer, size_t size) noexcept
      : m_buffer(buffer),
        m_size(size)
      {}

      constexpr bool decodeU8(
        size_t& offset, std::uint8_t& decodedValue) const noexcept
      {
        if (offset >= m_size)
            return false;

        decodedValue = m_buffer[offset];
        offset += sizeof(std::uint8_t);
        return true;
      }

      constexpr bool decodeU16(
        size_t& offset, std::uint16_t& decodedValue) const noexcept
      {
        if (offset + sizeof(std::uint16_t) > m_size)
            return false;

        const uint8_t b0 = m_buffer[offset], b1 = m_buffer[offset + 1];
        decodedValue = (b0 << 0) | (b1 << 8);
        offset += sizeof(std::uint16_t);
        return true;
      }

      constexpr bool decodeU32(
        size_t& offset, std::uint32_t& decodedValue) const noexcept
      {
        if (offset + sizeof(std::uint32_t) > m_size)
            return false;

        const uint8_t b0 = m_buffer[offset], b1 = m_buffer[offset + 1], b2 = m_buffer[offset + 2], b3 = m_buffer[offset + 3];
        decodedValue = (b0 << 0) | (b1 << 8) | (b2 << 16) | (b3 << 24);
        offset += sizeof(std::uint32_t);
        return true;
      }

      constexpr bool decodeU64(
        size_t& offset, std::uint64_t& decodedValue) const noexcept
      {
        if (offset + sizeof(std::uint64_t) > m_size)
            return false;

        const uint8_t b0 = m_buffer[offset], b1 = m_buffer[offset + 1],
            b2 = m_buffer[offset + 2], b3 = m_buffer[offset + 3],
            b4 = m_buffer[offset + 4], b5 = m_buffer[offset + 5],
            b6 = m_buffer[offset + 6], b7 = m_buffer[offset + 7];

        decodedValue = (static_cast<std::uint64_t>(b0) << 0) |
            (static_cast<std::uint64_t>(b1) << 8) |
            (static_cast<std::uint64_t>(b2) << 16) |
            (static_cast<std::uint64_t>(b3) << 24) |
            (static_cast<std::uint64_t>(b4) << 32) |
            (static_cast<std::uint64_t>(b5) << 40) |
            (static_cast<std::uint64_t>(b6) << 48) |
            (static_cast<std::uint64_t>(b7) << 56);

        offset += sizeof(std::uint64_t);
        return true;
      }

  private:
    const std::uint8_t* m_buffer;
    const size_t m_size;
  };

  template<typename EndianDecoderT>
  class ByteDecoder
  {
  public:
    ByteDecoder(const ByteDecoder&) = delete;
    ByteDecoder& operator=(const ByteDecoder&) = delete;

  public:
    constexpr ByteDecoder(const std::uint8_t* buffer, size_t size)
      : m_buffer(buffer),
        m_size(size),
        m_endianDecoder(buffer, size)
    {}

    template<typename ...Args>
    constexpr bool decode(size_t offset, Args&... args) const noexcept
    {
        bool success = true;

        using expand_type = int[];
        expand_type
        {
            ([&success] (auto result) noexcept
            {
                success = (!success || !result) ? false : true;

            } (decodeValue(offset, args)), 0)...
        };

        return success;

      }

      template<typename T>
      constexpr bool decode(size_t offset, T& decodedValue) const noexcept
      {
        return decodeValue(offset, decodedValue);
      }

   private:
    template<typename T>
    constexpr bool decodeValue(
        size_t &offset, T& decodedValue) const noexcept
    {
        if constexpr (std::is_same< std::decay_t<T>, std::uint8_t>::value)
            return m_endianDecoder.decodeU8(offset, decodedValue);

        if constexpr (std::is_same< std::decay_t<T>, std::uint16_t>::value)
            return m_endianDecoder.decodeU16(offset, decodedValue);

        if constexpr (std::is_same< std::decay_t<T>, std::uint32_t>::value)
            return m_endianDecoder.decodeU32(offset, decodedValue);

        if constexpr (std::is_same< std::decay_t<T>, std::uint64_t>::value)
            return m_endianDecoder.decodeU64(offset, decodedValue);

        if constexpr (std::is_same<char *, typename std::decay<T>::type>::value ||
                      std::is_same<char const *, typename std::decay<T>::type>::value)
            return decodeCHR(offset, decodedValue);

        return false;
      }

      template<size_t SIZE>
      constexpr bool decodeCHR(
        size_t &offset, char (&buffer)[SIZE]) const noexcept
      {
        if (offset + SIZE > m_size)
            return false;

        memset(&buffer[0], 0x00, sizeof(char) * SIZE);
        memcpy(&buffer[0], &m_buffer[offset], sizeof(char) * (std::min)(SIZE, std::extent<decltype(buffer)>::value - 1));
        offset += SIZE;
        return true;
      }

  private:
    const std::uint8_t* m_buffer;
    const size_t m_size;
    EndianDecoderT m_endianDecoder;
  };

}} // namespace core::decoder

static void NonVariadicDecoding(benchmark::State& state) {
// Code inside this loop is measured repeatedly
constexpr std::uint8_t littleEndian[] = { 0x0D, 0x0C, 0x84, 0x03, 0x00, 0x00, 'H', 'e', 'l', 'l', 'o', '\0',  0x84, 0x03, 0x84, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
core::decoder::ByteDecoder<core::decoder::LittleEndian> decoder(littleEndian, sizeof(littleEndian));


for (auto _ : state) {

  size_t offset = 0;

  struct DecodedValue
  {
      std::uint16_t v1_U16;
      std::uint32_t v2_U32;
      char v3_CHR[6];
      std::uint16_t v4_U16;
      std::uint64_t v5_U64;
  };
  DecodedValue dv;

  decoder.decode(offset, dv.v1_U16);  
  offset += sizeof(dv.v1_U16);
  decoder.decode(offset, dv.v2_U32);
  offset += sizeof(dv.v2_U32);
  decoder.decode(offset, dv.v3_CHR);
  offset += sizeof(dv.v3_CHR);
  decoder.decode(offset, dv.v4_U16);
  offset += sizeof(dv.v4_U16);
  decoder.decode(offset, dv.v5_U64);

  benchmark::DoNotOptimize(dv);
  }
}
// Register the function as a benchmark
BENCHMARK(NonVariadicDecoding);

static void VariadicDecoding(benchmark::State& state) {
// Code before the loop is not measured
constexpr std::uint8_t littleEndian[] = { 0x0D, 0x0C, 0x84, 0x03, 0x00, 0x00, 'H', 'e', 'l', 'l', 'o', '\0',  0x84, 0x03, 0x84, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
core::decoder::ByteDecoder<core::decoder::LittleEndian> decoder(littleEndian, sizeof(littleEndian));

for (auto _ : state) {

  struct DecodedValue
  {
      std::uint16_t v1_U16;
      std::uint32_t v2_U32;
      char v3_CHR[6];
      std::uint16_t v4_U16;
      std::uint64_t v5_U64;
  };

  DecodedValue dv;
  decoder.decode(0, dv.v1_U16, dv.v2_U32, dv.v3_CHR, dv.v4_U16, dv.v5_U64);
  benchmark::DoNotOptimize(dv);
  }
}
BENCHMARK(VariadicDecoding);

Quick-Bench 快速工作台

Answer 1

If you have recursive calls to your variadic implementation and use perfect forwarding then you can have better performance: 如果递归调用可变参数实现并使用完美的转发，则可以得到更好的性能：

    template <typename Type>
    constexpr bool decode_impl(size_t offset, Type&& value) const noexcept
    {
      return decodeValue(offset, std::forward<Type>(value));
    }

    template <typename First, typename Second, typename... Other>
    constexpr bool decode_impl(size_t offset, First&& first, Second&& second, Other&&... others) const noexcept
    {
      return decode_impl(offset, std::forward<First>(first)) && decode_impl(offset, std::forward<Second>(second), std::forward<Other>(others)...);
    }

    template<typename ...Args>
    constexpr bool decode(size_t offset, Args&&... args) const noexcept
    {
      return decode_impl(offset, std::forward<Args>(args)...);
    }

See there . 看那里。

基准可变参数模板函数调用

问题描述

1 个解决方案

解决方案1
2 已采纳 2017-12-05 16:29:01

基准可变参数模板函数调用

问题描述

1 个解决方案

解决方案1 2 已采纳 2017-12-05 16:29:01

解决方案1
2 已采纳 2017-12-05 16:29:01