簡體   English   中英

opengl圖像處理性能差

[英]Poor opengl image processing performance

我正在嘗試使用opengl進行一些簡單的圖像處理。 因為我找不到任何好的庫來做這個好事我一直在努力做自己的解決方案。

我只想在gpu上編寫一些圖像然后再讀回來。 然而,我的實現的性能似乎幾乎等於它在cpu上所做的......有些不對勁......

我試圖遵循我在網上找到的最佳實踐。 但它仍然做錯了什么。

我已經嘗試刪除所有不相關的代碼。

關於為什么這個實現性能不佳的任何想法?

int image_width = 1280;
int image_height = 720;
int image_size = image_width * image_height;

class texture
{
public:
    texture()
    {   
        glGenTextures(1, &texture_);    
        bind();
        glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
        glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
        glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP);
        glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP);
        glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, image_width, image_height, 0, GL_BGRA, GL_UNSIGNED_BYTE, NULL);
    }

    ~texture(){ glDeleteTextures(1, &texture_); }
    void bind(){ glBindTexture(GL_TEXTURE_2D, texture_); }
    GLuint handle() { return texture_; }

private:
    GLuint texture_;
};
typedef std::shared_ptr<texture> texture_ptr;

class pixel_buffer // pixel buffer with associated texture
{
public:
    pixel_buffer()
    {
        glGenBuffersARB(1, &pbo_);
        bind_pbo();
        glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, image_size, 0, GL_STREAM_DRAW);
    }

    ~pixel_buffer(){ glDeleteBuffers(1, &pbo_); }

    void begin_write(void* src)
    {
        texture_.bind();
        glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_);
        glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, image_size, 0, GL_STREAM_DRAW);
        void* ptr = glMapBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, GL_WRITE_ONLY);
        assert(ptr); 
        memcpy(ptr, src, image_size);
        glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
    }

    void end_write()
    {
        bind_texture();
        bind_pbo();
        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, image_width, image_height, GL_BGRA, GL_UNSIGNED_BYTE, BUFFER_OFFSET(0));
        unbind_pbo();
    }

    void begin_read(GLuint buffer)
    {
        glReadBuffer(buffer);
        glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, pbo_);
        glBufferData(GL_PIXEL_PACK_BUFFER_ARB, image_size, NULL, GL_STREAM_READ);
        glReadPixels(0, 0, image_width, image_height, GL_BGRA, GL_UNSIGNED_BYTE, BUFFER_OFFSET(0));
    }

    void end_read(void* dest)
    {
        void* ptr = glMapBuffer(GL_PIXEL_PACK_BUFFER_ARB, GL_READ_ONLY);   
        memcpy(dest, ptr, image_size);
        glUnmapBuffer(GL_PIXEL_PACK_BUFFER_ARB);
        unbind_pbo();
    }

    void bind_pbo(){ glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_); }
    void unbind_pbo(){ glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_); }
    void bind_texture() { texture_.bind(); }

    GLuint texture_handle() { return texture_.handle(); }

private:
    texture texture_;
    GLuint pbo_;
};
typedef std::shared_ptr<pixel_buffer> pixel_buffer_ptr;

class frame_buffer// frame buffer with associated pixel buffer
{
public:
    frame_buffer()
    {
        glGenFramebuffersEXT(1, &fbo_);

        bind();
        pbo_.bind_texture();
        glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT, GL_TEXTURE_2D, pbo_.texture_handle(), 0);
    }

    ~frame_buffer() { glDeleteFramebuffersEXT(1, &fbo_); }

    void bind() { glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, fbo_); }
    void unbind() { glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0);    }

    void begin_read()
    {
        bind();
        pbo_.begin_read(GL_COLOR_ATTACHMENT0_EXT);
    }

    void end_read(void* dest)
    {
        pbo_.end_read(dest);
        unbind();
    }

private:
    pixel_buffer pbo_;
    GLuint fbo_;
};
typedef std::shared_ptr<frame_buffer> frame_buffer_ptr;

struct image_processor::implementation
{   
    void compose(const std::vector<image_ptr>& images)
    {                   
        // END PREVIOUS READ
        if(reading_fbo_)
        {
            image_ptr result_image = std::make_shared<image>(image_size);
            reading_fbo_->end_read(result_image->data());
            output_.push(reading_result_image_);

            reading_fbo_ = nullptr;
        }

        // END PREVIOUS WRITE
        frame_buffer_ptr written_fbo;
        if(!writing_pbo_group_.empty())
        {
            // END
            written_fbo = get_fbo();
            written_fbo->bind();
            glClear(GL_COLOR_BUFFER_BIT);   

            for(size_t n = 0; n < writing_pbo_group_.size(); ++n)
            {
                writing_pbo_group_[n]->end_write();
                writing_pbo_group_[n]->bind_texture();
                quad_->draw(); // DRAW FULLSCREEN QUAD
            }
            written_fbo->unbind();

            writing_pbo_group_.clear();
        }

        // BEGIN NEW WRITE
        if(!images.empty())
        {           
            for(size_t n = 0; n < images.size(); ++n)
            {
                auto pbo = get_pbo();
                pbo->begin_write(images[n]->data());

                writing_pbo_group_.push_back(pbo);
            }
        }

        // BEGIN NEW READ       
        if(written_fbo)
        {   
            written_fbo->begin_read();

            reading_fbo_ = written_fbo; 
        }
    }

    pixel_buffer_ptr get_pbo()
    {
        if(pbo_pool_.empty())
            pbo_pool_.push_back(std::make_shared<pixel_buffer>());

        auto pbo = pbo_pool_.front();
        pbo_pool_.pop_front();

        return pixel_buffer_ptr(pbo.get(), [=](pixel_buffer*){pbo_pool_.push_back(pbo);});
    }

    frame_buffer_ptr get_fbo()
    {
        if(fbo_pool_.empty())
            fbo_pool_.push_back(std::make_shared<frame_buffer>());

        auto fbo = fbo_pool_.front();
        fbo_pool_.pop_front();

        return frame_buffer_ptr(fbo.get(), [=](frame_buffer*){fbo_pool_.push_back(fbo);});
    }

    std::vector<pixel_buffer_ptr>   writing_pbo_group_;
    frame_buffer_ptr                reading_fbo_;

    std::deque<pixel_buffer_ptr> pbo_pool_;
    std::deque<frame_buffer_ptr> fbo_pool_;
};

編輯:

做了一些剖析。 大多數cpu時間似乎花在begin_write();

雖然看不出它有什么問題......

void begin_write(void* src)
{
    texture_.bind();
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_);
    glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, image_size, 0, GL_STREAM_DRAW);
    void* ptr = glMapBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, GL_WRITE_ONLY);
    assert(ptr); 
    memcpy(ptr, src, image_size);
    glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
}

您的瓶頸是從CPU到GPU的內存傳輸。 當您需要對存儲在GPU內存中的相同少量數據執行許多並行計算時(例如,每秒30幀繪制相同的場景),GPU可以提高性能。 當您需要從主內存和GPU內存來回傳輸數據時,大部分時間都是浪費在上面。

嘗試改變pixelformat(即BGRA而不是ARGB)。

我不確定具體細節,但卡片實際使用和你想要的東西不匹配會產生災難性的表現。

(如果沒有關於數據集的信息,正在運行的算法和一些硬基准數據,很難提供更多幫助...)

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM