简体   繁体   English

使用 OpenGL 3.3 实例化似乎很慢

[英]Instancing with OpenGL 3.3 seems very slow

I wrote a minimal code-sample in C++, which is rendering 10000 colored quads on the screen.我用 C++ 编写了一个最小的代码示例,它在屏幕上呈现 10000 个彩色四边形。 I am using "instancing" and so updating only the model-matrix for each quad each frame.我正在使用“实例化”,因此每帧只更新每个四边形的模型矩阵。 The data of the 6 vertices are stored in an indivdual VBO und will be reused all the time. 6 个顶点的数据存储在一个单独的 VBO 中,并且将一直重复使用。 The projection-matrix (orthographic) is injected once at program-start via uniform.投影矩阵(正交)在程序开始时通过统一注入一次。 The model-matrix is calculated on the CPU with the library GLM.模型矩阵是在 CPU 上使用库 GLM 计算的。 I measured the rendering-time and I got only an average FPS of 52. I think this is MUCH to less, but I cannot find the mistake/bottleneck in my little sample program.我测量了渲染时间,我得到的平均 FPS 仅为 52。我认为这要少得多,但我在我的小示例程序中找不到错误/瓶颈。

After some analysis it seems, that the 3 calculations done with GLM are very slow.经过一些分析,似乎用 GLM 完成的 3 次计算非常慢。 Am I doing something wrong here?我在这里做错了吗? For example, If I remove the rotating-calculation, I get an FPS-boost of 10 FPS!例如,如果我删除旋转计算,我将获得 10 FPS 的 FPS 提升! Maybe you can help me to find out, what I can do better here and how can I optimize my sample.也许您可以帮助我找出我可以在这里做得更好的地方以及如何优化我的样本。 It is important for me, that each quad is individual configurable during runtime, so I decided to use instancing.对我来说很重要的是,每个四边形在运行时都可以单独配置,所以我决定使用实例化。 Moving the matrix-calculations to the GPU seems another option, but I am really confused, why the CPU has so much problems calculating the 10000 model-matrices!将矩阵计算转移到 GPU 似乎是另一种选择,但我真的很困惑,为什么 CPU 在计算 10000 个模型矩阵时有这么多问题! Ok, my CPU is very bad (Athlon 2 Core-Duo M300, GPU is ATI Mobility Radeon 4100), but It should do this task in no measurable time, or?好吧,我的 CPU 非常糟糕(Athlon 2 Core-Duo M300,GPU 是 ATI Mobility Radeon 4100),但是它应该在可测量的时间内完成这项任务,或者?

Here is minimal, fully working, compilable example (If u have GLFW and GLM).这是最小的、完全工作的、可编译的示例(如果你有 GLFW 和 GLM)。 Maybe someone have some time and can help me out here :)也许有人有时间可以在这里帮助我:)

#define GLEW_STATIC
#define GLM_FORCE_INLINE
#define GLM_FORCE_SSE2
#include "glew.h"
#include "glfw3.h"
#include "glm.hpp"
#include "glm/gtc/matrix_transform.hpp"
#include <conio.h>
#include <cstdlib>
#include <iostream>
#include <ctime>

GLuint buildShader()
{
    std::string strVSCode = 
    "#version 330 core\n"
    "in vec3 vertexPosition;\n"
    "in mat4 modelMatrix;\n"
    "uniform mat4 projectionMatrix;\n"
    "out vec4 m_color;\n"
    "void main() {\n"
    "   vec4 vecVertex = vec4(vertexPosition, 1);\n"
    "   gl_Position = projectionMatrix * modelMatrix * vecVertex;\n"
    "   m_color = gl_Position;\n"
    "}\n";

    std::string strFSCode = "#version 330 core\n"
    "out vec4 frag_colour;\n"
    "in vec4 m_color;\n"
    "void main() {\n"
    "   frag_colour = vec4(m_color.x, m_color.y, m_color.z, 0.5f);\n"
    "}\n";

    GLuint gluiVertexShaderId = glCreateShader(GL_VERTEX_SHADER);
    char const * VertexSourcePointer = strVSCode.c_str();
    glShaderSource(gluiVertexShaderId, 1, &VertexSourcePointer, NULL);
    glCompileShader(gluiVertexShaderId);
    GLuint gluiFragmentShaderId = glCreateShader(GL_FRAGMENT_SHADER);
    char const * FragmentSourcePointer = strFSCode.c_str();
    glShaderSource(gluiFragmentShaderId, 1, &FragmentSourcePointer, NULL);
    glCompileShader(gluiFragmentShaderId);
    GLuint gluiProgramId = glCreateProgram();
    glAttachShader(gluiProgramId, gluiVertexShaderId);
    glAttachShader(gluiProgramId, gluiFragmentShaderId);
    glLinkProgram(gluiProgramId);
    glDeleteShader(gluiVertexShaderId);
    glDeleteShader(gluiFragmentShaderId);
    return gluiProgramId;
}

struct Sprite
{
    glm::vec3 position, dimension;
    float speed, rotation, rx, ry;
};

struct Vertex
{
    float x, y, z;
    Vertex(){};
    Vertex(float x, float y, float z) : x(x), y(y), z(z) {}
};

int main(int arc, char **argv)
{
    // GLFW init
    int displayResWith   = 1366; //modify this here
    int displayResHeight = 768;  //modify this here
    glfwInit();
    glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
    glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
    glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, 1);
    glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
    glfwWindowHint(GLFW_RED_BITS, 8);
    glfwWindowHint(GLFW_GREEN_BITS, 8);
    glfwWindowHint(GLFW_BLUE_BITS, 8);
    glfwWindowHint(GLFW_ALPHA_BITS, 8);
    glfwWindowHint(GLFW_DEPTH_BITS, 32);
    glfwWindowHint(GLFW_STENCIL_BITS, 32);
    GLFWwindow* window = glfwCreateWindow(displayResWith, displayResHeight,"Instancing", glfwGetPrimaryMonitor(),NULL);
    int width, height;
    glfwMakeContextCurrent(window);
    glfwSwapInterval(0);
    glfwGetFramebufferSize(window, &width, &height);

    //GLEW init
    glewExperimental = GL_TRUE;
    glewInit();
    const GLubyte* renderer = glGetString(GL_RENDERER);
    const GLubyte* version = glGetString(GL_VERSION);
    std::cout << "Renderer: " << renderer << std::endl;
    std::cout << "OpenGL supported version: " << version << std::endl;

    //OpenGL init
    glEnable(GL_CULL_FACE); 
    glCullFace(GL_BACK);
    glEnable(GL_DEPTH_TEST);
    glDepthFunc(GL_LESS);
    glEnable(GL_BLEND);
    glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
    glClearColor(255.0f, 255.0f, 255.0f, 255.0f);

    //Shader
    GLuint programID = buildShader();

    //VBO vertexBuffer
    GLuint vertexBuffer;
    glGenBuffers(1, &vertexBuffer);
    glBindBuffer(GL_ARRAY_BUFFER, vertexBuffer);
    Vertex VertexBufferData[6];
    VertexBufferData[0] = Vertex(-0.5f, 0.5f, 0.0f);    //Links oben
    VertexBufferData[1] = Vertex(-0.5f, -0.5f, 0.0f);   //Links unten
    VertexBufferData[2] = Vertex(0.5f, -0.5f, 0.0f);    //Rechts unten
    VertexBufferData[3] = VertexBufferData[2];          //Rechts unten
    VertexBufferData[4] = Vertex(0.5f, 0.5f, 0.0f);     //Rechts oben
    VertexBufferData[5] = VertexBufferData[0];          //Links oben
    glBufferData(GL_ARRAY_BUFFER, sizeof(Vertex)*6, VertexBufferData, GL_STATIC_DRAW);

    //VBO instanceBuffer
    GLuint instanceBuffer;
    glGenBuffers(1, &instanceBuffer);
    glBindBuffer(GL_ARRAY_BUFFER, instanceBuffer);
    int iMaxInstanceCount = 30000;
    glm::mat4 *ptrInstanceBufferData = new glm::mat4[iMaxInstanceCount];
    glBufferData(GL_ARRAY_BUFFER, iMaxInstanceCount * sizeof(glm::mat4), NULL, GL_STREAM_DRAW);

    //VAO - Start
    GLuint vertexArrayObject;
    glGenVertexArrays(1, &vertexArrayObject);
    glBindVertexArray(vertexArrayObject);

        //For VBO vertexbuffer
        glEnableVertexAttribArray(glGetAttribLocation(programID, "vertexPosition"));
        glBindBuffer(GL_ARRAY_BUFFER, vertexBuffer);
        glVertexAttribPointer(
            glGetAttribLocation(programID, "vertexPosition"),
            3,                                                  
            GL_FLOAT,                                           
            GL_FALSE,                                           
            sizeof(Vertex),                                     
            (void*)0                                            
            );

        glVertexAttribDivisor(0, 0);

        //For VBO instanceBuffer
        int pos = glGetAttribLocation(programID, "modelMatrix");
        int pos1 = pos + 0;
        int pos2 = pos + 1;
        int pos3 = pos + 2;
        int pos4 = pos + 3;
        glEnableVertexAttribArray(pos1);
        glEnableVertexAttribArray(pos2);
        glEnableVertexAttribArray(pos3);
        glEnableVertexAttribArray(pos4);
        glBindBuffer(GL_ARRAY_BUFFER, instanceBuffer);
        glVertexAttribPointer(pos1, 4, GL_FLOAT, GL_FALSE, sizeof(GLfloat) * 4 * 4, (void*)(0));
        glVertexAttribPointer(pos2, 4, GL_FLOAT, GL_FALSE, sizeof(GLfloat) * 4 * 4, (void*)(sizeof(float) * 4));
        glVertexAttribPointer(pos3, 4, GL_FLOAT, GL_FALSE, sizeof(GLfloat) * 4 * 4, (void*)(sizeof(float) * 8));
        glVertexAttribPointer(pos4, 4, GL_FLOAT, GL_FALSE, sizeof(GLfloat) * 4 * 4, (void*)(sizeof(float) * 12));
        glVertexAttribDivisor(pos1, 1);
        glVertexAttribDivisor(pos2, 1);
        glVertexAttribDivisor(pos3, 1);
        glVertexAttribDivisor(pos4, 1);

    glBindVertexArray(0); //VAO - End

    //Matrix vars
    glm::mat4 Projection, Rotating, Scaling, Translation, Identity;
    glm::vec3 ZRotateVec(0.0f, 0.0f, 1.0f);

    //Calc projection-matrix and put shader (uniform)
    Projection = glm::ortho(0.0f, (float)width, 0.0f, (float)height, 0.0f, 1.0f);
    glUseProgram(programID);
    glUniformMatrix4fv(glGetUniformLocation(programID, "projectionMatrix"), 1, GL_FALSE, &Projection[0][0]);

    //Creating sprites
    std::srand(static_cast<unsigned int>(std::time(0)));
    int iActInstanceCount = 10000;
    Sprite *ptrSprites = new Sprite[iActInstanceCount];
    for (int i = 0; i < iActInstanceCount; ++i)
    {
        ptrSprites[i].dimension = glm::vec3(16, 16, 1.0f);
        ptrSprites[i].position = glm::vec3(std::rand()%(width-32),std::rand()%(height-32),-1.0f *((std::rand()%256)/256.0f));
        ptrSprites[i].rotation = rand() % 360 + 0.0f;
        ptrSprites[i].rx = static_cast<float>(std::rand() % 2);
        ptrSprites[i].ry = static_cast<float>(std::rand() % 2);
        ptrSprites[i].speed = (std::rand() % 100) + 1.0f;
        if (ptrSprites[i].speed < 1.0f) ptrSprites[i].speed = 1.0f;
    }

    //FPS init
    double fFramesRendered = 0.0f;
    double fFrameMeasurementStart = 0.0f;
    double fFPS = 0.0f;
    double fCurrentTime = 0.0f;
    glfwSetTime(0);

    //Main-loop (also renderloop)
    while (!glfwWindowShouldClose(window))
    {
        //application-logic
        if (glfwGetKey(window, GLFW_KEY_ESCAPE)== GLFW_PRESS)
            glfwSetWindowShouldClose(window, GL_TRUE);

        const double fNewTime = glfwGetTime();
        double fDeltaTime = fNewTime - fCurrentTime;
        fCurrentTime = fNewTime;

        for (int i = 0; i < iActInstanceCount; ++i)
        {
            float fSpeed = ptrSprites[i].speed * static_cast<float>(fDeltaTime);
            ptrSprites[i].rotation += fSpeed;
            if (ptrSprites[i].rotation >= 360.0f) ptrSprites[i].rotation = 0.0f;
            if (ptrSprites[i].rx == 1)  ptrSprites[i].position.x = ptrSprites[i].position.x + fSpeed;
            if (ptrSprites[i].rx == 0)  ptrSprites[i].position.x = ptrSprites[i].position.x - fSpeed;
            if (ptrSprites[i].ry == 1)  ptrSprites[i].position.y = ptrSprites[i].position.y + fSpeed;
            if (ptrSprites[i].ry == 0)  ptrSprites[i].position.y = ptrSprites[i].position.y - fSpeed;
            if (ptrSprites[i].position.x <= 0) ptrSprites[i].rx = 1;
            if (ptrSprites[i].position.x + ptrSprites[i].dimension.x >= width) ptrSprites[i].rx = 0;
            if (ptrSprites[i].position.y <= 0) ptrSprites[i].ry = 1;
            if (ptrSprites[i].position.y + ptrSprites[i].dimension.y >= height) ptrSprites[i].ry = 0;

            //matrix-calculations (saved in local buffer)
            Translation = glm::translate(Identity, ptrSprites[i].position + glm::vec3(ptrSprites[i].dimension.x / 2.0f, ptrSprites[i].dimension.y / 2.0f, 0.0f));
            Scaling = glm::scale(Translation, ptrSprites[i].dimension);
            ptrInstanceBufferData[i] = glm::rotate(Scaling, ptrSprites[i].rotation, ZRotateVec);
        }

        //render-call
        glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
        glUseProgram(programID);
        glBindVertexArray(vertexArrayObject);
        glBindBuffer(GL_ARRAY_BUFFER, instanceBuffer);
        glBufferData(GL_ARRAY_BUFFER, iMaxInstanceCount * sizeof(glm::mat4), NULL, GL_STREAM_DRAW); // Buffer orphaning
        glBufferSubData(GL_ARRAY_BUFFER, 0, iActInstanceCount * sizeof(glm::mat4), ptrInstanceBufferData);
        glDrawArraysInstanced(GL_TRIANGLES, 0, 6, iActInstanceCount);
        glBindVertexArray(0);
        glfwSwapBuffers(window);
        glfwPollEvents();


        //FPS-stuff
        ++fFramesRendered;

        if ((fCurrentTime*1000.0f) >= (fFrameMeasurementStart*1000.0f) + 1000.0f)
        {
            fFPS = ((fCurrentTime*1000.0f) - (fFrameMeasurementStart*1000.0f)) / 1000.0f * fFramesRendered;
            fFrameMeasurementStart = fCurrentTime;
            fFramesRendered = 0;
            std::cout << "FPS: " << fFPS << std::endl;
        }
    }

    //Termination and cleanup
    glDeleteBuffers(1, &vertexBuffer);
    glDeleteBuffers(1, &instanceBuffer);
    glDeleteVertexArrays(1, &vertexArrayObject);
    glDeleteProgram(programID);
    glfwDestroyWindow(window);
    glfwTerminate();
    return _getch();
}

Well, after testing it on my machine, it is definitely CPU limited, so nothing you do with OGL is going to make much difference.嗯,在我的机器上测试之后,它肯定是 CPU 限制的,所以你用 OGL 做的任何事情都不会有太大的不同。 I get about ~300fps with GCC on at least -O1, but only ~80 with -O0.我至少在 -O1 上使用 GCC 获得约 300fps,但在使用 -O0 时仅获得约 80 帧。 My CPU is very fast (i7 2600k, 4.7ghz), but my GPU is rather slow (GT 520).我的 CPU 速度非常快(i7 2600k,4.7ghz),但我的 GPU 速度很慢(GT 520)。 I'm also on Ubuntu.我也在 Ubuntu 上。

Some quick ideas for things that might speed it up a little:一些可能会加快速度的快速想法:

  • Put the vertex positions in an array in the vertex shader and use gl_VertexID to access them将顶点位置放在顶点着色器的数组中,并使用 gl_VertexID 访问它们
  • Use GL_TRIANGLE_STRIP instead of GL_TRIANGLES使用 GL_TRIANGLE_STRIP 而不是 GL_TRIANGLES
  • Use radians for angles, as otherwise GLM has to convert them使用弧度作为角度,否则 GLM 必须转换它们

None of these are likely to make much of any impact, really.真的,这些都不太可能产生任何影响。 Just make sure your compiler is set up right, and there probably isn't much more to do.只要确保你的编译器设置正确,可能没有更多的事情要做。

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM