rootbeer CUDA 示例代碼量化吞吐量增益

Question

以下是 Nvidia CUDA 的 rootbeer 示例代碼，我在帶有 Ubuntu 12.04 (Precise) 和 bumblebee 和 optirun 的筆記本電腦上運行。 筆記本電腦配備了 Nvidia Optimus，因此是 optirun。 GPU 恰好是 Nvidia GeForce GT 540M，Nvidia 網站稱其具有 96 個內核。 我幾乎沒有獲得吞吐量增益。 問題是什么？

package com.random.test;

import java.util.ArrayList;
import java.util.Formatter;
import java.util.List;

import edu.syr.pcpratts.rootbeer.runtime.Kernel;
import edu.syr.pcpratts.rootbeer.runtime.Rootbeer;

public class ArraySumApp {
    final static int numberOfJobs = 1024; // 1024 in the original example
    final static int sizeOfArray = 512; // 512 in the original example
    final static int theAnswer = 130816;

    public int[] sumArrays(List<int[]> arrays) {

        List<Kernel> jobs = new ArrayList<Kernel>();
        int[] ret = new int[arrays.size()];
        for (int i = 0; i < arrays.size(); ++i) {
            jobs.add(new ArraySum(arrays.get(i), ret, i));
        }

        Rootbeer rootbeer = new Rootbeer();
        rootbeer.runAll(jobs);
        return ret;
    }

    private static long measureOneJob() {

        int[] source = new int[ArraySumApp.sizeOfArray];
        int[] destination = new int[1];
        for (int i = 0; i < ArraySumApp.sizeOfArray; i++)
            source[i] = i;
        Kernel job = new ArraySum(source, destination, 0);

        ElapsedTimer et = new ElapsedTimer();
        job.gpuMethod();
        long timeInMs = et.stopInMilliseconds();
        System.out.println("measureOneJob " + et.stringInMilliseconds());

        assert destination[0] == ArraySumApp.theAnswer : "cosmic rays";
        return timeInMs;
    }

    public static void main(String[] args) {

        Helper.assertAssertionEnabled();

        // measure the time to do one job
        ArraySumApp.measureOneJob();
        long oneJob = ArraySumApp.measureOneJob();

        ArraySumApp app = new ArraySumApp();
        List<int[]> arrays = new ArrayList<int[]>();

        // you want 1000s of threads to run on the GPU all at once for speedups
        for (int i = 0; i < ArraySumApp.numberOfJobs; ++i) {
            int[] array = new int[ArraySumApp.sizeOfArray];
            for (int j = 0; j < array.length; ++j) {
                array[j] = j;
            }
            arrays.add(array);
        }

        ElapsedTimer et = new ElapsedTimer();
        int[] sums = app.sumArrays(arrays);
        long allJobs = et.stopInMilliseconds();
        System.out.println("measureAllJobs " + et.stringInMilliseconds());

        double gainFactor = ((double) ArraySumApp.numberOfJobs) * oneJob
                / allJobs;
        System.out.println(String.format(
                "throughput gain factor %.1f\nthroughput gain %.1f\n",
                gainFactor, gainFactor - 1.0d));

        // check the number of answers is correct
        assert sums.length == ArraySumApp.numberOfJobs : "cosmic rays";

        // check they all have the answer
        for (int i = 0; i < ArraySumApp.numberOfJobs; i++)
            assert sums[i] == ArraySumApp.theAnswer : "cosmic rays";
    }
}

class ArraySum implements Kernel {

    final static int repetitionFactor = 100000;

    private int[] source;
    private int[] ret;
    private int index;

    public ArraySum(int[] src, int[] dst, int i) {
        source = src;
        ret = dst;
        index = i;
    }

    public void gpuMethod() {
        for (int repetition = 0; repetition < ArraySum.repetitionFactor; repetition++) {
            int sum = 0;
            for (int i = 0; i < source.length; ++i) {
                sum += source[i];
            }
            ret[index] = sum;
        }
    }
}

class Helper {
    private Helper() {
    }

    static void assertAssertionEnabled() {
        try {
            assert false;
        } catch (AssertionError e) {
            return;
        }
        Helper.noteCosmicRays();
    }

    static void noteCosmicRays() // programmer design or logic error
    {
        throw new RuntimeException("cosmic rays");
    }
}

class ElapsedTimer {
    private org.joda.time.DateTime t0;
    private long savedStopInMilliseconds;

    public ElapsedTimer() {
        this.t0 = new org.joda.time.DateTime();
    }

    public long stopInMilliseconds() {
        return stop();
    }

    public String stringInMilliseconds() // relies on a saved stop
    {
        Formatter f = new Formatter();
        f.format("%d ms", this.savedStopInMilliseconds);
        String s = f.toString();
        f.close();
        return s;
    }

    public String stopStringInMilliseconds() {
        stop();
        return stringInMilliseconds();
    }

    public String stringInSecondsAndMilliseconds() // relies on a saved stop
    {
        Formatter f = new Formatter();
        f.format("%5.3f s", this.savedStopInMilliseconds / 1000.0d);
        String s = f.toString();
        f.close();
        return s;
    }

    public String stopStringInSecondsAndMilliseconds() {
        stop();
        return stringInSecondsAndMilliseconds();
    }

    public long stopInSeconds() {
        return (stop() + 500L) / 1000L; // rounding
    }

    public String stringInSeconds() // relies on a saved stop
    {
        Formatter f = new Formatter();
        long elapsed = (this.savedStopInMilliseconds + 500L) / 1000L; // rounding
        f.format("%d s", elapsed);
        String s = f.toString();
        f.close();
        return s;
    }

    public String stopStringInSeconds() {
        stop();
        return stringInSeconds();
    }

    /**
     * This is private. Use the stopInMilliseconds method if this is what you
     * need.
     */
    private long stop() {
        org.joda.time.DateTime t1 = new org.joda.time.DateTime();
        savedStopInMilliseconds = t1.getMillis() - this.t0.getMillis();
        return savedStopInMilliseconds;
    }
}

這是輸出：

measureOneJob 110 ms
measureOneJob 26 ms
CudaRuntime2 ctor: elapsedTimeMillis: 609
measureAllJobs 24341 ms
throughput gain factor 1.1
throughput gain 0.1

Answer 1

rootbeer 開發人員表示，采用數組元素總和的示例代碼不是最好的示例，一個替代示例將顯示吞吐量增益。

Answer 2

可以看到： https : //github.com/pcpratts/rootbeer1/tree/develop/gtc2013/Matrix

這是 2013 年 NVIDIA GTC 大會的一個例子。 我在使用轉置的 4 核 Java 矩陣乘法上獲得了 20 倍的加速。

該示例是使用 GPU 上的共享內存的平鋪矩陣乘法。 從 NVIDIA 文獻來看，使用共享內存是獲得良好加速的最重要方面之一。 要使用共享內存，您需要讓塊中的每個線程將值加載到共享數組中。 然后你必須多次重用這些共享值。 這節省了從全局內存中獲取的時間。

在 Tesla 2.0 架構上，從全局內存中獲取大約需要 200-300 個時鍾周期，從共享內存中獲取大約需要 2-3 個時鍾周期。

rootbeer CUDA 示例代碼量化吞吐量增益

問題描述

2 個解決方案

解決方案1
1 已采納 2012-12-11 08:58:20

解決方案2
1 2013-04-03 20:54:25

rootbeer CUDA 示例代碼量化吞吐量增益

問題描述

2 個解決方案

解決方案1 1 已采納 2012-12-11 08:58:20

解決方案2 1 2013-04-03 20:54:25

解決方案1
1 已采納 2012-12-11 08:58:20

解決方案2
1 2013-04-03 20:54:25