簡體   English   中英

從用戶空間應用程序讀取原始 GPU 內存

[英]Reading raw GPU memory from userspace application

我正在嘗試從用戶空間應用程序中讀取原始 gpu 內存。 這個想法是從應用程序映射/sys/bus/pci/devices/[device addr]/resource1並對其進行加載和存儲。

這里的設備是具有 8GiB 板載內存的 Nvidia 3060Ti。 BAR 配置為可調整大小,因此所有 8GiB 的內存都應該可以訪問:

(base) [xps] pcimem git:(master) ✗ ls -lah /sys/bus/pci/devices/0000:01:00.0/resource*                   
-r--r--r-- 1 root root 4,0K avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource
-rw------- 1 root root  16M avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource0
-rw------- 1 root root 8,0G avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource1
-rw------- 1 root root 8,0G avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource1_wc
-rw------- 1 root root  32M avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource3
-rw------- 1 root root  32M avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource3_wc
-rw------- 1 root root  128 avril 22 11:17 /sys/bus/pci/devices/0000:01:00.0/resource5

使用pciem訪問內存不起作用。 將 0 寫入某個位置將在下一次讀取時返回零,但在任何后續讀取時將返回0x000000005665BDF5 第一次讀取后,所有位置的值0x000000005665BDF5相同。

對這些(失敗的)讀/寫進行基准測試似乎表明它們確實到達了 GPU。 讀取延遲約為 900ns,接近 PCIe 往返時間。

我已經嘗試直接mmap幀緩沖區( /dev/fb0 )並讀/寫它。 這行得通,我看到類似的讀/寫延遲。 但是,對於我的用例來說,幀緩沖區太小了。

CUDA 不起作用,因為在從設備內存讀取時,GPU 會將該頁面移動到主機。

有沒有辦法從 Linux 訪問 GPU 上的內存?

我在這里的目標是能夠在用戶空間應用程序中映射 GPU 的內存並將其用作內存擴展。 用戶空間應用程序(在 CPU 上運行)將直接在 GPU 的內存上分配和訪問數據結構。

TIA

看來您可以使用 GDRCopy 庫,或者至少可以使用它的內核驅動程序。 網站

GDRCopy 是一個基於 GPUDirect RDMA 技術的低延遲 GPU 內存復制庫,允許 CPU 直接映射和訪問 GPU 內存。

解決方案是使用 vulcan API 在 GPU 上分配一個堆並訪問它。 但是,由於 x86 無法緩存 MMIO 地址,因此每次訪問都將通過 PCIe 轉到 GPU。

該實現的延遲與 Nvidia 的服務器解決方案大致相同。

這是 C++ 中的一個快速而骯臟的實現,它將 GPU 抽象為堆內存並允許malloc()free()在其上。

要找出堆類型,請檢查:http: //vulkan.gpuinfo.org/displayreport.php ?id=14928#memory

createVertexBuffer()調用findMemoryType()時,您需要檢查 GPU 支持的標志

#include <chrono>
#include <vulkan/vulkan.h>

#include <algorithm>
#include <array>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iostream>
#include <limits>
#include <optional>
#include <set>
#include <stdexcept>
#include <vector>

#include "libvram/libvram.hh"
class VRamWrapper;

VRamWrapper *vrw_obj;

const size_t DEV_EXT_LEN = 1;
const char *deviceExtensions[] = {VK_KHR_SWAPCHAIN_EXTENSION_NAME};

struct QueueFamilyIndices {
  std::optional<uint32_t> graphicsFamily;

  bool isComplete() { return graphicsFamily.has_value(); }
};

class VRamWrapper {
public:
  void init() { initVulkan(); }

  void *malloc(size_t bytes) { return this->createVertexBuffer(bytes); }
  void free(void *buf) { assert(0); }

private:
  VkInstance instance;

  VkPhysicalDevice physicalDevice = VK_NULL_HANDLE;
  VkDevice device;

  VkQueue graphicsQueue;

  std::vector<VkBuffer> buffers;
  std::vector<VkDeviceMemory> bufferMemories;

  void initVulkan() {
    createInstance();
    pickPhysicalDevice();
    createLogicalDevice();
  }

  void cleanup() {
    for (auto buf : buffers) {
      vkDestroyBuffer(device, buf, nullptr);
    }

    for (auto mem : bufferMemories) {
      vkFreeMemory(device, mem, nullptr);
    }

    vkDestroyDevice(device, nullptr);
    vkDestroyInstance(instance, nullptr);
  }

  void createInstance() {
    VkApplicationInfo appInfo{};
    appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
    appInfo.pApplicationName = "Hello Triangle";
    appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
    appInfo.pEngineName = "No Engine";
    appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0);
    appInfo.apiVersion = VK_API_VERSION_1_0;

    VkInstanceCreateInfo createInfo{};
    createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
    createInfo.pApplicationInfo = &appInfo;

    createInfo.enabledLayerCount = 0;

    createInfo.pNext = nullptr;

    if (vkCreateInstance(&createInfo, nullptr, &instance) != VK_SUCCESS) {
      throw std::runtime_error("failed to create instance!");
    }
  }

  void pickPhysicalDevice() {
    uint32_t deviceCount = 0;
    vkEnumeratePhysicalDevices(instance, &deviceCount, nullptr);

    if (deviceCount == 0) {
      throw std::runtime_error("failed to find GPUs with Vulkan support!");
    }

    std::vector<VkPhysicalDevice> devices(deviceCount);
    vkEnumeratePhysicalDevices(instance, &deviceCount, devices.data());

    for (const auto &device : devices) {
      if (isDeviceSuitable(device)) {
        physicalDevice = device;
        break;
      }
    }

    if (physicalDevice == VK_NULL_HANDLE) {
      throw std::runtime_error("failed to find a suitable GPU!");
    }
  }

  void createLogicalDevice() {
    QueueFamilyIndices indices = findQueueFamilies(physicalDevice);

    std::vector<VkDeviceQueueCreateInfo> queueCreateInfos;
    std::set<uint32_t> uniqueQueueFamilies = {indices.graphicsFamily.value()};

    float queuePriority = 1.0f;
    for (uint32_t queueFamily : uniqueQueueFamilies) {
      VkDeviceQueueCreateInfo queueCreateInfo{};
      queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
      queueCreateInfo.queueFamilyIndex = queueFamily;
      queueCreateInfo.queueCount = 1;
      queueCreateInfo.pQueuePriorities = &queuePriority;
      queueCreateInfos.push_back(queueCreateInfo);
    }

    VkPhysicalDeviceFeatures deviceFeatures{};

    VkDeviceCreateInfo createInfo{};
    createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;

    createInfo.queueCreateInfoCount =
        static_cast<uint32_t>(queueCreateInfos.size());
    createInfo.pQueueCreateInfos = queueCreateInfos.data();

    createInfo.pEnabledFeatures = &deviceFeatures;

    createInfo.enabledExtensionCount = static_cast<uint32_t>(DEV_EXT_LEN);
    createInfo.ppEnabledExtensionNames = deviceExtensions;

    createInfo.enabledLayerCount = 0;

    if (vkCreateDevice(physicalDevice, &createInfo, nullptr, &device) !=
        VK_SUCCESS) {
      throw std::runtime_error("failed to create logical device!");
    }

    vkGetDeviceQueue(device, indices.graphicsFamily.value(), 0, &graphicsQueue);
  }

  void *createVertexBuffer(size_t bytes) {
    VkBuffer buffer;
    VkDeviceMemory bufferMemory;

    VkBufferCreateInfo bufferInfo{};
    bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
    bufferInfo.size = bytes;
    bufferInfo.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
    bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;

    if (vkCreateBuffer(device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) {
      throw std::runtime_error("failed to create vertex buffer!");
    }

    VkMemoryRequirements memRequirements;
    vkGetBufferMemoryRequirements(device, buffer, &memRequirements);

    assert(memRequirements.size == bytes);

    VkMemoryAllocateInfo allocInfo{};
    allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
    allocInfo.allocationSize = memRequirements.size;
    allocInfo.memoryTypeIndex =
        findMemoryType(memRequirements.memoryTypeBits,
                       VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
                           VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
                           VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);

    if (auto res = vkAllocateMemory(device, &allocInfo, nullptr, &bufferMemory);
        res != VK_SUCCESS) {
      throw std::runtime_error("failed to allocate vertex buffer memory");
    }

    vkBindBufferMemory(device, buffer, bufferMemory, 0);

    void *data;
    auto res = vkMapMemory(device, bufferMemory, 0, bytes, 0, &data);
    if (res != VK_SUCCESS) {
      throw std::runtime_error("Map failed");
    }

    fprintf(stderr, "Map completed. Allocated %lu MiB at %p\n",
            (bytes) / (1024UL * 1024), data);

    this->buffers.push_back(buffer);
    this->bufferMemories.push_back(bufferMemory);

    return data;
  }

  uint32_t findMemoryType(uint32_t typeFilter,
                          VkMemoryPropertyFlags properties) {
    VkPhysicalDeviceMemoryProperties memProperties;
    vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memProperties);

    for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) {
      if ((typeFilter & (1 << i)) &&
          (memProperties.memoryTypes[i].propertyFlags & properties) ==
              properties) {
        return i;
      }
    }

    throw std::runtime_error("failed to find suitable memory type!");
  }

  bool isDeviceSuitable(VkPhysicalDevice device) {
    QueueFamilyIndices indices = findQueueFamilies(device);

    bool extensionsSupported = checkDeviceExtensionSupport(device);

    return indices.isComplete() &&
           extensionsSupported /* && swapChainAdequate */;
  }

  bool checkDeviceExtensionSupport(VkPhysicalDevice device) {
    uint32_t extensionCount;
    vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount,
                                         nullptr);

    std::vector<VkExtensionProperties> availableExtensions(extensionCount);
    vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount,
                                         availableExtensions.data());

    std::set<std::string> requiredExtensions(deviceExtensions,
                                             deviceExtensions + DEV_EXT_LEN);

    for (const auto &extension : availableExtensions) {
      requiredExtensions.erase(extension.extensionName);
    }

    return requiredExtensions.empty();
  }

  QueueFamilyIndices findQueueFamilies(VkPhysicalDevice device) {
    QueueFamilyIndices indices;

    uint32_t queueFamilyCount = 0;
    vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount,
                                             nullptr);

    std::vector<VkQueueFamilyProperties> queueFamilies(queueFamilyCount);
    vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount,
                                             queueFamilies.data());

    int i = 0;
    for (const auto &queueFamily : queueFamilies) {
      if (queueFamily.queueFlags & VK_QUEUE_GRAPHICS_BIT) {
        indices.graphicsFamily = i;
      }

      if (indices.isComplete()) {
        break;
      }

      i++;
    }

    return indices;
  }
};

void ctor_libvram() {
  fprintf(stderr, "%s() called\n", __FUNCTION__);
  vrw_obj = new VRamWrapper();
  vrw_obj->init();
}

void *libvram::malloc(size_t bytes) {
  return vrw_obj->malloc(bytes);
}

void libvram::free(void *ptr) {
  vrw_obj->free(ptr);
}

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM