如何将 Linux 内核缓冲区映射到用户空间？

Question

Let's say the buffer is allocated using a page based scheme.假设缓冲区是使用基于页面的方案分配的。 One way to implement mmap would be to use remap_pfn_range but LDD3 says this does not work for conventional memory.实现 mmap 的一种方法是使用 remap_pfn_range 但 LDD3 表示这不适用于传统内存。 It appears we can work around this by marking the page(s) reserved using SetPageReserved so that it gets locked in memory.看来我们可以通过使用 SetPageReserved 标记保留的页面来解决这个问题，以便它被锁定在内存中。 But isn't all kernel memory already non-swappable ie already reserved?但是不是所有的内核内存都已经不可交换，即已经保留了吗？ Why the need to set the reserved bit explicitly?为什么需要显式设置保留位？

Does this have something to do with pages allocated from HIGH_MEM?这与从 HIGH_MEM 分配的页面有关吗？

Answer 1

The simplest way to map a set of pages from the kernel in your mmap method is to use the fault handler to map the pages.在 mmap 方法中从内核映射一组页面的最简单方法是使用错误处理程序来映射页面。 Basically you end up with something like:基本上你最终会得到类似的东西：

static int my_mmap(struct file *filp, struct vm_area_struct *vma)
{
    vma->vm_ops = &my_vm_ops;
    return 0;
}

static const struct file_operations my_fops = {
    .owner  = THIS_MODULE,
    .open   = nonseekable_open,
    .mmap   = my_mmap,
    .llseek = no_llseek,
};

(where the other file operations are whatever your module needs). （其他文件操作是您的模块需要的任何内容）。 Also in my_mmap you do whatever range checking etc. is needed to validate the mmap parameters.同样在my_mmap您可以进行任何范围检查等以验证 mmap 参数。

Then the vm_ops look like:然后vm_ops看起来像：

static int my_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
    vmf->page = my_page_at_index(vmf->pgoff);
    get_page(vmf->page);

    return 0;
} 

static const struct vm_operations_struct my_vm_ops = {
    .fault      = my_fault
}

where you just need to figure out for a given vma / vmf passed to your fault function which page to map into userspace.您只需要找出传递给故障函数的给定 vma / vmf 将哪个页面映射到用户空间。 This depends on exactly how your module works.这完全取决于您的模块的工作方式。 For example, if you did例如，如果你做了

my_buf = vmalloc_user(MY_BUF_SIZE);

then the page you use would be something like那么您使用的页面将类似于

vmalloc_to_page(my_buf + (vmf->pgoff << PAGE_SHIFT));

But you could easily create an array and allocate a page for each entry, use kmalloc, whatever.但是您可以轻松地创建一个数组并为每个条目分配一个页面，使用 kmalloc 等等。

[just noticed that my_fault is a slightly amusing name for a function] [刚刚注意到my_fault是一个有点有趣的函数名称]

Answer 2

Minimal runnable example and userland test最小的可运行示例和用户空间测试

Kernel module:内核模块：

#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h> /* min */
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/uaccess.h> /* copy_from_user, copy_to_user */
#include <linux/slab.h>

static const char *filename = "lkmc_mmap";

enum { BUFFER_SIZE = 4 };

struct mmap_info {
    char *data;
};

/* After unmap. */
static void vm_close(struct vm_area_struct *vma)
{
    pr_info("vm_close\n");
}

/* First page access. */
static vm_fault_t vm_fault(struct vm_fault *vmf)
{
    struct page *page;
    struct mmap_info *info;

    pr_info("vm_fault\n");
    info = (struct mmap_info *)vmf->vma->vm_private_data;
    if (info->data) {
        page = virt_to_page(info->data);
        get_page(page);
        vmf->page = page;
    }
    return 0;
}

/* After mmap. TODO vs mmap, when can this happen at a different time than mmap? */
static void vm_open(struct vm_area_struct *vma)
{
    pr_info("vm_open\n");
}

static struct vm_operations_struct vm_ops =
{
    .close = vm_close,
    .fault = vm_fault,
    .open = vm_open,
};

static int mmap(struct file *filp, struct vm_area_struct *vma)
{
    pr_info("mmap\n");
    vma->vm_ops = &vm_ops;
    vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
    vma->vm_private_data = filp->private_data;
    vm_open(vma);
    return 0;
}

static int open(struct inode *inode, struct file *filp)
{
    struct mmap_info *info;

    pr_info("open\n");
    info = kmalloc(sizeof(struct mmap_info), GFP_KERNEL);
    pr_info("virt_to_phys = 0x%llx\n", (unsigned long long)virt_to_phys((void *)info));
    info->data = (char *)get_zeroed_page(GFP_KERNEL);
    memcpy(info->data, "asdf", BUFFER_SIZE);
    filp->private_data = info;
    return 0;
}

static ssize_t read(struct file *filp, char __user *buf, size_t len, loff_t *off)
{
    struct mmap_info *info;
    ssize_t ret;

    pr_info("read\n");
    if ((size_t)BUFFER_SIZE <= *off) {
        ret = 0;
    } else {
        info = filp->private_data;
        ret = min(len, (size_t)BUFFER_SIZE - (size_t)*off);
        if (copy_to_user(buf, info->data + *off, ret)) {
            ret = -EFAULT;
        } else {
            *off += ret;
        }
    }
    return ret;
}

static ssize_t write(struct file *filp, const char __user *buf, size_t len, loff_t *off)
{
    struct mmap_info *info;

    pr_info("write\n");
    info = filp->private_data;
    if (copy_from_user(info->data, buf, min(len, (size_t)BUFFER_SIZE))) {
        return -EFAULT;
    } else {
        return len;
    }
}

static int release(struct inode *inode, struct file *filp)
{
    struct mmap_info *info;

    pr_info("release\n");
    info = filp->private_data;
    free_page((unsigned long)info->data);
    kfree(info);
    filp->private_data = NULL;
    return 0;
}

static const struct file_operations fops = {
    .mmap = mmap,
    .open = open,
    .release = release,
    .read = read,
    .write = write,
};

static int myinit(void)
{
    proc_create(filename, 0, NULL, &fops);
    return 0;
}

static void myexit(void)
{
    remove_proc_entry(filename, NULL);
}

module_init(myinit)
module_exit(myexit)
MODULE_LICENSE("GPL");

GitHub upstream . GitHub 上游.

Userland test:用户空间测试：

#define _XOPEN_SOURCE 700
#include <assert.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h> /* uintmax_t */
#include <string.h>
#include <sys/mman.h>
#include <unistd.h> /* sysconf */

/* Format documented at:
 * https://github.com/torvalds/linux/blob/v4.9/Documentation/vm/pagemap.txt
 */
typedef struct {
    uint64_t pfn : 54;
    unsigned int soft_dirty : 1;
    unsigned int file_page : 1;
    unsigned int swapped : 1;
    unsigned int present : 1;
} PagemapEntry;

/* Parse the pagemap entry for the given virtual address.
 *
 * @param[out] entry      the parsed entry
 * @param[in]  pagemap_fd file descriptor to an open /proc/pid/pagemap file
 * @param[in]  vaddr      virtual address to get entry for
 * @return                0 for success, 1 for failure
 */
int pagemap_get_entry(PagemapEntry *entry, int pagemap_fd, uintptr_t vaddr)
{
    size_t nread;
    ssize_t ret;
    uint64_t data;

    nread = 0;
    while (nread < sizeof(data)) {
        ret = pread(pagemap_fd, ((uint8_t*)&data) + nread, sizeof(data),
                (vaddr / sysconf(_SC_PAGE_SIZE)) * sizeof(data) + nread);
        nread += ret;
        if (ret <= 0) {
            return 1;
        }
    }
    entry->pfn = data & (((uint64_t)1 << 54) - 1);
    entry->soft_dirty = (data >> 54) & 1;
    entry->file_page = (data >> 61) & 1;
    entry->swapped = (data >> 62) & 1;
    entry->present = (data >> 63) & 1;
    return 0;
}

/* Convert the given virtual address to physical using /proc/PID/pagemap.
 *
 * @param[out] paddr physical address
 * @param[in]  pid   process to convert for
 * @param[in] vaddr  virtual address to get entry for
 * @return           0 for success, 1 for failure
 */
int virt_to_phys_user(uintptr_t *paddr, pid_t pid, uintptr_t vaddr)
{
    char pagemap_file[BUFSIZ];
    int pagemap_fd;

    snprintf(pagemap_file, sizeof(pagemap_file), "/proc/%ju/pagemap", (uintmax_t)pid);
    pagemap_fd = open(pagemap_file, O_RDONLY);
    if (pagemap_fd < 0) {
        return 1;
    }
    PagemapEntry entry;
    if (pagemap_get_entry(&entry, pagemap_fd, vaddr)) {
        return 1;
    }
    close(pagemap_fd);
    *paddr = (entry.pfn * sysconf(_SC_PAGE_SIZE)) + (vaddr % sysconf(_SC_PAGE_SIZE));
    return 0;
}

enum { BUFFER_SIZE = 4 };

int main(int argc, char **argv)
{
    int fd;
    long page_size;
    char *address1, *address2;
    char buf[BUFFER_SIZE];
    uintptr_t paddr;

    if (argc < 2) {
        printf("Usage: %s <mmap_file>\n", argv[0]);
        return EXIT_FAILURE;
    }
    page_size = sysconf(_SC_PAGE_SIZE);
    printf("open pathname = %s\n", argv[1]);
    fd = open(argv[1], O_RDWR | O_SYNC);
    if (fd < 0) {
        perror("open");
        assert(0);
    }
    printf("fd = %d\n", fd);

    /* mmap twice for double fun. */
    puts("mmap 1");
    address1 = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    if (address1 == MAP_FAILED) {
        perror("mmap");
        assert(0);
    }
    puts("mmap 2");
    address2 = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    if (address2 == MAP_FAILED) {
        perror("mmap");
        return EXIT_FAILURE;
    }
    assert(address1 != address2);

    /* Read and modify memory. */
    puts("access 1");
    assert(!strcmp(address1, "asdf"));
    /* vm_fault */
    puts("access 2");
    assert(!strcmp(address2, "asdf"));
    /* vm_fault */
    strcpy(address1, "qwer");
    /* Also modified. So both virtual addresses point to the same physical address. */
    assert(!strcmp(address2, "qwer"));

    /* Check that the physical addresses are the same.
     * They are, but TODO why virt_to_phys on kernel gives a different value? */
    assert(!virt_to_phys_user(&paddr, getpid(), (uintptr_t)address1));
    printf("paddr1 = 0x%jx\n", (uintmax_t)paddr);
    assert(!virt_to_phys_user(&paddr, getpid(), (uintptr_t)address2));
    printf("paddr2 = 0x%jx\n", (uintmax_t)paddr);

    /* Check that modifications made from userland are also visible from the kernel. */
    read(fd, buf, BUFFER_SIZE);
    assert(!memcmp(buf, "qwer", BUFFER_SIZE));

    /* Modify the data from the kernel, and check that the change is visible from userland. */
    write(fd, "zxcv", 4);
    assert(!strcmp(address1, "zxcv"));
    assert(!strcmp(address2, "zxcv"));

    /* Cleanup. */
    puts("munmap 1");
    if (munmap(address1, page_size)) {
        perror("munmap");
        assert(0);
    }
    puts("munmap 2");
    if (munmap(address2, page_size)) {
        perror("munmap");
        assert(0);
    }
    puts("close");
    close(fd);
    return EXIT_SUCCESS;
}

GitHub upstream . GitHub 上游.

Tested on kernel 5.4.3.在内核 5.4.3 上测试。

Answer 3

Though the pages are reserved via a kernel driver, it is meant to be accessed via user space.虽然页面是通过内核驱动程序保留的，但它旨在通过用户空间访问。 As a result, the PTE (page table entries) do not know if the pfn belongs to user space or kernel space (even though they are allocated via kernel driver).结果，PTE（页表条目）不知道 pfn 是属于用户空间还是内核空间（即使它们是通过内核驱动程序分配的）。

This is why they are marked with SetPageReserved .这就是它们被标记为SetPageReserved 。

如何将 Linux 内核缓冲区映射到用户空间？

问题描述

3 个解决方案

解决方案1
24 已采纳 2012-05-26 23:39:57

解决方案2
13 2017-08-12 01:39:12

解决方案3
0 2013-03-10 23:29:28

如何将 Linux 内核缓冲区映射到用户空间？

问题描述

3 个解决方案

解决方案1 24 已采纳 2012-05-26 23:39:57

解决方案2 13 2017-08-12 01:39:12

解决方案3 0 2013-03-10 23:29:28

解决方案1
24 已采纳 2012-05-26 23:39:57

解决方案2
13 2017-08-12 01:39:12

解决方案3
0 2013-03-10 23:29:28