简体   繁体   English

如何在 kernel 空间调用系统调用?

[英]How to call system call in kernel space?

I'm trying to trigger system call in kernel space and it works fine if the system call does not take arguments such as getpid() .我正在尝试在 kernel 空间中触发系统调用,如果系统调用不采用 arguments 例如getpid()它工作正常。

The method how I do it:我怎么做的方法:

  1. get the address of system table获取系统表的地址
static void **syscall_table;
  1. use it with system call number you want and as a function pointer:将它与您想要的系统调用号一起使用,并作为 function 指针:
typedef long (*sys_call_ptr_t)(const struct __user pt_regs *);

// call system call
((sys_call_ptr_t *)syscall_table)[system_call_number](reg);
  1. if system call have argument, store them into regs before calling it:如果系统调用有参数,在调用它之前将它们存储到 regs 中:
struct __user pt_regs *reg = kmalloc....;
reg->di = ...
reg->si = ...

Currently, I'm trying to use write but it fails.目前,我正在尝试使用write但它失败了。

write(int fd, const void *buf, size_t count);

For buf , I've tried both user space address and kernel space address.对于buf ,我尝试了用户空间地址和 kernel 空间地址。 count may not be a problem. count可能不是问题。 So, I guess problem maybe occur in file descriptor (maybe fd is different between in lower level's and user space's).所以,我猜问题可能出现在文件描述符中(可能fd在较低级别和用户空间之间是不同的)。 For basic testing, I only want to write text into terminal, so fd should be 1 (at least in user space).对于基本测试,我只想将文本写入终端,所以fd应该是1 (至少在用户空间中)。

There're two questions here:这里有两个问题:

  1. In some reason, I need to stick to the method calling syscall described above.出于某种原因,我需要坚持上述调用系统调用的方法。 Is it reasonable or any step I miss and cause failure of using write ?这是合理的还是我错过的任何步骤并导致使用write失败?

  2. If something wrong when I called write ?如果我调用write时出现问题? Does the problem come from fd ?问题来自fd吗? If so, how do I get the corresponding fd with 1 in user space?如果是这样,我如何在用户空间中获得对应的1fd

Foreword前言

By definition, a system call is a service offered by the system to the user space applications.根据定义,系统调用是系统向用户空间应用程序提供的服务。 When one is running inside the system, he should not call a service destined to user space.当一个人在系统内部运行时,他不应该调用以用户空间为目标的服务。 Hence, this is unadvised to make it.因此,不建议这样做。

First try with a kernel space buffer首先尝试使用 kernel 空间缓冲区

The write() system call is defined in fs/read_write.c . write()系统调用在fs/read_write.c中定义。 It calls ksys_write() which calls vfs_write() :它调用ksys_write()调用vfs_write()

ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
    ssize_t ret;

    if (!(file->f_mode & FMODE_WRITE))
        return -EBADF;
    if (!(file->f_mode & FMODE_CAN_WRITE))
        return -EINVAL;
    if (unlikely(!access_ok(buf, count)))
        return -EFAULT;

    ret = rw_verify_area(WRITE, file, pos, count);
    if (!ret) {
        if (count > MAX_RW_COUNT)
            count =  MAX_RW_COUNT;
        file_start_write(file);
        ret = __vfs_write(file, buf, count, pos);
        if (ret > 0) {
            fsnotify_modify(file);
            add_wchar(current, ret);
        }
        inc_syscw(current);
        file_end_write(file);
    }

    return ret;
}
[...]
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
    struct fd f = fdget_pos(fd);
    ssize_t ret = -EBADF;

    if (f.file) {
        loff_t pos, *ppos = file_ppos(f.file);
        if (ppos) {
            pos = *ppos;
            ppos = &pos;
        }
        ret = vfs_write(f.file, buf, count, ppos);
        if (ret >= 0 && ppos)
            f.file->f_pos = pos;
        fdput_pos(f);
    }

    return ret;
}

SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
        size_t, count)
{
    return ksys_write(fd, buf, count);
}

The file descriptor passed as first parameter is not a problem.作为第一个参数传递的文件描述符不是问题。 The value passed from user space is used to retrieve the file structure of the output file (in ksys_write() ).从用户空间传递的值用于检索 output 文件的文件结构(在ksys_write()中)。 But the second parameter must reference a user space memory area.但是第二个参数必须引用一个用户空间 memory 区域。 In vfs_write() , a check is done on the second parameter:vfs_write()中,对第二个参数进行检查:

    if (unlikely(!access_ok(buf, count)))
        return -EFAULT;

access_ok() checks if the buffer is in the user-level space. access_ok()检查缓冲区是否在用户级空间中。 Hence, if you pass an address referencing the kernel space, the returned code from read() will be -EFAULT (-14).因此,如果您传递引用 kernel 空间的地址,则read()的返回代码将为-EFAULT (-14)。

The example below is a simple module calling the write() system call with a kernel space buffer.下面的示例是一个使用 kernel 空间缓冲区调用write()系统调用的简单模块。 On x86_64, the convention for the parameters of the system calls are:在 x86_64 上,系统调用的参数约定是:

   RDI = arg#0
   RSI = arg#1
   RDX = arg#2
   R10 = arg#3
   R8  = arg#4
   R9  = arg#5
#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/ptrace.h>
#include <linux/socket.h>
#include <linux/kallsyms.h>


MODULE_LICENSE("GPL");

typedef int (* syscall_wrapper)(struct pt_regs *);

unsigned long sys_call_table_addr;

#define DEV_NAME "[DEVICE2]"


#define DEV_STR  DEV_NAME "String from driver"

static char buf[1024];


static int __init device2_init(void) {

  syscall_wrapper write_syscall;
  int rc;
  struct pt_regs param;

  printk(KERN_INFO DEV_NAME "module has been loaded\n");

  sys_call_table_addr = kallsyms_lookup_name("sys_call_table");

  printk(KERN_INFO DEV_NAME "sys_call_table@%lx\n", sys_call_table_addr);

  write_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_write];

  /*
    Call to write() system call with a kernel space buffer
  */
  snprintf(buf, sizeof(buf), "%s\n", DEV_STR);
  param.di = 1;
  param.si = (unsigned long)buf;
  param.dx = strlen(buf);
  rc = (* write_syscall)(&param);

  printk(KERN_INFO DEV_NAME "write() with a kernel space buffer = %d\n", rc);

  return 0;
}

static void __exit device2_exit(void) {
  printk(KERN_INFO DEV_NAME "module has been unloaded\n");
}

module_init(device2_init);
module_exit(device2_exit);

At module insertion time, we can verify that the system call returns -EFAULT:在模块插入时,我们可以验证系统调用是否返回 -EFAULT:

$ sudo insmod ./device2.ko
$ dmesg
[15716.262977] [DEVICE2]module has been loaded
[15716.270566] [DEVICE2]sys_call_table@ffffffff926013a0
[15716.270568] [DEVICE2]write() with a kernel space buffer = -14

But the same module with a system call like dup() which involves a file descriptor but no user space buffers, this works.但是具有像dup()这样的系统调用的相同模块涉及文件描述符但没有用户空间缓冲区,这是可行的。 Let's change the previous code with:让我们将之前的代码更改为:

static int __init device2_init(void) {

  syscall_wrapper write_syscall;
  syscall_wrapper dup_syscall;
  syscall_wrapper close_syscall;
  int rc;
  struct pt_regs param;

  printk(KERN_INFO DEV_NAME "module has been loaded\n");

  sys_call_table_addr = kallsyms_lookup_name("sys_call_table");

  printk(KERN_INFO DEV_NAME "sys_call_table@%lx\n", sys_call_table_addr);

  write_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_write];
  dup_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_dup];
  close_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_close];

  /*
    Call to write() system call with a kernel space buffer
  */
  snprintf(buf, sizeof(buf), "%s\n", DEV_STR);
  param.di = 1;
  param.si = (unsigned long)buf;
  param.dx = strlen(buf);
  rc = (* write_syscall)(&param);

  printk(KERN_INFO DEV_NAME "write() with a kernel space buffer = %d\n", rc);

  /*
    Call to dup() system call
  */
  param.di = 1;
  rc = (* dup_syscall)(&param);

  printk(KERN_INFO DEV_NAME "dup() = %d\n", rc);

  /*
    Call to close() system call
  */
  param.di = 0;
  rc = (* close_syscall)(&param);

  printk(KERN_INFO DEV_NAME "close() = %d\n", rc);

  /*
    Call to dup() system call ==> Must return 0 as it is available
  */
  param.di = 1;
  rc = (* dup_syscall)(&param);

  printk(KERN_INFO DEV_NAME "dup() = %d\n", rc);

  return 0;
}

The result of dup() is OK: dup()的结果是好的:

$ sudo insmod ./device2.ko
$ dmesg
[17444.098469] [DEVICE2]module has been loaded
[17444.106935] [DEVICE2]sys_call_table@ffffffff926013a0
[17444.106937] [DEVICE2]write() with a kernel space buffer = -14
[17444.106939] [DEVICE2]dup() = 4
[17444.106940] [DEVICE2]close() = 0
[17444.106940] [DEVICE2]dup() = 0

The first call to dup() returns 4 because the current process is insmod .第一次调用dup()返回 4 因为当前进程是insmod The latter opened the module file and got file descriptor 3. Hence, the first available file descriptor is 4. The second call to dup() returns 0 because we closed the file descriptor 0.后者打开模块文件并获得文件描述符 3。因此,第一个可用的文件描述符是 4。对dup()的第二次调用返回 0,因为我们关闭了文件描述符 0。

Second try with a user space buffer第二次尝试使用用户空间缓冲区

To use a user space buffer, let's add some file operations to the kernel module ( open() , release() and write() ).要使用用户空间缓冲区,让我们向 kernel 模块添加一些文件操作( open()release()write() )。 In the write() entry point we echo back what is passed from user space into stderr (file descriptor 2) using the user space buffer passed to the write() entry point:write()入口点,我们使用传递给write()入口点的用户空间缓冲区回显从用户空间传递到stderr (文件描述符 2)的内容:

#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/ptrace.h>
#include <linux/socket.h>
#include <linux/kallsyms.h>
#include <linux/cdev.h>


MODULE_LICENSE("GPL");

typedef int (* syscall_wrapper)(struct pt_regs *);

static unsigned long sys_call_table_addr;

#define DEV_NAME "[DEVICE2]"

static syscall_wrapper write_syscall;

static ssize_t device2_write(struct file *filp, const char *buff, size_t len, loff_t * off)
{
  struct pt_regs param;
  int rc;

  printk(KERN_INFO DEV_NAME "write %p, %zu\n", buff, len);

  /*
    Call to write() system call to echo the write to stderr
  */
  param.di = 2;
  param.si = (unsigned long)buff;
  param.dx = len;
  rc = (* write_syscall)(&param);

  printk(KERN_INFO DEV_NAME "write() = %d\n", rc);

  return len;  // <-------------- To stop the write
}

static int device2_open(struct inode *inode, struct file *file)
{
    printk(KERN_INFO DEV_NAME "open\n");
    return 0;
}

static int device2_release(struct inode *inode, struct file *file)
{
    printk(KERN_INFO DEV_NAME "released\n");
    return 0;
}

static const struct file_operations fops =
{
    .owner= THIS_MODULE,
    .write=device2_write,
    .open= device2_open,
    .release= device2_release

};

struct cdev *device_cdev;
dev_t deviceNumbers;

static int __init device2_init(void) {

  int rc;

  printk(KERN_INFO DEV_NAME "module has been loaded\n");

  // This returns the major number chosen dynamically in deviceNumbers
  rc = alloc_chrdev_region(&deviceNumbers, 0, 1, DEV_NAME);

  if (rc < 0) {
    printk(KERN_ALERT DEV_NAME "Error registering: %d\n", rc);
    return -1;
  }

  device_cdev = cdev_alloc();

  cdev_init(device_cdev, &fops);

  cdev_add(device_cdev, deviceNumbers, 1);

  printk(KERN_INFO DEV_NAME "initialized (major number is %d)\n", MAJOR(deviceNumbers));

  sys_call_table_addr = kallsyms_lookup_name("sys_call_table");

  printk(KERN_INFO DEV_NAME "sys_call_table@%lx\n", sys_call_table_addr);

  write_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_write];

  printk(KERN_INFO DEV_NAME "write_syscall@%p\n", write_syscall);

  return 0;
}

static void __exit device2_exit(void) {
  printk(KERN_INFO DEV_NAME "module has been unloaded\n");
}

module_init(device2_init);
module_exit(device2_exit);

The loading of the module:模块的加载:

$ sudo insmod device2.ko
$ dmesg
[ 2255.183196] [DEVICE2]module has been loaded
[ 2255.183202] [DEVICE2]initialized (major number is 508)
[ 2255.193255] [DEVICE2]sys_call_table@ffffffffbcc013a0
[ 2255.193256] [DEVICE2]write_syscall@0000000030394929

Make the device entry in the file system to be able to write into it:使文件系统中的设备条目能够写入:

$ sudo mknod /dev/device2 c 508 0
$ sudo chmod 666 /dev/device2
$ sudo ls -l /dev/device2
crw-rw-rw- 1 root root 508, 0 janv.  24 16:55 /dev/device2

The writing into the device triggers the expected echo on stderr :写入设备会触发stderr上的预期回声:

$ echo "qwerty for test purposes" > /dev/device2
qwerty for test purposes
$ echo "another string" > /dev/device2
another string
$ dmesg
[ 2255.183196] [DEVICE2]module has been loaded
[ 2255.183202] [DEVICE2]initialized (major number is 508)
[ 2255.193255] [DEVICE2]sys_call_table@ffffffffbcc013a0
[ 2255.193256] [DEVICE2]write_syscall@0000000030394929
[ 2441.674250] [DEVICE2]open
[ 2441.674268] [DEVICE2]write 0000000032fb5249, 25
[ 2441.674281] [DEVICE2]write() = 25
[ 2441.674286] [DEVICE2]released
[ 2475.538140] [DEVICE2]open
[ 2475.538159] [DEVICE2]write 0000000032fb5249, 15
[ 2475.538171] [DEVICE2]write() = 15
[ 2475.538175] [DEVICE2]released

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM