linux内核exec过程


简介

本文分析linux内核exec系统调用执行过程中可执行文件的加载过程和栈的设置,内核代码版本为2.6.32

分析

\arch\ia64\kernel\process.c中有sys_exec函数的实现,是exec的系统调用服务例程

long
sys_execve (char __user *filename, char __user * __user *argv, char __user * __user *envp,
        struct pt_regs *regs)
{
    char *fname;
    int error;
  //得到文件名字
    fname = getname(filename);
    error = PTR_ERR(fname);
    if (IS_ERR(fname))
        goto out;
    error = do_execve(fname, argv, envp, regs);
    putname(fname);
out:
    return error;
}

\fs\namei.c中有getname函数的实现,在getname中,会从slab分配器中分配空间,然后从用户空间读取名字。所以sys_execve的主要工作有do_execve来实现,do_execve实现在\fs\exec.c中,下面分析do_execve的实现

首先是共享打开文件描述符

 struct files_struct *displaced;
retval = unshare_files(&displaced);

unshare是linux中名称空间的控制函数,files_struct是挂靠在进程文件描述符上的,表示一个进程打开文件的信息,包含打开文件列表等待信息。这里的unshare_files就是复制原打开文件列表,所以说,exec后,子进程是共享父进程的打开文件列表的,包括标准输入输出和错误输出

struct linux_binprm *bprm;
bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);

这里动态分配了linux_binprm结构,linux_binprm是exec过程中信息的结构

/*
 * This structure is used to hold the arguments that are used when loading binaries.
 */
struct linux_binprm{
    char buf[BINPRM_BUF_SIZE];
#ifdef CONFIG_MMU
    struct vm_area_struct *vma;
#else
# define MAX_ARG_PAGES    32
    struct page *page[MAX_ARG_PAGES];
#endif
    struct mm_struct *mm;
    unsigned long p; /* current top of mem */
    unsigned int
        cred_prepared:1,/* true if creds already prepared (multiple
                 * preps happen for interpreters) */
        cap_effective:1;/* true if has elevated effective capabilities,
                 * false if not; except for init which inherits
                 * its parent's caps anyway */
#ifdef __alpha__
    unsigned int taso:1;
#endif
    unsigned int recursion_depth;
    struct file * file;
    struct cred *cred;    /* new credentials */
    int unsafe;        /* how unsafe this exec is (mask of LSM_UNSAFE_*) */
    unsigned int per_clear;    /* bits to clear in current->personality */
    int argc, envc;
    char * filename;    /* Name of binary as seen by procps */
    char * interp;        /* Name of the binary really executed. Most
                   of the time same as filename, but could be
                   different for binfmt_{misc,script} */
    unsigned interp_flags;
    unsigned interp_data;
    unsigned long loader, exec;
};
linux_binprm结构

接下来的prepare_bprm_creds新建一个cred结构,设置linux_binprm中的cred结构,就是信任状相关内容,包含gid,uid等信息,经常用来提权

retval = prepare_bprm_creds(bprm);

然后打开文件,并初始化文件相关结构

file = open_exec(filename);
bprm->file = file;
bprm->filename = filename;
bprm->interp = filename;

建立内存管理的mm结构

retval = bprm_mm_init(bprm);
然后初始化一下参数个数和环境变量个数
bprm->argc = count(argv, MAX_ARG_STRINGS);
if ((retval = bprm->argc) < 0)
    goto out;

bprm->envc = count(envp, MAX_ARG_STRINGS);
if ((retval = bprm->envc) < 0)
    goto out;

接着是prepare_binprm函数,prepare_binprm函数检查了文件是否可以执行,初始化了binprm中cred的几个字段,然后还从文件中读取了BINPRM_BUF_SIZE的内容到binprm的buf中

int prepare_binprm(struct linux_binprm *bprm)
{
    umode_t mode;
    struct inode * inode = bprm->file->f_path.dentry->d_inode;
    int retval;

    mode = inode->i_mode;
    if (bprm->file->f_op == NULL)
        return -EACCES;

    /* clear any previous set[ug]id data from a previous binary */
    bprm->cred->euid = current_euid();
    bprm->cred->egid = current_egid();

    if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
        /* Set-uid? */
        if (mode & S_ISUID) {
            bprm->per_clear |= PER_CLEAR_ON_SETID;
            bprm->cred->euid = inode->i_uid;
        }

        /* Set-gid? */
        /*
         * If setgid is set but no group execute bit then this
         * is a candidate for mandatory locking, not a setgid
         * executable.
         */
        if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
            bprm->per_clear |= PER_CLEAR_ON_SETID;
            bprm->cred->egid = inode->i_gid;
        }
    }

    /* fill in binprm security blob */
    retval = security_bprm_set_creds(bprm);
    if (retval)
        return retval;
    bprm->cred_prepared = 1;

    memset(bprm->buf, 0, BINPRM_BUF_SIZE);
    return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
}

 下面是复制几个字符串的工作

    retval = copy_strings_kernel(1, &bprm->filename, bprm);
    if (retval < 0)
        goto out;

    bprm->exec = bprm->p;
    retval = copy_strings(bprm->envc, envp, bprm);
    if (retval < 0)
        goto out;

    retval = copy_strings(bprm->argc, argv, bprm);
    if (retval < 0)
        goto out;

其中copy_string_kernel也是调用copy_string实现的,只不过是从你和中拷贝,具体实现就是使用set_fs设置段限制为内核数据段。

看copy_string函数之前,先看看linux_binprm中的两个字段,page和p,page表示的是存放参数的页面数组,而p表示的是在这些数组的顶部,因为这些字符串是按照栈的方式存放的,也就是说,先分配地址更高的数组,向低地址方向增长,p就指向栈顶部

下面copy_string的实现也就清楚了。

static int copy_strings(int argc, char __user * __user * argv,
            struct linux_binprm *bprm)
{
    struct page *kmapped_page = NULL;
    char *kaddr = NULL;
    unsigned long kpos = 0;
    int ret;
    /* 
    这里使用的是argc不断减少,也就是说get_user取得的是逆序的argv字符串
    */
    while (argc-- > 0) {
        char __user *str;
        int len;
        unsigned long pos;

        if (get_user(str, argv+argc) ||
                !(len = strnlen_user(str, MAX_ARG_STRLEN))) {
            ret = -EFAULT;
            goto out;
        }

        if (!valid_arg_len(bprm, len)) {
            ret = -E2BIG;
            goto out;
        }
        /*
        p指向的是内存区域的最高长度,不断减少
        pos也是指向字符串结尾处的偏移量
        str指向用户态字符串结尾处
         */
        /* We're going to work our way backwords. */
        pos = bprm->p;
        str += len;
        bprm->p -= len;

        while (len > 0) {
            int offset, bytes_to_copy;
            //offset表示的是在页内的偏移量的末尾
            offset = pos % PAGE_SIZE;
            if (offset == 0)
                offset = PAGE_SIZE;

            bytes_to_copy = offset;
            if (bytes_to_copy > len)
                bytes_to_copy = len;
            //这一步让offset指向页内偏移的开始位置,此时字符串应该被拷贝进offset到offset+bytes_to_copy处
            offset -= bytes_to_copy;
            pos -= bytes_to_copy;
            str -= bytes_to_copy;
            len -= bytes_to_copy;

            if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
                struct page *page;
                //从bprm的page字段中取出第一个page,如果不存在则分配页
                page = get_arg_page(bprm, pos, 1);
                if (!page) {
                    ret = -E2BIG;
                    goto out;
                }
                
                if (kmapped_page) {
                    flush_kernel_dcache_page(kmapped_page);
                    kunmap(kmapped_page);
                    put_arg_page(kmapped_page);
                }
                //建立映射到永久内存映射区,虚拟地址是kmap
                kmapped_page = page;
                kaddr = kmap(kmapped_page);
                kpos = pos & PAGE_MASK;
                flush_arg_page(bprm, kpos, kmapped_page);
            }
            //这一步从用户空间拷贝数据进内核,这些数据存放在binprm的page中
            if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
                ret = -EFAULT;
                goto out;
            }
        }
    }
    ret = 0;
out:
    if (kmapped_page) {
        flush_kernel_dcache_page(kmapped_page);
        kunmap(kmapped_page);
        put_arg_page(kmapped_page);
    }
    return ret;
}

 接着就是寻找可执行文件的过程

retval = search_binary_handler(bprm,regs);

在linux内核中,有一个全局的链表formats,表示系统中所有的可执行文件格式,其中链上挂接的结构是linux_binfmt结构,表示一个可执行文件格式,包好了3个重要的函数,分别用来加载可执行文件、共享库和生成core_dump核心转储文件,search_binary_handler就是用这个链上的load_binary来执行。

struct linux_binfmt {
    struct list_head lh;
    struct module *module;
    int (*load_binary)(struct linux_binprm *, struct  pt_regs * regs);
    int (*load_shlib)(struct file *);
    int (*core_dump)(long signr, struct pt_regs *regs, struct file *file, unsigned long limit);
    unsigned long min_coredump;    /* minimal dump size */
    int hasvdso;
};

其中elf的结构定义在\fs\binfmt_elf.c中,如下:

static struct linux_binfmt elf_format = {
        .module        = THIS_MODULE,
        .load_binary    = load_elf_binary,
        .load_shlib    = load_elf_library,
        .core_dump    = elf_core_dump,
        .min_coredump    = ELF_EXEC_PAGESIZE,
        .hasvdso    = 1
};

回头看do_execve,search_binary_handler下面的内容也没有什么了,清除一些分配的结构等等。所以主要的加载实现是在load_elf_binary,这个函数接受了之前初始化的linux_binprm和寄存器上下文。加载可执行文件

下面看load_elf_binary函数

static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
{
    struct file *interpreter = NULL; /* to shut gcc up */
     unsigned long load_addr = 0, load_bias = 0;
    int load_addr_set = 0;
    char * elf_interpreter = NULL;
    unsigned long error;
    struct elf_phdr *elf_ppnt, *elf_phdata;
    unsigned long elf_bss, elf_brk;
    int retval, i;
    unsigned int size;
    unsigned long elf_entry;
    unsigned long interp_load_addr = 0;
    unsigned long start_code, end_code, start_data, end_data;
    unsigned long reloc_func_desc = 0;
    int executable_stack = EXSTACK_DEFAULT;
    unsigned long def_flags = 0;
    //直接在栈上分配两个elf头,表示可执行文件和动态链接器的头
    struct {
        struct elfhdr elf_ex;
        struct elfhdr interp_elf_ex;
    } *loc;
    //分配内存
    loc = kmalloc(sizeof(*loc), GFP_KERNEL);
    if (!loc) {
        retval = -ENOMEM;
        goto out_ret;
    }

    //之前初始化bprm的时候从文件中读取了一些数据放到buf中
    loc->elf_ex = *((struct elfhdr *)bprm->buf);

    retval = -ENOEXEC;
    //这里做一些简单的一致性检查
    if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
        goto out;

    if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN)
        goto out;
    if (!elf_check_arch(&loc->elf_ex))
        goto out;
    if (!bprm->file->f_op||!bprm->file->f_op->mmap)
        goto out;

    /* Now read in all of the header information */
    if (loc->elf_ex.e_phentsize != sizeof(struct elf_phdr))
        goto out;
    if (loc->elf_ex.e_phnum < 1 ||
         loc->elf_ex.e_phnum > 65536U / sizeof(struct elf_phdr))
        goto out;
    //程序头表的大小
    size = loc->elf_ex.e_phnum * sizeof(struct elf_phdr);
    retval = -ENOMEM;
    //分配程序头表的内存空间
    elf_phdata = kmalloc(size, GFP_KERNEL);
    if (!elf_phdata)
        goto out;
    //读入程序头表的内容
    retval = kernel_read(bprm->file, loc->elf_ex.e_phoff,
                 (char *)elf_phdata, size);
    if (retval != size) {
        if (retval >= 0)
            retval = -EIO;
        goto out_free_ph;
    }

    elf_ppnt = elf_phdata;
    elf_bss = 0;
    elf_brk = 0;

    start_code = ~0UL;
    end_code = 0;
    start_data = 0;
    end_data = 0;
    //遍历所有的段,找到类型为PT_INTERP的段,这个段内存放的是动态链接器的地址
    for (i = 0; i < loc->elf_ex.e_phnum; i++) {
        if (elf_ppnt->p_type == PT_INTERP) {
            /* This is the program interpreter used for
             * shared libraries - for now assume that this
             * is an a.out format binary
             */
            retval = -ENOEXEC;
            //验证动态链接器的路径是否符合路径要求
            if (elf_ppnt->p_filesz > PATH_MAX || 
                elf_ppnt->p_filesz < 2)
                goto out_free_ph;

            retval = -ENOMEM;
            //分配内存存放动态链接器路径
            elf_interpreter = kmalloc(elf_ppnt->p_filesz,
                          GFP_KERNEL);
            if (!elf_interpreter)
                goto out_free_ph;
            //读取动态链接器路径
            retval = kernel_read(bprm->file, elf_ppnt->p_offset,
                         elf_interpreter,
                         elf_ppnt->p_filesz);
            if (retval != elf_ppnt->p_filesz) {
                if (retval >= 0)
                    retval = -EIO;
                goto out_free_interp;
            }
            /* make sure path is NULL terminated */
            retval = -ENOEXEC;
            //确认字符串路径最后一定是'\0'字符
            if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
                goto out_free_interp;

            /*
             * The early SET_PERSONALITY here is so that the lookup
             * for the interpreter happens in the namespace of the 
             * to-be-execed image.  SET_PERSONALITY can select an
             * alternate root.
             *
             * However, SET_PERSONALITY is NOT allowed to switch
             * this task into the new images's memory mapping
             * policy - that is, TASK_SIZE must still evaluate to
             * that which is appropriate to the execing application.
             * This is because exit_mmap() needs to have TASK_SIZE
             * evaluate to the size of the old image.
             *
             * So if (say) a 64-bit application is execing a 32-bit
             * application it is the architecture's responsibility
             * to defer changing the value of TASK_SIZE until the
             * switch really is going to happen - do this in
             * flush_thread().    - akpm
             */
            //这个应该是和执行域相关
            SET_PERSONALITY(loc->elf_ex);
            //内核里面用来打开可执行文件的函数,返回的是file结构
            interpreter = open_exec(elf_interpreter);
            retval = PTR_ERR(interpreter);
            if (IS_ERR(interpreter))
                goto out_free_interp;

            /*
             * If the binary is not readable then enforce
             * mm->dumpable = 0 regardless of the interpreter's
             * permissions.
             */
            if (file_permission(interpreter, MAY_READ) < 0)
                bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
            //这里将bprm的buf又填充成为动态链接器的内容
            retval = kernel_read(interpreter, 0, bprm->buf,
                         BINPRM_BUF_SIZE);
            if (retval != BINPRM_BUF_SIZE) {
                if (retval >= 0)
                    retval = -EIO;
                goto out_free_dentry;
            }

            /* Get the exec headers */
            //初始化头
            loc->interp_elf_ex = *((struct elfhdr *)bprm->buf);
            break;
        }
        elf_ppnt++;
    }
    //又遍历可执行文件,找到类型为PT_GNU_STACK的段,这应该是表示栈上的代码是否可以执行
    elf_ppnt = elf_phdata;
    for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++)
        if (elf_ppnt->p_type == PT_GNU_STACK) {
            if (elf_ppnt->p_flags & PF_X)
                executable_stack = EXSTACK_ENABLE_X;
            else
                executable_stack = EXSTACK_DISABLE_X;
            break;
        }

    //对动态链接器做一些检查
    /* Some simple consistency checks for the interpreter */
    if (elf_interpreter) {
        retval = -ELIBBAD;
        /* Not an ELF interpreter */
        if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
            goto out_free_dentry;
        /* Verify the interpreter has a valid arch */
        if (!elf_check_arch(&loc->interp_elf_ex))
            goto out_free_dentry;
    } else {
        /* Executables without an interpreter also need a personality  */
        SET_PERSONALITY(loc->elf_ex);
    }

    /* Flush all traces of the currently running executable */
    //清空原地址空间的内容
    retval = flush_old_exec(bprm);
    if (retval)
        goto out_free_dentry;

    /* OK, This is the point of no return */
    current->flags &= ~PF_FORKNOEXEC;
    current->mm->def_flags = def_flags;

    /* Do this immediately, since STACK_TOP as used in setup_arg_pages
       may depend on the personality.  */
    SET_PERSONALITY(loc->elf_ex);
    if (elf_read_implies_exec(loc->elf_ex, executable_stack))
        current->personality |= READ_IMPLIES_EXEC;

    if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
        current->flags |= PF_RANDOMIZE;
    //选择一个地址空间布局
    arch_pick_mmap_layout(current->mm);

    /* Do this so that we can load the interpreter, if need be.  We will
       change some of these later */
    current->mm->free_area_cache = current->mm->mmap_base;
    current->mm->cached_hole_size = 0;
    //设置栈区vma的页
    retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
                 executable_stack);
    if (retval < 0) {
        send_sig(SIGKILL, current, 0);
        goto out_free_dentry;
    }
    
    current->mm->start_stack = bprm->p;

    /* Now we do a little grungy work by mmaping the ELF image into
       the correct location in memory. */
    //遍历段,找到PT_LOAD段
    for(i = 0, elf_ppnt = elf_phdata;
        i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
        int elf_prot = 0, elf_flags;
        unsigned long k, vaddr;

        if (elf_ppnt->p_type != PT_LOAD)
            continue;
        /*
        在下面的赋值中,elf_bss和elf_brk分别来表示虚拟地址加上文件偏移和内存偏移
        所以这里的比较应该是内存大小大于文件大小,也就是有bss节的段,data段
        */
        if (unlikely (elf_brk > elf_bss)) {
            unsigned long nbyte;
                
            /* There was a PT_LOAD segment with p_memsz > p_filesz
               before this one. Map anonymous pages, if needed,
               and clear the area.  */
            //将bss节的空间设置为有效
            retval = set_brk (elf_bss + load_bias,
                      elf_brk + load_bias);
            if (retval) {
                send_sig(SIGKILL, current, 0);
                goto out_free_dentry;
            }
            //下面的代码将bss节清零
            nbyte = ELF_PAGEOFFSET(elf_bss);
            if (nbyte) {
                nbyte = ELF_MIN_ALIGN - nbyte;
                if (nbyte > elf_brk - elf_bss)
                    nbyte = elf_brk - elf_bss;
                if (clear_user((void __user *)elf_bss +
                            load_bias, nbyte)) {
                    /*
                     * This bss-zeroing can fail if the ELF
                     * file specifies odd protections. So
                     * we don't check the return value
                     */
                }
            }
        }

        if (elf_ppnt->p_flags & PF_R)
            elf_prot |= PROT_READ;
        if (elf_ppnt->p_flags & PF_W)
            elf_prot |= PROT_WRITE;
        if (elf_ppnt->p_flags & PF_X)
            elf_prot |= PROT_EXEC;

        elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;

        vaddr = elf_ppnt->p_vaddr;
        //一般的可执行文件应该进入if条件
        if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
            elf_flags |= MAP_FIXED;
        } else if (loc->elf_ex.e_type == ET_DYN) {
            //如果是共享目标文件,动态链接器的类型是这个
            /* Try and get dynamic programs out of the way of the
             * default mmap base, as well as whatever program they
             * might try to exec.  This is because the brk will
             * follow the loader, and is not movable.  */
#ifdef CONFIG_X86
            load_bias = 0;
#else
            load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
#endif
        }
        //map这个段,对于可执行文件来说,map的虚拟地址取段的虚拟地址
        error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
                elf_prot, elf_flags, 0);
        if (BAD_ADDR(error)) {
            send_sig(SIGKILL, current, 0);
            retval = IS_ERR((void *)error) ?
                PTR_ERR((void*)error) : -EINVAL;
            goto out_free_dentry;
        }

        if (!load_addr_set) {
            load_addr_set = 1;
            //这是算出整个可执行文件的虚拟位置
            load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset);
            if (loc->elf_ex.e_type == ET_DYN) {
                load_bias += error -
                             ELF_PAGESTART(load_bias + vaddr);
                load_addr += load_bias;
                reloc_func_desc = load_bias;
            }
        }
        k = elf_ppnt->p_vaddr;
        if (k < start_code)
            start_code = k;
        if (start_data < k)
            start_data = k;

        /*
         * Check to see if the section's size will overflow the
         * allowed task size. Note that p_filesz must always be
         * <= p_memsz so it is only necessary to check p_memsz.
         */
        if (BAD_ADDR(k) || elf_ppnt->p_filesz > elf_ppnt->p_memsz ||
            elf_ppnt->p_memsz > TASK_SIZE ||
            TASK_SIZE - elf_ppnt->p_memsz < k) {
            /* set_brk can never work. Avoid overflows. */
            send_sig(SIGKILL, current, 0);
            retval = -EINVAL;
            goto out_free_dentry;
        }

        //这里是加上的文件偏移
        k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz;

        if (k > elf_bss)
            elf_bss = k;
        if ((elf_ppnt->p_flags & PF_X) && end_code < k)
            end_code = k;
        if (end_data < k)
            end_data = k;
        //这里是虚拟地址加上内存大小
        k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;
        if (k > elf_brk)
            elf_brk = k;
    }

    loc->elf_ex.e_entry += load_bias;
    elf_bss += load_bias;
    elf_brk += load_bias;
    start_code += load_bias;
    end_code += load_bias;
    start_data += load_bias;
    end_data += load_bias;

    /* Calling set_brk effectively mmaps the pages that we need
     * for the bss and break sections.  We must do this before
     * mapping in the interpreter, to make sure it doesn't wind
     * up getting placed where the bss needs to go.
     */
    retval = set_brk(elf_bss, elf_brk);
    if (retval) {
        send_sig(SIGKILL, current, 0);
        goto out_free_dentry;
    }
    if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {
        send_sig(SIGSEGV, current, 0);
        retval = -EFAULT; /* Nobody gets to see this, but.. */
        goto out_free_dentry;
    }
    //如果有动态链接器,则入口地址需要改成动态链接器的地址
    if (elf_interpreter) {
        unsigned long uninitialized_var(interp_map_addr);

        elf_entry = load_elf_interp(&loc->interp_elf_ex,
                        interpreter,
                        &interp_map_addr,
                        load_bias);
        if (!IS_ERR((void *)elf_entry)) {
            /*
             * load_elf_interp() returns relocation
             * adjustment
             */
            interp_load_addr = elf_entry;
            elf_entry += loc->interp_elf_ex.e_entry;
        }
        if (BAD_ADDR(elf_entry)) {
            force_sig(SIGSEGV, current);
            retval = IS_ERR((void *)elf_entry) ?
                    (int)elf_entry : -EINVAL;
            goto out_free_dentry;
        }
        reloc_func_desc = interp_load_addr;

        allow_write_access(interpreter);
        fput(interpreter);
        kfree(elf_interpreter);
    } else {
        elf_entry = loc->elf_ex.e_entry;
        if (BAD_ADDR(elf_entry)) {
            force_sig(SIGSEGV, current);
            retval = -EINVAL;
            goto out_free_dentry;
        }
    }

    kfree(elf_phdata);

    set_binfmt(&elf_format);

#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
    retval = arch_setup_additional_pages(bprm, !!elf_interpreter);
    if (retval < 0) {
        send_sig(SIGKILL, current, 0);
        goto out;
    }
#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */

    install_exec_creds(bprm);
    current->flags &= ~PF_FORKNOEXEC;
    retval = create_elf_tables(bprm, &loc->elf_ex,
              load_addr, interp_load_addr);
    if (retval < 0) {
        send_sig(SIGKILL, current, 0);
        goto out;
    }
    /* N.B. passed_fileno might not be initialized? */
    current->mm->end_code = end_code;
    current->mm->start_code = start_code;
    current->mm->start_data = start_data;
    current->mm->end_data = end_data;
    current->mm->start_stack = bprm->p;

#ifdef arch_randomize_brk
    if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1))
        current->mm->brk = current->mm->start_brk =
            arch_randomize_brk(current->mm);
#endif

    if (current->personality & MMAP_PAGE_ZERO) {
        /* Why this, you ask???  Well SVr4 maps page 0 as read-only,
           and some applications "depend" upon this behavior.
           Since we do not have the power to recompile these, we
           emulate the SVr4 behavior. Sigh. */
        down_write(¤t->mm->mmap_sem);
        error = do_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
                MAP_FIXED | MAP_PRIVATE, 0);
        up_write(¤t->mm->mmap_sem);
    }

#ifdef ELF_PLAT_INIT
    /*
     * The ABI may specify that certain registers be set up in special
     * ways (on i386 %edx is the address of a DT_FINI function, for
     * example.  In addition, it may also specify (eg, PowerPC64 ELF)
     * that the e_entry field is the address of the function descriptor
     * for the startup routine, rather than the address of the startup
     * routine itself.  This macro performs whatever initialization to
     * the regs structure is required as well as any relocations to the
     * function descriptor entries when executing dynamically links apps.
     */
    ELF_PLAT_INIT(regs, reloc_func_desc);
#endif

    start_thread(regs, elf_entry, bprm->p);
    retval = 0;
out:
    kfree(loc);
out_ret:
    return retval;

    /* error cleanup */
out_free_dentry:
    allow_write_access(interpreter);
    if (interpreter)
        fput(interpreter);
out_free_interp:
    kfree(elf_interpreter);
out_free_ph:
    kfree(elf_phdata);
    goto out;
}

这段代码比较长,关键点有这些

1、加载可执行文件,这部分实现在寻找类型为PT_INTERP和PT_LOAD的段循环中,对于可执行文件,他的PT_INTERP段中存放动态链接器的地址,然后遍历所有的PT_LOAD段,可执行文件主要有两个段,text段和data段,这两个段按照虚拟地址进行map。

2、加载动态链接器,主要实现在load_elf_interp函数,具体的加载方式也和可执行文件的加载方式十类似,找到elf文件的所有类型为PT_LOAD的段,然后map,这里map的地址就是从2G+段虚拟地址的地址开始map,因为动态链接器的段虚拟地址是从0开始的。

3、设置栈空间,主要实现在create_elf_tables中

在create_elf_tables中对于栈空间的设置有下面这些部分

第一步:

p = arch_align_stack(p);

unsigned long arch_align_stack(unsigned long sp)
{
    if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
        sp -= get_random_int() % 8192;
    return sp & ~0xf;
}

第二步:

if (k_platform) {
    size_t len = strlen(k_platform) + 1;

    u_platform = (elf_addr_t __user *)STACK_ALLOC(p, len);
    if (__copy_to_user(u_platform, k_platform, len))
        return -EFAULT;
}

第三步:

get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
u_rand_bytes = (elf_addr_t __user *)
              STACK_ALLOC(p, sizeof(k_rand_bytes));
if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
    return -EFAULT;

第四步:

sp = STACK_ADD(p, ei_index);
items = (argc + 1) + (envc + 1) + 1; bprm->p = STACK_ROUND(sp, items);

第五步:

/* Now, let's put argc (and argv, envp if appropriate) on the stack */
if (__put_user(argc, sp++))
    return -EFAULT;

第六步:

/* Populate argv and envp */
p = current->mm->arg_end = current->mm->arg_start;
while (argc-- > 0) {
    size_t len;
    if (__put_user((elf_addr_t)p, argv++))
        return -EFAULT;
    len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
    if (!len || len > MAX_ARG_STRLEN)
        return -EINVAL;
    p += len;
}
  if (__put_user(0, argv))   return -EFAULT;

第七步:

while (envc-- > 0) {
    size_t len;
    if (__put_user((elf_addr_t)p, envp++))
        return -EFAULT;
    len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
    if (!len || len > MAX_ARG_STRLEN)
        return -EINVAL;
    p += len;
}
if (__put_user(0, envp))
    return -EFAULT;
current->mm->env_end = p;

第八步:

sp = (elf_addr_t __user *)envp + 1;
if (copy_to_user(sp, elf_info, ei_index * sizeof(elf_addr_t)))
    return -EFAULT;

整个栈上的空间分配应该是

position            content                         size(bytes) + comment
-------------------------------------------------------------------------------------------------

stack pointer ->    [argc = number of args]         8
                    [argv[0](pointer)]              8
                    [argv[1](pointer)]              8
                    [argv[...](pointer)]            8 * x
                    [argv[n-1](pointer)]            8
                    [argv[n](pointer)]              8 (=NULL)


                    [envp[0](pointer)]              8
                    [envp[1](pointer)]              8
                    [envp[..](pointer)]             8 * x
                    [envp[term](pointer)]           8 (=NULL)


                    [auxv[0](Elf64_auxv_t)]         16
                    [auxv[1](Elf64_auxv_t)]         16
                    [auxv[..](Elf64_auxv_t)]        16 * x
                    [auxv[term](Elf64_auxv_t)]      16 (=NULL)

                    [padding]                       >= 0

                    [rand bytes]                    16

                    [String identifying platform]   >= 0

                    [padding for align]             >= 0 (sp - (get_random_int() % 8192)) & (~0xf)

                    [argument ASCIIZ strings]       >= 0
                    [environment ASCIIZ str]        >= 0
          [file name]        >= 0

标号1处将堆栈指针向下移动了x(0 <= x <=8192), 分配出[padding for align]这部分空间

标号2的代码为[String identifying platform]在栈上分配空间并进行赋值操作

标号3的代码生成16bytes大小的随机数,然后为[rand bytes]在栈上分配空间并进行赋值操作

标号4的代码

sp = STACK_ADD(p, ei_index)

在栈上分配辅助向量(auxiliary vector)所需存储空间

items = (argc + 1) + (envc + 1) + 1;
bprm->p = STACK_ROUND(sp, items);

在栈上分配argc, argv, encironment vector所需存储空间, 值得注意的是这两步都只是移动堆栈指针分配空间,并没有进行赋值初始化数据操作

标号5, 6, 7, 8的代码做了如下操作
5. 初始化栈上argc的值
6. 初始化栈上argv的值
7. 初始化栈上envp的值
8. 初始化栈上auxv的值

通过阅读代码可以看出执行结果与上面的结构图是相匹配的, 可能会有疑惑的地方就是图示中[padding]这一区域, 这一块数据来源如下:

通过上面解释我们可以看到标号4的代码在栈上一次性分配了argc, argv, envp auxv所需要的空间,然后再通过堆栈指针按顺序向上初始化每一块数据, 重点在于分配的空间并不刚好等于所需空间, 因为标号4的分配空间时使用了宏STACK_ROUND, 该宏定义如下

#define STACK_ROUND(sp, items) \
    (((unsigned long) (sp - items)) &~ 15UL)

会向下16字节对齐,因此分配的空间可能会比所需空间多,而标号5, 6 ,7 ,8的初始化操作又是从栈顶初始化的,所以最后在[auxvterm]和[rand bytes]这两块区域之间会多出[padding]这块数据

参考

https://github.com/chenpengcong/blog/issues/18

https://www.cnblogs.com/joey-hua/p/5638306.html