open系统调用flow之do_last()


open系统调用flow之do_last()

本文章来分析下open file系统调用最后一个主要的函数do_last()

open系统调用来到do_last()时,主要看其nd参数,这个结构体里的path、last成员,此时的path表示open file完整路径最低一级路径,last表示open file的file name,比如open file /data/test/test.txt,此时path结构体表示/data/test的路径;而last qstr struct表示test.txt

有了这个前提之后再来分析下do_last()

和之前link_path_walk()里lookup dentry类似,do_last同样会去查找last file对应的dentry。先调用lookup_fast()看看last所表示的file是否有对应的dentry,如果有,则会直接跳到finish_lookup,不会再走lookup_open() flow;如果没有则会走lookup_open() flow。

不管dcache里是否有对应的dentry,都会call到step_into(),这个函数根据path结构体更新nd.path,即nd.path表示last文件。

如果是走的lookup_open() flow,则接下来会调用vfs_open(),至于如果在dcache里找到了对应的dentry是否还会call vfs_open()没有确认过。

/*
 * Handle the last step of open()
 */
static int do_last(struct nameidata *nd,
           struct file *file, const struct open_flags *op)
{
    struct dentry *dir = nd->path.dentry;
    kuid_t dir_uid = nd->inode->i_uid;
    umode_t dir_mode = nd->inode->i_mode;
    int open_flag = op->open_flag;
    bool will_truncate = (open_flag & O_TRUNC) != 0;
    bool got_write = false;
    int acc_mode = op->acc_mode;
    unsigned seq;
    struct inode *inode;
    struct path path;
    int error;

    nd->flags &= ~LOOKUP_PARENT;
    nd->flags |= op->intent;

    if (nd->last_type != LAST_NORM) {
        error = handle_dots(nd, nd->last_type);
        if (unlikely(error))
            return error;
        goto finish_open;
    }

    if (!(open_flag & O_CREAT)) { //因为是是分析的file已经存在的case,所以是没有O_CREAT flag的
        if (nd->last.name[nd->last.len])
            nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
        /* we _can_ be in RCU mode here */
        error = lookup_fast(nd, &path, &inode, &seq);
        if (likely(error > 0))
            goto finish_lookup;

        if (error < 0)
            return error;

        BUG_ON(nd->inode != dir->d_inode);
        BUG_ON(nd->flags & LOOKUP_RCU);
    } else {
        /* create side of things */
        /*
         * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED
         * has been cleared when we got to the last component we are
         * about to look up
         */
        error = complete_walk(nd);
        if (error)
            return error;

        audit_inode(nd->name, dir, LOOKUP_PARENT);
        /* trailing slashes? */
        if (unlikely(nd->last.name[nd->last.len]))
            return -EISDIR;
    }

    if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
        error = mnt_want_write(nd->path.mnt);
        if (!error)
            got_write = true;
        /*
         * do _not_ fail yet - we might not need that or fail with
         * a different error; let lookup_open() decide; we'll be
         * dropping this one anyway.
         */
    }
    if (open_flag & O_CREAT)
        inode_lock(dir->d_inode);
    else
        inode_lock_shared(dir->d_inode);
    error = lookup_open(nd, &path, file, op, got_write);
    if (open_flag & O_CREAT)
        inode_unlock(dir->d_inode);
    else
        inode_unlock_shared(dir->d_inode);

    if (error)
        goto out;

    if (file->f_mode & FMODE_OPENED) {
        if ((file->f_mode & FMODE_CREATED) ||
            !S_ISREG(file_inode(file)->i_mode))
            will_truncate = false;

        audit_inode(nd->name, file->f_path.dentry, 0);
        goto opened;
    }

    if (file->f_mode & FMODE_CREATED) {
        /* Don't check for write permission, don't truncate */
        open_flag &= ~O_TRUNC;
        will_truncate = false;
        acc_mode = 0;
        path_to_nameidata(&path, nd);
        goto finish_open_created;
    }

    /*
     * If atomic_open() acquired write access it is dropped now due to
     * possible mount and symlink following (this might be optimized away if
     * necessary...)
     */
    if (got_write) {
        mnt_drop_write(nd->path.mnt);
        got_write = false;
    }

    error = follow_managed(&path, nd);
    if (unlikely(error < 0))
        return error;

    if (unlikely(d_is_negative(path.dentry))) {
        path_to_nameidata(&path, nd);
        return -ENOENT;
    }

    /*
     * create/update audit record if it already exists.
     */
    audit_inode(nd->name, path.dentry, 0);

    if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
        path_to_nameidata(&path, nd);
        return -EEXIST;
    }

    seq = 0;    /* out of RCU mode, so the value doesn't matter */
    inode = d_backing_inode(path.dentry);
finish_lookup:
    error = step_into(nd, &path, 0, inode, seq);
    if (unlikely(error))
        return error;
finish_open:
    /* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
    error = complete_walk(nd);
    if (error)
        return error;
    audit_inode(nd->name, nd->path.dentry, 0);
    if (open_flag & O_CREAT) {
        error = -EISDIR;
        if (d_is_dir(nd->path.dentry))
            goto out;
        error = may_create_in_sticky(dir_mode, dir_uid,
                         d_backing_inode(nd->path.dentry));
        if (unlikely(error))
            goto out;
    }
    error = -ENOTDIR;
    if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
        goto out;
    if (!d_is_reg(nd->path.dentry))
        will_truncate = false;

    if (will_truncate) {
        error = mnt_want_write(nd->path.mnt);
        if (error)
            goto out;
        got_write = true;
    }
finish_open_created:
    error = may_open(&nd->path, acc_mode, open_flag);
    if (error)
        goto out;
    BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */
    error = vfs_open(&nd->path, file);
    if (error)
        goto out;
opened:
    error = ima_file_check(file, op->acc_mode);
    if (!error && will_truncate)
        error = handle_truncate(file);
out:
    if (unlikely(error > 0)) {
        WARN_ON(1);
        error = -EINVAL;
    }
    if (got_write)
        mnt_drop_write(nd->path.mnt);
    return error;
}

lookup_open()首先调用了d_lookup(),这个lookup同样是在dcache里查找last所表示的文件是否有对应的dentry,这个lookup是需要操作rcu锁以及dentry的d_lock自旋锁的。如果没有找到,说明这个文件之前没有被open过,还没有建立对应的dentry,则调用d_alloc_parallel给last分配一个dentry。

然后会走到no_open label执行dir_inode->i_op->lookup,这个lookup是具体文件系统的lookup函数。这部分alloc dentry和lookup和之前在分析link_path_walk flow时分析的lookup_slow() flow一样,这里不再赘述。

最后设置下path结构体,将path结构体里的dentry、mnt成员更新为表示当前文件:

static int lookup_open(struct nameidata *nd, struct path *path,
            struct file *file,
            const struct open_flags *op,
            bool got_write)
{
    struct dentry *dir = nd->path.dentry;
    struct inode *dir_inode = dir->d_inode;
    int open_flag = op->open_flag;
    struct dentry *dentry;
    int error, create_error = 0;
    umode_t mode = op->mode;
    DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);

    if (unlikely(IS_DEADDIR(dir_inode)))
        return -ENOENT;

    file->f_mode &= ~FMODE_CREATED;
    dentry = d_lookup(dir, &nd->last);
    for (;;) {
        if (!dentry) {
            dentry = d_alloc_parallel(dir, &nd->last, &wq);
            if (IS_ERR(dentry))
                return PTR_ERR(dentry);
        }
        if (d_in_lookup(dentry))
            break;

        error = d_revalidate(dentry, nd->flags);
        if (likely(error > 0))
            break;
        if (error)
            goto out_dput;
        d_invalidate(dentry);
        dput(dentry);
        dentry = NULL;
    }
    if (dentry->d_inode) {
        /* Cached positive dentry: will open in f_op->open */
        goto out_no_open;
    }

    /*
     * Checking write permission is tricky, bacuse we don't know if we are
     * going to actually need it: O_CREAT opens should work as long as the
     * file exists.  But checking existence breaks atomicity.  The trick is
     * to check access and if not granted clear O_CREAT from the flags.
     *
     * Another problem is returing the "right" error value (e.g. for an
     * O_EXCL open we want to return EEXIST not EROFS).
     */
    if (open_flag & O_CREAT) {
        if (!IS_POSIXACL(dir->d_inode))
            mode &= ~current_umask();
        if (unlikely(!got_write)) {
            create_error = -EROFS;
            open_flag &= ~O_CREAT;
            if (open_flag & (O_EXCL | O_TRUNC))
                goto no_open;
            /* No side effects, safe to clear O_CREAT */
        } else {
            create_error = may_o_create(&nd->path, dentry, mode);
            if (create_error) {
                open_flag &= ~O_CREAT;
                if (open_flag & O_EXCL)
                    goto no_open;
            }
        }
    } else if ((open_flag & (O_TRUNC|O_WRONLY|O_RDWR)) &&
           unlikely(!got_write)) {
        /*
         * No O_CREATE -> atomicity not a requirement -> fall
         * back to lookup + open
         */
        goto no_open;
    }

    if (dir_inode->i_op->atomic_open) { #这个是atomic open
        error = atomic_open(nd, dentry, path, file, op, open_flag,
                    mode);
        if (unlikely(error == -ENOENT) && create_error)
            error = create_error;
        return error;
    }

no_open:
    if (d_in_lookup(dentry)) {
        struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
                                 nd->flags);
        d_lookup_done(dentry);
        if (unlikely(res)) {
            if (IS_ERR(res)) {
                error = PTR_ERR(res);
                goto out_dput;
            }
            dput(dentry);
            dentry = res;
        }
    }

    /* Negative dentry, just create the file */
    if (!dentry->d_inode && (open_flag & O_CREAT)) {
        file->f_mode |= FMODE_CREATED;
        audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
        if (!dir_inode->i_op->create) {
            error = -EACCES;
            goto out_dput;
        }
        error = dir_inode->i_op->create(dir_inode, dentry, mode,
                        open_flag & O_EXCL);
        if (error)
            goto out_dput;
        fsnotify_create(dir_inode, dentry);
    }
    if (unlikely(create_error) && !dentry->d_inode) {
        error = create_error;
        goto out_dput;
    }
out_no_open:
    path->dentry = dentry;
    path->mnt = nd->path.mnt;
    return 0;

out_dput:
    dput(dentry);
    return error;
}

vfs_open,将表示file的path赋值给此file的f_path成员,之后就可以根据file struct得到此file的dentry(f_path.dentry)以及此文件所在的文件系统vfsmount(f_path.mnt)了。根据这个dentry又可以得到这个file的inode struct。

然后调用do_dentry_open()

int vfs_open(const struct path *path, struct file *file)
{
    file->f_path = *path;
    return do_dentry_open(file, d_backing_inode(path->dentry), NULL);
}

do_dentry_open参数说明:

f: 即表示此file的file struct;

inode: 即此file的inode struct;

open: 这里它为null

do_dentry_open()里会调用fops_get(inode->i_fop),将返回值赋值给f->f_op,fops_get()的返回值,对于文件系统来说,它是没有设置module类型的owner成员的,同时CONFIG_MODULES和CONFIG_MODULE_UNLOAD(表示是否支持卸载已经加载的ko)一般是define了的,所以是使用的module.c里的try_module_get(),在这个函数里,如果module指针是null,则直接return true,所以对于文件系统来说,是直接将inode->i_fop赋值给f->fop,即设置file的file_operations,这个file_operations是具体文件系统提供的,以ext4 fs为例,它是ext4_file_operations。然后如果具体文件系统提供的file_operations函数集里有open函数,则调用这个open函数,对应ext4 fs,它是有提供的,它是ext4_file_open(),传给它的参数是inode和f,即此file的indoe、file struct。至此open系统调用过程基本结束!

static int do_dentry_open(struct file *f,
              struct inode *inode,
              int (*open)(struct inode *, struct file *))
{
    static const struct file_operations empty_fops = {};
    int error;

    path_get(&f->f_path);
    f->f_inode = inode;
    f->f_mapping = inode->i_mapping;

    /* Ensure that we skip any errors that predate opening of the file */
    f->f_wb_err = filemap_sample_wb_err(f->f_mapping);

    if (unlikely(f->f_flags & O_PATH)) {
        f->f_mode = FMODE_PATH | FMODE_OPENED;
        f->f_op = &empty_fops;
        return 0;
    }

    /* Any file opened for execve()/uselib() has to be a regular file. */
    if (unlikely(f->f_flags & FMODE_EXEC && !S_ISREG(inode->i_mode))) {
        error = -EACCES;
        goto cleanup_file;
    }

    if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
        error = get_write_access(inode);
        if (unlikely(error))
            goto cleanup_file;
        error = __mnt_want_write(f->f_path.mnt);
        if (unlikely(error)) {
            put_write_access(inode);
            goto cleanup_file;
        }
        f->f_mode |= FMODE_WRITER;
    }

    /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
    if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
        f->f_mode |= FMODE_ATOMIC_POS;

    f->f_op = fops_get(inode->i_fop);
    if (unlikely(WARN_ON(!f->f_op))) {
        error = -ENODEV;
        goto cleanup_all;
    }

    error = security_file_open(f);
    if (error)
        goto cleanup_all;

    error = break_lease(locks_inode(f), f->f_flags);
    if (error)
        goto cleanup_all;

    /* normally all 3 are set; ->open() can clear them if needed */
    f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
    if (!open)
        open = f->f_op->open;
    if (open) {
        error = open(inode, f);
        if (error)
            goto cleanup_all;
    }
    f->f_mode |= FMODE_OPENED;
    if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
        i_readcount_inc(inode);
    if ((f->f_mode & FMODE_READ) &&
         likely(f->f_op->read || f->f_op->read_iter))
        f->f_mode |= FMODE_CAN_READ;
    if ((f->f_mode & FMODE_WRITE) &&
         likely(f->f_op->write || f->f_op->write_iter))
        f->f_mode |= FMODE_CAN_WRITE;

    f->f_write_hint = WRITE_LIFE_NOT_SET;
    f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);

    file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);

    /* NB: we're sure to have correct a_ops only after f_op->open */
    if (f->f_flags & O_DIRECT) {
        if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
            return -EINVAL;
    }
    return 0;

cleanup_all:
    if (WARN_ON_ONCE(error > 0))
        error = -EINVAL;
    fops_put(f->f_op);
    if (f->f_mode & FMODE_WRITER) {
        put_write_access(inode);
        __mnt_drop_write(f->f_path.mnt);
    }
cleanup_file:
    path_put(&f->f_path);
    f->f_path.mnt = NULL;
    f->f_path.dentry = NULL;
    f->f_inode = NULL;
    return error;
}
fs