关于linux的一点好奇心(二):linux启动过程之三大进程


  上一节我们通过对x86的linux内核的讲解,知道了它的一个大概的启动过程。

        /arch/x86/boot/header.S
        -> calll main    ->    /arch/x86/boot/main.c
        -> go_to_protected_mode()    ->    /arch/x86/boot/pm.c
        -> protected_mode_jump()    ->    /arch/x86/boot/pmjump.S
        -> jmpl    *%eax    ->    /arch/x86/kernel/head_32.S
        -> .long i386_start_kernel    ->    /arch/x86/kernel/head32.c
        -> start_kernel()    ->    /init/main.c    (C语言入口)

  这其中的动作,基本都是找到对应的地址,然后设置各种设备的初始化信息,中断设置,键盘,控制台,idt...

  当然,有相当一部分代码是用汇编语言完成的,这自然是底层硬件决定的,而且因为特殊性,再封装是没有必要的了。所以,汇编是最好的选择。

  本篇,我们再来看看cpu架构无关的main都又干了啥,从而解开心中的迷团。

1. start_kernel入口

  排除掉架构相关的代码,就是到了/init/main.c 中的 start_kernel(), 从这里我们可以看到操作系统启动时,大致干了啥。

// /init/main.c
asmlinkage __visible void __init start_kernel(void)
{
    char *command_line;
    char *after_dashes;

    set_task_stack_end_magic(&init_task);
    smp_setup_processor_id();
    debug_objects_early_init();

    cgroup_init_early();

    local_irq_disable();
    early_boot_irqs_disabled = true;

    /*
     * Interrupts are still disabled. Do necessary setups, then
     * enable them.
     */
    boot_cpu_init();
    page_address_init();
    pr_notice("%s", linux_banner);
    setup_arch(&command_line);
    /*
     * Set up the the initial canary and entropy after arch
     * and after adding latent and command line entropy.
     */
    add_latent_entropy();
    add_device_randomness(command_line, strlen(command_line));
    boot_init_stack_canary();
    mm_init_cpumask(&init_mm);
    setup_command_line(command_line);
    setup_nr_cpu_ids();
    setup_per_cpu_areas();
    smp_prepare_boot_cpu();    /* arch-specific boot-cpu hooks */
    boot_cpu_hotplug_init();

    build_all_zonelists(NULL);
    page_alloc_init();

    pr_notice("Kernel command line: %s\n", boot_command_line);
    parse_early_param();
    after_dashes = parse_args("Booting kernel",
                  static_command_line, __start___param,
                  __stop___param - __start___param,
                  -1, -1, NULL, &unknown_bootoption);
    if (!IS_ERR_OR_NULL(after_dashes))
        parse_args("Setting init args", after_dashes, NULL, 0, -1, -1,
               NULL, set_init_arg);

    jump_label_init();

    /*
     * These use large bootmem allocations and must precede
     * kmem_cache_init()
     */
    setup_log_buf(0);
    vfs_caches_init_early();
    sort_main_extable();
    trap_init();
    mm_init();

    ftrace_init();

    /* trace_printk can be enabled here */
    early_trace_init();

    /*
     * Set up the scheduler prior starting any interrupts (such as the
     * timer interrupt). Full topology setup happens at smp_init()
     * time - but meanwhile we still have a functioning scheduler.
     */
    sched_init();
    /*
     * Disable preemption - early bootup scheduling is extremely
     * fragile until we cpu_idle() for the first time.
     */
    preempt_disable();
    if (WARN(!irqs_disabled(),
         "Interrupts were enabled *very* early, fixing it\n"))
        local_irq_disable();
    radix_tree_init();

    /*
     * Set up housekeeping before setting up workqueues to allow the unbound
     * workqueue to take non-housekeeping into account.
     */
    housekeeping_init();

    /*
     * Allow workqueue creation and work item queueing/cancelling
     * early.  Work item execution depends on kthreads and starts after
     * workqueue_init().
     */
    workqueue_init_early();

    rcu_init();

    /* Trace events are available after this */
    trace_init();

    if (initcall_debug)
        initcall_debug_enable();

    context_tracking_init();
    /* init some links before init_ISA_irqs() */
    early_irq_init();
    init_IRQ();
    tick_init();
    rcu_init_nohz();
    init_timers();
    hrtimers_init();
    softirq_init();
    timekeeping_init();
    time_init();
    sched_clock_postinit();
    printk_safe_init();
    perf_event_init();
    profile_init();
    call_function_init();
    WARN(!irqs_disabled(), "Interrupts were enabled early\n");
    early_boot_irqs_disabled = false;
    local_irq_enable();

    kmem_cache_init_late();

    /*
     * HACK ALERT! This is early. We're enabling the console before
     * we've done PCI setups etc, and console_init() must be aware of
     * this. But we do want output early, in case something goes wrong.
     */
    console_init();
    if (panic_later)
        panic("Too many boot %s vars at `%s'", panic_later,
              panic_param);

    lockdep_info();

    /*
     * Need to run this when irqs are enabled, because it wants
     * to self-test [hard/soft]-irqs on/off lock inversion bugs
     * too:
     */
    locking_selftest();

    /*
     * This needs to be called before any devices perform DMA
     * operations that might use the SWIOTLB bounce buffers. It will
     * mark the bounce buffers as decrypted so that their usage will
     * not cause "plain-text" data to be decrypted when accessed.
     */
    mem_encrypt_init();

#ifdef CONFIG_BLK_DEV_INITRD
    if (initrd_start && !initrd_below_start_ok &&
        page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
        pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n",
            page_to_pfn(virt_to_page((void *)initrd_start)),
            min_low_pfn);
        initrd_start = 0;
    }
#endif
    page_ext_init();
    kmemleak_init();
    debug_objects_mem_init();
    setup_per_cpu_pageset();
    numa_policy_init();
    acpi_early_init();
    if (late_time_init)
        late_time_init();
    calibrate_delay();
    pid_idr_init();
    anon_vma_init();
#ifdef CONFIG_X86
    if (efi_enabled(EFI_RUNTIME_SERVICES))
        efi_enter_virtual_mode();
#endif
    thread_stack_cache_init();
    cred_init();
    fork_init();
    proc_caches_init();
    uts_ns_init();
    buffer_init();
    key_init();
    security_init();
    dbg_late_init();
    vfs_caches_init();
    pagecache_init();
    signals_init();
    seq_file_init();
    proc_root_init();
    nsfs_init();
    cpuset_init();
    cgroup_init();
    taskstats_init_early();
    delayacct_init();

    check_bugs();

    acpi_subsystem_init();
    arch_post_acpi_subsys_init();
    sfi_init_late();

    if (efi_enabled(EFI_RUNTIME_SERVICES)) {
        efi_free_boot_services();
    }

    // 执行除了各种init之外的代码,就是创建首个线程之类的
    /* Do the rest non-__init'ed, we're now alive */
    rest_init();
}

/*
 * We need to finalize in a non-__init function or else race conditions
 * between the root thread and the init thread may cause start_kernel to
 * be reaped by free_initmem before the root thread has proceeded to
 * cpu_idle.
 *
 * gcc-3.4 accidentally inlines this function, so use noinline.
 */

static __initdata DECLARE_COMPLETION(kthreadd_done);
// main.c
static noinline void __ref rest_init(void)
{
    struct task_struct *tsk;
    int pid;

    rcu_scheduler_starting();
    /*
     * We need to spawn init first so that it obtains pid 1, however
     * the init task will end up wanting to create kthreads, which, if
     * we schedule it before we create kthreadd, will OOPS.
     */
    // 首先创建init进程,此进程pid=1
    pid = kernel_thread(kernel_init, NULL, CLONE_FS);
    /*
     * Pin init on the boot CPU. Task migration is not properly working
     * until sched_init_smp() has been run. It will set the allowed
     * CPUs for init to the non isolated CPUs.
     */
    rcu_read_lock();
    tsk = find_task_by_pid_ns(pid, &init_pid_ns);
    set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
    rcu_read_unlock();

    numa_default_policy();
    // 然后创建 kthreadd 进程,此进程pid=2
    pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
    rcu_read_lock();
    kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
    rcu_read_unlock();

    /*
     * Enable might_sleep() and smp_processor_id() checks.
     * They cannot be enabled earlier because with CONFIG_PREEMPT=y
     * kernel_thread() would trigger might_sleep() splats. With
     * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled
     * already, but it's stuck on the kthreadd_done completion.
     */
    system_state = SYSTEM_SCHEDULING;

    complete(&kthreadd_done);

    /*
     * The boot idle thread must execute schedule()
     * at least once to get things moving:
     */
    schedule_preempt_disabled();
    /* Call into cpu_idle with preempt disabled */
    // idle 进程开启
    cpu_startup_entry(CPUHP_ONLINE);
}

  同样,有大量的设备的init操作。但 rest_init() 稍微不太一样点,至少它和硬件关系不那么大了。它主要干三大件事:1. 初始化init进程; 2. 初始化kthreadd进程; 3. 初始化idle进程. 这三个东西,也许更值得多探探究竟。因为毕竟,硬件我们还是在外行了。

2. init进程的初始化过程

  init进程,又叫第一个进程,即pid为1的进程,是系统必不可少的进程。那它都干了啥呢?我们来看一下:

// main.c
// 初始化进程,主要用于执行 /bin/init 等启动命令
static int __ref kernel_init(void *unused)
{
    int ret;
    // 初始化系统模块,开启用户空间
    kernel_init_freeable();
    /* need to finish all async __init code before freeing the memory */
    async_synchronize_full();
    ftrace_free_init_mem();
    jump_label_invalidate_initmem();
    free_initmem();
    mark_readonly();
    system_state = SYSTEM_RUNNING;
    numa_default_policy();

    rcu_end_inkernel_boot();

    if (ramdisk_execute_command) {
        ret = run_init_process(ramdisk_execute_command);
        if (!ret)
            return 0;
        pr_err("Failed to execute %s (error %d)\n",
               ramdisk_execute_command, ret);
    }

    /*
     * We try each of these until one succeeds.
     *
     * The Bourne shell can be used instead of init if we are
     * trying to recover a really broken machine.
     */
    if (execute_command) {
        ret = run_init_process(execute_command);
        if (!ret)
            return 0;
        panic("Requested init %s failed (error %d).",
              execute_command, ret);
    }
    // 执行以下init系统命令,以便将系统运行起来
    // 因各平台各配置不一致,故做多次尝试,但只要一次成功,则返回0
    if (!try_to_run_init_process("/sbin/init") ||
        !try_to_run_init_process("/etc/init") ||
        !try_to_run_init_process("/bin/init") ||
        !try_to_run_init_process("/bin/sh"))
        return 0;

    panic("No working init found.  Try passing init= option to kernel. "
          "See Linux Documentation/admin-guide/init.rst for guidance.");
}

// /init/main.c
static noinline void __init kernel_init_freeable(void)
{
    /*
     * Wait until kthreadd is all set-up.
     */
    wait_for_completion(&kthreadd_done);

    /* Now the scheduler is fully set up and can do blocking allocations */
    gfp_allowed_mask = __GFP_BITS_MASK;

    /*
     * init can allocate pages on any node
     */
    set_mems_allowed(node_states[N_MEMORY]);

    cad_pid = task_pid(current);

    smp_prepare_cpus(setup_max_cpus);
    // 将队列绑定到各cpu上,以便后续可以各自执行各自的任务
    workqueue_init();

    init_mm_internals();

    do_pre_smp_initcalls();
    lockup_detector_init();

    smp_init();
    sched_init_smp();

    page_alloc_init_late();
    // cpu已就绪,可以进行真正的初始化方法了
    do_basic_setup();

    /* Open the /dev/console on the rootfs, this should never fail */
    if (ksys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
        pr_err("Warning: unable to open an initial console.\n");

    (void) ksys_dup(0);
    (void) ksys_dup(0);
    /*
     * check if there is an early userspace init.  If yes, let it do all
     * the work
     */

    if (!ramdisk_execute_command)
        ramdisk_execute_command = "/init";

    if (ksys_access((const char __user *)
            ramdisk_execute_command, 0) != 0) {
        ramdisk_execute_command = NULL;
        prepare_namespace();
    }

    /*
     * Ok, we have completed the initial bootup, and
     * we're essentially up and running. Get rid of the
     * initmem segments and start the user-mode stuff..
     *
     * rootfs is available now, try loading the public keys
     * and default modules
     */

    integrity_load_keys();
    // 加载默认模块
    load_default_modules();
}

/*
 * Ok, the machine is now initialized. None of the devices
 * have been touched yet, but the CPU subsystem is up and
 * running, and memory and process management works.
 *
 * Now we can finally start doing some real work..
 */
static void __init do_basic_setup(void)
{
    cpuset_init_smp();
    shmem_init();
    driver_init();
    init_irq_proc();
    do_ctors();
    usermodehelper_enable();
    do_initcalls();
}

// /drivers/base/init.c  驱动初始化
/**
 * driver_init - initialize driver model.
 *
 * Call the driver model init functions to initialize their
 * subsystems. Called early from init/main.c.
 */
void __init driver_init(void)
{
    /* These are the core pieces */
    devtmpfs_init();
    devices_init();
    buses_init();
    classes_init();
    firmware_init();
    hypervisor_init();

    /* These are also core pieces, but must come after the
     * core core pieces.
     */
    platform_bus_init();
    cpu_dev_init();
    memory_dev_init();
    container_dev_init();
    of_core_init();
}


// /init/main.c
/*
 * This function requests modules which should be loaded by default and is
 * called twice right after initrd is mounted and right before init is
 * exec'd.  If such modules are on either initrd or rootfs, they will be
 * loaded before control is passed to userland.
 */
void __init load_default_modules(void)
{
    load_default_elevator_module();
}
// /block/elevator.c
/* called during boot to load the elevator chosen by the elevator param */
void __init load_default_elevator_module(void)
{
    struct elevator_type *e;

    if (!chosen_elevator[0])
        return;

    /*
     * Boot parameter is deprecated, we haven't supported that for MQ.
     * Only look for non-mq schedulers from here.
     */
    spin_lock(&elv_list_lock);
    e = elevator_find(chosen_elevator, false);
    spin_unlock(&elv_list_lock);

    if (!e)
        request_module("%s-iosched", chosen_elevator);
}

  可以看到,init进程承担着非常重要的工作,它需要初始化内存,页,队列,cpu等等,还要创建用户空间,加载默认模块等等。并且更重要的是,它要负责执行开机启动程序,而这决定了我们的系统如何运行。它如此重要以至于,它作为第一个进程被创建出来。是一个不可少的进程。

3. kthreadd内核进程运行流程

  继init进程之后,kthreadd是第二个运行的进程,它又是在干什么呢?实际上,它主要用于给各子进程创建时使用的。

// /include/linux/kthread.h
int kthreadd(void *unused)
{
    struct task_struct *tsk = current;

    /* Setup a clean context for our children to inherit. */
    // 让kthreadd进程尽量少各种特殊配置,以便各子进程生成时,会带有各种特异功能
    set_task_comm(tsk, "kthreadd");
    ignore_signals(tsk);
    set_cpus_allowed_ptr(tsk, cpu_all_mask);
    set_mems_allowed(node_states[N_MEMORY]);

    current->flags |= PF_NOFREEZE;
    cgroup_init_kthreadd();

    for (;;) {
        set_current_state(TASK_INTERRUPTIBLE);
        if (list_empty(&kthread_create_list))
            // 上下文切换,即主动放弃cpu,此处是汇编实现
            schedule();
        __set_current_state(TASK_RUNNING);

        spin_lock(&kthread_create_lock);
        while (!list_empty(&kthread_create_list)) {
            struct kthread_create_info *create;

            create = list_entry(kthread_create_list.next,
                        struct kthread_create_info, list);
            list_del_init(&create->list);
            spin_unlock(&kthread_create_lock);
            // 创建一个内核线程(进程)
            create_kthread(create);

            spin_lock(&kthread_create_lock);
        }
        spin_unlock(&kthread_create_lock);
    }

    return 0;
}

// /kernel/kthread.c   创建一个内核线程(进程)
static void create_kthread(struct kthread_create_info *create)
{
    int pid;

#ifdef CONFIG_NUMA
    current->pref_node_fork = create->node;
#endif
    /* We want our own signal handler (we take no signals by default). */
    pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
    if (pid < 0) {
        /* If user was SIGKILLed, I release the structure. */
        struct completion *done = xchg(&create->done, NULL);

        if (!done) {
            kfree(create);
            return;
        }
        create->result = ERR_PTR(pid);
        complete(done);
    }
}

  可见 kthreadd 的作用就是不停地根据需要,创建一个个的内核进程线程咯。

4. idle进程

  idle进程是在启动后做的一件事。它的作用就是,不停的运行,保持cpu的活性。

// kernel/sched/idle.c
void cpu_startup_entry(enum cpuhp_state state)
{
    /*
     * This #ifdef needs to die, but it's too late in the cycle to
     * make this generic (ARM and SH have never invoked the canary
     * init for the non boot CPUs!). Will be fixed in 3.11
     */
#ifdef CONFIG_X86
    /*
     * If we're the non-boot CPU, nothing set the stack canary up
     * for us. The boot CPU already has it initialized but no harm
     * in doing it again. This is a good place for updating it, as
     * we wont ever return from this function (so the invalid
     * canaries already on the stack wont ever trigger).
     */
    boot_init_stack_canary();
#endif
    arch_cpu_idle_prepare();
    cpuhp_online_idle(state);
    // 永不停止的 do_idle
    while (1)
        do_idle();
}

/*
 * Generic idle loop implementation
 *
 * Called with polling cleared.
 */
static void do_idle(void)
{
    int cpu = smp_processor_id();
    /*
     * If the arch has a polling bit, we maintain an invariant:
     *
     * Our polling bit is clear if we're not scheduled (i.e. if rq->curr !=
     * rq->idle). This means that, if rq->idle has the polling bit set,
     * then setting need_resched is guaranteed to cause the CPU to
     * reschedule.
     */

    __current_set_polling();
    tick_nohz_idle_enter();

    while (!need_resched()) {
        check_pgt_cache();
        rmb();

        if (cpu_is_offline(cpu)) {
            tick_nohz_idle_stop_tick_protected();
            cpuhp_report_idle_dead();
            arch_cpu_idle_dead();
        }

        local_irq_disable();
        arch_cpu_idle_enter();

        /*
         * In poll mode we reenable interrupts and spin. Also if we
         * detected in the wakeup from idle path that the tick
         * broadcast device expired for us, we don't want to go deep
         * idle as we know that the IPI is going to arrive right away.
         */
        if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
            tick_nohz_idle_restart_tick();
            // 轮循 idle
            cpu_idle_poll();
        } else {
            cpuidle_idle_call();
        }
        arch_cpu_idle_exit();
    }

    /*
     * Since we fell out of the loop above, we know TIF_NEED_RESCHED must
     * be set, propagate it into PREEMPT_NEED_RESCHED.
     *
     * This is required because for polling idle loops we will not have had
     * an IPI to fold the state for us.
     */
    preempt_set_need_resched();
    tick_nohz_idle_exit();
    __current_clr_polling();

    /*
     * We promise to call sched_ttwu_pending() and reschedule if
     * need_resched() is set while polling is set. That means that clearing
     * polling needs to be visible before doing these things.
     */
    smp_mb__after_atomic();

    sched_ttwu_pending();
    schedule_idle();

    if (unlikely(klp_patch_pending(current)))
        klp_update_patch_state(current);
}

static noinline int __cpuidle cpu_idle_poll(void)
{
    rcu_idle_enter();
    trace_cpu_idle_rcuidle(0, smp_processor_id());
    local_irq_enable();
    stop_critical_timings();

    while (!tif_need_resched() &&
        (cpu_idle_force_poll || tick_check_broadcast_expired()))
        cpu_relax();
    start_critical_timings();
    trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
    rcu_idle_exit();

    return 1;
}

// arch/sh/include/asm/processor.h
#define cpu_relax()    barrier()

// arch/powerpc/boot/io.h
static inline void barrier(void)
{
    asm volatile("" : : : "memory");
}

  idle 进程就是不停地运行检测,然后调用cpu命令进行休眠。

  当然了,在有的精简系统中,idle进程并非是必须的,但其思想却是值得一学的。