You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
enum cpuhp_state {
CPUHP_INVALID = -1,
/* PREPARE section invoked on a control CPU */
CPUHP_OFFLINE = 0,
CPUHP_CREATE_THREADS,
CPUHP_PERF_PREPARE,
...
CPUHP_BRINGUP_CPU,
/* * STARTING section invoked on the hotplugged CPU in low level * bringup and teardown code.*/
CPUHP_AP_IDLE_DEAD,
CPUHP_AP_OFFLINE,
...
CPUHP_AP_ONLINE,
CPUHP_TEARDOWN_CPU,
/* Online section invoked on the hotplugged CPU from the hotplug thread */
CPUHP_AP_ONLINE_IDLE,
...
CPUHP_AP_ONLINE_DYN,
CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30,
/* Must be after CPUHP_AP_ONLINE_DYN for node_states[N_CPU] update */
CPUHP_AP_MM_DEMOTION_ONLINE,
CPUHP_AP_X86_HPET_ONLINE,
CPUHP_AP_X86_KVM_CLK_ONLINE,
CPUHP_AP_ACTIVE,
CPUHP_ONLINE,
};
在适当的位置将状态插入枚举中,这样就满足了排序要求。状态常量必须被用于状态的设置和移除。
当状态回调不是在运行时设置的,并且是 kernel/cpu.c 中 CPU 热插拔状态数组初始化的一部分时,也需要静态分配。
/* Boot processor state steps */staticstructcpuhp_step cpuhp_hp_states[] = {
[CPUHP_OFFLINE] = {
.name = "offline",
.startup.single = NULL,
.teardown.single = NULL,
},
...
/* Entry state on starting. Interrupts enabled from here on. Transient * state for synchronsization */
[CPUHP_AP_ONLINE] = {
.name = "ap:online",
},
/* * Handled on control processor until the plugged processor manages * this itself.*/
[CPUHP_TEARDOWN_CPU] = {
.name = "cpu:teardown",
.startup.single = NULL,
.teardown.single = takedown_cpu,
.cant_stop = true,
},
[CPUHP_AP_SCHED_WAIT_EMPTY] = {
.name = "sched:waitempty",
.startup.single = NULL,
.teardown.single = sched_cpu_wait_empty,
},
...
/* * The dynamically registered state space is here*/
#ifdef CONFIG_SMP
/* Last state is scheduler control setting the cpu active */
[CPUHP_AP_ACTIVE] = {
.name = "sched:active",
.startup.single = sched_cpu_activate,
.teardown.single = sched_cpu_deactivate,
},
#endif/* CPU is fully up and running. */
[CPUHP_ONLINE] = {
.name = "online",
.startup.single = NULL,
.teardown.single = NULL,
},
};
/* * Get the next state to run. Empty ones will be skipped. Returns true if a * state must be run. * * st->state will be modified ahead of time, to match state_to_run, as if it * has already ran.*/staticboolcpuhp_next_state(bool bringup,
enum cpuhp_state *state_to_run,
structcpuhp_cpu_state *st,
enum cpuhp_state target)
{
do {
if (bringup) { //如果是 bringup,状态向前(下)迁移if (st->state >= target) //到达目标状态了,返回 false 让调用者停止它的循环returnfalse;
*state_to_run = ++st->state;
} else { //否则,状态向后(上)迁移if (st->state <= target) //到达目标状态了,返回 false 让调用者停止它的循环returnfalse;
*state_to_run = st->state--;
}
//空步骤将会被跳过if (!cpuhp_step_empty(bringup, cpuhp_get_step(*state_to_run)))
break;
} while (true);
//如果该步骤必须被运行,返回 true 让调用者继续循环returntrue;
}
/* * The cpu hotplug threads manage the bringup and teardown of the cpus*/staticintcpuhp_should_run(unsignedint cpu)
{
structcpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
return st->should_run;
}
/* * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke * callbacks when a state gets [un]installed at runtime. * * Each invocation of this function by the smpboot thread does a single AP * state callback. * * It has 3 modes of operation: * - single: runs st->cb_state * - up: runs ++st->state, while st->state < st->target * - down: runs st->state--, while st->state > st->target * * When complete or on error, should_run is cleared and the completion is fired.*/staticvoidcpuhp_thread_fun(unsignedint cpu)
{
structcpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
bool bringup = st->bringup;
enum cpuhp_state state;
//不应该以“不应该运行”的标志唤醒该线程if (WARN_ON_ONCE(!st->should_run))
return;
/* * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures * that if we see ->should_run we also see the rest of the state.*/smp_mb();
/* * The BP holds the hotplug lock, but we're now running on the AP, * ensure that anybody asserting the lock is held, will actually find * it so.*/lockdep_acquire_cpus_lock();
cpuhp_lock_acquire(bringup);
//mode 1,在运行时状态被 [un]installedif (st->single) {
state = st->cb_state;
st->should_run = false;
} else { //mode 2、3,在插入的 CPU 上执行 teardown/startup 回调
st->should_run = cpuhp_next_state(bringup, &state, st, st->target); //递进状态,并决定是否需要终止循环if (!st->should_run)
goto end;
}
//本线程仅负责处理 CPUHP_BRINGUP_CPU 之前的状态WARN_ON_ONCE(!cpuhp_is_ap_state(state));
//前面的 STARTING/DYING 状态需要在禁用 IRQ 的情况下运行,并且不得失败。if (cpuhp_is_atomic_state(state)) {
local_irq_disable();
st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
local_irq_enable();
/* * STARTING/DYING must not fail!*/WARN_ON_ONCE(st->result);
} else {
st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
}
//如果出错发生了回滚,然而回滚也失败了,也不要再继续运行了,内核文档有解释过这个场景if (st->result) {
/* * If we fail on a rollback, we're up a creek without no * paddle, no way forward, no way back. We loose, thanks for * playing.*/WARN_ON_ONCE(st->rollback);
st->should_run = false;
}
end:
cpuhp_lock_release(bringup);
lockdep_release_cpus_lock();
//结束循环,设置完成量,让 CPU offline 的发起 CPU 继续运行if (!st->should_run)
complete_ap_thread(st, bringup);
}
CPUHP_AP_ACTIVE 的 .teardown.single = sched_cpu_deactivate 就负责将任务推到其他 CPU 上
CPUHP_AP_SCHED_WAIT_EMPTY 状态等待调度完成的事件
这两个函数会在 CPU 热插拔状态迁移的过程中分别被 cpuhp_thread_fun(),完成被 offlined CPU 上的任务迁移
[CPUHP_AP_SCHED_WAIT_EMPTY] = {
.name = "sched:waitempty",
.startup.single = NULL,
.teardown.single = sched_cpu_wait_empty,
},
...
#ifdef CONFIG_SMP
/* Last state is scheduler control setting the cpu active */
[CPUHP_AP_ACTIVE] = {
.name = "sched:active",
.startup.single = sched_cpu_activate,
.teardown.single = sched_cpu_deactivate,
},
#endif
路径如下:
(gdb) bt
#0sched_cpu_deactivate (cpu=3) at kernel/sched/core.c:9560
#10xffffffff81140b39 in cpuhp_invoke_callback (cpu=cpu@entry=3, state=CPUHP_AP_ACTIVE, bringup=bringup@entry=false,
node=0x0 <fixed_percpu_data>, lastp=0xff1100017bb9b988) at kernel/cpu.c:192
#20xffffffff81141b6f in cpuhp_thread_fun (cpu=3) at kernel/cpu.c:825
#30xffffffff81175aa3 in smpboot_thread_fn (data=0xff11000100073210) at kernel/smpboot.c:164
#40xffffffff8116b066 in kthread (_create=0xff110001005aab80) at kernel/kthread.c:376
#50xffffffff81006ff9 in ret_from_fork () at arch/x86/entry/entry_64.S:311
#60x0000000000000000 in ?? ()
核心函数 __cpuhp_invoke_callback_range()
__cpuhp_invoke_callback_range() 则是在发起 offline CPU 上通过 cpuhp_down_callbacks() 调用的核心函数
它会让 cpuhp_state 状态机发生变化,并且调用对应状态的回调函数
staticint__cpuhp_invoke_callback_range(bool bringup,
unsignedint cpu,
structcpuhp_cpu_state *st,
enum cpuhp_state target,
bool nofail)
{
enum cpuhp_state state;
int ret = 0;
while (cpuhp_next_state(bringup, &state, st, target)) {
int err;
//调用对应状态的回调函数
err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL);
if (!err)
continue; //如果没出错,迁移到下一个状态//如果出错了,调用者要求不管失败,则只是打印在某个状态遇到了错误的信息if (nofail) {
pr_warn("CPU %u %s state %s (%d) failed (%d)\n",
cpu, bringup ? "UP" : "DOWN",
cpuhp_get_step(st->state)->name,
st->state, err);
ret = -1;
} else { //调用者关心失败,则停止状态迁移,返回出错码
ret = err;
break;
}
}
return ret;
}
运行在发起 CPU Offline 进程上的 takedown_cpu()
当发起 CPU Offline 的进程进入到 CPUHP_TEARDOWN_CPU 的时候就会调用其回调函数 takedown_cpu()
kernel/cpu.c
staticinttakedown_cpu(unsignedint cpu)
{
structcpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
int err;
//停放 "cpuhp/%u"/* Park the smpboot threads */kthread_park(st->thread);
/* * Prevent irq alloc/free while the dying cpu reorganizes the * interrupt affinities.*/irq_lock_sparse();
//给 offline CPU 安排一个 stop machine 的 take_cpu_down() 流程,完成第二阶段的 offline/* * So now all preempt/rcu users must observe !cpu_active().*/
err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
if (err) {
/* CPU refused to die */irq_unlock_sparse();
/* Unpark the hotplug thread so we can rollback there */kthread_unpark(st->thread);
return err;
}
BUG_ON(cpu_online(cpu));
//等待 stop machine 的 take_cpu_down() 流程走完/* * The teardown callback for CPUHP_AP_SCHED_STARTING will have removed * all runnable tasks from the CPU, there's only the idle task left now * that the migration thread is done doing the stop_machine thing. * * Wait for the stop thread to go away.*/wait_for_ap_thread(st, false);
BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
/* Interrupts are moved away from the dying cpu, reenable alloc/free */irq_unlock_sparse();
hotplug_cpu__broadcast_tick_pull(cpu);
/* This actually kills the CPU. */__cpu_die(cpu);
cpuhp_bp_sync_dead(cpu);
tick_cleanup_dead_cpu(cpu);
rcutree_migrate_callbacks(cpu);
return0;
}
运行在 Stop Machine 进程中的 take_cpu_down()
这个 stop machine 的 work 是发起 CPU Offline 的进程安排进来的,完成第二阶段的 offline
kernel/cpu.c
staticinttake_cpu_down(void *_param)
{
structcpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
int err, cpu = smp_processor_id();
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
if (err < 0)
return err;
/* * Must be called from CPUHP_TEARDOWN_CPU, which means, as we are going * down, that the current state is CPUHP_TEARDOWN_CPU - 1.*/WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1));
/* * Invoke the former CPU_DYING callbacks. DYING must not fail!*/cpuhp_invoke_callback_range_nofail(false, cpu, st, target);
/* Give up timekeeping duties */tick_handover_do_timer();
/* Remove CPU from timer broadcasting */tick_offline_cpu(cpu);
/* Park the stopper thread */stop_machine_park(cpu);
return0;
}