本文基于SPDK代码,nvmf_tgt -m 0x0F(3个IO reactor)运行环境
主要内容如下:
NVMe-oF Target启动
NVMe-oF Discovery Subsystem
NVMe-oF Subsystem创建
NVMe-oF bdev创建
NVMe-oF bdev添加到Subsystem
NVMe-oF Admin/IO/Discovery ctrlr
NVMe-oF Admin/IO Queue
SPDK dev/channel机制
NVMe-oF Target启动
Reacotor启动
- 使用
eal_worker_thread_create
为除当前核以外的CPU核创建一个woker_thread
运行eal_thread_loop
rte_atomic_store_explicit
存储reactor_run
到变量lcore_config
,再通过pipe写eal_thread_wake_worker
唤醒worker
,之后该reactor
会一直运行reactor_run
- 直接运行
reactor_run
启动main reactor
#0 eal_worker_thread_create (lcore_id=1) at ../lib/eal/linux/eal.c:921
#1 0x00005555557bfb8f in rte_eal_init (argc=12, argv=0x555555dce550) at ../lib/eal/linux/eal.c:1250
#2 0x0000555555645e13 in spdk_env_init (opts=0x7fffffffe020) at init.c:667
#3 0x00005555556f2bf3 in app_setup_env (opts=0x7fffffffe0f0) at app.c:419
#4 0x00005555556f3ff5 in spdk_app_start (opts_user=0x7fffffffe3a0, start_fn=0x55555557729a <nvmf_tgt_started>, arg1=0x0) at app.c:791
#5 0x00005555555773bc in main (argc=2, argv=0x7fffffffe5a8) at nvmf_main.c:47#0 eal_thread_wait_command () at ../lib/eal/unix/eal_unix_thread.c:36
#1 0x00005555557a201f in eal_thread_loop (arg=0x1) at ../lib/eal/common/eal_common_thread.c:189
#2 0x00005555557befad in eal_worker_thread_loop (arg=0x1) at ../lib/eal/linux/eal.c:916
#3 0x00007ffff77b2ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#4 0x00007ffff7844850 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81/*** Macro to browse all running lcores except the main lcore.*/
#define RTE_LCORE_FOREACH_WORKER(i) \for (i = rte_get_next_lcore(-1, 1, 0); \i < RTE_MAX_LCORE; \i = rte_get_next_lcore(i, 1, 0))if (pthread_create((pthread_t *)&lcore_config[lcore_id].thread_id.opaque_id,attrp, eal_worker_thread_loop, (void *)(uintptr_t)lcore_id) == 0)/* main loop of threads */
__rte_noreturn uint32_t
eal_thread_loop(void *arg)
{/* read on our pipe to get commands */while (1) {eal_thread_wait_command();/* Set the state to 'RUNNING'. Use release order* since 'state' variable is used as the guard variable.*/rte_atomic_store_explicit(&lcore_config[lcore_id].state, RUNNING,rte_memory_order_release);eal_thread_ack_command();/* Load 'f' with acquire order to ensure that* the memory operations from the main thread* are accessed only after update to 'f' is visible.* Wait till the update to 'f' is visible to the worker.*/while ((f = rte_atomic_load_explicit(&lcore_config[lcore_id].f,rte_memory_order_acquire)) == NULL)rte_pause();/* ... */}
}/* app thread,注意spdk thread需等到reactor_run运行才启动 */
spdk_cpuset_set_cpu(&tmp_cpumask, spdk_env_get_current_core(), true);
/* Now that the reactors have been initialized, we can create the app thread. */
spdk_thread_create("app_thread", &tmp_cpumask);#0 eal_thread_wake_worker (worker_id=2) at ../lib/eal/unix/eal_unix_thread.c:14
#1 0x0000555555791797 in rte_eal_remote_launch (f=0x5555557bee67 <sync_func>, arg=0x0, worker_id=2) at ../lib/eal/common/eal_common_launch.c:52
#2 0x0000555555791a6a in rte_eal_mp_remote_launch (f=0x5555557bee67 <sync_func>, arg=0x0, call_main=SKIP_MAIN) at ../lib/eal/common/eal_common_launch.c:79
#3 0x00005555557bfce9 in rte_eal_init (argc=12, argv=0x555555dce550) at ../lib/eal/linux/eal.c:1269
#4 0x0000555555645e13 in spdk_env_init (opts=0x7fffffffe020) at init.c:667
#5 0x00005555556f2bf3 in app_setup_env (opts=0x7fffffffe0f0) at app.c:419
#6 0x00005555556f3ff5 in spdk_app_start (opts_user=0x7fffffffe3a0, start_fn=0x55555557729a <nvmf_tgt_started>, arg1=0x0) at app.c:791
#7 0x00005555555773bc in main (argc=2, argv=0x7fffffffe5a8) at nvmf_main.c:47/* 向reactor发送消息,启动reactor */
SPDK_ENV_FOREACH_CORE(i) {if (i != current_core) {rc = spdk_env_thread_launch_pinned(reactor->lcore, reactor_run, reactor);}spdk_cpuset_set_cpu(&g_reactor_core_mask, i, true);}/* Start the main reactor */
reactor = spdk_reactor_get(current_core);
reactor_run(reactor);
spdk_env_thread_wait_all();
NVMe-oF Target/PG/SUBSYSTEM创建
- NVMe-oF注册
spdk_subsystem
, 启动时调用nvmf_subsystem_init
NVMF_TGT_INIT_CREATE_TARGET
:nvmf_subsystem_init
先创建target
,spdk_io_device_register
注册io_device
,之后为其添加Discovery Subsystem
(nvmf_add_discovery_subsystem
),进入下一阶段NVMF_TGT_INIT_CREATE_POLL_GROUPS
: 为每个CPU核创建spdk_thread, 发送msg
创建nvmf_poll_group
(spdk_nvmf_poll_group_create
->spdk_get_io_channel(tgt)
), 所有CPU核均创建完成后,进入下一阶段NVMF_TGT_INIT_START_SUBSYSTEMS
: 创建Subsystem
更新上下文,spdk_for_each_channel
调用subsystem_state_change_on_pg
为各个nvmf_poll_group
更新Discovery Subsystem
(namespace信息
),进入下一阶段NVMF_TGT_RUNNING
:Target
启动完成,初始化其他spdk subsystem
,spdk_subsystem_init_next(0)
, 初始化后,退出状态机,初始化完毕。进程运行期间,Target
均处于RUNNING
状态。
static struct spdk_subsystem g_spdk_subsystem_nvmf = {.name = "nvmf",.init = nvmf_subsystem_init,.fini = nvmf_subsystem_fini,.write_config_json = nvmf_subsystem_write_config_json,
};SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_nvmf)
SPDK_SUBSYSTEM_DEPEND(nvmf, bdev)
SPDK_SUBSYSTEM_DEPEND(nvmf, sock)#0 spdk_nvmf_tgt_create (opts=0x7fffffffda30) at nvmf.c:359
#1 0x000055555564fc9f in nvmf_tgt_create_target () at nvmf_tgt.c:311
#2 0x000055555565004d in nvmf_tgt_advance_state () at nvmf_tgt.c:418
#3 0x0000555555650243 in nvmf_subsystem_init () at nvmf_tgt.c:492
#4 0x0000555555734943 in spdk_subsystem_init_next (rc=0) at subsystem.c:166
#5 0x00005555556ff108 in bdev_initialize_complete (cb_arg=0x0, rc=0) at bdev.c:17
#6 0x0000555555705055 in bdev_init_complete (rc=0) at bdev.c:2033
#7 0x00005555557050d3 in bdev_module_action_complete () at bdev.c:2075
#8 0x00005555557055c3 in spdk_bdev_initialize (cb_fn=0x5555556ff0eb <bdev_initialize_complete>, cb_arg=0x0) at bdev.c:2205
#9 0x00005555556ff127 in bdev_subsystem_initialize () at bdev.c:23
#10 0x0000555555734943 in spdk_subsystem_init_next (rc=0) at subsystem.c:166
#11 0x000055555571e4c6 in accel_subsystem_initialize () at accel.c:20
#12 0x0000555555734943 in spdk_subsystem_init_next (rc=0) at subsystem.c:166
#13 0x0000555555732865 in iobuf_subsystem_initialize () at iobuf.c:22
#14 0x0000555555734943 in spdk_subsystem_init_next (rc=0) at subsystem.c:166
#15 0x000055555572fd80 in sock_subsystem_init () at sock.c:13
#16 0x0000555555734943 in spdk_subsystem_init_next (rc=0) at subsystem.c:166
#17 0x000055555572b0e8 in vmd_subsystem_init () at vmd.c:63
#18 0x0000555555734943 in spdk_subsystem_init_next (rc=0) at subsystem.c:166
#19 0x00005555556f1f71 in scheduler_subsystem_init () at scheduler.c:23
#20 0x0000555555734943 in spdk_subsystem_init_next (rc=0) at subsystem.c:166
#21 0x0000555555734ad0 in spdk_subsystem_init (cb_fn=0x5555556f277b <app_subsystem_init_done>, cb_arg=0x0) at subsystem.c:199
#22 0x00005555556f2922 in app_start_rpc (rc=0, arg1=0x0) at app.c:356
#23 0x00005555556f3260 in bootstrap_fn (arg1=0x0) at app.c:545spdk_thread_send_msg(spdk_thread_get_app_thread(), bootstrap_fn, NULL);enum nvmf_tgt_state {NVMF_TGT_INIT_NONE = 0,NVMF_TGT_INIT_CREATE_TARGET,NVMF_TGT_INIT_CREATE_POLL_GROUPS,NVMF_TGT_INIT_START_SUBSYSTEMS,NVMF_TGT_RUNNING,NVMF_TGT_FINI_STOP_SUBSYSTEMS,NVMF_TGT_FINI_DESTROY_SUBSYSTEMS,NVMF_TGT_FINI_DESTROY_POLL_GROUPS,NVMF_TGT_FINI_DESTROY_TARGET,NVMF_TGT_STOPPED,NVMF_TGT_ERROR,
};SPDK_ENV_FOREACH_CORE(cpu) {if (g_poll_groups_mask && !spdk_cpuset_get_cpu(g_poll_groups_mask, cpu)) {continue;}snprintf(thread_name, sizeof(thread_name), "nvmf_tgt_poll_group_%u", count++);thread = spdk_thread_create(thread_name, g_poll_groups_mask);assert(thread != NULL);spdk_thread_send_msg(thread, nvmf_tgt_create_poll_group, NULL);}spdk_io_device_register(tgt,nvmf_tgt_create_poll_group,nvmf_tgt_destroy_poll_group,sizeof(struct spdk_nvmf_poll_group),tgt->name);struct spdk_nvmf_poll_group *
spdk_nvmf_poll_group_create(struct spdk_nvmf_tgt *tgt)
{struct spdk_io_channel *ch;ch = spdk_get_io_channel(tgt);if (!ch) {SPDK_ERRLOG("Unable to get I/O channel for target\n");return NULL;}return spdk_io_channel_get_ctx(ch);
}if (++g_num_poll_groups == nvmf_get_cpuset_count()) {if (g_tgt_state != NVMF_TGT_ERROR) {g_tgt_state = NVMF_TGT_INIT_START_SUBSYSTEMS;}nvmf_tgt_advance_state();}spdk_for_each_channel(subsystem->tgt,subsystem_state_change_on_pg,ctx,subsystem_state_change_done);
NVMe-oF Discovery Subsystem
- 客户端nvme discover -t rdma -a 192.168.2.30 -s 4420时,统一指定Subsystem为nqn.2014-08.org.nvmexpress.discovery,Target默认创建该subsystem,其根据trid匹配subsystem,返回对应的Subsystem nqn及其他信息
Discovery log page
。
scripts/rpc.py nvmf_create_transport -t RDMA -u 8192 -i 131072 -c 8192
scripts/rpc.py nvmf_create_subsystem nqn.2016-06.io.spdk:cnode1 -a -s SPDK00000000000001 -d SPDK_Controller1
scripts/rpc.py bdev_malloc_create -b Malloc0 512 512
scripts/rpc.py nvmf_subsystem_add_ns nqn.2016-06.io.spdk:cnode1 Malloc0