s1eep123's blog.

记一次虚拟机启动失败调试过程

Word count: 2.2kReading time: 11 min
2023/06/07

记一次虚拟机启动失败调试过程

实习过程中分配了个小任务,替换iso镜像中的sqfs文件并重新打包为iso。其中在测试iso是否打包成功过程中,要vmware发现无法启动报panic,具体输出如下图。后用centos7镜像做同样测试发现也会报错。深入centos7内核代码查看调试。最终发现是由于虚拟机配置初始化时默认硬盘空间太小导致无法挂载系统盘导致。

image-20230719225511210

image-20230719225531527

linux kernel源代码和iso镜像的关系

梳理一下如何从linux kernel代码来制作iso的过程

  1. 下载kernel代码
  2. 修改配置文件
  3. 编译内核
  4. 安装内核
  5. 将文件系统打包为iso镜像

在安装好的系统下执行sudo cp -r /boot iso 创建isolinux.cfg配置文件。之后使用mkisofs工具即可将文件系统打包为iso即完成

解压原本的iso文件可以看到一下目录,可以理解为iso是包含了系统的文件系统信息,如目录结构、文件属性和引导代码的一个压缩文件

image-20230719225552391

image-20230719225611925

image-20230719225626653

image-20230719225639644

虚拟机启动panic过程

image-20230719225653745

其中能想到调试方法有两种,一种是通过qemu模拟虚拟机+外部串口链接通过kgdb进行调试。但此方法需要对linux代码添加相关调试信息且需要重新编译源码,较为麻烦。

因此通过一种报错字符串定位内核代码向上回朔调用栈来进行debug

下载该linux发行版对应内核版本,根据屏幕显示的debug信息全局搜索Decomporessing Linux…关键字定位具体函数。(linux内核中包含了不同架构实现的代码,由于centos7使用的是x86架构。因此后续分析如不说明皆为x86下)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
//Users/s1eep/kernel/linux/arch/x86/boot/compressed/misc.c

asmlinkage void decompress_kernel(void *rmode, memptr heap,
unsigned char *input_data,
unsigned long input_len,
unsigned char *output)
{
// 设置os模式为实模式并且memset相关内存
real_mode = rmode;
sanitize_boot_params(real_mode);

//初始化控制台打印信息相关设置,如设置字符输出端口,具体输出到屏幕的那个像素,颜色等
if (real_mode->screen_info.orig_video_mode == 7) {
vidmem = (char *) 0xb0000;
vidport = 0x3b4;
} else {
vidmem = (char *) 0xb8000;
vidport = 0x3d4;
}
lines = real_mode->screen_info.orig_video_lines;
cols = real_mode->screen_info.orig_video_cols;

console_init();
//在/x86/boot/compressed/misc.c/ __putstr(const char *s) 中可看到字符输出的屏幕的具体实现
debug_putstr("early console in decompress_kernel\n");

//分配的空闲内存,至于为什么是堆,因为栈空间还未分配
free_mem_ptr = heap; /* Heap */
free_mem_end_ptr = heap + BOOT_HEAP_SIZE;

...内存区间是否越界相关判断

//接下来就是kernel启动过程中看到的字符串
debug_putstr("\nDecompressing Linux... ");
decompress(input_data, input_len, NULL, NULL, output, NULL, error);
parse_elf(output);
debug_putstr("done.\nBooting the kernel.\n");
return;
}

分析函数看到

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
//linux/boot/head.S

.code32
.text

#include <linux/init.h>
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/pgtable_types.h>
#include <asm/page_types.h>
#include <asm/boot.h>
#include <asm/msr.h>
#include <asm/processor-flags.h>
#include <asm/asm-offsets.h>

//删除32位处理逻辑,只关注64位
.code64
.org 0x200
ENTRY(startup_64)

#ifdef CONFIG_EFI_STUB
jmp preferred_addr

ENTRY(efi_pe_entry)
mov %rcx, %rdi
mov %rdx, %rsi
pushq %rdi
pushq %rsi
call make_boot_params
cmpq $0,%rax
je 1f
mov %rax, %rdx
popq %rsi
popq %rdi

ENTRY(efi_stub_entry)
call efi_main
movq %rax,%rsi
cmpq $0,%rax
jne 2f
1:
/* EFI init failed, so hang. */
hlt
jmp 1b
2:
call 3f
3:
popq %rax
subq $3b, %rax
subq BP_pref_address(%rsi), %rax
add BP_code32_start(%esi), %eax
leaq preferred_addr(%rax), %rax
jmp *%rax

preferred_addr:
#endif

//段寄存器清零
xorl %eax, %eax
movl %eax, %ds
movl %eax, %es
movl %eax, %ss
movl %eax, %fs
movl %eax, %gs

/*
* 如果kernel是可重定向的,则从2MB addr 对齐的加载地址解压并运行内核,
* 否则从 LOAD_PHYSICAL_ADDR 解压并运行内核
*/
#ifdef CONFIG_RELOCATABLE
leaq startup_32(%rip) /* - $startup_32 */, %rbp
movl BP_kernel_alignment(%rsi), %eax
decl %eax
addq %rax, %rbp
notq %rax
andq %rax, %rbp
#else
movq $LOAD_PHYSICAL_ADDR, %rbp
#endif

/*
* 读取 boot_params.init_size ,得到 kernel initialization size
* 减去 _end ,得到起始地址,再加上实际的起始偏移量,得到:
* 内核重定位后的起始地址,保存到 rbx
*/
leaq z_extract_offset(%rbp), %rbx

/* Set up the stack */
leaq boot_stack_end(%rbx), %rsp

/* Zero EFLAGS */
pushq $0
popfq

/*
* Copy the compressed kernel to the end of our buffer
* where decompression in place becomes safe.
*/
pushq %rsi
leaq (_bss-8)(%rip), %rsi
leaq (_bss-8)(%rbx), %rdi
movq $_bss /* - $startup_32 */, %rcx
shrq $3, %rcx
std
rep movsq
cld
popq %rsi

/*
* Jump to the relocated address.
*/
leaq relocated(%rbx), %rax
jmp *%rax

.text
relocated:

/*
* Clear BSS (stack is currently empty)
*/
xorl %eax, %eax
leaq _bss(%rip), %rdi
leaq _ebss(%rip), %rcx
subq %rdi, %rcx
shrq $3, %rcx
rep stosq

/*
* Adjust our own GOT
*/
leaq _got(%rip), %rdx
leaq _egot(%rip), %rcx
1:
cmpq %rcx, %rdx
jae 2f
addq %rbx, (%rdx)
addq $8, %rdx
jmp 1b
2:

/*
* Do the decompression, and jump to the new kernel..
*/
pushq %rsi /* Save the real mode argument */
movq %rsi, %rdi /* real mode address */
leaq boot_heap(%rip), %rsi /* malloc area for uncompression */
leaq input_data(%rip), %rdx /* input_data */
movl $z_input_len, %ecx /* input_len */
movq %rbp, %r8 /* output target address */
//调用函数
call decompress_kernel
popq %rsi

/*
* Jump to the decompressed kernel.
*/
jmp *%rbp

.code32
no_longmode:
/* This isn't an x86-64 CPU so hang */
1:
hlt
jmp 1b

#include "../../kernel/verify_cpu.S"

.data
gdt:
.word gdt_end - gdt
.long gdt
.word 0
.quad 0x0000000000000000 /* NULL descriptor */
.quad 0x00af9a000000ffff /* __KERNEL_CS */
.quad 0x00cf92000000ffff /* __KERNEL_DS */
.quad 0x0080890000000000 /* TS descriptor */
.quad 0x0000000000000000 /* TS continued */
gdt_end:

/*
* Stack and heap for uncompression
*/
.bss
.balign 4
boot_heap:
.fill BOOT_HEAP_SIZE, 1, 0
boot_stack:
.fill BOOT_STACK_SIZE, 1, 0
boot_stack_end:

/*
* Space for page tables (not in .bss so not zeroed)
*/
.section ".pgtable","a",@nobits
.balign 4096
pgtable:
.fill 6*4096, 1, 0

接着启动虚拟机报错信息在内核代码中向上朔源

/initrd.image: incomplete write (-28!=xxxx)

找到一下函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
///Users/s1eep/kernel/linux/init/do_mounts_rd.c
tatic int __init compr_flush(void *window, unsigned int outcnt)
{
int written = sys_write(crd_outfd, window, outcnt);
if (written != outcnt) {
if (decompress_error == 0)
printk(KERN_ERR
"RAMDISK: incomplete write (%d != %d)\n",
written, outcnt);
decompress_error = 1;
return -1;
}
return outcnt;
}

向上查看函数调用链

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
/Users/s1eep/kernel/linux/init/do_mounts_rd.c
static int __init crd_load(int in_fd, int out_fd, decompress_fn deco)

/Users/s1eep/kernel/linux/init/do_mounts_rd.c
int __init rd_load_image(char *from)

/Users/s1eep/kernel/linux/init/do_mounts_initrd.c
int __init initrd_load(void)

/Users/s1eep/kernel/linux/init/do_mounts.c
void __init prepare_namespace(void)

/Users/s1eep/kernel/linux/init/main.c
static noinline void __init kernel_init_freeable(void)

/Users/s1eep/kernel/linux/init/main.c
static int __ref kernel_init(void *unused)

/Users/s1eep/kernel/linux/init/main.c
static noinline void __init_refok rest_init(void)
asmlinkage void __init start_kernel(void)
asmlinkage void __init start_kernel(void)
{
char * command_line;
extern const struct kernel_param __start___param[], __stop___param[];

/*
* Need to run as early as possible, to initialize the
* lockdep hash:
*/
lockdep_init();
smp_setup_processor_id();
debug_objects_early_init();

/*
* Set up the the initial canary ASAP:
*/
boot_init_stack_canary();

cgroup_init_early();

local_irq_disable();
early_boot_irqs_disabled = true;

/*
* Interrupts are still disabled. Do necessary setups, then
* enable them
*/
boot_cpu_init();
page_address_init();
pr_notice("%s", linux_banner);
setup_arch(&command_line);
mm_init_owner(&init_mm, &init_task);
mm_init_cpumask(&init_mm);
setup_command_line(command_line);
setup_nr_cpu_ids();
setup_per_cpu_areas();
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */

build_all_zonelists(NULL, NULL);
page_alloc_init();

pr_notice("Kernel command line: %s\n", boot_command_line);
parse_early_param();
parse_args("Booting kernel", static_command_line, __start___param,
__stop___param - __start___param,
-1, -1, &unknown_bootoption);

jump_label_init();

/*
* These use large bootmem allocations and must precede
* kmem_cache_init()
*/
setup_log_buf(0);
pidhash_init();
vfs_caches_init_early();
sort_main_extable();
trap_init();
mm_init();

/*
* Set up the scheduler prior starting any interrupts (such as the
* timer interrupt). Full topology setup happens at smp_init()
* time - but meanwhile we still have a functioning scheduler.
*/
sched_init();
/*
* Disable preemption - early bootup scheduling is extremely
* fragile until we cpu_idle() for the first time.
*/
preempt_disable();
if (WARN(!irqs_disabled(), "Interrupts were enabled *very* early, fixing it\n"))
local_irq_disable();
idr_init_cache();
perf_event_init();
rcu_init();
tick_nohz_init();
radix_tree_init();
/* init some links before init_ISA_irqs() */
early_irq_init();
init_IRQ();
tick_init();
init_timers();
hrtimers_init();
softirq_init();
timekeeping_init();
time_init();
profile_init();
call_function_init();
WARN(!irqs_disabled(), "Interrupts were enabled early\n");
early_boot_irqs_disabled = false;
local_irq_enable();

kmem_cache_init_late();

/*
* HACK ALERT! This is early. We're enabling the console before
* we've done PCI setups etc, and console_init() must be aware of
* this. But we do want output early, in case something goes wrong.
*/
console_init();
if (panic_later)
panic(panic_later, panic_param);

lockdep_info();

/*
* Need to run this when irqs are enabled, because it wants
* to self-test [hard/soft]-irqs on/off lock inversion bugs
* too:
*/
locking_selftest();

#ifdef CONFIG_BLK_DEV_INITRD
if (initrd_start && !initrd_below_start_ok &&
page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n",
page_to_pfn(virt_to_page((void *)initrd_start)),
min_low_pfn);
initrd_start = 0;
}
#endif
page_cgroup_init();
debug_objects_mem_init();
kmemleak_init();
setup_per_cpu_pageset();
numa_policy_init();
if (late_time_init)
late_time_init();
sched_clock_init();
calibrate_delay();
pidmap_init();
anon_vma_init();
#ifdef CONFIG_X86
if (efi_enabled(EFI_RUNTIME_SERVICES))
efi_enter_virtual_mode();
#endif
thread_info_cache_init();
cred_init();
fork_init(totalram_pages);
proc_caches_init();
buffer_init();
key_init();
security_init();
dbg_late_init();
vfs_caches_init(totalram_pages);
signals_init();
/* rootfs populating might need page-writeback */
page_writeback_init();
#ifdef CONFIG_PROC_FS
proc_root_init();
#endif
cgroup_init();
cpuset_init();
taskstats_init_early();
delayacct_init();

check_bugs();

acpi_early_init(); /* before LAPIC and SMP init */
sfi_init_late();

if (efi_enabled(EFI_RUNTIME_SERVICES)) {
efi_late_init();
efi_free_boot_services();
}

ftrace_init();

/* Do the rest non-__init'ed, we're now alive */
rest_init();
}

分析第二行报错信息

Assuming drive cache: write through

1
2
3
4
5
6
7
8
9
10
defaults:
if (sdp->wce_default_on) {
sd_printk(KERN_NOTICE, sdkp, "Assuming drive cache: write back\n");
sdkp->WCE = 1;
} else {
sd_printk(KERN_ERR, sdkp, "Assuming drive cache: write through\n");
sdkp->WCE = 0;
}
sdkp->RCD = 0;
sdkp->DPOFUA = 0;

分析第三行报错信息,

VFS: Unable to mount root fs on unknown-block

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
///Users/s1eep/kernel/linux/init/do_mounts.c

void __init mount_block_root(char *name, int flags)
{
struct page *page = alloc_page(GFP_KERNEL |
__GFP_NOTRACK_FALSE_POSITIVE);
char *fs_names = page_address(page);
char *p;
#ifdef CONFIG_BLOCK
char b[BDEVNAME_SIZE];
#else
const char *b = name;
#endif

get_fs_names(fs_names);
retry:
for (p = fs_names; *p; p += strlen(p)+1) {
int err = do_mount_root(name, p, flags, root_mount_data);
switch (err) {
case 0:
goto out;
case -EACCES:
flags |= MS_RDONLY;
goto retry;
case -EINVAL:
continue;
}
/*
* Allow the user to distinguish between failed sys_open
* and bad superblock on root device.
* and give them a list of the available devices
*/
#ifdef CONFIG_BLOCK
__bdevname(ROOT_DEV, b);
#endif
printk("VFS: Cannot open root device \"%s\" or %s: error %d\n",
root_device_name, b, err);
printk("Please append a correct \"root=\" boot option; here are the available partitions:\n");

printk_all_partitions();
#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT
printk("DEBUG_BLOCK_EXT_DEVT is enabled, you need to specify "
"explicit textual name for \"root=\" boot option.\n");
#endif
panic("VFS: Unable to mount root fs on %s", b);
}

printk("List of all partitions:\n");
printk_all_partitions();
printk("No filesystem could mount root, tried: ");
for (p = fs_names; *p; p += strlen(p)+1)
printk(" %s", p);
printk("\n");
#ifdef CONFIG_BLOCK
__bdevname(ROOT_DEV, b);
#endif
panic("VFS: Unable to mount root fs on %s", b);
out:
put_page(page);
}

CATALOG
  1. 1. 记一次虚拟机启动失败调试过程
    1. 1.1. linux kernel源代码和iso镜像的关系
    2. 1.2. 虚拟机启动panic过程