Erofs-utils 中制作 EROFS 压缩镜像的代码逻辑

1. erofs-utils 是什么

erofs-utils 是一组工具,用于处理 EROFS(Enhanced Read-Only File System,增强型只读文件系统)的文件系统镜像。这包括创建、检查和解包 EROFS 镜像。EROFS 是由华为开发,主要用于 Android 和其他嵌入式系统中,特别强调高效的读取性能和对压缩数据的支持。

构建 erofs-utils 的基本过程如下:

# git clone https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs-utils.git
# ./autogen.sh
# ./configure --prefix=$(pwd)/build --enable-lzma --enable-fuse
# make -j$(nproc)
# make install
  • 最后能够把对应的二进制程序都放入到 $(pwd)/build 目录下

2. 制作镜像

制作镜像使用的是 erofs-utils 工具组中的 mkfs.erofs,这个工具会将一个目录树转换成一个 EROFS 镜像,该镜像可以挂载为只读文件系统。

假设我们想将 /path/to/source/dir 目录下的内容制作成 EROFS 镜像文件 /path/to/erofs.img,可以使用以下指令:

mkfs.erofs /path/to/erofs.img /path/to/source/dir

这样,就能把我们在目录下的数据都压缩进 erofs 的镜像了:

[root@fedora erofs]# ./build/bin/mkfs.erofs /erofs/erofs_disk /erofs/tmp/
mkfs.erofs 1.7.1
Build completed.
------
Filesystem UUID: 92fd7f8f-9c8d-4a43-bbfa-ef3b579f551b
Filesystem total blocks: 1 (of 4096-byte blocks)
Filesystem total inodes: 2
Filesystem total metadata blocks: 1
Filesystem total deduplicated bytes (of source files): 0

3. 挂载镜像

挂载镜像要求 kernel 是支持 erofs 的,可以通过 lsmod | grep erofs 检查。

假设我们想将 /path/to/erofs.img EROFS 镜像文件挂载到 /path/to/mount,可以使用以下指令:

mount -t /path/to/erofs.img /path/to/mount

4. 制作镜像的时候都发生了什么?

制作镜像的操作都由 mkfs.erofs 来实现,我们可以将其划分为几个不同的阶段:

  1. 解析命令行参数
  2. 扫描输入目录
  3. 文件压缩
  4. 构建文件系统元数据
  5. 生成镜像文件

4.1 解析命令行参数

Erofs 专门使用了一个数据结构来保存所有涉及的配置:

struct erofs_configure {
	const char *c_version;
	int c_dbg_lvl;
	bool c_dry_run;
	bool c_legacy_compress;
#ifndef NDEBUG
	bool c_random_pclusterblks;
	bool c_random_algorithms;
#endif
	char c_timeinherit;
	char c_chunkbits;
	bool c_inline_data;
	bool c_ztailpacking;
	bool c_fragments;
	bool c_all_fragments;
	bool c_dedupe;
	bool c_ignore_mtime;
	bool c_showprogress;
	bool c_extra_ea_name_prefixes;
	bool c_xattr_name_filter;
	bool c_ovlfs_strip;
 
#ifdef HAVE_LIBSELINUX
	struct selabel_handle *sehnd;
#endif
	/* related arguments for mkfs.erofs */
	char *c_img_path;
	char *c_src_path;
	char *c_blobdev_path;
	char *c_compress_hints_file;
	char *c_compr_alg[EROFS_MAX_COMPR_CFGS];
	int c_compr_level[EROFS_MAX_COMPR_CFGS];
	char c_force_inodeversion;
	char c_force_chunkformat;
	/* < 0, xattr disabled and INT_MAX, always use inline xattrs */
	int c_inline_xattr_tolerance;
 
	u32 c_pclusterblks_max, c_pclusterblks_def, c_pclusterblks_packed;
	u32 c_max_decompressed_extent_bytes;
	u32 c_dict_size;
	u64 c_unix_timestamp;
	u32 c_uid, c_gid;
	const char *mount_point;
	long long c_uid_offset, c_gid_offset;
#ifdef WITH_ANDROID
	char *target_out_path;
	char *fs_config_file;
	char *block_list_file;
#endif
 
	/* offset when reading multi partition images */
	u64 c_offset;
};

在程序中将其定义为一个全局变量,在开始时,首先需要对其进行初始化:

struct erofs_configure cfg;
 
void erofs_init_configure(void)
{
	memset(&cfg, 0, sizeof(cfg));
 
	cfg.c_dbg_lvl  = EROFS_WARN;
	cfg.c_version  = PACKAGE_VERSION;
	cfg.c_dry_run  = false;
	cfg.c_ignore_mtime = false;
	cfg.c_force_inodeversion = 0;
	cfg.c_inline_xattr_tolerance = 2;
	cfg.c_unix_timestamp = -1;
	cfg.c_uid = -1;
	cfg.c_gid = -1;
	cfg.c_pclusterblks_max = 1;
	cfg.c_pclusterblks_def = 1;
	cfg.c_max_decompressed_extent_bytes = -1;
}

然后把可选的配置先初始化到默认配置:

static void erofs_mkfs_default_options(void)
{
	cfg.c_showprogress = true;
	cfg.c_legacy_compress = false;
	cfg.c_inline_data = true;
	cfg.c_xattr_name_filter = true;
	sbi.blkszbits = ilog2(min_t(u32, getpagesize(), EROFS_MAX_BLOCK_SIZE));
	sbi.feature_incompat = EROFS_FEATURE_INCOMPAT_ZERO_PADDING;
	sbi.feature_compat = EROFS_FEATURE_COMPAT_SB_CHKSUM |
			     EROFS_FEATURE_COMPAT_MTIME;
 
	/* generate a default uuid first */
	erofs_uuid_generate(sbi.uuid);
}

值得注意的是,这里出现了一个 sbi 的变量,他代表着 erofs 的 superblock,也是定义为一个全局变量:

struct erofs_sb_info {
	struct erofs_device_info *devs;
	char *devname;
 
	u64 total_blocks;
	u64 primarydevice_blocks;
 
	erofs_blk_t meta_blkaddr;
	erofs_blk_t xattr_blkaddr;
 
	u32 feature_compat;
	u32 feature_incompat;
	u64 build_time;
	u32 build_time_nsec;
 
	u8  extslots;
	unsigned char islotbits;
	unsigned char blkszbits;
 
	/* what we really care is nid, rather than ino.. */
	erofs_nid_t root_nid;
	/* used for statfs, f_files - f_favail */
	u64 inos;
 
	u8 uuid[16];
	char volume_name[16];
 
	u16 available_compr_algs;
	u16 lz4_max_distance;
 
	u32 checksum;
	u16 extra_devices;
	union {
		u16 devt_slotoff;		/* used for mkfs */
		u16 device_id_mask;		/* used for others */
	};
	erofs_nid_t packed_nid;
 
	u32 xattr_prefix_start;
	u8 xattr_prefix_count;
	struct erofs_xattr_prefix_item *xattr_prefixes;
 
	int devfd, devblksz;
	u64 devsz;
	dev_t dev;
	unsigned int nblobs;
	unsigned int blobfd[256];
 
	struct list_head list;
 
	u64 saved_by_deduplication;
};

接下来就是解析命令行参数了,这里把所有的逻辑都写到 mkfs_parse_options_cfg 里面了,有种力大飞转的美感 : (

static int mkfs_parse_options_cfg(int argc, char *argv[])
{
	char *endptr;
	int opt, i, err;
	bool quiet = false;
 
	while ((opt = getopt_long(argc, argv, "C:E:L:T:U:b:d:x:z:",
				  long_options, NULL)) != -1) {
		switch (opt) {
		case 'z':
			i = mkfs_parse_compress_algs(optarg);
			if (i)
				return i;
			break;
		...
		default: /* '?' */
			return -EINVAL;
		}
	}
	...
 
	return 0;
}

为了更好地理解这一坨代码在解析什么,我们可以先看一下 mkfs.erofs 的 help 文档:

[root@0f5ed24efcd4 erofs]# ./build/bin/mkfs.erofs --help
usage: [options] FILE SOURCE(s)
Generate EROFS image (FILE) from DIRECTORY, TARBALL and/or EROFS images.  And [options] are:
 -b#                   set block size to # (# = page size by default)
 -d#                   set output message level to # (maximum 9)
 -x#                   set xattr tolerance to # (< 0, disable xattrs; default 2)
 -zX[,Y][:..]          X=compressor (Y=compression level, optional)
                       alternative algorithms can be separated by colons(:)
 -C#                   specify the size of compress physical cluster in bytes
 -EX[,...]             X=extended options
 -L volume-label       set the volume label (maximum 16)
 -T#                   set a fixed UNIX timestamp # to all files
 -UX                   use a given filesystem UUID
 --all-root            make all files owned by root
 --blobdev=X           specify an extra device X to store chunked data
 --chunksize=#         generate chunk-based files with #-byte chunks
 --compress-hints=X    specify a file to configure per-file compression strategy
 --exclude-path=X      avoid including file X (X = exact literal path)
 --exclude-regex=X     avoid including files that match X (X = regular expression)
 --force-uid=#         set all file uids to # (# = UID)
 --force-gid=#         set all file gids to # (# = GID)
 --uid-offset=#        add offset # to all file uids (# = id offset)
 --gid-offset=#        add offset # to all file gids (# = id offset)
 --gzip                try to filter the tarball stream through gzip
 --help                display this help and exit
 --ignore-mtime        use build time instead of strict per-file modification time
 --max-extent-bytes=#  set maximum decompressed extent size # in bytes
 --preserve-mtime      keep per-file modification time strictly
 --aufs                replace aufs special files with overlayfs metadata
 --tar=[fi]            generate an image from tarball(s)
 --ovlfs-strip=[01]    strip overlayfs metadata in the target image (e.g. whiteouts)
 --quiet               quiet execution (do not write anything to standard output.)
 --xattr-prefix=X      X=extra xattr name prefix
 --mount-point=X       X=prefix of target fs path (default: /)
 
Available compressors are: lzma, deflate

可以看到,mkfs.erofs 的使用形式是:

mkfs.erofs [OPTS] FILE SOURCE(S)

基本上,这里的每一个可配置的参数会被映射到程序的数据结构中,解析这一系列的参数使用的是 getopt_long,这是一个 GNU getopt 函数的扩展,getopt 是 POSIX 标准的一部分。使用时,getopt_long 会逐步解析每一个参数,然后将其转化为对应的返回值。getopt_long 会接受一个短参数(字符串形式)和长参数(结构体形式),并返回匹配到的参数的短参数值(匹配到对应的短参数或长参数指定的短参数形式)。短参数的每个字符代表一个短选项,如果选项需要参数,则在该字符后面加上冒号。

mkfs.erofs 的长参数定义如下:

// long_options
static struct option long_options[] = {
	{"help", no_argument, 0, 1},
	{"exclude-path", required_argument, NULL, 2},
	{"exclude-regex", required_argument, NULL, 3},
#ifdef HAVE_LIBSELINUX
	{"file-contexts", required_argument, NULL, 4},
#endif
	{"force-uid", required_argument, NULL, 5},
	{"force-gid", required_argument, NULL, 6},
	{"all-root", no_argument, NULL, 7},
#ifndef NDEBUG
	{"random-pclusterblks", no_argument, NULL, 8},
	{"random-algorithms", no_argument, NULL, 18},
#endif
	{"max-extent-bytes", required_argument, NULL, 9},
	{"compress-hints", required_argument, NULL, 10},
	{"chunksize", required_argument, NULL, 11},
	{"quiet", no_argument, 0, 12},
	{"blobdev", required_argument, NULL, 13},
	{"ignore-mtime", no_argument, NULL, 14},
	{"preserve-mtime", no_argument, NULL, 15},
	{"uid-offset", required_argument, NULL, 16},
	{"gid-offset", required_argument, NULL, 17},
	{"tar", optional_argument, NULL, 20},
	{"aufs", no_argument, NULL, 21},
	{"mount-point", required_argument, NULL, 512},
	{"xattr-prefix", required_argument, NULL, 19},
#ifdef WITH_ANDROID
	{"product-out", required_argument, NULL, 513},
	{"fs-config-file", required_argument, NULL, 514},
	{"block-list-file", required_argument, NULL, 515},
#endif
	{"ovlfs-strip", optional_argument, NULL, 516},
#ifdef HAVE_ZLIB
	{"gzip", no_argument, NULL, 517},
#endif
	{0, 0, 0, 0},
};

根据这样的定义进行解析:

// parse
	while ((opt = getopt_long(argc, argv, "C:E:L:T:U:b:d:x:z:",
				  long_options, NULL)) != -1) { ... }

注意

这里的长参数和短参数是完全分开的,也就是说各自表达不同的含义,具体可以结合上述的手册比对。

在解析完一系列的参数之后,读取传入的 FILESOURCE(S):

// 解析 FILE
	cfg.c_img_path = strdup(argv[optind++]);
	if (!cfg.c_img_path)
		return -ENOMEM;
 
// 解析 SOURCES
	cfg.c_src_path = realpath(argv[optind++], NULL);
	if (!cfg.c_src_path) {
		erofs_err("failed to parse source directory: %s",
			  erofs_strerror(-errno));
		return -ENOENT;
	}

可重复构建

除了解析一般的命令行参数之外,erofs-utils 还会检查 SOURCE_DATE_EPOCH,以确保构建产物的时间戳是一致的。在构建产物中嵌入相同的时间戳是为了实现可重复构建(reproducible builds),这是一种确保软件构建过程可靠性和安全性的重要实践。

4.2 扫描输入目录

接下来,mkfs.erofs 会扫描传入的 FILE 文件路径,这在函数 dev_open 中实现:

// 打开 FILE
fd = open(dev, O_RDWR | O_CREAT | O_BINARY, 0644);
 
// 获取文件或设备的元数据
ret = fstat(fd, &st);
 
// 根据文件类型处理
switch (st.st_mode & S_IFMT) {
 
// 如果是块设备
case S_IFBLK:
	ret = dev_get_blkdev_size(fd, &sbi->devsz);
 
// 如果是普通文件
case S_IFREG:
	... // 处理ext4和btrfs的特殊情况
	ret = ftruncate(fd, 0);
	...
	// 设置块大小
	sbi->devsz = INT64_MAX;
	sbi->devblksz = st.st_blksize;

4.3 压缩文件

首先解释一下 erofs 的布局。erofs 采用的是 fixed-sized output compression,简而言之就是压缩的时候,原数据从头开始压缩,直到压缩得到的数据填满 4 K(即压缩时的单位大小,可配置),填满后再从新的原数据头开始压缩,周而复始得到一系列的 4 K 压缩数据和末尾的数据。

与之相对应的是 fixed-sized output compression,即每次取固定大小的数据做压缩,压缩出的数据大小不固定。如下图所示:

mkfs.erofs 中,需要先初始化出一系列的 bucket 来放置这些数据:

/* The maximum block size which erofs-utils supports */
#define EROFS_MAX_BLOCK_SIZE 4096
 
/* buckets for all mapped buffer blocks to boost up allocation */
static struct list_head mapped_buckets[META + 1][EROFS_MAX_BLOCK_SIZE];
 
/* return buffer_head of erofs super block (with size 0) */
struct erofs_buffer_head *erofs_buffer_init(void)
{
	int i, j;
	struct erofs_buffer_head *bh = erofs_balloc(META, 0, 0, 0);
 
	if (IS_ERR(bh))
		return bh;
 
	bh->op = &erofs_skip_write_bhops;
 
	for (i = 0; i < ARRAY_SIZE(mapped_buckets); i++)
		for (j = 0; j < ARRAY_SIZE(mapped_buckets[0]); j++)
			init_list_head(&mapped_buckets[i][j]);
	return bh;
}
 
// 为 superblock 扩充空间
err = erofs_bh_balloon(sb_bh, EROFS_SUPER_END);
 
int erofs_bh_balloon(struct erofs_buffer_head *bh, erofs_off_t incr)
{
	struct erofs_buffer_block *const bb = bh->block;
 
	/* should be the tail bh in the corresponding buffer block */
	if (bh->list.next != &bb->buffers.list)
		return -EINVAL;
 
	return __erofs_battach(bb, NULL, incr, 1, 0, false);
}
 
/* return occupied bytes in specific buffer block if succeed */
static int __erofs_battach(struct erofs_buffer_block *bb,
			   struct erofs_buffer_head *bh,
			   erofs_off_t incr,
			   unsigned int alignsize,
			   unsigned int extrasize,
			   bool dryrun)
{
	const unsigned int blksiz = erofs_blksiz(&sbi);
	const unsigned int blkmask = blksiz - 1;
	const erofs_off_t alignedoffset = roundup(bb->buffers.off, alignsize);
	const int oob = cmpsgn(roundup(((bb->buffers.off - 1) & blkmask) + 1,
				       alignsize) + incr + extrasize, blksiz);
	bool tailupdate = false;
	erofs_blk_t blkaddr;
 
		if (oob >= 0) {
		/* the next buffer block should be NULL_ADDR all the time */
		if (oob && list_next_entry(bb, list)->blkaddr != NULL_ADDR)
			return -EINVAL;
 
		blkaddr = bb->blkaddr;
		if (blkaddr != NULL_ADDR) {
			tailupdate = (tail_blkaddr == blkaddr +
				      DIV_ROUND_UP(bb->buffers.off, blksiz));
			if (oob && !tailupdate)
				return -EINVAL;
		}
	}
 
	if (!dryrun) {
		if (bh) {
			bh->off = alignedoffset;
			bh->block = bb;
			list_add_tail(&bh->list, &bb->buffers.list);
		}
		bb->buffers.off = alignedoffset + incr;
		/* need to update the tail_blkaddr */
		if (tailupdate)
			tail_blkaddr = blkaddr +
					DIV_ROUND_UP(bb->buffers.off, blksiz);
		erofs_bupdate_mapped(bb);
	}
	return ((alignedoffset + incr - 1) & blkmask) + 1;
}

接着,先写入 superblock:

/* make sure that the super block should be the very first blocks */
(void)erofs_mapbh(sb_bh->block);
if (erofs_btell(sb_bh, false) != 0) {
	erofs_err("failed to reserve erofs_super_block");
	goto exit;
}
 
erofs_blk_t erofs_mapbh(struct erofs_buffer_block *bb)
{
	struct erofs_buffer_block *t = last_mapped_block;
 
	if (bb && bb->blkaddr != NULL_ADDR)
		return bb->blkaddr;
	do {
		t = list_next_entry(t, list);
		if (t == &blkh)
			break;
 
		DBG_BUGON(t->blkaddr != NULL_ADDR);
		(void)__erofs_mapbh(t);
	} while (t != bb);
	return tail_blkaddr;
}
 
static erofs_blk_t __erofs_mapbh(struct erofs_buffer_block *bb)
{
	erofs_blk_t blkaddr;
 
	if (bb->blkaddr == NULL_ADDR) {
		bb->blkaddr = tail_blkaddr;
		last_mapped_block = bb;
		erofs_bupdate_mapped(bb);
	}
 
	blkaddr = bb->blkaddr + BLK_ROUND_UP(&sbi, bb->buffers.off);
	if (blkaddr > tail_blkaddr)
		tail_blkaddr = blkaddr;
 
	return blkaddr;
}
 
static void erofs_bupdate_mapped(struct erofs_buffer_block *bb)
{
	struct list_head *bkt;
 
	if (bb->blkaddr == NULL_ADDR)
		return;
 
	bkt = mapped_buckets[bb->type] +
		(bb->buffers.off & (erofs_blksiz(&sbi) - 1));
	list_del(&bb->mapped_list);
	list_add_tail(&bb->mapped_list, bkt);
}

完成 superblock 的写入之后,开始提取压缩的一些配置:

	err = erofs_load_compress_hints(&sbi);
	if (err) {
		erofs_err("failed to load compress hints %s",
			  cfg.c_compress_hints_file);
		goto exit;
	}

erofs_load_compress_hints 函数用于从配置文件中加载压缩提示(compress hints)信息,这些信息用于指导 EROFS(Enhanced Read-Only File System)文件系统如何处理特定文件或文件模式的压缩。函数通过解析一个给定的文件来设置压缩配置,这些配置可以指定哪些文件应该被压缩以及使用什么算法进行压缩。

下一步就是初始化 compressor 了,后续压缩都是调用 compressor 的 compress_destsize 方法实现的。

// struct 关系图
erofs_compress
	-> erofs_algorithm
		-> erofs_compressor
 
// operator + compress_level
struct erofs_compressor {
	int default_level;
	int best_level;
 
	int (*init)(struct erofs_compress *c);
	int (*exit)(struct erofs_compress *c);
	int (*setlevel)(struct erofs_compress *c, int compression_level);
 
	int (*compress_destsize)(const struct erofs_compress *c,
				 const void *src, unsigned int *srcsize,
				 void *dst, unsigned int dstsize);
};
 
// 支持的压缩算法,编译的时候使用--enable_lama可以开启lzma支持,其他同理
static const struct erofs_algorithm {
	char *name;
	const struct erofs_compressor *c;
	unsigned int id;
 
	/* its name won't be shown as a supported algorithm */
	bool optimisor;
} erofs_algs[] = {
	{ "lz4",
#if LZ4_ENABLED
		&erofs_compressor_lz4,
#else
		NULL,
#endif
	  Z_EROFS_COMPRESSION_LZ4, false },
 
#if LZ4HC_ENABLED
	{ "lz4hc", &erofs_compressor_lz4hc,
	  Z_EROFS_COMPRESSION_LZ4, true },
#endif
 
	{ "lzma",
#if HAVE_LIBLZMA
		&erofs_compressor_lzma,
#else
		NULL,
#endif
	  Z_EROFS_COMPRESSION_LZMA, false },
 
	{ "deflate", &erofs_compressor_deflate,
	  Z_EROFS_COMPRESSION_DEFLATE, false },
 
#if HAVE_LIBDEFLATE
	{ "libdeflate", &erofs_compressor_libdeflate,
	  Z_EROFS_COMPRESSION_DEFLATE, true },
#endif
};
 
const struct erofs_compressor erofs_compressor_lzma = {
	.default_level = LZMA_PRESET_DEFAULT,
	.best_level = 109,
	.init = erofs_compressor_liblzma_init,
	.exit = erofs_compressor_liblzma_exit,
	.setlevel = erofs_compressor_liblzma_setlevel,
	.compress_destsize = erofs_liblzma_compress_destsize,
};
 
int z_erofs_compress_init(struct erofs_sb_info *sbi, struct erofs_buffer_head *sb_bh)
{
	int i, ret;
 
	for (i = 0; cfg.c_compr_alg[i]; ++i) {
		struct erofs_compress *c = &erofs_ccfg[i].handle;
 
		ret = erofs_compressor_init(sbi, c, cfg.c_compr_alg[i]);
		if (ret)
			return ret;
 
		ret = erofs_compressor_setlevel(c, cfg.c_compr_level[i]);
		if (ret)
			return ret;
 
		erofs_ccfg[i].algorithmtype =
			z_erofs_get_compress_algorithm_id(c);
		erofs_ccfg[i].enable = true;
		sbi->available_compr_algs |= 1 << erofs_ccfg[i].algorithmtype;
		if (erofs_ccfg[i].algorithmtype != Z_EROFS_COMPRESSION_LZ4)
			erofs_sb_set_compr_cfgs(sbi);
	}
 
	...	
	return 0;
}

最后,如果有去重的需求,会调用 z_erofs_dedupe_init 做初始化:

if (cfg.c_dedupe) {
	if (!cfg.c_compr_alg[0]) {
		erofs_err("Compression is not enabled.  Turn on chunk-based data deduplication instead.");
		cfg.c_chunkbits = sbi.blkszbits;
	} else {
		err = z_erofs_dedupe_init(erofs_blksiz(&sbi));
		if (err) {
			erofs_err("failed to initialize deduplication: %s",
				  erofs_strerror(err));
			goto exit;
		}
	}
}

4.4 构建文件系统

构建文件系统分为两步:

  1. 构建 xattrs:这是一些扩展属性,在这里就不过多介绍
  2. 构建文件系统:主要是构建以镜像目录为根的 inode 树
err = erofs_build_shared_xattrs_from_path(&sbi, cfg.c_src_path);
if (err) {
	erofs_err("failed to build shared xattrs: %s",
		  erofs_strerror(err));
	goto exit;
}
 
if (cfg.c_extra_ea_name_prefixes)
	erofs_xattr_write_name_prefixes(&sbi, packedfile);
 
root_inode = erofs_mkfs_build_tree_from_path(cfg.c_src_path);
if (IS_ERR(root_inode)) {
	err = PTR_ERR(root_inode);
	goto exit;
}

这里的 erofs_mkfs_build_tree_from_path 是最核心的部分,他将 source 文件夹下的文件构造成一颗树,并在后续进行压缩,最终写入镜像。

struct erofs_inode *erofs_mkfs_build_tree_from_path(const char *path)
{
	LIST_HEAD(dirs);
	struct erofs_inode *inode, *root, *dumpdir;
 
	// 获得root的inode
	root = erofs_iget_from_path(path, true);
	if (IS_ERR(root))
		return root;
 
	(void)erofs_igrab(root);
	root->i_parent = root;	/* rootdir mark */
	list_add(&root->i_subdirs, &dirs);
 
	do {
		int err;
		char *trimmed;
 
		inode = list_first_entry(&dirs, struct erofs_inode, i_subdirs);
		list_del(&inode->i_subdirs);
		init_list_head(&inode->i_subdirs);
 
		// 构造文件树
		err = erofs_mkfs_build_tree(inode, &dirs);
		if (err) {
			root = ERR_PTR(err);
			break;
		}
 
		if (S_ISDIR(inode->i_mode)) {
			inode->next_dirwrite = dumpdir;
			dumpdir = inode;
		} else {
			erofs_iput(inode);
		}
	} while (!list_empty(&dirs));
 
	// ...
	return root;
}
  • 通过 erofs_iget_from_path 为传入的目录文件创建目录文件 inode。该目录文件对应的是 erofs 文件系统的根目录 /
  • 将该 inode 的 parent 指向自己,说明自己是根目录
  • 调用 erofs_mkfs_build_tree 递归地为根目录创建子目录及文件,并一一对应当前目录下的子目录和文件

具体来说,在执行 erofs_iget_from_path 的过程中,有如下流程:

  1. 通过 lstat 解析 path,可以快速获知当前 path 是目录还是文件
  2. 传入的是目录,因此不会执行 erofs_iget 而直接调用 erofs_new_inode 创建一个新的 inode
  3. 通过 erofs_fill_inode 对新 inode 进行初始化
/* get the inode from the (source) path */
static struct erofs_inode *erofs_iget_from_path(const char *path, bool is_src)
{
	struct stat st;
	struct erofs_inode *inode;
	int ret;
 
	/* currently, only source path is supported */
	if (!is_src)
		return ERR_PTR(-EINVAL);
 
	ret = lstat(path, &st);
	if (ret)
		return ERR_PTR(-errno);
 
	/*
	 * lookup in hash table first, if it already exists we have a
	 * hard-link, just return it. Also don't lookup for directories
	 * since hard-link directory isn't allowed.
	 */
	if (!S_ISDIR(st.st_mode)) {
		inode = erofs_iget(st.st_dev, st.st_ino);
		if (inode)
			return inode;
	}
 
	/* cannot find in the inode cache */
	inode = erofs_new_inode();
	if (IS_ERR(inode))
		return inode;
 
	ret = erofs_fill_inode(inode, &st, path);
	if (ret) {
		erofs_iput(inode);
		return ERR_PTR(ret);
	}
	return inode;
}

erofs_fill_inode 中,主要就是装填 inode 的属性。此时,也将 path 设入 inode 的 srcpath 中,建立了源文件系统与目标文件系统的映射关系。

最后,由于是新的 inode 。需要将其插入 inode_hashtable 中,用来加速查询。

static int erofs_fill_inode(struct erofs_inode *inode, struct stat *st,
			    const char *path)
{
	int err = __erofs_fill_inode(inode, st, path);
 
	if (err)
		return err;
 
	inode->i_mode = st->st_mode;
	inode->i_nlink = 1;	/* fix up later if needed */
 
	switch (inode->i_mode & S_IFMT) {
	case S_IFCHR:
	case S_IFBLK:
	case S_IFIFO:
	case S_IFSOCK:
		inode->u.i_rdev = erofs_new_encode_dev(st->st_rdev);
	case S_IFDIR:
		inode->i_size = 0;
		break;
	case S_IFREG:
	case S_IFLNK:
		inode->i_size = st->st_size;
		break;
	default:
		return -EINVAL;
	}
 
	// 把inode的srcpath设置成path
	inode->i_srcpath = strdup(path);
	if (!inode->i_srcpath)
		return -ENOMEM;
 
	if (!S_ISDIR(inode->i_mode)) {
		inode->dev = st->st_dev;
		inode->i_ino[1] = st->st_ino;
	}
 
	...
 
	erofs_insert_ihash(inode, st->st_dev, st->st_ino);
	return 0;
}

完成之后,进入到 erofs_mkfs_build_tree 函数,这个函数负责初始化 root 目录下的目录项,然后递归地向下进行构建。遍历时有两种情况:

  1. 如果遍历到文件,就调用 erofs_write_file 写入
  2. 如果遍历到路径,递归地创建下面目录的 inode 树,也是调用 erofs_mkfs_build_tree_from_path 来实现
static int erofs_mkfs_build_tree(struct erofs_inode *dir, struct list_head *dirs)
{
	int ret;
	DIR *_dir;
	struct dirent *dp;
	struct erofs_dentry *d;
	unsigned int nr_subdirs, i_nlink;
 
	...
 
	// 如果是文件
	if (!S_ISDIR(dir->i_mode)) {
		if (S_ISLNK(dir->i_mode)) {
			char *const symlink = malloc(dir->i_size);
 
			if (!symlink)
				return -ENOMEM;
			ret = readlink(dir->i_srcpath, symlink, dir->i_size);
			if (ret < 0) {
				free(symlink);
				return -errno;
			}
			ret = erofs_write_file_from_buffer(dir, symlink);
			free(symlink);
		} else if (dir->i_size) {
			int fd = open(dir->i_srcpath, O_RDONLY | O_BINARY);
			if (fd < 0)
				return -errno;
 
			// **写入镜像**
			ret = erofs_write_file(dir, fd, 0);
			close(fd);
		} else {
			ret = 0;
		}
		if (ret)
			return ret;
 
		erofs_prepare_inode_buffer(dir);
		erofs_write_tail_end(dir);
		return 0;
	}
 
	// 发现是目录,打开目录并初始化
	_dir = opendir(dir->i_srcpath);
	if (!_dir) {
		erofs_err("failed to opendir at %s: %s",
			  dir->i_srcpath, erofs_strerror(errno));
		return -errno;
	}
	nr_subdirs = 0;
	while (1) {
		/*
		 * set errno to 0 before calling readdir() in order to
		 * distinguish end of stream and from an error.
		 */
		errno = 0;
		dp = readdir(_dir);
		if (!dp)
			break;
 
		if (is_dot_dotdot(dp->d_name))
			continue;
 
		/* skip if it's a exclude file */
		if (erofs_is_exclude_path(dir->i_srcpath, dp->d_name))
			continue;
 
		d = erofs_d_alloc(dir, dp->d_name);
		if (IS_ERR(d)) {
			ret = PTR_ERR(d);
			goto err_closedir;
		}
		nr_subdirs++;
	}
 
	...
 
	ret = erofs_prepare_dir_file(dir, nr_subdirs);
	if (ret)
		return ret;
 
	ret = erofs_prepare_inode_buffer(dir);
	if (ret)
		return ret;
	dir->bh->op = &erofs_skip_write_bhops;
 
	if (IS_ROOT(dir))
		erofs_fixup_meta_blkaddr(dir);
 
	// 遍历每个目录项,为每个目录项创建inode
	i_nlink = 0;
	list_for_each_entry(d, &dir->i_subdirs, d_child) {
		char buf[PATH_MAX];
		unsigned char ftype;
		struct erofs_inode *inode;
 
		if (is_dot_dotdot(d->name)) {
			++i_nlink;
			continue;
		}
 
		...
 
		// 获得目录的inode
		inode = erofs_iget_from_path(buf, true);
 
		/* a hardlink to the existed inode */
		if (inode->i_parent) {
			++inode->i_nlink;
		} else {
			inode->i_parent = dir;
			erofs_igrab(inode);
			list_add_tail(&inode->i_subdirs, dirs);
		}
 
		// 更新目录项信息
		ftype = erofs_mode_to_ftype(inode->i_mode);
		i_nlink += (ftype == EROFS_FT_DIR);
		d->inode = inode;
		d->type = ftype;
		erofs_info("file %s/%s dumped (type %u)",
			   dir->i_srcpath, d->name, d->type);
	}
	...
	return 0;
 
err_closedir:
	closedir(_dir);
	return ret;
}

4.5 生成镜像文件

erofs_mkfs_build_tree 中,遍历到文件就会写入镜像。写入时调用函数 erofs_write_file,这个函数也会判断两种情况:

  1. 需要压缩,调用压缩函数 erofs_write_compressed_file 写入
  2. 不需要压缩,直接写入
int erofs_write_file(struct erofs_inode *inode, int fd, u64 fpos)
{
	int ret;
 
	// 需要压缩
	if (cfg.c_compr_alg[0] && erofs_file_is_compressible(inode)) {
		ret = erofs_write_compressed_file(inode, fd);
		if (!ret || ret != -ENOSPC)
			return ret;
 
		ret = lseek(fd, fpos, SEEK_SET);
		if (ret < 0)
			return -errno;
	}
 
	// 不需要压缩
	/* fallback to all data uncompressed */
	return write_uncompressed_file_from_fd(inode, fd);
}

不压缩写入并不重要,我们主要看压缩的逻辑:

int erofs_write_compressed_file(struct erofs_inode *inode, int fd)
{
	...
 
	blkaddr = erofs_mapbh(bh->block);	/* start_blkaddr */
	// 初始化压缩的上下文
	ctx.inode = inode;
	ctx.pclustersize = z_erofs_get_max_pclustersize(inode);
	ctx.blkaddr = blkaddr;
	ctx.metacur = compressmeta + Z_EROFS_LEGACY_MAP_HEADER_SIZE;
	ctx.head = ctx.tail = 0;
	ctx.clusterofs = 0;
	ctx.e.length = 0;
	ctx.remaining = inode->i_size - inode->fragment_size;
	ctx.fix_dedupedfrag = false;
	ctx.fragemitted = false;
	if (cfg.c_all_fragments && !erofs_is_packed_inode(inode) &&
	    !inode->fragment_size) {
		ret = z_erofs_pack_file_from_fd(inode, fd, ctx.tof_chksum);
		if (ret)
			goto err_free_idata;
	} else {
		// 把inode对应文件的数据读到ctx的队列中
		while (ctx.remaining) {
			const u64 rx = min_t(u64, ctx.remaining,
					     sizeof(ctx.queue) - ctx.tail);
 
			ret = read(fd, ctx.queue + ctx.tail, rx);
			if (ret != rx) {
				ret = -errno;
				goto err_bdrop;
			}
			ctx.remaining -= rx;
			ctx.tail += rx;
 
			// 压缩文件内容
			ret = vle_compress_one(&ctx);
			if (ret)
				goto err_free_idata;
		}
	}
	DBG_BUGON(ctx.head != ctx.tail);
 
	/* fall back to no compression mode */
	...
}
 
static int vle_compress_one(struct z_erofs_vle_compress_ctx *ctx)
{
	// 初始化和配置:存放压缩后的数据
	static char dstbuf[EROFS_CONFIG_COMPR_MAX_SZ + EROFS_MAX_BLOCK_SIZE];
	struct erofs_inode *inode = ctx->inode;
	struct erofs_sb_info *sbi = inode->sbi;
	char *const dst = dstbuf + erofs_blksiz(sbi);
	struct erofs_compress *const h = &ctx->ccfg->handle;
	unsigned int len = ctx->tail - ctx->head;
	bool is_packed_inode = erofs_is_packed_inode(inode);
	bool final = !ctx->remaining;
	int ret;
 
	while (len) {
		bool may_packing = (cfg.c_fragments && final &&
				   !is_packed_inode);
		bool may_inline = (cfg.c_ztailpacking && final &&
				  !may_packing);
		bool fix_dedupedfrag = ctx->fix_dedupedfrag;
 
		// 去重
		if (z_erofs_compress_dedupe(ctx, &len) && !final)
			break;
 
		// 如果文件很小,则考虑不同的策略
		if (len <= ctx->pclustersize) {
			if (!final || !len)
				break;
			// 尾部数据打包优化
			if (may_packing) {
				if (inode->fragment_size && !fix_dedupedfrag) {
					ctx->pclustersize =
						roundup(len, erofs_blksiz(sbi));
					goto fix_dedupedfrag;
				}
				ctx->e.length = len;
				goto frag_packing;
			}
			// 不内联,直接不压缩写入
			if (!may_inline && len <= erofs_blksiz(sbi))
				goto nocompression;
		}
 
		ctx->e.length = min(len,
				cfg.c_max_decompressed_extent_bytes);
		// 调用压缩算法压缩数据,存入dst数组
		ret = erofs_compress_destsize(h, ctx->queue + ctx->head,
				&ctx->e.length, dst, ctx->pclustersize,
				!(final && len == ctx->e.length));
		if (ret <= 0) {
		...
		} else if (may_packing && len == ctx->e.length &&
			// 尾部数据打包
		} else if (may_inline && len == ctx->e.length &&
			   ret < erofs_blksiz(sbi)) {
			// inline数据存到inode里
		} else {
			// 压缩成功,写入压缩的数据
			ret = blk_write(sbi, dst - padding, ctx->blkaddr,
				ctx->e.compressedblks);
		}
 
		...
	}
	return 0;
 
fix_dedupedfrag:
	DBG_BUGON(!inode->fragment_size);
	ctx->remaining += inode->fragment_size;
	ctx->e.length = 0;
	ctx->fix_dedupedfrag = true;
	return 0;
}
 
static inline int blk_write(struct erofs_sb_info *sbi, const void *buf,
			    erofs_blk_t blkaddr, u32 nblocks)
{
	return dev_write(sbi, buf, erofs_pos(sbi, blkaddr),
			 erofs_pos(sbi, nblocks));
}
 
int dev_write(struct erofs_sb_info *sbi, const void *buf, u64 offset, size_t len)
{
	int ret;
 
	// syscall,写入文件
	ret = pwrite64(sbi->devfd, buf, len, (off64_t)offset);
 
	return 0;
}

完整的流程大致如下:

erofs_mkfs_build_tree
	--- erofs_write_file
		--- erofs_write_compressed_file
			--- vle_compress_one
				--- erofs_compress_destsize
				--- blk_write

最后,镜像的布局如下图: