UNIX环境高级编程APUE练习4.6-实现类似cp(1)的程序,保留文件中的空洞

1 题面

编写类似cp(1)的程序,它复制包含空洞的文件,但是不将字节0写到输出文件中去。

2 基本思路

  • 首先要搞清楚空洞的性质以判断一个文件是否有空洞,以及空洞的位置
  • 知道了空洞的位置之后,读到源文件中的空洞部分时,在目标文件中lseek相应的长度

3 创建空洞文件,同时探索空洞性质

交替lseekwrite,逐渐增大间隔长度。比较文件的大小和实际占用的block数目

  • 测试源码
点击展开代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>

int holesize[]={1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32*1024};
int filesize = 64*1024;

int main()
{
int i = 0;
int count = 0;
int ret = 0, fd = 0;
char filename[32]={0};
unsigned char buf[32*1024]={0};
memset(buf, 1, 32*1024);
for (; i< sizeof(holesize)/ sizeof(int); ++i) {
count = 0;
memset(filename, 0, 32);
sprintf(filename, "%s%d", "holesize", holesize[i]);
fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
if(fd < 0) {
printf("open file fail\n");
return -1;
}
while(count < filesize) {
ret = lseek(fd, holesize[i], SEEK_CUR);
if(ret < 0) {
printf("lseek fail\n");
return -1;
}
int remain = holesize[i];
while(remain) {
ret = write(fd, buf, remain);
if(ret < 0 ) {
perror("write fail\n");
return -1;
}
remain -= ret;
}
count += holesize[i] * 2;
}
close(fd);
}
return 0;
}
  • MAC OSX 10.1.4.6测试结果
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
^_^$ ll -s
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize1
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize1024
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize128
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize16
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize16384
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize2
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize2048
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize256
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize32
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize32768
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize4
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize4096
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize512
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize64
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize8
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize8192

Mac OSX上创建不了空洞文件,因为默认的文件系统是HFS +,不支持稀疏文件

  • Ubuntu18 4.15.0-60-generic测试结果
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
^_^$ ll -s
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize1
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize1024
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize128
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize16
32 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize16384
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize2
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize2048
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize256
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize32
32 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize32768
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize4
32 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize4096
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize512
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize64
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize8
32 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize8192

4KB以上才实际创建空洞。
因为在linux的文件系统中,磁盘分配的最小物理单元为簇。(即使文件大小不足以占用满一簇,该簇空余的磁盘存储仍旧是该文件的)

所以可以根据这个性质,判断文件是否是空洞文件。有空洞的文件,用文件大小计算的block数至少比实际占用的block数大1个簇的block数

如何可移植地获取簇的大小

1
pagesize = sysconf(_SC_PAGESIZE);

初步实现功能

  • 源码
点击展开代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/errno.h>

int my_cp(const char *from, const char *to)
{
int fd1 = -1, fd2 = -1;
int rev = -1;
unsigned char *buffer = NULL;
unsigned char *start_pos = NULL;
long pagesize = 0;
long long blocks, blksize, size;
int read_num, write_num, remain_num, current_pos = 0, last_zero = -1, last_nonzero = -1, have_holes = 0;
struct stat st;

fd1 = open(from, O_RDONLY);
if(-1 == fd1){
perror("open file1 faild");
goto err;
}

if(fstat(fd1, &st) !=0) {
perror("fstat: ");
goto err;
}
else{
#ifdef _SC_PAGESIZE
pagesize = sysconf(_SC_PAGESIZE);
if (pagesize < 0) {
if (errno != 0) {
if (errno == EINVAL) {
fputs(" (not supported)\n", stdout);
pagesize = st.st_blksize;
}
else {
perror("sysconf error");
goto err;
}
} else {
fputs(" (no limit)\n", stdout);
pagesize = st.st_blksize;
}
}
printf("pagesize: %ld\n", pagesize);
#else
pagesize = st.st_blksize;
#endif
blocks = st.st_blocks;
blksize = st.st_blksize;
size = st.st_size;
printf("st.st_blocks: %lld\n", blocks);
printf("st.st_blksize: %lld\n", blksize);
printf("st.st_size: %lld\n", size);
/*块大小512,在不同平台上可能不兼容*/
if(S_ISREG(st.st_mode) && (size / pagesize + (size%pagesize?1:0)) * pagesize > 512 * blocks) {
have_holes = 1;
printf("%s is a sparse-block file!\n", from);
} else{
have_holes = 0;
printf("%s is not a sparse-block file!\n", from);
}
}
fd2 = open(to, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
if ( -1 == fd2) {
perror ("open file2 faild");
goto err;
}

buffer = malloc(pagesize);
if(buffer == NULL) {
perror ("malloc fail");
goto err;
}
memset(buffer, '\0', pagesize);
while((read_num = read(fd1, buffer, pagesize)) > 0) {
/* 源文件有空洞 */
if(have_holes){
last_zero = -1;
last_nonzero = -1;
for(current_pos = 0; current_pos < read_num; current_pos++){
/* 逐字节判断,效率较低*/
if(buffer[current_pos] == 0){
if(last_nonzero > last_zero){
remain_num = last_nonzero - last_zero;
start_pos = buffer + last_zero + 1;
while(remain_num){
write_num = write(fd2, start_pos, remain_num);
if ( -1 == write_num){
perror( "write file2 error");
goto err;
}
remain_num -= write_num;
start_pos += write_num;
}
}
last_zero = current_pos;
}
else{
if(last_zero > last_nonzero){
remain_num = last_zero - last_nonzero;
if(-1 == lseek(fd2, remain_num, SEEK_CUR)){
perror("lseek file2 fail");
goto err;
}
}
last_nonzero = current_pos;
}
}
/* 处理最后剩余数据*/
remain_num = (last_nonzero > last_zero)?(last_nonzero - last_zero):(last_zero - last_nonzero);
start_pos = buffer + current_pos - remain_num;
if(last_nonzero > last_zero){
while(remain_num){
write_num = write(fd2, start_pos, remain_num);
if ( -1 == write_num){
perror( "write file2 error");
goto err;
}
remain_num -= write_num;
start_pos += write_num;
}
}
else{
if(-1 == lseek(fd2, remain_num, SEEK_CUR)){
perror("lseek file2 fail");
goto err;
}
}
}
/* 源文件无空洞 */
else {
remain_num = read_num;
start_pos = buffer;
while(remain_num){
write_num = write(fd2, start_pos, remain_num);
if ( -1 == write_num){
perror( "write file2 error");
goto err;
}
remain_num -= write_num;
start_pos += write_num;
}
}
}
if(-1 == read_num) {
perror("read file1 error");
goto err;
}
rev = 0;
err:
if(buffer) free(buffer);
close(fd1);
close(fd2);
return rev;
}

int main(int argc, char *argv[])
{
if(argc < 3) {
printf("Usage: %s file1 file2\n", argv[0]);
return -1;
}
my_cp(argv[1], argv[2]);
return 0;
}
  • 测试结果
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
^_^$ ./my_cp holesize2048 holesize2048.cp
pagesize: 4096
st.st_blocks: 128
st.st_blksize: 4096
st.st_size: 65536
holesize2048 is not a sparse-block file!
chen@ubuntu18:~/study/apue.3e/exercises/4
^_^$ ./my_cp holesize4096 holesize4096.cp
pagesize: 4096
st.st_blocks: 72
st.st_blksize: 4096
st.st_size: 65536
holesize4096 is a sparse-block file!

^_^$ ll -s
total 1708
64 -rw-r--r-- 1 chen chen 65536 1月 6 17:27 holesize2048
64 -rw-r--r-- 1 chen chen 65536 1月 6 17:27 holesize2048.cp
36 -rw-r--r-- 1 chen chen 65536 1月 6 17:27 holesize4096
32 -rw-r--r-- 1 chen chen 65536 1月 6 17:27 holesize4096.cp

空洞文件可以正常拷贝

尝试优化程序

上面的程序仅在判断文件是否含有空洞时利用的空洞的最小限制。而在实际读写时并没有利用该性质。

这样较短的0字节也会当成是空洞,导致系统调用次数的增加,性能的降低

要优化性能,必须进一步探究空洞的性质。在什么样的情况下才创建空洞(不实际占用磁盘空间的块)?

  • 测试程序源码

此程序创建了3个文件:

- 文件1先`write`了1K的非零数据,然后`lseek` 7K-1字节。循环2次。
- 文件2先`write`了1K的非零数据,然后`lseek` 7K字节。循环2次
- 文件3先`write`了1K的非零数据,然后`lseek` 7K+1字节。循环2次
点击展开代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>

int holesize[]={4096};
int filesize = 64*1024;

int main()
{
int i = 0;
int count = 0;
int ret = 0, fd1 = 0, fd2 = 0, fd3 = 0;
char filename1[32]={0};
char filename2[32]={0};
char filename3[32]={0};
unsigned char buf[32*1024]={0};
memset(buf, 1, 32*1024);
for (; i< sizeof(holesize)/ sizeof(int); ++i) {
count = 0;
memset(filename1, 0, 32);
memset(filename2, 0, 32);
memset(filename3, 0, 32);
sprintf(filename1, "%s%d-1", "holesize", holesize[i]);
sprintf(filename2, "%s%d-2", "holesize", holesize[i]);
sprintf(filename3, "%s%d-3", "holesize", holesize[i]);
fd1 = open(filename1, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
fd2 = open(filename2, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
fd3 = open(filename3, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
if(fd1 < 0 || fd2 < 0 || fd3 < 0) {
printf("open file fail\n");
return -1;
}
count = 0;
while(count < 2) {
int remain = holesize[i] * 1 / 4;
while(remain) {
ret = write(fd1, buf, remain);
if(ret < 0 ) {
perror("write fail\n");
return -1;
}
remain -= ret;
}
ret = lseek(fd1, holesize[i] * 7 / 4 - 1, SEEK_CUR);
if(ret < 0) {
printf("lseek fail\n");
return -1;
}
++count;
}
count = 0;
while(count < 2) {
int remain = holesize[i] * 1 / 4;
while(remain) {
ret = write(fd2, buf, remain);
if(ret < 0 ) {
perror("write fail\n");
return -1;
}
remain -= ret;
}
ret = lseek(fd2, holesize[i] * 7 / 4, SEEK_CUR);
if(ret < 0) {
printf("lseek fail\n");
return -1;
}
++count;
}
count = 0;
while(count < 2) {
int remain = holesize[i] * 1 / 4;
while(remain) {
ret = write(fd3, buf, remain);
if(ret < 0 ) {
perror("write fail\n");
return -1;
}
remain -= ret;
}
ret = lseek(fd3, holesize[i] * 7 / 4 + 1, SEEK_CUR);
if(ret < 0) {
printf("lseek fail\n");
return -1;
}
++count;
}
close(fd1);
close(fd2);
close(fd3);
}
return 0;
}
  • 测试结果
1
2
3
4
^_^$ ll -s
12 -rw-r--r-- 1 chen chen 9215 1月 6 15:07 holesize4096-1
8 -rw-r--r-- 1 chen chen 9216 1月 6 15:07 holesize4096-2
8 -rw-r--r-- 1 chen chen 9217 1月 6 15:07 holesize4096-3

可见空洞必须从一页的起始位置开始计算,并且等于或超过pagesize,才不占用实际磁盘空间

优化后程序

  • 源码
点击展开代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/errno.h>

ssize_t read_ex(int fd, void *buf, size_t nbyte){
size_t read_remain = nbyte;
unsigned char *read_start = (unsigned char*)buf;
ssize_t read_num = -1;
ssize_t total_num = 0;
while(read_remain) {
read_num = read(fd, read_start, read_remain);
if(-1 == read_num){
return -1;
}
else if(0 == read_num){
break;
}
else{
read_remain -= read_num;
read_start += read_num;
total_num += read_num;
}
}
return total_num;
}

ssize_t write_ex(int fd, const void *buf, size_t nbyte){
size_t write_remain = nbyte;
unsigned char *write_start = (unsigned char*)buf;
ssize_t write_num = -1;
ssize_t total_num = 0;
while(write_remain) {
write_num = write(fd, write_start, write_remain);
if(-1 == write_num){
return -1;
}
else{
write_remain -= write_num;
write_start += write_num;
total_num += write_num;
}
}
return total_num;
}
int my_cp(const char *from, const char *to)
{
int fd1 = -1, fd2 = -1;
int rev = -1;
unsigned char *buffer = NULL, *buffer_zero = NULL;
long pagesize = 0;
long long blocks, blksize, size;
int read_num, write_num, write_remain, have_holes = 0;
struct stat st;

fd1 = open(from, O_RDONLY);
if(-1 == fd1){
perror("open file1 faild");
goto err;
}

if(fstat(fd1, &st) !=0) {
perror("fstat: ");
goto err;
}
else{
#ifdef _SC_PAGESIZE
pagesize = sysconf(_SC_PAGESIZE);
if (pagesize < 0) {
if (errno != 0) {
if (errno == EINVAL) {
fputs(" (not supported)\n", stdout);
pagesize = st.st_blksize;
}
else {
perror("sysconf error");
goto err;
}
} else {
fputs(" (no limit)\n", stdout);
pagesize = st.st_blksize;
}
}
printf("pagesize: %ld\n", pagesize);
#else
pagesize = st.st_blksize;
#endif
blocks = st.st_blocks;
blksize = st.st_blksize;
size = st.st_size;
printf("st.st_blocks: %lld\n", blocks);
printf("st.st_blksize: %lld\n", blksize);
printf("st.st_size: %lld\n", size);
/*块大小512,在不同平台上可能不兼容*/
if(S_ISREG(st.st_mode) && (size / pagesize + (size%pagesize?1:0)) * pagesize > 512 * blocks) {
have_holes = 1;
printf("%s is a sparse-block file!\n", from);
} else{
have_holes = 0;
printf("%s is not a sparse-block file!\n", from);
}
}
buffer = malloc(pagesize);
buffer_zero = malloc(pagesize);
if(buffer == NULL || buffer_zero == NULL) {
perror ("malloc fail");
goto err;
}
memset(buffer, '\0', pagesize);
memset(buffer_zero, '\0', pagesize);

fd2 = open(to, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
if (-1 == fd2) {
perror ("open file2 faild");
goto err;
}

while((read_num = read_ex(fd1, buffer, pagesize)) > 0) {
/* 读取到空洞 */
if(have_holes && !memcmp(buffer_zero, buffer, read_num)){
if(-1 == lseek(fd2, read_num, SEEK_CUR)){
perror("lseek file2 fail");
goto err;
}
}
/* 非空洞 */
else{
write_num = write_ex(fd2, buffer, read_num);
if (-1 == write_num){
perror( "write file2 error");
goto err;
}
}
}
if(-1 == read_num){
perror("read file1 error");
goto err;
}
rev = 0;
err:
if(buffer) free(buffer);
if(buffer_zero) free(buffer_zero);
close(fd1);
close(fd2);
return rev;
}

int main(int argc, char *argv[])
{
if(argc < 3) {
printf("Usage: %s file1 file2\n", argv[0]);
return -1;
}
my_cp(argv[1], argv[2]);
return 0;
}
  • 对比测试

构造一个文件,除了开头一个空洞,其余数据为0x00,0x01的100000次重复

用优化前的程序拷贝该文件10000次,大约2000s

用优化后的程序拷贝该文件10000次,大约30s

-------------本文结束感谢您的阅读-------------

欢迎关注我的其它发布渠道