linux - mmap speed compared to read and write
问题描述
I use mmap, read and write system calls to copy file. I want to see the advantage of mmap speed.
I consider four ways to copy:
- read+write
- read+mmap
- write+mmap
- mmap+mmap
However, the result is that regardless of the block size, mmap is always slower than read and write.
Source code:
#include <pthread.h>
#include <unistd.h>
#include <iostream>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#include <time.h>
constexpr long MAP_SIZE = 1L*1024*1024*1024;
constexpr int BLOCK_SIZE = 256;
using namespace std;
void test1(int fd1, int fd2){
// read+write
char buf[BLOCK_SIZE];
lseek(fd1, 0, SEEK_SET);
lseek(fd2, 0, SEEK_SET);
clock_t t1 = clock();
for(long i = 0; i < MAP_SIZE / BLOCK_SIZE; i++){
// use for fair
off_t file_offset = i*BLOCK_SIZE;
off_t page_align_offset = i*BLOCK_SIZE & ~(sysconf(_SC_PAGE_SIZE)-1);
off_t in_page_offset = file_offset-page_align_offset;
int mm_len = in_page_offset+BLOCK_SIZE;
read(fd1, buf, BLOCK_SIZE);
write(fd2, buf, BLOCK_SIZE);
}
clock_t t2 = clock();
cout << "total size is: " << MAP_SIZE << endl;
cout << "block size is: " << BLOCK_SIZE << endl;
cout << "read+write use " << (clock() - t1) * 1.0 / CLOCKS_PER_SEC * 1000 << "ms" << endl;
}
void test2(int fd1, int fd2){
// mmap+write
char *buf;
lseek(fd1, 0, SEEK_SET);
lseek(fd2, 0, SEEK_SET);
clock_t t1 = clock();
for(long i = 0; i < MAP_SIZE / BLOCK_SIZE; i++){
off_t file_offset = i*BLOCK_SIZE;
off_t page_align_offset = i*BLOCK_SIZE & ~(sysconf(_SC_PAGE_SIZE)-1);
off_t in_page_offset = file_offset - page_align_offset;
int mm_len = in_page_offset + BLOCK_SIZE;
buf = (char *)mmap(NULL, mm_len, PROT_READ|PROT_WRITE, MAP_SHARED, fd1, page_align_offset);
write(fd2, buf+in_page_offset, BLOCK_SIZE);
munmap(buf, mm_len);
}
clock_t t2 = clock();
cout << "total size is: " << MAP_SIZE << endl;
cout << "block size is: " << BLOCK_SIZE << endl;
cout << "mmap+write use " << (clock() - t1) * 1.0 / CLOCKS_PER_SEC * 1000 << "ms" << endl;
}
void test3(int fd1, int fd2){
// read+mmap
char buf1[BLOCK_SIZE];
char* buf2;
lseek(fd1, 0, SEEK_SET);
lseek(fd2, 0, SEEK_SET);
clock_t t1 = clock();
for(long i = 0; i < MAP_SIZE / BLOCK_SIZE; i++){
off_t file_offset = i*BLOCK_SIZE;
off_t page_align_offset = i*BLOCK_SIZE & ~(sysconf(_SC_PAGE_SIZE)-1);
off_t in_page_offset = file_offset-page_align_offset;
int mm_len = in_page_offset+BLOCK_SIZE;
read(fd1, buf1, BLOCK_SIZE);
buf2 = (char *)mmap(NULL, mm_len, PROT_READ|PROT_WRITE, MAP_SHARED, fd2, page_align_offset);
memcpy(buf2+in_page_offset, buf1, BLOCK_SIZE);
munmap(buf2, mm_len);
}
clock_t t2 = clock();
cout << "total size is: " << MAP_SIZE << endl;
cout << "block size is: " << BLOCK_SIZE << endl;
cout << "read+mmap use " << (clock() - t1) * 1.0 / CLOCKS_PER_SEC * 1000 << "ms" << endl;
}
void test4(int fd1, int fd2){
// mmap+mmap
char* buf1;
char* buf2;
clock_t t1 = clock();
for(long i = 0; i < MAP_SIZE / BLOCK_SIZE; i++){
off_t file_offset = i*BLOCK_SIZE;
off_t page_align_offset = i*BLOCK_SIZE & ~(sysconf(_SC_PAGE_SIZE)-1);
off_t in_page_offset = file_offset-page_align_offset;
int mm_len = in_page_offset+BLOCK_SIZE;
buf1 = (char *)mmap(NULL, mm_len, PROT_READ|PROT_WRITE, MAP_SHARED, fd1, page_align_offset);
buf2 = (char *)mmap(NULL, mm_len, PROT_READ|PROT_WRITE, MAP_SHARED, fd2, page_align_offset);
memcpy(buf2+in_page_offset, buf1+in_page_offset, BLOCK_SIZE);
munmap(buf1, mm_len);
munmap(buf2, mm_len);
}
clock_t t2 = clock();
cout << "total size is: " << MAP_SIZE << endl;
cout << "block size is: " << BLOCK_SIZE << endl;
cout << "mmap+mmap use " << (clock() - t1) * 1.0 / CLOCKS_PER_SEC * 1000 << "ms" << endl;
}
int main(int argc, char *argv[])
{
int fd1, fd2;
fd1 = shm_open("shm1", O_RDWR|O_CREAT, 0644);
fd2 = shm_open("shm2", O_RDWR|O_CREAT, 0644);
// fd1 = open("f1", O_RDWR|O_CREAT, 0644);
// fd2 = open("f2", O_RDWR|O_CREAT, 0644);
if(fd1 < 0 || fd2 < 0){
printf("shm_open failed\n");
exit(1);
}
if (ftruncate(fd1, MAP_SIZE) < 0){
printf("ftruncate failed\n");
exit(1);
}
if (ftruncate(fd2, MAP_SIZE) < 0){
printf("ftruncate failed\n");
exit(1);
}
test1(fd1, fd2);
test2(fd1, fd2);
test3(fd1, fd2);
test4(fd1, fd2);
}
One of results showing that mmap is slower:
total size is: 1073741824
block size is: 4096
read+write use 254.571ms
total size is: 1073741824
block size is: 4096
mmap+write use 486.342ms
total size is: 1073741824
block size is: 4096
read+mmap use 537.277ms
total size is: 1073741824
block size is: 4096
mmap+mmap use 737.734ms
解决方案
该clock
函数测量进程使用的 CPU 时间。它对工作是在进程上下文还是内核上下文中完成很敏感。
当一个线程去从一个文件中读取一些数据时,无论是通过调用read
还是访问 a mmap
,在读取操作完成之前,该线程都不能进一步向前推进。最有效的做法是立即执行与该操作相关的所有 CPU 工作,以便进程可以恢复工作。由于您衡量的是在这种情况下完成了多少工作,因此您实际上惩罚了更有效的操作,因为它以最有效的方式完成了更多的工作。
所以这种类型的测量实际上奖励了低效率。在流程上下文之外完成的工作不计算在内,因此该方法效率低下的工作越多(同时使流程等待更长的时间),您说它使用的时间就越少。
我认为主要的问题是你真的没有一个明确的问题。为什么测量“速度”意味着测量使用的进程 CPU 时间?为什么不测量挂墙时间?
推荐阅读
- reactjs - 无法访问 AG Grid cellRenderer/valueGetter 函数中更新的反应组件状态值
- python - 我的网页抓取脚本的输出有问题
- sql - 如何在我的 SQL 查询中使用 FIND_IN_SET 和 sum 列
- r - 在ggplot中更改图例线型的颜色
- php - 如何显示错误但仍获得正确的 HTTP 状态?
- r - R 或 Mplus 中的中介分析
- javascript - 通过数组 reactjs 处理多个循环复选框
- python - 如何查看concurrent.futures.ThreadPoolExecutor线程执行结果?
- javascript - 如何将布尔值从一个功能组件传递到另一个功能组件?
- python - 计算两个向量的加法、乘法和点积