首页 > 解决方案 > mmap speed compared to read and write

问题描述

I use mmap, read and write system calls to copy file. I want to see the advantage of mmap speed.

I consider four ways to copy:

  1. read+write
  2. read+mmap
  3. write+mmap
  4. mmap+mmap

However, the result is that regardless of the block size, mmap is always slower than read and write.

Source code:

#include <pthread.h>
#include <unistd.h>
#include <iostream>
#include <sys/mman.h>
#include <sys/stat.h>        
#include <fcntl.h>       
#include <string.h>
#include <time.h>

constexpr long MAP_SIZE = 1L*1024*1024*1024;
constexpr int BLOCK_SIZE = 256;

using namespace std;

void test1(int fd1, int fd2){
    // read+write
    char buf[BLOCK_SIZE];
    lseek(fd1, 0, SEEK_SET);
    lseek(fd2, 0, SEEK_SET);

    clock_t t1 = clock();
    for(long i = 0; i < MAP_SIZE / BLOCK_SIZE; i++){
        // use for fair
        off_t file_offset = i*BLOCK_SIZE;
        off_t page_align_offset = i*BLOCK_SIZE & ~(sysconf(_SC_PAGE_SIZE)-1);
        off_t in_page_offset = file_offset-page_align_offset;
        int mm_len = in_page_offset+BLOCK_SIZE;

        read(fd1, buf, BLOCK_SIZE);
        write(fd2, buf, BLOCK_SIZE);
    }
    clock_t t2 = clock();
    cout << "total size is: " << MAP_SIZE << endl;
    cout << "block size is: " << BLOCK_SIZE << endl;
    cout << "read+write use " << (clock() - t1) * 1.0 / CLOCKS_PER_SEC * 1000 << "ms" << endl;
}
void test2(int fd1, int fd2){
    // mmap+write
    char *buf;
    lseek(fd1, 0, SEEK_SET);
    lseek(fd2, 0, SEEK_SET);

    clock_t t1 = clock();
    for(long i = 0; i < MAP_SIZE / BLOCK_SIZE; i++){
        off_t file_offset = i*BLOCK_SIZE;
        off_t page_align_offset = i*BLOCK_SIZE & ~(sysconf(_SC_PAGE_SIZE)-1);
        off_t in_page_offset = file_offset - page_align_offset;
        int mm_len = in_page_offset + BLOCK_SIZE;

        buf = (char *)mmap(NULL, mm_len, PROT_READ|PROT_WRITE, MAP_SHARED, fd1, page_align_offset);
        write(fd2, buf+in_page_offset, BLOCK_SIZE);
        munmap(buf, mm_len);
    }
    clock_t t2 = clock();
    cout << "total size is: " << MAP_SIZE << endl;
    cout << "block size is: " << BLOCK_SIZE << endl;
    cout << "mmap+write use " << (clock() - t1) * 1.0 / CLOCKS_PER_SEC * 1000 << "ms" << endl;
}
void test3(int fd1, int fd2){
    // read+mmap
    char buf1[BLOCK_SIZE];
    char* buf2;
    lseek(fd1, 0, SEEK_SET);
    lseek(fd2, 0, SEEK_SET);

    clock_t t1 = clock();
    for(long i = 0; i < MAP_SIZE / BLOCK_SIZE; i++){
        off_t file_offset = i*BLOCK_SIZE;
        off_t page_align_offset = i*BLOCK_SIZE & ~(sysconf(_SC_PAGE_SIZE)-1);
        off_t in_page_offset = file_offset-page_align_offset;
        int mm_len = in_page_offset+BLOCK_SIZE;

        read(fd1, buf1, BLOCK_SIZE);
        buf2 = (char *)mmap(NULL, mm_len, PROT_READ|PROT_WRITE, MAP_SHARED, fd2, page_align_offset);
        memcpy(buf2+in_page_offset, buf1, BLOCK_SIZE);
        munmap(buf2, mm_len);
    }
    clock_t t2 = clock();
    cout << "total size is: " << MAP_SIZE << endl;
    cout << "block size is: " << BLOCK_SIZE << endl;
    cout << "read+mmap use " << (clock() - t1) * 1.0 / CLOCKS_PER_SEC * 1000 << "ms" << endl;
}
void test4(int fd1, int fd2){
    // mmap+mmap
    char* buf1;
    char* buf2;
    clock_t t1 = clock();
    for(long i = 0; i < MAP_SIZE / BLOCK_SIZE; i++){
        off_t file_offset = i*BLOCK_SIZE;
        off_t page_align_offset = i*BLOCK_SIZE & ~(sysconf(_SC_PAGE_SIZE)-1);
        off_t in_page_offset = file_offset-page_align_offset;
        int mm_len = in_page_offset+BLOCK_SIZE;

        buf1 = (char *)mmap(NULL, mm_len, PROT_READ|PROT_WRITE, MAP_SHARED, fd1, page_align_offset);
        buf2 = (char *)mmap(NULL, mm_len, PROT_READ|PROT_WRITE, MAP_SHARED, fd2, page_align_offset);
        memcpy(buf2+in_page_offset, buf1+in_page_offset, BLOCK_SIZE);

        munmap(buf1, mm_len);
        munmap(buf2, mm_len);
    }
    
    clock_t t2 = clock();
    cout << "total size is: " << MAP_SIZE << endl;
    cout << "block size is: " << BLOCK_SIZE << endl;
    cout << "mmap+mmap use " << (clock() - t1) * 1.0 / CLOCKS_PER_SEC * 1000 << "ms" << endl;
}


int main(int argc, char *argv[])
{
    int fd1, fd2;
    fd1 = shm_open("shm1", O_RDWR|O_CREAT, 0644);
    fd2 = shm_open("shm2", O_RDWR|O_CREAT, 0644);
    // fd1 = open("f1", O_RDWR|O_CREAT, 0644);
    // fd2 = open("f2", O_RDWR|O_CREAT, 0644);
    if(fd1 < 0 || fd2 < 0){
        printf("shm_open failed\n");
        exit(1);
    }
    
    if (ftruncate(fd1, MAP_SIZE) < 0){
        printf("ftruncate failed\n");
        exit(1);
    }
    if (ftruncate(fd2, MAP_SIZE) < 0){
        printf("ftruncate failed\n");
        exit(1);
    }

    test1(fd1, fd2);
    test2(fd1, fd2);
    test3(fd1, fd2);
    test4(fd1, fd2);
}

One of results showing that mmap is slower:

total size is: 1073741824

block size is: 4096

read+write use 254.571ms

total size is: 1073741824

block size is: 4096

mmap+write use 486.342ms

total size is: 1073741824

block size is: 4096

read+mmap use 537.277ms

total size is: 1073741824

block size is: 4096

mmap+mmap use 737.734ms

标签: linuxfilecopymmap

解决方案


clock函数测量进程使用的 CPU 时间。它对工作是在进程上下文还是内核上下文中完成很敏感。

当一个线程去从一个文件中读取一些数据时,无论是通过调用read还是访问 a mmap,在读取操作完成之前,该线程都不能进一步向前推进。最有效的做法是立即执行与该操作相关的所有 CPU 工作,以便进程可以恢复工作。由于您衡量的是在这种情况下完成了多少工作,因此您实际上惩罚了更有效的操作,因为它以最有效的方式完成了更多的工作。

所以这种类型的测量实际上奖励了低效率。在流程上下文之外完成的工作不计算在内,因此该方法效率低下的工作越多(同时使流程等待更长的时间),您说它使用的时间就越少。

我认为主要的问题是你真的没有一个明确的问题。为什么测量“速度”意味着测量使用的进程 CPU 时间?为什么不测量挂墙时间?


推荐阅读