首页 > 解决方案 > 大内存区域的“clflush”不刷新?

问题描述

我想尝试测量从缓存访问和从主内存访问时内存访问的时序差异。

考虑这个程序:

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <time.h>

#include <x86intrin.h>

#define CL_SIZE 64
#define REGION_SIZE 0x40000000  //A 1Gb memory region

void gen_perm(int* arr_ptr,int N)
/** This function generates a random permutation of integers**/
{
  srand(time(NULL));
  for (int i=0;i<N;i++)
    arr_ptr[i] = i;

  for(int i=N-1;i>=0;i--)
  {
    int index = rand() % (i+1);
    int temp = arr_ptr[i];
    arr_ptr[i] = arr_ptr[index];
    arr_ptr[index] = temp;
  }
}

inline void force_read(char* address) 
/** We force a memory read of a given memory address **/
{
   asm  volatile (""::"r"(*address):"memory");
}

inline uint64_t timing()
/* This function gives us a timestamp, useful for measuring clock cycles */
{
  uint32_t time_lo, time_hi;
  asm volatile(
    "mfence \n"
    "rdtscp \n"
    "mov %%edx, %[hi]\n"
    "mov %%eax, %[lo]\n"
    : [hi] "=r" (time_hi), [lo] "=r" (time_lo)
    :
    : "rdx", "rax", "rcx");
  return ( time_lo | ((uint64_t)time_hi << 32) );
}

char* mapped_area;

void clean_cache()
/**Objective is to flush the mapped_area completely from the cache
 * */
{
  for (int i=0;i<512*100;i+=CL_SIZE) // ---> NOTE THE !!! 512*100 !!! I'm not even flushing the entire mapped_area
    asm volatile ("clflush %[target]"::[target]"m"(mapped_area[i]));
    //_mm_clflush(&mapped_area[i]);  ---> You can use this intrinsic function too

}

int profile(int stride,int range)
{
  uint64_t* result_array = malloc(range*sizeof(uint64_t));
  int* perm_array = malloc(range*sizeof(int));

  gen_perm(perm_array,range);
  clean_cache();

  for(int i = 0; i < range; i++)
  {
    int mixed_index = perm_array[i]; //Trying to remove the prefetcher influence
    uint64_t time[2];

    time[0] = timing();
    force_read(&mapped_area[mixed_index*stride]);
    time[1] = timing();

    result_array[mixed_index] = time[1]-time[0];
  }

  printf("\nLineNo\tTime");
  for(int i=0; i< range; i++)
    printf("\n%d\t%lu",i,result_array[i]);

  free(perm_array);
  free(result_array);
}

int main()
{
  mapped_area = memalign(4096,REGION_SIZE);
  profile(512, 100);
  free(mapped_area);
  return 0;
}

我得到的输出是:

LineNo  Time
0   76
1   76
2   76
3   76
4   692
5   76
6   76
7   76
8   280
9   76
10  76
11  76
12  76

....

97  76
98  76
99  76

显然,如果我实际上是从内存中获取该值,则该值太少而无法正确(我认为应该在 200-300 左右)。我哪里会出错?

标签: cperformanceassemblycachingx86

解决方案


看起来代码是正确的(据我所知)。我认为问题的出现是由于操作系统的参与,每当我尝试从mapped_area. 这很容易通过强制操作系统通过写入页面来给我们页面来缓解。

int main()
{
  mapped_area = memalign(4096,REGION_SIZE);
  for(int i = 0;i<REGION_SIZE;i+=4096)  mapped_area[i] = 123; //4096 is the page size. I should probably replace by a standard macro
  profile(512, 100);
  free(mapped_area);
  return 0;
}

这给了我:

LineNo  Time
0   272
1   266
2   422
3   234
4   234
5   254
6   220
7   230
8   266
...

97  212
98  264
99  268

相当一致!

PS:我不确定为什么即使出现页面错误,阅读时间也没有增加......有什么想法吗?


推荐阅读