首页 > 解决方案 > 英特尔 FPGA 的 OpenCL 中本地内存阵列的 RAM 消耗量如此之大

问题描述

我在 OpenCL 中为 FPGA 板编写了一个简单的代码。我使用 DE10 nano 仅共享板和 Intel SDK 18.1 。主要问题是内存消耗过多。HTML 报告主要显示本地内存数组中的问题。在 ND 范围内核中,这个问题会变得更糟!

另一个问题是:所有本地数组都有一个编译器警告,即:

(积极的编译器优化:移除不必要的存储到本地内存)

顺便说一句,在 LOOPS ANALYSIS 选项卡中有 II : ~1 并在 Details 窗格中提到:

(由于以下可停顿指令,II 是一个近似值:加载操作#no,存储操作#no)。我怎样才能解决它并达到 II 的确切 1 ?!
编码:

#define IDX(i, j, n) ((i) * (n) + (j))
//#include<stdlib.h>

__kernel void PushKernel( uint column,__global int * restrict height,
__global int * restrict excessFlow,__global int * restrict netFlowOutS,
__global int * restrict netFlowInT,uint s,uint t,uint row,
__global int * restrict residualFlow_up,__global int * restrict residualFlow_down,
__global int * restrict residualFlow_right,__global int * restrict residualFlow_left)
{
    const uint num_column=6;
    const uint num_row=4;
    int FlowOutS=*netFlowOutS;
    int FlowInT=*netFlowInT;
    uint source=s;
    uint destination=t;
    uint index;
    __local int heights_horizontal_cache[6];
    __local int excessFlow_horizontal_cache[6];
    __local int excessFlow_horizontal_cache_temp[6];
    __local int residualFlow_right_cache[6];
    __local int residualFlow_left_cache[6];
    __local int outS_cache;
    //#pragma unroll
    //#pragma loop_coalesce
    #pragma ivdep
    //#pragma ii 1
    for(int i=0; i<num_row; i++){index=IDX(i, 0, num_column);
        #pragma unroll
        #pragma ivdep
        for(int j=0; j<num_column; j++){//index=IDX(i, 0, num_column);
            heights_horizontal_cache[j]=height[index+j];
            excessFlow_horizontal_cache[j]=excessFlow[index+j];
            excessFlow_horizontal_cache_temp[j]=0;
            residualFlow_right_cache[j]=residualFlow_right[index+j];
            residualFlow_left_cache[j]=residualFlow_left[index+j];
            outS_cache=0;
        }
    
//mem_fence(CLK_GLOBAL_MEM_FENCE);
///////////////////////////////////////////////////////////////////////push to right
     
        //#pragma ivdep array (residualFlow_right_cache)  
        #pragma ivdep
        #pragma unroll
        for(int j=0; j<num_column-1; j++){
            //index=IDX(i, j, num_column);
            
            if(index+j != source && index+j != destination && excessFlow_horizontal_cache[j]>0 && residualFlow_right_cache[j]>0 && heights_horizontal_cache[j]==heights_horizontal_cache[j+1]+1){
                int delta = min(excessFlow_horizontal_cache[j], residualFlow_right_cache[j]);
                residualFlow_right_cache[j]-=delta; 
                residualFlow_left_cache[j+1]+=delta;    
                excessFlow_horizontal_cache[j]-=delta;
                
                //excessFlow_horizontal_cache[j+1]+=delta;
                excessFlow_horizontal_cache_temp[j+1]=delta;

                if (IDX(i, j+1, num_column) == s) {
                    //FlowOutS-=delta;
                    outS_cache=delta;
                } 
                else if (IDX(i, j+1, num_column) == t) {
                    FlowInT+=delta;}
            }
        
///////////////////////////////////////////////////////////////////////results back to global
//mem_fence(CLK_GLOBAL_MEM_FENCE);
        }
        #pragma unroll
        #pragma ivdep
        for(int j=0; j<num_column; j++){
            excessFlow_horizontal_cache[j]+=excessFlow_horizontal_cache_temp[j];
        }
        #pragma unroll
        #pragma ivdep
        for(int j=0; j<num_column; j++){
            //index=IDX(i, 0, num_column);
            excessFlow[index+j]=excessFlow_horizontal_cache[j];
            
            residualFlow_right[index+j]=residualFlow_right_cache[j];
            residualFlow_left[index+j]=residualFlow_left_cache[j];  
        }
    }
    FlowOutS-=outS_cache;
    *netFlowOutS=FlowOutS;
    *netFlowInT=FlowInT;
}

这是 HTML 报告:
HTML 报告

标签: openclintel-fpga

解决方案


推荐阅读