首页 > 解决方案 > 结构中的指针是否会减慢我的代码速度?

问题描述

我正在寻找一些帮助或提示来加速我的代码。

我已经实现了一个从一组球谐系数 C_{n,m} 和 S_{n,m} 计算空间中一点 (r,phi,lambda) 的引力势的例程。该等式显示在下面的链接中:

正在实施的方程式

并且包括与纬度 (phi) 相关的关联勒让德多项式 P_{n,m} 的递归计算,从前两个值 P{0,0} 和 P_{1,1} 开始。

起初,我将其实现为 MATLAB C-MEX 代码,而我的代码的核心部分只有 C 语言。我想制作一个纯 C 例程,但发现代码运行速度慢了 3-5 倍,这让我想知道为什么。这可能是我定义我的结构并在中央代码中使用指向指针的方式吗?似乎是核心计算部分花费了额外的时间,但这部分并没有改变,尽管在我使用指针直接传递给变量之前,现在我在结构内部使用指针。任何帮助表示赞赏!

在下文中,我将尝试解释我的代码并展示一些摘录:

在程序开始时,我定义了三个结构。一个保存球谐系数,C_{n,m} 和 S_{n,m}, (ggm_struct),一个保存计算坐标 (comp_struct),一个保存结果 (func_struct):

// Define constant variables
const double deg2rad = M_PI/180.0;                 // degrees to radians conversion factor
const double sfac = 1.0000E-280;                   // scaling factor
const double sqrt2 = 1.414213562373095;            // sqrt(2)
const double sqrt3 = 1.732050807568877;            // sqrt(3)

// Define structure to hold geopotential model
struct ggm_struct {
    char   product_type[100], modelname[100], errors[100], norm[100], tide_system[100];
    double GM, R, *C, *S;
    int    max_degree, ncoef;
};
// Define structure to hold computation coordinates
struct comp_struct {
    double *lat, *lon, *h;
    double *r, *phi;
    int    nlat, nlon;
};
/* Define structure to hold results */
struct func_struct {
    double *rval;
    int    npoints;
};

然后我有一个(子)函数,它首先分配空间,然后从 ascii 文件中加载系数

int read_gfc(char mfile[100], int *nmax, int *mmax, struct ggm_struct *ggm)
{

    // Set file identifier
    FILE *fid;

    // Declare variables
    char   str[200], var[100];
    int    n, m, nid, l00 = 0, l10 = 0, l11 = 0;
    double c, s;

    // Determine number of coefficients
    ggm->ncoef = (*nmax+2)*(*nmax+1)/2;

    // Allocate memory for coefficients
    ggm->C = (double*) malloc(ggm->ncoef*sizeof(double));
    if (ggm->C == NULL){
        printf("Error: Memory for C not allocated!");
        return -ENOMEM;
    }
    ggm->S = (double*) malloc(ggm->ncoef*sizeof(double));
    if (ggm->S == NULL){
        printf("Error: Memory for S not allocated!");
        return -ENOMEM;
    }

    // Open file
    fid = fopen(mfile,"r");

    // Check that file was opened correctly
    if (fid == NULL){
        printf("Error: opening file %s!",mfile);
        return -ENOMEM;
    }

    // Read file header
    while (fgets(str,200,fid) != NULL && strncmp(str,"end_of_head",11) != 0){

        // Extract model parameters
        if (strncmp(str,"product_type",12) == 0){ sscanf(str,"%s %s",var,ggm->product_type); }
        if (strncmp(str,"modelname",9) == 0){ sscanf(str,"%s %s",var,ggm->modelname); }
        if (strncmp(str,"earth_gravity_constant",22) == 0){ sscanf(str,"%s %lf",var,&ggm->GM); }
        if (strncmp(str,"radius",6) == 0){ sscanf(str,"%s %lf",var,&ggm->R); }
        if (strncmp(str,"max_degree",10) == 0){ sscanf(str,"%s %d",var,&ggm->max_degree); }
        if (strncmp(str,"errors",6) == 0){ sscanf(str,"%s %s",var,ggm->errors); }
        if (strncmp(str,"norm",4) == 0){ sscanf(str,"%s %s",var,ggm->norm); }
        if (strncmp(str,"tide_system",11) == 0){ sscanf(str,"%s %s",var,ggm->tide_system); }

    }

    // Read coefficients
    while (fgets(str,200,fid) != NULL){

        // Extract parameters
        sscanf(str,"%s %d %d %lf %lf",var,&n,&m,&c,&s);

        // Store parameters
        if (n <= *nmax && m <= *mmax) {

            // Determine index
            nid = (n+1)*n/2 + m;

            // Store values
            *(ggm->C+nid) = c;
            *(ggm->S+nid) = s;

        }

    }

    // Close fil
    fclose(fid);

    // Return from function
    return 0;

}

之后,计算网格由一个由七个组件组成的数组定义。例如,数组 [-90 90 -180 180 1 1 0] 定义了一个从 -90 到 90 度以 1 度为增量的网格,从 -180 到 180 度以 1 度为增量定义了一个网格。高度为零。从此数组中,在(子)函数中生成计算网格:

int make_grid(double *grid, struct comp_struct *inp)
{

    // Declare variables
    int n;

    /* Echo routine */
    printf("Creating grid of coordinates\n");
    printf("             [lat1,lat2,dlat] = [%f,%f,%f]\n", *grid, *(grid+1), *(grid+4) );
    printf("             [lon1,lon2,dlon] = [%f,%f,%f]\n", *(grid+2), *(grid+3), *(grid+5) );
    printf("             h                = %f\n", *(grid+6));

    /* Latitude ------------------------------------------------------------- */

    // Determine number of increments
    inp->nlat = ceil( ( *(grid+1) - *grid + *(grid+4) ) / *(grid+4) );

    // Allocate memory
    inp->lat = (double*) malloc(inp->nlat*sizeof(double));
    if (inp->lat== NULL){
        printf("Error: Memory for LATITUDE (inp.lat) points not allocated!");
        return -ENOMEM;
    }

    // Fill in values
    *(inp->lat) = *(grid+1);
    for (n = 1; n < inp->nlat-1; n++) {
        *(inp->lat+n) = *(inp->lat+n-1) - *(grid+4);
    }
    *(inp->lat+inp->nlat-1) = *grid;

    /* Longitude ------------------------------------------------------------ */

    // Determine number of increments
    inp->nlon = ceil( ( *(grid+3) - *(grid+2) + *(grid+5) ) / *(grid+5) );

    // Allocate memory
    inp->lon = (double*) malloc(inp->nlon*sizeof(double));
    if (inp->lon== NULL){
        printf("Error: Memory for LONGITUDE (inp.lon) points not allocated!");
        return -ENOMEM;
    }

    // Fill in values
    *(inp->lon) = *(grid+2);
    for (n = 1; n < inp->nlon-1; n++) {
        *(inp->lon+n) = *(inp->lon+n-1) + *(grid+5);
    }
    *(inp->lon+inp->nlon-1) = *(grid+3);

    /* Height --------------------------------------------------------------- */

    // Allocate memory
    inp->h = (double*) malloc(inp->nlat*sizeof(double));
    if (inp->h== NULL){
        printf("Error: Memory for HEIGHT (inp.h) points not allocated!");
        return -ENOMEM;
    }

    // Fill in values
    for (n = 0; n < inp->nlat; n++) {
        *(inp->h+n) = *(inp->h+n-1) + *(grid+6);
    }

    // Return from function
    return 0;

}

然后将这些地理坐标转换为球坐标以使用另一个(子)例程进行计算

int geo2sph(struct comp_struct *inp, int *lgrid)
{

    // Declare variables
    double a = 6378137.0, e2 = 6.69437999014E-3; /* WGS84 parameters */
    double x, y, z, sinlat, coslat, sinlon, coslon, R_E;
    int    i, j, nid;

    /* Allocate space ------------------------------------------------------- */
    
    // radius
    inp->r = (double*) malloc(inp->nlat*sizeof(double));
    if (inp->r== NULL){
        printf("Error: Memory for SPHERICAL DISTANCE (inp.r) points not allocated!");
        return -ENOMEM;
    }

    // phi
    inp->phi = (double*) malloc(inp->nlat*sizeof(double));
    if (inp->phi== NULL){
        printf("Error: Memory for SPHERICAL LATITUDE (inp.phi) points not allocated!");
        return -ENOMEM;
    }

    /* Loop over latitude =================================================== */
    for (i = 0; i < inp->nlat; i++) {

        // Compute sine and cosine of latitude
        sinlat = sin(*(inp->lat+i));
        coslat = cos(*(inp->lat+i));

        // Compute radius of curvature
        R_E = a / sqrt( 1.0 - e2*sinlat*sinlat );

        // Compute sine and cosine of longitude
        sinlon = sin(*(inp->lon));
        coslon = cos(*(inp->lon));

        // Compute rectangular coordinates
        x = ( R_E + *(inp->h+i) ) * coslat * coslon;
        y = ( R_E + *(inp->h+i) ) * coslat * sinlon;
        z = ( R_E*(1.0-e2) + *(inp->h+i) ) * sinlat;

        // Compute sqrt( x^2 + y^2 )
        R_E = sqrt( x*x + y*y );

        // Derive radial distance
        *(inp->r+i) = sqrt( R_E * R_E + z*z );

        // Derive spherical latitude
        if (R_E < 1) {
            if (z > 0) { *(inp->phi+i) = M_PI/2.0; }
            else { *(inp->phi+i) = -M_PI/2.0; }
        }
        else {
            *(inp->phi+i) = asin( z / *(inp->r+i) );
        }

    }

    // Return from function
    return 0;

}

最后,引力势在自身(子)函数内计算。这是代码的核心部分,它或多或少与 MATLAB C-MEX 函数相同。唯一的区别似乎是之前(在 MATLAB MEX 中)所有内容都被定义为(简单)双变量 - 现在变量位于包含指针的结构内。

int gravpot(struct comp_struct *inp, struct ggm_struct *ggm, int *nmax,
    int *mmax, int *lgrid, struct func_struct *out)
{

    // Declare variables
    double GMr, ar, t, u, u2, arn, gnm, hnm, P, Pp1, Pp2, msum;
    double Pmm[*nmax+1], CPnm[*mmax+1], SPnm[*mmax+1];
    int i, j, n, m, id;

    // Allocate memory
    out->rval = (double*) malloc(inp->nlat*inp->nlon*sizeof(double));
    if (out->rval== NULL){
        printf("Error: Memory for OUTPUT (out.rval) not allocated!");
        return -ENOMEM;
    }

    /* Compute sectorial values of associated Legendre polynomials ========== */

    // Define seed values ( divided by u^m )
    Pmm[0] = sfac;
    Pmm[1] = sqrt3 * sfac;

    // Compute sectorial values, [1] eq. 13 and 28 ( divided by u^m )
    for (m = 2; m <= *nmax; m++) {
        Pmm[m] = sqrt( (2.0*m+1.0) / (2.0*m) ) * Pmm[m-1];
    }

    /* ====================================================================== */

    /* Loop over latitude =================================================== */
    for (i = 0; i < inp->nlat; i++) {
      
        // Compute ratios to be used in summation
        GMr = ggm->GM / *(inp->r+i);
        ar  = ggm->R / *(inp->r+i);

        /* ---------------------------------------------------------------------
         * Compute product of Legendre values and spherical harmonic coefficients.
         * Products of similar degree are summed together, resulting in mmax
         * values. The degree terms are latitude dependent, such that these mmax
         * sums are valid for every point with the same latitude.
         * The values of the associated Legendre polynomials, Pnm, are scaled by
         * sfac = 10^(-280) and divided by u^m in order to prevent underflow and
         * overflow of the coefficients.
         * ------------------------------------------------------------------ */

        // Form coefficients for Legendre recursive algorithm
        t   = sin(*(inp->phi+i));
        u   = cos(*(inp->phi+i));
        u2  = u * u;
        arn = ar;

        /* Degree n = 0 terms ----------------------------------------------- */

        // Compute order m = 0 term (S term is zero)
        CPnm[0] = Pmm[0] * *(ggm->C);

        /* Degree n = 1 terms ----------------------------------------------- */

        // Compute (1,1) terms, [1] eq. 3
        CPnm[1] = ar * Pmm[1] * *(ggm->C+2);
        SPnm[1] = ar * Pmm[1] * *(ggm->S+2);

        // Compute (1,0) Legendre value, [1] eq. 18 and 27
        P = t * Pmm[1];

        // Add (1,0) terms to sum (S term is zero), [1] eq. 3
        CPnm[0] = CPnm[0] + ar * P * *(ggm->C+1);

        /* Degree n = [2,n_max] --------------------------------------------- */
        for (n = 2; n <= *nmax; n++) {

            // Compute power term
            arn = arn * ar;

            /* Compute sectorial (m=n) terms ++++++++++++++++++++++++++++++++ */
        
            // Extract associated Legendre value
            Pp1 = Pmm[n];

            // Compute product terms, [1] eq. 3
            if (n <= *mmax) {
                id = (n+1)*n/2 + n;
                CPnm[n] = arn * Pp1 * *(ggm->C+id);
                SPnm[n] = arn * Pp1 * *(ggm->S+id);
            }

            /* Compute first non-sectorial terms (m=n-1) ++++++++++++++++++++ */

            // Compute associated Legendre value, [1] eq. 18 and 27
            gnm = sqrt( 2.0*n );
            P   = gnm * t * Pp1;
        
            // Add terms to summation, eq. 3 in [1]
            if (n-1 <= *mmax) {
                id = (n+1)*n/2 + n - 1;
                CPnm[n-1] = CPnm[n-1] + arn * P * *(ggm->C+id);
                SPnm[n-1] = SPnm[n-1] + arn * P * *(ggm->S+id);
            }

            /* Compute terms of order m = [n-2,1] +++++++++++++++++++++++++++ */
            for (m = n-2; m > 0; m--) {

                // Set previous values
                Pp2 = Pp1;
                Pp1 = P;
        
                // Compute associated Legendre value, [1] eq. 18, 19 and 27
                gnm = 2.0*(m+1.0) / sqrt( (n-m)*(n+m+1.0) );
                hnm = sqrt( (n+m+2.0)*(n-m-1.0)/(n-m)/(n+m+1.0) );
                P   = gnm * t * Pp1 - hnm * u2 * Pp2;

                // Add product terms to summation, eq. 3 in [1]
                if (m <= *mmax) {
                    id = (n+1)*n/2 + m;
                    CPnm[m] = CPnm[m] + arn * P * *(ggm->C+id);
                    SPnm[m] = SPnm[m] + arn * P * *(ggm->S+id);
                }

            }

            /* Compute zonal terms (m=0) ++++++++++++++++++++++++++++++++++++ */

            // Compute associated Legendre value, [1] eq. 18, 19 and 27
            gnm = 2.0 / sqrt( n*(n+1.0) );
            hnm = sqrt( (n+2.0)*(n-1.0)/n/(n+1.0) );
            P   = ( gnm * t * P - hnm * u2 * Pp1 ) / sqrt2;

            // Add terms to summation (S term is zero), [1] eq. 3
            id = (n+1)*n/2;
            CPnm[0] = CPnm[0] + arn * P * *(ggm->C+id);

        } /* ---------------------------------------------------------------- */

        /* Loop over longitude ============================================== */
        for (j = 0; j < inp->nlon; j++) {
        
            /* -----------------------------------------------------------------
             * All associated Legendre polynomials (latitude dependent) are now
             * computed and multiplied by the corresponding spherical harmonic
             * coefficient. These products are scaled by u^m, meaning that
             * Horner's scheme is used in the following summation.
             * -------------------------------------------------------------- */
      
            // Initialise order-dependent sum
            msum = 0.0;

            // Derive longitude id
            id = j + i * *lgrid;
            
            // Loop over order (m > 0)
            for (m = *mmax; m > 0; m--) {

                // Add to order-dependent sum using Horner's scheme, [1] eq. 2, 3 and 31
                msum = ( msum + cos( m * *(inp->lon+id) ) * CPnm[m] 
                    + sin( m * *(inp->lon+id) ) * SPnm[m] ) * u;
      
            }

            // Add zero order term to sum
            msum = msum + CPnm[0];

            // Rescale value into gravitational potential, [1] eq. 1
            *(out->rval+i+j*inp->nlat) = GMr * msum / sfac;
    
        } /* ================================================================ */

    } /* ==================================================================== */

    // Return from function
    return 0;

}

同样,非常感谢任何帮助,如果相关,可以提供其他信息,但这已经成为一个很长的帖子。我很难接受纯 c 代码的运行速度比 MATLAB C-MEX 代码慢。

标签: cperformancepointersstructure

解决方案


简而言之,是的,指针可以阻止某些编译器优化,从而导致潜在的减速。至少,这显然是 ICC 和 GCC 的情况。程序的性能受到指针别名和矢量化的强烈影响。

实际上,编译器不能轻易知道所提供的指针是相互别名还是与所提供数据结构的某些字段的地址别名。结果,编译器倾向于保守,并假设指向的值可以随时更改并经常重新加载它们。这可以防止优化,例如拆分某些循环gravpot(使用 GCC - 参见此修改代码的第 119 行)。此外,间接和别名往往会阻止热循环的矢量化(即使用目标处理器提供的 SIMD 指令)。矢量化可以强烈影响代码的性能。

举个例子,这里是初始代码,geo2sph这里稍微修改的实现。在第一种情况下,ICC 生成缓慢的标量实现,而在第二种情况下,ICC 生成明显更快的矢量化实现。两种实现之间的唯一区别是restrict关键字的使用。这个关键字告诉编译器,在指针的生命周期内,只有指针本身或直接派生自它的值(例如指针+1)将用于访问它指向的对象(有关更多信息,请参见此处) . 请注意,restrict关键字的使用是危险的,应该非常小心在使用它时,如果限制提示错误(很难调试),编译器可能会生成错误的代码。或者,您可以帮助编译器使用OpenMP SIMD 指令#pragma omp simd生成矢量化代码(结果请参见此处)。请注意,您应该确保目标代码可以安全地矢量化(例如,迭代必须是独立的)。


推荐阅读