首页 > 解决方案 > 使用内在函数时出现错误双重释放或损坏

问题描述

下面的代码用于检查add_prod. 我在 add_prod 中编写了代码。它使计算正确,但显示错误double free or corruption (!prev). Aborted (core dumped)。使用valgrind,它打印

==2462== Invalid write of size 8
==2462==    at 0x1097A5: _mm_storeu_si128 (emmintrin.h:727)

谁能帮助我为什么会出现这个错误?

static void add_prod(const short* src, short* dst, short x, int n) {
    __m128i _src, _dst,
            _scalar = _mm_set_epi16(x,x,x,x,x,x,x,x);
    
    for(int i = 0; i < n; i += 8) {
        _src = _mm_loadu_si128((const __m128i*) (src+i));
        _dst = _mm_loadu_si128((const __m128i*) (dst+i));
        
        _src = _mm_mullo_epi16(_src, _scalar);
        _dst = _mm_add_epi16(_src, _dst);
        
        _mm_storeu_si128((__m128i*) (dst+i), _dst);
    }
        
}
#define N1 1001
#define M1 1
#define N2 16
#define M2 100000

void matmul(const short** a, const short** b, short** c, int n) {
    int i, j, k;
    for (i=0; i<n; ++i)
        for (j=0; j<n; ++j) c[i][j] = 0;
    for (i=0; i<n; ++i)
        for (k=0; k<n; ++k)
            add_prod(b[k], c[i], a[i][k], n);
}

static long mat_sum(const short** m, int n) {
    int i, j;
    long sum = 0;
    for (i=0; i<n; ++i)
        for (j=0; j<n; ++j) sum += m[i][j];        
    return sum;
}

static short** alloc_mat(int n) {
    int i;
    short** m = malloc(n*sizeof(short*));
    assert(m != NULL);
    for (i=0; i<n; ++i) {
        m[i] = malloc(n*sizeof(short));
        assert(m[i] != NULL);
    }
    return m;
}

static void init_mat(short** m, int n, int max) {
    int i, j;
    for (i=0; i<n; ++i) 
        for (j=0; j<n; ++j) m[i][j] = 1 + (i+j) % max; 
}

static void free_mat(short** m, int n) {
    int i;
    for (i=0; i<n; ++i) free(m[i]);
    free(m);
}

static int do_test(const short** a, const short** b, short** c, 
                    int n, int m, int test_no) {

    double start, tseq, tsse;
    int i;
    long rseq, rsse;

    printf("\nTest #%d\n", test_no);

    // sequential
    start = get_real_time();
    for (i=0; i<m; ++i) matmul_seq(a, b, c, n);
    tseq  = get_real_time()-start;
    rseq = mat_sum((const short**)c, n);

    // SSE
    start = get_real_time();
    for (i=0; i<m; ++i) matmul(a, b, c, n);
    tsse  = get_real_time()-start;
    rsse = mat_sum((const short**)c, n);

    printf("- result: %ld [expected: %ld]\n", rsse, rseq);
    printf("- sequential version: %.2f msec\n", tseq*1000);
    printf("- SSE version: %.2f msec\n", tsse*1000);
    printf("- speedup: %.2fx\n", tseq/(tsse==0.0 ? 1E-9 : tsse));

    return rsse == rseq;
}

int main() {

    int points = 0;

    short** a1 = alloc_mat(N1);
    short** b1 = alloc_mat(N1);
    short** c1 = alloc_mat(N1);
    short** a2 = alloc_mat(N2);
    short** b2 = alloc_mat(N2);
    short** c2 = alloc_mat(N2);

    init_mat(a1, N1, 5);
    init_mat(b1, N1, 3);
    init_mat(a2, N2, 7);
    init_mat(b2, N2, 5);

    points += do_test((const short**)a1, (const short**)b1, c1, N1, M1, 1);
    points += do_test((const short**)a2, (const short**)b2, c2, N2, M2, 2);

    free_mat(a1, N1);
    free_mat(b1, N1);
    free_mat(c1, N1);
    free_mat(a2, N2);
    free_mat(b2, N2);
    free_mat(c2, N2);

    printf("\nPoints: %d out of 2\n", points);

    return 0;
}

标签: cvectorizationdynamic-memory-allocationintrinsics

解决方案


您不会进入未映射的页面,这将直接导致段错误,而不是损坏 malloc 簿记信息。但正如评论中所指出的,您的循环绑定允许循环开始一次迭代,该迭代触及 16 个字节 if i < n,但这仅足以证明一个short是安全的,而不是 8 个。这就是为什么 valgrind 正确报告您在对象外部访问的原因。

使用i-7 < n,或倒数剩余的元素数量,如n >= 8/ n -= 8

然后进行标量清理,或在数组末尾结束的最终向量,如果长度不是向量宽度的倍数,则部分重叠。(后一种策略仅在您从 早期加载时才有效dst,否则您将重做一些元素。)


推荐阅读