首页 > 解决方案 > 在 C 中实现了简单的线性回归,但使用 p_thread 慢得多

问题描述

我在 C 中实现了简单的线性回归,一个是单线程,另一个是多线程,运行速度要慢得多。

下面附上单线程版本。这很简单。首先,它从同一文件夹中的两个文件中读取 xs 和 ys。那么,直线的系数和截距的计算就是来自xs和ys的数据的加法和乘法。

// gcc simple_regression.cpp -lstdc++ -o simple_regression

#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h> 

const long long int N = 8000000*10;

typedef struct {
    double coef;
    double intercept;
} lr_res;

lr_res lr(double *xs, double *ys) {
    double sumxy = 0.0, sumx = 0.0, sumy = 0.0, sumx2 = 0.0;
    int i;
    for(i=0; i<N; ++i) {
        sumxy += xs[i]*ys[i];
        sumx += xs[i];
        sumy += ys[i];
        sumx2 += xs[i]*xs[i];
        //printf("after adding x[i]=%f, sumx=%f\n", xs[i], sumx);
    }
    //printf("avg(x): %f, avg(y): %f\n", sumx/N, sumy/N);
    lr_res res;
    res.coef = (N*sumxy - sumx*sumy)/(N*sumx2 - sumx*sumx);
    res.intercept = (sumy - res.coef*sumx)/N;
    return res;
}

int main(int argc, char**argv) {

    FILE *x_file, *y_file;
    char *num_cptr = NULL;
    size_t len = 0;
    ssize_t read;

    char *ptr;
    float data;

    //double xs[N], ys[N]; 
    double *xs = new double[N];
    double *ys = new double[N];
    int i;
    //printf("hello!\n");
    i = 0;    
    x_file = fopen("xs.data", "r");    
    while ((read = getline(&num_cptr, &len, x_file)) != -1) {
        xs[i++] = strtod(num_cptr, &ptr);
        //printf("%f\n", xs[i++]);
        if (i == N) break;
    }

    i = 0;
    y_file = fopen("ys.data", "r");
    while ((read = getline(&num_cptr, &len, y_file)) != -1) {
        ys[i++] = strtod(num_cptr, &ptr);
        //printf("%f\n", ys[i++]);
        if (i == N) break;
    }

    printf("start fitting...\n");

    clock_t t; 
    t = clock(); 

    lr_res res = lr(xs, ys);

    t = clock() - t; 
    double time_taken = ((double)t)/CLOCKS_PER_SEC; // in seconds 


    printf("coef: %f, intercept: %f\n", res.coef, res.intercept);

    printf("fitting took %f seconds\n", time_taken); 

}

下面附上多线程版本。在计算系数和截距时,将求和工作分配到不同的线程中,希望与单线程版本相比,速度可以提高大约 8 倍。

// gcc simple_regression_mt.cpp -lstdc++ -lpthread  -o simple_regression_mt
// https://www.statisticshowto.com/what-is-a-regression-equation/

#include <sys/types.h>
#include <sys/wait.h>
#include <sys/sysinfo.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h> 
#include <pthread.h>

#include<string.h> 

const long long int N = 8000000*10;
//const long long int N = 800;
const int nprocs = 8;
int chunksize = N/nprocs;

//double xs[N], ys[N]; 
double *xs = new double[N];
double *ys = new double[N];
double sumxy[nprocs], sumx[nprocs], sumy[nprocs], sumx2[nprocs];


void *lr_calc(void *arg) {
    char* coreid_str = (char *) arg;
    int coreid = strtol(coreid_str, (char **)NULL, 10) - 1;
    int offset = coreid*chunksize;
    //printf("core id:%d starting up...\n", coreid);

    for(int i=offset; i<offset+chunksize; ++i) {
        sumxy[coreid] += xs[i]*ys[i];
        sumx[coreid] += xs[i];
        sumy[coreid] += ys[i];
        sumx2[coreid] += xs[i]*xs[i];
        //printf("after adding x[i]=%f, sumxy=%f\n", xs[i], sumxy[coreid]);
    }
    //printf("avg(x): %f, avg(y): %f\n", sumx[coreid]/chunksize, sumy[coreid]/chunksize);

    //printf("core id:%d finishing...\n", coreid);

    return NULL;  
}

char* itoa(int val, int base){  
    static char buf[32] = {0};  
    int i = 30; 
    for(; val && i ; --i, val /= base)  
        buf[i] = "0123456789abcdef"[val % base];    
    return &buf[i+1];   
}


int main(int argc, char**argv) {

    printf("This system has %d processors configured and "
        "%d processors available.\n",
        get_nprocs_conf(), get_nprocs());



    FILE *x_file, *y_file;
    char *num_cptr = NULL;
    size_t len = 0;
    ssize_t read;

    char *ptr;
    float data;


    int i;

    // read x, y into dynamic arrays xs, ys
    i = 0;    
    x_file = fopen("xs.data", "r");    
    while ((read = getline(&num_cptr, &len, x_file)) != -1) {
        xs[i++] = strtod(num_cptr, &ptr);
        //printf("%f\n", xs[i++]);
        if (i == N) break;
    }

    i = 0;
    y_file = fopen("ys.data", "r");
    while ((read = getline(&num_cptr, &len, y_file)) != -1) {
        ys[i++] = strtod(num_cptr, &ptr);
        //printf("%f\n", ys[i++]);
        if (i == N) break;
    }

    // multithreaded version of linear regression
    printf("start fitting...\n");
    struct timeval tp;
    gettimeofday(&tp, NULL);
    int ms = tp.tv_sec * 1000 + tp.tv_usec / 1000;

    pthread_t tid[nprocs];
    for (int i=0; i<nprocs; ++i) {
        char* scoreid = itoa(i+1, 10);
        char* scoreid_dup = strdup(scoreid);  

        pthread_create(&tid[i], NULL, lr_calc, (void*)(scoreid_dup));
    }

    void* v;
    for (int i=0; i<nprocs; ++i) {
        pthread_join(tid[i], &v);
    }

    double ssumxy = 0.0, ssumx = 0.0, ssumy = 0.0, ssumx2 = 0.0;
    for (int i=0; i<nprocs; ++i) {
        ssumxy += sumxy[i];
        ssumx += sumx[i];
        ssumy += sumy[i];
        ssumx2 += sumx2[i];
        //printf("sumxy[i]: %f\n", sumxy[i]);
        //printf("ssumxy: %f\n", ssumxy);
    }
    //printf("avg(x): %f, avg(y): %f\n", ssumx/N, ssumy/N);

    double coef = (N*ssumxy - ssumx*ssumy)/(N*ssumx2 - ssumx*ssumx);
    //printf("N*ssumxy is: %f\n", ssumx);
    double intercept = (ssumy - coef*ssumx)/N;

    struct timeval tp1;
    gettimeofday(&tp1, NULL);
    int ms2 = tp1.tv_sec * 1000 + tp1.tv_usec / 1000;

    int time_taken = (ms2 - ms); // in million seconds 


    printf("coef: %f, intercept: %f\n", coef, intercept);    
    printf("fitting took %d milli seconds\n", time_taken); 

}

但是,结果表明,如下所示,多线程版本要慢得多。请注意,时间仅用于计算/拟合线性回归,不计入从文件中读取数据的部分。时差似乎也不是创建线程的开销。(创建线程不会花费 0.5 秒左右的时间。)

有人可以给我一些建议吗?太感谢了!

peterlqchen@system76-pc:~/Documents/python_scripts$ ./simple_regression
start fitting...
coef: 2.081660, intercept: 0.075544
fitting took 0.440704 seconds
peterlqchen@system76-pc:~/Documents/python_scripts$ ./simple_regression_mt 
This system has 8 processors configured and 8 processors available.
start fitting...
coef: 2.081660, intercept: 0.075544
fitting took 2370 milli seconds

标签: c++cparallel-processingpthreadslinear-regression

解决方案


你有一个虚假分享的问题。就在这儿:

for(int i=offset; i<offset+chunksize; ++i) {
    sumxy[coreid] += xs[i]*ys[i];
    sumx[coreid] += xs[i];
    sumy[coreid] += ys[i];
    sumx2[coreid] += xs[i]*xs[i];
    //printf("after adding x[i]=%f, sumxy=%f\n", xs[i], sumxy[coreid]);
}

内存不会以位或字节的形式传输,而是以缓存线的形式传输——目前它们是 64 字节。如果两个内核修改同一高速缓存行上的数据,则会运行一个复杂的例程来解决该问题。即使他们修改了完全不同的缓存行。由于您的所有内核都在或多或少相同的缓存行上运行并修改它们 - 使用多个线程预计会大大降低速度。

要修复它,只需sum**_local为线程计算创建局部变量,最后才将它们放入共享缓存中。

double sumxy_local = 0., sumx_local = 0., sumy_local = 0., sumx2_local = 0.;
for(int i=offset; i<offset+chunksize; ++i) {
    sumxy_local += xs[i]*ys[i];
    sumx_local  += xs[i];
    sumy_local  += ys[i];
    sumx2_local  += xs[i]*xs[i];
    //printf("after adding x[i]=%f, sumxy=%f\n", xs[i], sumxy[coreid]);
}
sumxy[coreid] = sumxy_local;
sumx[coreid] = sumx_local;
sumy[coreid] = sumy_local;
sumx2[coreid] = sumx2_local;

这应该可以解决问题。

免责声明:可能还有其他关于多线程的问题可能会导致速度变慢,但这似乎是最关键的。


推荐阅读