c++ - 在 C 中实现了简单的线性回归,但使用 p_thread 慢得多
问题描述
我在 C 中实现了简单的线性回归,一个是单线程,另一个是多线程,运行速度要慢得多。
下面附上单线程版本。这很简单。首先,它从同一文件夹中的两个文件中读取 xs 和 ys。那么,直线的系数和截距的计算就是来自xs和ys的数据的加法和乘法。
// gcc simple_regression.cpp -lstdc++ -o simple_regression
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
const long long int N = 8000000*10;
typedef struct {
double coef;
double intercept;
} lr_res;
lr_res lr(double *xs, double *ys) {
double sumxy = 0.0, sumx = 0.0, sumy = 0.0, sumx2 = 0.0;
int i;
for(i=0; i<N; ++i) {
sumxy += xs[i]*ys[i];
sumx += xs[i];
sumy += ys[i];
sumx2 += xs[i]*xs[i];
//printf("after adding x[i]=%f, sumx=%f\n", xs[i], sumx);
}
//printf("avg(x): %f, avg(y): %f\n", sumx/N, sumy/N);
lr_res res;
res.coef = (N*sumxy - sumx*sumy)/(N*sumx2 - sumx*sumx);
res.intercept = (sumy - res.coef*sumx)/N;
return res;
}
int main(int argc, char**argv) {
FILE *x_file, *y_file;
char *num_cptr = NULL;
size_t len = 0;
ssize_t read;
char *ptr;
float data;
//double xs[N], ys[N];
double *xs = new double[N];
double *ys = new double[N];
int i;
//printf("hello!\n");
i = 0;
x_file = fopen("xs.data", "r");
while ((read = getline(&num_cptr, &len, x_file)) != -1) {
xs[i++] = strtod(num_cptr, &ptr);
//printf("%f\n", xs[i++]);
if (i == N) break;
}
i = 0;
y_file = fopen("ys.data", "r");
while ((read = getline(&num_cptr, &len, y_file)) != -1) {
ys[i++] = strtod(num_cptr, &ptr);
//printf("%f\n", ys[i++]);
if (i == N) break;
}
printf("start fitting...\n");
clock_t t;
t = clock();
lr_res res = lr(xs, ys);
t = clock() - t;
double time_taken = ((double)t)/CLOCKS_PER_SEC; // in seconds
printf("coef: %f, intercept: %f\n", res.coef, res.intercept);
printf("fitting took %f seconds\n", time_taken);
}
下面附上多线程版本。在计算系数和截距时,将求和工作分配到不同的线程中,希望与单线程版本相比,速度可以提高大约 8 倍。
// gcc simple_regression_mt.cpp -lstdc++ -lpthread -o simple_regression_mt
// https://www.statisticshowto.com/what-is-a-regression-equation/
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/sysinfo.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#include <pthread.h>
#include<string.h>
const long long int N = 8000000*10;
//const long long int N = 800;
const int nprocs = 8;
int chunksize = N/nprocs;
//double xs[N], ys[N];
double *xs = new double[N];
double *ys = new double[N];
double sumxy[nprocs], sumx[nprocs], sumy[nprocs], sumx2[nprocs];
void *lr_calc(void *arg) {
char* coreid_str = (char *) arg;
int coreid = strtol(coreid_str, (char **)NULL, 10) - 1;
int offset = coreid*chunksize;
//printf("core id:%d starting up...\n", coreid);
for(int i=offset; i<offset+chunksize; ++i) {
sumxy[coreid] += xs[i]*ys[i];
sumx[coreid] += xs[i];
sumy[coreid] += ys[i];
sumx2[coreid] += xs[i]*xs[i];
//printf("after adding x[i]=%f, sumxy=%f\n", xs[i], sumxy[coreid]);
}
//printf("avg(x): %f, avg(y): %f\n", sumx[coreid]/chunksize, sumy[coreid]/chunksize);
//printf("core id:%d finishing...\n", coreid);
return NULL;
}
char* itoa(int val, int base){
static char buf[32] = {0};
int i = 30;
for(; val && i ; --i, val /= base)
buf[i] = "0123456789abcdef"[val % base];
return &buf[i+1];
}
int main(int argc, char**argv) {
printf("This system has %d processors configured and "
"%d processors available.\n",
get_nprocs_conf(), get_nprocs());
FILE *x_file, *y_file;
char *num_cptr = NULL;
size_t len = 0;
ssize_t read;
char *ptr;
float data;
int i;
// read x, y into dynamic arrays xs, ys
i = 0;
x_file = fopen("xs.data", "r");
while ((read = getline(&num_cptr, &len, x_file)) != -1) {
xs[i++] = strtod(num_cptr, &ptr);
//printf("%f\n", xs[i++]);
if (i == N) break;
}
i = 0;
y_file = fopen("ys.data", "r");
while ((read = getline(&num_cptr, &len, y_file)) != -1) {
ys[i++] = strtod(num_cptr, &ptr);
//printf("%f\n", ys[i++]);
if (i == N) break;
}
// multithreaded version of linear regression
printf("start fitting...\n");
struct timeval tp;
gettimeofday(&tp, NULL);
int ms = tp.tv_sec * 1000 + tp.tv_usec / 1000;
pthread_t tid[nprocs];
for (int i=0; i<nprocs; ++i) {
char* scoreid = itoa(i+1, 10);
char* scoreid_dup = strdup(scoreid);
pthread_create(&tid[i], NULL, lr_calc, (void*)(scoreid_dup));
}
void* v;
for (int i=0; i<nprocs; ++i) {
pthread_join(tid[i], &v);
}
double ssumxy = 0.0, ssumx = 0.0, ssumy = 0.0, ssumx2 = 0.0;
for (int i=0; i<nprocs; ++i) {
ssumxy += sumxy[i];
ssumx += sumx[i];
ssumy += sumy[i];
ssumx2 += sumx2[i];
//printf("sumxy[i]: %f\n", sumxy[i]);
//printf("ssumxy: %f\n", ssumxy);
}
//printf("avg(x): %f, avg(y): %f\n", ssumx/N, ssumy/N);
double coef = (N*ssumxy - ssumx*ssumy)/(N*ssumx2 - ssumx*ssumx);
//printf("N*ssumxy is: %f\n", ssumx);
double intercept = (ssumy - coef*ssumx)/N;
struct timeval tp1;
gettimeofday(&tp1, NULL);
int ms2 = tp1.tv_sec * 1000 + tp1.tv_usec / 1000;
int time_taken = (ms2 - ms); // in million seconds
printf("coef: %f, intercept: %f\n", coef, intercept);
printf("fitting took %d milli seconds\n", time_taken);
}
但是,结果表明,如下所示,多线程版本要慢得多。请注意,时间仅用于计算/拟合线性回归,不计入从文件中读取数据的部分。时差似乎也不是创建线程的开销。(创建线程不会花费 0.5 秒左右的时间。)
有人可以给我一些建议吗?太感谢了!
peterlqchen@system76-pc:~/Documents/python_scripts$ ./simple_regression
start fitting...
coef: 2.081660, intercept: 0.075544
fitting took 0.440704 seconds
peterlqchen@system76-pc:~/Documents/python_scripts$ ./simple_regression_mt
This system has 8 processors configured and 8 processors available.
start fitting...
coef: 2.081660, intercept: 0.075544
fitting took 2370 milli seconds
解决方案
你有一个虚假分享的问题。就在这儿:
for(int i=offset; i<offset+chunksize; ++i) {
sumxy[coreid] += xs[i]*ys[i];
sumx[coreid] += xs[i];
sumy[coreid] += ys[i];
sumx2[coreid] += xs[i]*xs[i];
//printf("after adding x[i]=%f, sumxy=%f\n", xs[i], sumxy[coreid]);
}
内存不会以位或字节的形式传输,而是以缓存线的形式传输——目前它们是 64 字节。如果两个内核修改同一高速缓存行上的数据,则会运行一个复杂的例程来解决该问题。即使他们修改了完全不同的缓存行。由于您的所有内核都在或多或少相同的缓存行上运行并修改它们 - 使用多个线程预计会大大降低速度。
要修复它,只需sum**_local
为线程计算创建局部变量,最后才将它们放入共享缓存中。
double sumxy_local = 0., sumx_local = 0., sumy_local = 0., sumx2_local = 0.;
for(int i=offset; i<offset+chunksize; ++i) {
sumxy_local += xs[i]*ys[i];
sumx_local += xs[i];
sumy_local += ys[i];
sumx2_local += xs[i]*xs[i];
//printf("after adding x[i]=%f, sumxy=%f\n", xs[i], sumxy[coreid]);
}
sumxy[coreid] = sumxy_local;
sumx[coreid] = sumx_local;
sumy[coreid] = sumy_local;
sumx2[coreid] = sumx2_local;
这应该可以解决问题。
免责声明:可能还有其他关于多线程的问题可能会导致速度变慢,但这似乎是最关键的。
推荐阅读
- python - Pandas 删除换行符并将文本文件转换为 CSV
- r - 增加ggplot2中图例键之间的垂直间距
- go-gorm - Golang - GORM 如何为现有表创建模型?
- audio - 正确的 WAVE_FORMAT_1S16 PCM 双通道格式?
- python - Django admin:根据另一个字段的值显示模型的字段
- python - 如何在 Kivy 中从 python 代码更改屏幕
- javascript - 意外的令牌“导出”
- functional-programming - 将包括列表在内的多个参数传递给函数
- python - 如何仅使用numpy python中的矩阵运算(无for循环)计算两个矩阵之间的欧几里得距离?
- swiftui - SwiftUI 请求将 App 更新到最新版本