我创建了这个小程序,用概率和比率来计算pi.为了让它运行得更快我决定用pthreads多线程一次.不幸的是,即使经过大量的搜索,我也无法解决我的问题,当我运行threadFunc函数时,使用一个线程,无论是使用pthread,还是通常从calculate_pi_mt函数调用,性能都很高比我在双核机器上使用两个线程运行它时更好(至少两次或者不是三倍).我试过禁用优化无济于事.据我所知,当线程运行时,它使用局部变量,而不是在最后我使用互斥锁来创建命中总和...
首先是否有任何创建代码的技巧可以在这里运行得更好?(即风格),因为我只是通过尝试这些东西来学习.
其次,这些明显的性能问题是否有任何原因?当线程数设置为1运行时,我的一个cpus最大值为100%.设置为2时,第二个cpu上升到大约80%-90%,但显然这样做的所有额外工作都无济于事!可能是使用rand()函数?
struct arguments { int n_threads; int rays; int hits_in; pthread_mutex_t *mutex; }; void *threadFunc(void *arg) { struct arguments* args=(struct arguments*)arg; int n = 0; int local_hits_in = 0; double x; double y; double r; while (n < args->rays) { n++; x = ((double)rand())/((double)RAND_MAX); y = ((double)rand())/((double)RAND_MAX); r = (double)sqrt(pow(x, 2) + pow(y, 2)); if (r < 1.0){ local_hits_in++; } } pthread_mutex_lock(args->mutex); args->hits_in += local_hits_in; pthread_mutex_unlock(args->mutex); return NULL; } double calculate_pi_mt(int rays, int threads){ double answer; int c; unsigned int iseed = (unsigned int)time(NULL); srand(iseed); if ( (float)(rays/threads) != ((float)rays)/((float)threads) ){ printf("Error: number of rays is not evenly divisible by threads\n"); } /* argument initialization */ struct arguments* args = malloc(sizeof(struct arguments)); args->hits_in = 0; args->rays = rays/threads; args->n_threads = 0; args->mutex = malloc(sizeof(pthread_mutex_t)); if (pthread_mutex_init(args->mutex, NULL)){ printf("Error creating mutex!\n"); } pthread_t thread_ary[MAXTHREADS]; c=0; while (c < threads){ args->n_threads += 1; if (pthread_create(&(thread_ary[c]),NULL,threadFunc, args)){ printf("Error when creating thread\n"); } printf("Created Thread: %d\n", args->n_threads); c+=1; } c=0; while (c < threads){ printf("main waiting for thread %d to terminate...\n", c+1); if (pthread_join(thread_ary[c],NULL)){ printf("Error while waiting for thread to join\n"); } printf("Destroyed Thread: %d\n", c+1); c+=1; } printf("Hits in %d\n", args->hits_in); printf("Rays: %d\n", rays); answer = 4.0 * (double)(args->hits_in)/(double)(rays); //freeing everything! pthread_mutex_destroy(args->mutex); free(args->mutex); free(args); return answer; }
caf.. 11
我可以看到几个问题:
rand()
不是线程安全的.使用drand48_r()
(double
在本[0.0, 1.0)
机范围内生成一个,这是你想要的)
您只创建一个struct arguments
结构,然后尝试将其用于多个线程.您需要为每个线程创建一个单独的线程(只使用一个数组).
这是我如何清理你的方法.注意我们不需要使用任何互斥锁 - 每个线程只是在一个单独的位置存储它自己的返回值,并且主线程在其他线程完成后添加它们:
#include#include #include #include #include struct thread_info { int thread_n; pthread_t thread_id; int rays; int hits_in; }; void seed_rand(int thread_n, struct drand48_data *buffer) { struct timeval tv; gettimeofday(&tv, NULL); srand48_r(tv.tv_sec * thread_n + tv.tv_usec, buffer); } void *threadFunc(void *arg) { struct thread_info *thread_info = arg; struct drand48_data drand_buffer; int n = 0; const int rays = thread_info->rays; int hits_in = 0; double x; double y; double r; seed_rand(thread_info->thread_n, &drand_buffer); for (n = 0; n < rays; n++) { drand48_r(&drand_buffer, &x); drand48_r(&drand_buffer, &y); r = x * x + y * y; if (r < 1.0){ hits_in++; } } thread_info->hits_in = hits_in; return NULL; } double calculate_pi_mt(int rays, int threads) { int c; int hits_in = 0; if (rays % threads) { printf("Error: number of rays is not evenly divisible by threads\n"); rays = (rays / threads) * threads; } /* argument initialization */ struct thread_info *thr = malloc(threads * sizeof thr[0]); for (c = 0; c < threads; c++) { thr[c].thread_n = c; thr[c].rays = rays / threads; thr[c].hits_in = 0; if (pthread_create(&thr[c].thread_id, NULL, threadFunc, &thr[c])) { printf("Error when creating thread\n"); } printf("Created Thread: %d\n", thr[c].thread_n); } for (c = 0; c < threads; c++) { printf("main waiting for thread %d to terminate...\n", c); if (pthread_join(thr[c].thread_id, NULL)) { printf("Error while waiting for thread to join\n"); } hits_in += thr[c].hits_in; printf("Destroyed Thread: %d\n", c+1); } printf("Hits in %d\n", hits_in); printf("Rays: %d\n", rays); double answer = (4.0 * hits_in) / rays; free(thr); return answer; }
Puppy.. 8
您正在使用太多的同步原语.您应该在主线程的末尾对local_hits求和,而不是使用互斥锁以异步方式更新它.或者,至少,您可以使用原子操作(它只是一个int)来执行它而不是锁定整个互斥锁来更新一个int.
我可以看到几个问题:
rand()
不是线程安全的.使用drand48_r()
(double
在本[0.0, 1.0)
机范围内生成一个,这是你想要的)
您只创建一个struct arguments
结构,然后尝试将其用于多个线程.您需要为每个线程创建一个单独的线程(只使用一个数组).
这是我如何清理你的方法.注意我们不需要使用任何互斥锁 - 每个线程只是在一个单独的位置存储它自己的返回值,并且主线程在其他线程完成后添加它们:
#include#include #include #include #include struct thread_info { int thread_n; pthread_t thread_id; int rays; int hits_in; }; void seed_rand(int thread_n, struct drand48_data *buffer) { struct timeval tv; gettimeofday(&tv, NULL); srand48_r(tv.tv_sec * thread_n + tv.tv_usec, buffer); } void *threadFunc(void *arg) { struct thread_info *thread_info = arg; struct drand48_data drand_buffer; int n = 0; const int rays = thread_info->rays; int hits_in = 0; double x; double y; double r; seed_rand(thread_info->thread_n, &drand_buffer); for (n = 0; n < rays; n++) { drand48_r(&drand_buffer, &x); drand48_r(&drand_buffer, &y); r = x * x + y * y; if (r < 1.0){ hits_in++; } } thread_info->hits_in = hits_in; return NULL; } double calculate_pi_mt(int rays, int threads) { int c; int hits_in = 0; if (rays % threads) { printf("Error: number of rays is not evenly divisible by threads\n"); rays = (rays / threads) * threads; } /* argument initialization */ struct thread_info *thr = malloc(threads * sizeof thr[0]); for (c = 0; c < threads; c++) { thr[c].thread_n = c; thr[c].rays = rays / threads; thr[c].hits_in = 0; if (pthread_create(&thr[c].thread_id, NULL, threadFunc, &thr[c])) { printf("Error when creating thread\n"); } printf("Created Thread: %d\n", thr[c].thread_n); } for (c = 0; c < threads; c++) { printf("main waiting for thread %d to terminate...\n", c); if (pthread_join(thr[c].thread_id, NULL)) { printf("Error while waiting for thread to join\n"); } hits_in += thr[c].hits_in; printf("Destroyed Thread: %d\n", c+1); } printf("Hits in %d\n", hits_in); printf("Rays: %d\n", rays); double answer = (4.0 * hits_in) / rays; free(thr); return answer; }
您正在使用太多的同步原语.您应该在主线程的末尾对local_hits求和,而不是使用互斥锁以异步方式更新它.或者,至少,您可以使用原子操作(它只是一个int)来执行它而不是锁定整个互斥锁来更新一个int.