[英]pthread and multicore on windows
我的問題與pthread庫以及利用多核系統有關。 在適當的參數下和較小的數據量下,系統似乎有所改進,最大的改進是在65000左右。數據表明,當您增加線程數時,它開始減少所需的時間,但此后不久就增加了。 當線程數= 1,2,4時,它可能會緩慢增加,有時是8,但是到16時,時間又開始減少。 在大數據量中,沒有任何改善,並且所有時間都保持相當接近。 如果有人可以告訴我是否有某些事情迫使我的線程按順序執行操作,或者其他問題會令人敬畏。
繼承數據
1395525080 0 num thread: 1 data size: 1024 0 1395525080
1395525080 0 num thread: 2 data size: 1024 0 1395525080
1395525080 0 num thread: 4 data size: 1024 0 1395525080
1395525080 15 num thread: 8 data size: 1024 0 1395525080
1395525080 47 num thread: 16 data size: 1024 0 1395525080
1395525080 31 num thread: 32 data size: 1024 0 1395525080
1395525080 16 num thread: 1 data size: 4096 0 1395525080
1395525080 0 num thread: 2 data size: 4096 0 1395525080
1395525080 0 num thread: 4 data size: 4096 0 1395525080
1395525080 15 num thread: 8 data size: 4096 0 1395525080
1395525080 78 num thread: 16 data size: 4096 0 1395525080
1395525080 31 num thread: 32 data size: 4096 0 1395525080
1395525080 140 num thread: 1 data size: 65536 0 1395525080
1395525081 156 num thread: 2 data size: 65536 0 1395525081
1395525081 109 num thread: 4 data size: 65536 0 1395525081
1395525081 94 num thread: 8 data size: 65536 0 1395525081
1395525081 93 num thread: 16 data size: 65536 0 1395525081
1395525081 187 num thread: 32 data size: 65536 0 1395525082
1395525082 171 num thread: 1 data size: 75536 0 1395525082
1395525082 172 num thread: 2 data size: 75536 0 1395525082
1395525082 141 num thread: 4 data size: 75536 0 1395525083
1395525083 109 num thread: 8 data size: 75536 0 1395525083
1395525083 140 num thread: 16 data size: 75536 0 1395525083
1395525083 234 num thread: 32 data size: 75536 0 1395525084
1395525084 203 num thread: 1 data size: 85536 0 1395525084
1395525084 203 num thread: 2 data size: 85536 0 1395525084
1395525084 172 num thread: 4 data size: 85536 0 1395525085
1395525085 202 num thread: 8 data size: 85536 0 1395525085
1395525085 125 num thread: 16 data size: 85536 0 1395525085
1395525085 187 num thread: 32 data size: 85536 0 1395525086
1395525086 125 num thread: 1 data size: 55536 0 1395525086
1395525086 109 num thread: 2 data size: 55536 0 1395525086
1395525086 141 num thread: 4 data size: 55536 0 1395525086
1395525086 78 num thread: 8 data size: 55536 0 1395525086
1395525087 140 num thread: 16 data size: 55536 0 1395525087
1395525087 156 num thread: 32 data size: 55536 0 1395525087
1395525120 153271 num thread: 1 data size: 70000000 153 1395525274
1395525398 152630 num thread: 2 data size: 70000000 152 1395525551
1395525675 154846 num thread: 4 data size: 70000000 154 1395525830
1395525956 153988 num thread: 8 data size: 70000000 153 1395526110
1395526236 153956 num thread: 16 data size: 70000000 153 1395526390
1395526515 157935 num thread: 32 data size: 70000000 157 1395526673
這是代碼,它執行傳統的存儲桶排序,我還有另外兩個具有相似數據的相似存儲桶,也執行存儲桶排序,順序代碼生成幾乎完全相同的值。
struct bucket
{
std::vector<int> data;
} ;
void *sort_bucket(void *unsorted_bucket);
int _tmain(int argc, _TCHAR* argv[])
{
int array_N[] = {1024, 4096, 65536,75536,85536,55536, 70000000, 16777216};
int array_number_of_threads[] = {1, 2, 4, 8, 16, 32};
std::vector<int> N;
std::vector<int> number_of_threads;
number_of_threads.assign(array_number_of_threads, array_number_of_threads+6);
N.assign(array_N, array_N+7);
for(int size_index = 0; size_index < N.size(); size_index++)
{
for(int thread_index = 0; thread_index < number_of_threads.size(); thread_index++)
{
std::vector<int> unsorted_data;
std::vector<int> sorted_data;
std::vector<std::thread> thread_array;
std::vector<bucket> buckets;
std::vector<pthread_t> thread;
while(buckets.size() < number_of_threads[thread_index]){ // checks against the number of threads and creates the number of buckets
bucket new_bucket;
pthread_t new_thread;
buckets.push_back(new_bucket);
thread.push_back(new_thread);
}
for(int index = 0; index < N[size_index]; index++) // gathers the data
{
unsorted_data.push_back(rand() % N[size_index]);
}
clock_t t = 0;
t = clock();
time_t start = 0;
time_t end = 0;
time(&start);
std::cout << start << " ";
int difference = N[size_index]/number_of_threads[thread_index];
int placeholder = 0;
for(int index = 0; index < N[size_index]; index++) {//calculates which bucket the data belong in and places the data in that bucket
//std::cout << unsorted_data[index] << " " << difference << " ";
placeholder = unsorted_data[index]/difference;
//std::cout << placeholder << std::endl;
buckets[placeholder].data.push_back(unsorted_data[index]);
}
for(int index = 0; index < number_of_threads[thread_index]; index++){ // sends the data to the threads
//thread_array.push_back(std::thread(sort_bucket ,buckets[index]));
pthread_create(&thread[index],
NULL,
sort_bucket ,
(void*) &buckets[index].data);
}
// bring the data back to the root process
for(int index = 0; index < number_of_threads[thread_index]; index++) {
void *data;
struct bucket *ret_bucket;
pthread_join(thread[index],(void**) &data);
ret_bucket = (struct bucket *) data;
sorted_data.insert(sorted_data.end(), ret_bucket->data.begin(), ret_bucket->data.end());
//sorted_data.assign(ret_bucket->data.begin(), ret_bucket->data.end());
}
/*
for(int index = 0; index < sorted_data.size(); index++)
{
std::cout << sorted_data[index] << " ";
}
*/
t = clock() - t;
std::cout << t << " ";
t = t/CLOCKS_PER_SEC;
std::cout << "num thread: " << number_of_threads[thread_index] << " ";
std::cout << "data size: " << N[size_index] << " ";
std::cout << t << " ";
time(&end);
std::cout << end << std::endl;
sort(unsorted_data.begin(), unsorted_data.end());
for(int index = 0; index < unsorted_data.size(); index++)
{
if(unsorted_data[index] != sorted_data[index])
{
std::cout << "data sorting failed" << std::endl;
}
}
}
}
int placeholder;
std::cin >> placeholder;
return 0;
}
void *sort_bucket(void *unsorted_bucket)
{
bucket *temp_sorted_bucket = (struct bucket *) unsorted_bucket;
std::sort(temp_sorted_bucket->data.begin(), temp_sorted_bucket->data.end());
/*for(int index = 0; index < temp_sorted_bucket->data.size(); index++)
{
std::cout << temp_sorted_bucket->data.at(index) << " ";
}*/
pthread_exit(temp_sorted_bucket);
return 0;
}
請記住,您的線程受CPU上物理內核數量的限制。 達到極限后,它必須使用資源在同一核心上的線程之間切換,這需要時間。 例如,一個i3處理器具有2個帶有超線程的物理內核,這些超線程在CPU上提供4個虛擬內核,因此,超過4個線程的任何操作通常都不會帶來任何好處。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.