转载请注明出处:
http://www.cnblogs.com/darkknightzh/p/4988264.html
参考网址:
关于mt19937:http://www.cnblogs.com/egmkang/archive/2012/09/06/2673253.html
代码如下:
1 #include "stdafx.h" 2 #include <iostream> 3 #include <random> // mt19937的头文件 4 #include <ppl.h> // parfor的头文件 5 #include <windows.h> // QueryPerformanceFrequency等函数的头文件 6 7 using namespace concurrency; // parfor使用 8 using namespace std; 9 10 11 // 分配内存 12 void AllocMatrix(double** m, size_t n) 13 { 14 *m = new double[n*n]; 15 memset(*m, 0, sizeof(double)*n*n); 16 } 17 18 19 // 初始化矩阵内容 20 template <class Gen> 21 void IniMatrix(double* m, size_t n, Gen& gen) 22 { 23 for (size_t i = 0; i < n; ++i) 24 { 25 for (size_t j = 0; j < n; ++j) 26 { 27 m[i*n + j] = static_cast<double>(gen()); 28 } 29 } 30 } 31 32 33 // 释放内存 34 void FreeMatrix(double** m) 35 { 36 if (nullptr != *m) 37 { 38 delete[](*m); 39 (*m) = nullptr; 40 } 41 } 42 43 44 // 矩阵相乘,使用for 45 void matrixMultiplyFor(double* res, const double* m1, const double* m2, size_t n) 46 { 47 for (size_t i = 0; i < n; i++) 48 { 49 for (size_t j = i; j < n; j++) 50 { 51 double temp = 0; 52 for (size_t k = 0; k < n; k++) 53 { 54 temp += m1[i * n + k] * m2[k * n + j]; 55 } 56 res[i*n + j] = temp; 57 } 58 } 59 } 60 61 62 // 矩阵相乘,外层使用parfor 63 void matrixMultiplyParForOuter(double* res, const double* m1, const double* m2, size_t n) 64 { 65 parallel_for(size_t(0), n, [&](size_t i) 66 { 67 for (size_t j = i; j < n; j++) 68 { 69 double temp = 0; 70 for (size_t k = 0; k < n; k++) 71 { 72 temp += m1[i * n + k] * m2[k * n + j]; 73 } 74 res[i*n + j] = temp; 75 } 76 }); 77 } 78 79 80 // 矩阵相乘,内层使用parfor 81 void matrixMultiplyParForInner(double* res, const double* m1, const double* m2, size_t n) 82 { 83 for (size_t i = 0; i < n; i++) 84 { 85 parallel_for(size_t(i), n, [&](size_t j) 86 { 87 double temp = 0; 88 for (size_t k = 0; k < n; k++) 89 { 90 temp += m1[i * n + k] * m2[k * n + j]; 91 } 92 res[i*n + j] = temp; 93 }); 94 } 95 } 96 97 98 // 测试矩阵相乘,使用for的时间 99 double testmatrixMultiplyFor(double* res, const double* m1, const double* m2, size_t n) 100 { 101 LARGE_INTEGER nFreq, nBeginTime, nEndTime; 102 QueryPerformanceFrequency(&nFreq); 103 QueryPerformanceCounter(&nBeginTime); 104 105 matrixMultiplyFor(res, m1, m2, n); 106 107 QueryPerformanceCounter(&nEndTime); 108 return (double)(nEndTime.QuadPart - nBeginTime.QuadPart) * 1000 / (double)nFreq.QuadPart; 109 } 110 111 112 // 测试矩阵相乘,外层使用parfor的时间 113 double testmatrixMultiplyParForOuter(double* res, const double* m1, const double* m2, size_t n) 114 { 115 LARGE_INTEGER nFreq, nBeginTime, nEndTime; 116 QueryPerformanceFrequency(&nFreq); 117 QueryPerformanceCounter(&nBeginTime); 118 119 matrixMultiplyParForOuter(res, m1, m2, n); 120 121 QueryPerformanceCounter(&nEndTime); 122 return (double)(nEndTime.QuadPart - nBeginTime.QuadPart) * 1000 / (double)nFreq.QuadPart; 123 } 124 125 126 // 测试矩阵相乘,内层使用parfor的时间 127 double testmatrixMultiplyParForInner(double* res, const double* m1, const double* m2, size_t n) 128 { 129 LARGE_INTEGER nFreq, nBeginTime, nEndTime; 130 QueryPerformanceFrequency(&nFreq); 131 QueryPerformanceCounter(&nBeginTime); 132 133 matrixMultiplyParForInner(res, m1, m2, n); 134 135 QueryPerformanceCounter(&nEndTime); 136 return (double)(nEndTime.QuadPart - nBeginTime.QuadPart) * 1000 / (double)nFreq.QuadPart; 137 } 138 139 140 // 主函数 141 int _tmain(int argc, _TCHAR* argv[]) 142 { 143 const size_t n = 1024; 144 double* dM1 = NULL; 145 double* dM2 = NULL; 146 double* dRes1 = NULL; 147 double* dRes2 = NULL; 148 double* dRes3 = NULL; 149 150 random_device rd; 151 mt19937 gen(rd()); 152 153 AllocMatrix(&dM1, n); 154 AllocMatrix(&dM2, n); 155 IniMatrix(dM1, n, gen); 156 IniMatrix(dM2, n, gen); 157 158 AllocMatrix(&dRes1, n); 159 AllocMatrix(&dRes2, n); 160 AllocMatrix(&dRes3, n); 161 162 double dTimeFor = testmatrixMultiplyFor(dRes1, dM1, dM2, n); 163 double dTimeParForOuter = testmatrixMultiplyParForOuter(dRes2, dM1, dM2, n); 164 double dTimeParForInner = testmatrixMultiplyParForInner(dRes3, dM1, dM2, n); 165 166 printf("time(ms)\nfor: %f \nparforOunter: %f \nparforInner: %f\n", dTimeFor, dTimeParForOuter, dTimeParForInner); 167 168 FreeMatrix(&dM1); 169 FreeMatrix(&dM2); 170 FreeMatrix(&dRes1); 171 FreeMatrix(&dRes2); 172 FreeMatrix(&dRes3); 173 174 return 0; 175 }
debug:
time(ms)
for: 7761.769099
parforOunter: 3416.670736
parforInner: 3423.701265
release:
time(ms)
for: 3884.167485
parforOunter: 1062.581817
parforInner: 1083.642302
说明:此处测试outer和inner是因为,matlab里面,使用outer形式的并行计算,使用parfor后,如果循环比对类似这种三角形式,最终有些核先跑完结果,有些核后跑完结果,导致出现,一个核累死累活的跑程序,另外N-1个核围观的状态,使最终的计算时间变长(不过在matlab中未测试outer和inner使用parfor的时间对比)。
但是,在C++里面,不知道是否优化的原因,outer使用parfor比inner使用parfor要快。此处测试了n=2048,结果也是outer比inner的形式要快。
来源:https://www.cnblogs.com/darkknightzh/p/4988264.html