问题
I got a task to write a Strassen Vinograd algorithm in C++. I have written it twice, but first version of my code don't even works. The result is correct in the lower left corner of result matrix. And my second version is running slower than naive algorithm, even with N = 64+. So i need help, what am i doing wrong? Important note: i'm not allowed to use dinamic matrix in recursion and structures. In addition it is better to do the multiplication without copying, using the coordinates of the corner element of the submatrices. Incorrect:
#include "pch.h"
#include<iostream>
#include<cstdio>
#include<conio.h>
#include<cstdlib>
#include<cmath>
#include<ctime>
#pragma comment(linker, "/STACK:5813243000")
using namespace std;
const int sizs = 256;
void vivod(int matrix[][256], int n);
void Matrix_Add(int a[][256], int b[][256], int c[][256], int n, int x1, int y1, int x2, int y2);
void Matrix_Sub(int a[][256], int b[][256], int c[][256], int n, int x1, int y1, int x2, int y2);
void Matrix_Multiply(int a[][256], int b[][256], int c[][256], int x1, int y1, int x2, int y2, int n);
void strassen(int a[][256], int b[][256], int c[][256], int m, int n, int x1, int y1, int x2, int y2);
void Naive_Multiply(int a[][256], int b[][256], int c[][256], int n)
{
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
c[i][j] = 0;
for (int t = 0; t < n; t++) {
c[i][j] = c[i][j] + a[i][t] * b[t][j];
}
}
}
}
void Multiply(int a[][256], int b[][256], int c[][256], int x1, int y1, int x2, int y2)
{
for (int i = 0; i < 2; i++) {
for (int j = 0; j < 2; j++) {
c[i][j] = 0;
for (int t = 0; t < 2; t++) {
c[i][j] = c[i][j] + a[i][t] * b[t][j];
}
}
}
}
int main()
{
setlocale(LC_ALL, "Russian");
int n;
cout << "Введите число n:";
cin >> n;
const int m = 256;
int A[m][m];
int B[m][m];
int C[m][m];
int k[m][m];
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
A[i][j] = 0;
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
B[i][j] = 0;
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
A[i][j] = rand() % 10;
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
B[i][j] = rand() % 10;
cout << "First Matrix:" << endl;
vivod(A, n);
cout << "Second Matrix:" << endl;
vivod(B, n);
int begin = clock();
//for (int i =0; i < 100; i++)
Naive_Multiply(A, B, k, n);
int end = clock();
cout << "Naive Multiply time: " << end - begin << endl;
vivod(k, n);
int begin2 = clock();
//for (int i = 0; i < 100; i++)
strassen(A, B, C, n, n, 0, 0, 0, 0);
int end2 = clock();
cout << "time: " << end2 - begin2 << endl;
vivod(C, n);
system("pause");
return 0;
}
void strassen(int a[][256], int b[][256], int c[][256], int m, int n, int x1, int y1, int x2, int y2) {
m = n / 2;
if (m != 1)
{
int s1[sizs][sizs];
int s2[sizs][sizs];
int s3[sizs][sizs];
int s4[sizs][sizs];
int s5[sizs][sizs];
int s6[sizs][sizs];
int s7[sizs][sizs];
int s8[sizs][sizs];
int m1[sizs][sizs];
int m2[sizs][sizs];
int m3[sizs][sizs];
int m4[sizs][sizs];
int m5[sizs][sizs];
int m6[sizs][sizs];
int m7[sizs][sizs];
int t1[sizs][sizs];
int t2[sizs][sizs];
int c11[sizs][sizs];
int c12[sizs][sizs];
int c21[sizs][sizs];
int c22[sizs][sizs];
Matrix_Add(a, a, s1, m, n - m, n - 2 * m, n - m, n - m);
Matrix_Sub(s1, a, s2, m, 0, 0, n - 2 * m, n - 2 * m);
Matrix_Sub(a, a, s3, m, n - 2 * m, n - 2 * m, n - m, n - 2 * m);
Matrix_Sub(a, s2, s4, m, n - 2 * m, n - m, 0, 0);
Matrix_Sub(b, b, s5, m, n - 2 * m, n - m, n - 2 * m, n - 2 * m);
Matrix_Sub(b, s5, s6, m, n - m, n - m, 0, 0);
Matrix_Sub(b, b, s7, m, n - m, n - m, n - 2 * m, n - m);
Matrix_Sub(s6, b, s8, m, 0, 0, n - m, n - 2 * m);
strassen(s2, s6, m1, m, m, 0, 0, 0, 0);
strassen(a, b, m2, m, m, n - 2 * m, n - 2 * m, n - 2 * m, n - 2 * m);
strassen(a, b, m3, m, m, n - 2 * m, n - m, n - m, n - 2 * m);
strassen(s3, s7, m4, m, m, 0, 0, 0, 0);
strassen(s1, s5, m5, m, m, 0, 0, 0, 0);
strassen(s4, b, m6, m, m, 0, 0, n - m, n - m);
strassen(a, s8, m7, m, m, n - m, n - m, 0, 0);
Matrix_Add(m1, m2, t1, m, 0, 0, 0, 0);
Matrix_Add(t1, m4, t2, m, 0, 0, 0, 0);
Matrix_Add(m2, m3, c11, m, 0, 0, 0, 0);
Matrix_Sub(t2, m7, c21, m, 0, 0, 0, 0);
Matrix_Add(t1, m5, c12, m, 0, 0, 0, 0);
Matrix_Add(c12, m6, c12, m, 0, 0, 0, 0);
Matrix_Add(t2, m5, c22, m, 0, 0, 0, 0);
for (int i = 0; i < n / 2; i++)
{
for (int j = 0; j < n / 2; j++)
{
c[i + n - 2 * m][j + n - 2 * m] = c11[i][j];
c[i + n - 2 * m][j + n - m] = c12[i][j];
c[i + n - m][j + n - 2 * m] = c21[i][j];
c[i + n - m][j + n - m] = c22[i][j];
}
}
}
else
{
Matrix_Multiply(a, b, c, x1, y1, x2, y2, n);
}
}
void vivod(int matrix[][256], int n)
{
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
cout << matrix[i][j] << " ";
}
cout << endl;
}
cout << endl;
}
void Matrix_Add(int a[][256], int b[][256], int c[][256], int n, int x1, int y1, int x2, int y2)
{
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
c[i][j] = a[i + x1][j + y1] + b[i + x2][j + y2];
}
}
}
void Matrix_Sub(int a[][256], int b[][256], int c[][256], int n, int x1, int y1, int x2, int y2)
{
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
c[i][j] = a[i + x1][j + y1] - b[i + x2][j + y2];
}
}
}
void Matrix_Multiply(int a[][256], int b[][256], int c[][256], int x1, int y1, int x2, int y2, int n)
{
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
c[i][j] = 0;
for (int t = 0; t < n; t++) {
c[i][j] = c[i][j] + a[x1 + i][y1 + t] * b[x2 + t][y2 + j];
}
}
}
}
Slow, deadline default 1:
#include "pch.h"
#include <stdio.h>
#include <iostream>
#include<cstdlib>
#include<ctime>
#pragma comment(linker, "/STACK:44333338")
int deadline;
using namespace std;
const int n = 16;
void StrVin(int(&A)[n][n], int(&B)[n][n], int(&C)[n][n], int size);
void add(int(&A)[n][n], int(&B)[n][n], int(&C)[n][n], int size);
void sub(int(&A)[n][n], int(&B)[n][n], int(&C)[n][n], int size);
void Multiply(int a[n][n], int b[n][n], int c[n][n]);
void vivod(int a[n][n]);
void naivemultiplication(int a[n][n], int b[n][n], int c[n][n], int size);
int main()
{
setlocale(LC_ALL, "Russian");
cout << "Введите предел: ";
cin >> deadline;
int a[n][n];
int b[n][n];
int c[n][n];
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
a[i][j] = rand() % 10;
b[i][j] = rand() % 10;
c[i][j] = 0;
}
}
cout << "First Matrix" << endl;
//vivod(a);
cout << "Second Matrix" << endl;
//vivod(b);
cout << "Strassen-Vinograd ";
long double begin2 = clock();
StrVin(a, b, c, n);
long double end2 = clock();
cout << "Time: " << (end2 - begin2) / CLOCKS_PER_SEC << endl;
//vivod(c);
cout << "Standart method ";
long double begin1 = clock();
naivemultiplication(a, b, c, n);
long double end1 = clock();
cout << "Time: " << (end1 - begin1) / CLOCKS_PER_SEC << endl;
//vivod(c);
system("pause");
return 0;
}
void vivod(int a[n][n])
{
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
cout << a[i][j] << " ";
cout << endl;
}
cout << endl;
}
void add(int(&A)[n][n], int(&B)[n][n], int(&C)[n][n], int size)
{
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
{
C[i][j] = A[i][j] + B[i][j];
}
}
}
void sub(int(&A)[n][n], int(&B)[n][n], int(&C)[n][n], int size)
{
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
{
C[i][j] = A[i][j] - B[i][j];
}
}
}
void Multiply(int a[n][n], int b[n][n], int c[n][n])
{
for (int i = 0; i < deadline*2; i++) {
for (int j = 0; j < deadline*2; j++) {
c[i][j] = 0;
for (int t = 0; t < deadline*2; t++) {
c[i][j] = c[i][j] + a[i][t] * b[t][j];
}
}
}
}
void StrVin(int(&A)[n][n], int(&B)[n][n], int(&C)[n][n], int size)
{
int half = size / 2;
if (half != deadline)
{
int A11[n][n], A12[n][n], A21[n][n], A22[n][n], B11[n][n], B12[n][n], B21[n][n], B22[n][n], C11[n][n], C12[n][n], C21[n][n], C22[n][n];
int S1[n][n], S2[n][n], S3[n][n], S4[n][n], S5[n][n], S6[n][n], S7[n][n], S8[n][n];
int P1[n][n], P2[n][n], P3[n][n], P4[n][n], P5[n][n], P6[n][n], P7[n][n];
int T1[n][n], T2[n][n];
int K[n][n];
for (int i = 0; i < half; i++) {
for (int j = 0; j < half; j++) {
A11[i][j] = A[i][j];
A12[i][j] = A[i][j + half];
A21[i][j] = A[i + half][j];
A22[i][j] = A[i + half][j + half];
B11[i][j] = B[i][j];
B12[i][j] = B[i][j + half];
B21[i][j] = B[i + half][j];
B22[i][j] = B[i + half][j + half];
}
}
for (int i = 0; i < half; i++)
{
for (int j = 0; j < half; j++)
{
C11[i][j] = C[i][j];
}
}
for (int i = 0; i < half; i++)
{
for (int j = half; j < size; j++)
{
C12[i][j - half] = C[i][j];
}
}
for (int i = half; i < size; i++)
{
for (int j = 0; j < half; j++)
{
C21[i - half][j] = C[i][j];
}
}
for (int i = half; i < size; i++)
{
for (int j = half; j < size; j++)
{
C22[i - half][j - half] = C[i][j];
}
}
add(A21, A22, S1, half);
sub(S1, A11, S2, half);
sub(A11, A21, S3, half);
sub(A12, S2, S4, half);
sub(B12, B11, S5, half);
sub(B22, S5, S6, half);
sub(B22, B12, S7, half);
sub(S6, B21, S8, half);
StrVin(S2, S6, P1, half);
StrVin(A11, B11, P2, half);
StrVin(A12, B21, P3, half);
StrVin(S3, S7, P4, half);
StrVin(S1, S5, P5, half);
StrVin(S4, B22, P6, half);
StrVin(A22, S8, P7, half);
add(P1, P2, T1, half);
add(T1, P4, T2, half);
add(P2, P3, C11, half);
add(T1, P5, K, half);
add(K, P6, C12, half);
sub(T2, P7, C21, half);
add(T2, P5, C22, half);
for (int i = 0; i < half; i++)
{
for (int j = 0; j < half; j++)
{
C[i][j] = C11[i][j];
C[i][j + half] = C12[i][j];
C[i + half][j] = C21[i][j];
C[i + half][j + half] = C22[i][j];
}
}
}
else
{
Multiply(A, B, C);
}
}
void naivemultiplication(int a[n][n], int b[n][n], int c[n][n], int size)
{
for (int k = 0; k < size; k++)
{
for (int i = 0; i < size; i++)
{
c[k][i] = 0;
for (int j = 0; j < size; j++)
{
c[k][i] += a[k][j] * b[j][i];
}
}
}
}
回答1:
While it is asymptotically faster than a classical matrix-matrix product, even a very well written Strassen Vinograd algorithm won't be faster than a naïve implementation for small sizes (and on modern architectures 64x64 is very small). This is because there is considerable overhead of n x n
matrix additions during the recursion.
That is why all optimized implementations have a cut-off size under which they will switch to an optimized classical matrix-matrix product. The cut-off size depends strongly on the architecture and the quality of the underlying algorithm, but it can be for n
way above 512.
If you want to make a serious implementation, I recommend reading some detailed description on the algorithm (you can start at wikipedia, then continue with their references). If you are doing this as a toy-project, at least stop the recursion under a certain threshold (which you need to tune), and try bigger sizes until you find measurable performance gains.
Regarding memory allocation, since you are allowed to allocate memory in the main function, you can compute an upper bound of memory required for temporary matrices as T*N^2*(1/4+1/16+...)
(where T
is the number of temporaries per recursion depth and N
is the size of the matrix) and re-use that memory at each recursion step.
To simplify things, start with just one recursion step which immediately switches to a classical algorithm, until you manage to get any speed improvements -- and again, read existing literature about implementation details of the algorithm.
来源:https://stackoverflow.com/questions/53802871/strassen-vinograd-algorithm