How to parallelize a program that need 0.005 seconds to terminate sequentially?

心已入冬 提交于 2021-01-28 08:03:21

问题


for a project i need to parallelize a sudoku solver, that sequentially terminate in avarage in 0.005 seconds. For test i also count how many "computations" the program does to solve the sudoku, and the problem is that when i run it in multithreading, if i do a lot less "computations" of the sequential version, then the completion time of the program in multithreading is less than the sequential version, but if the sequential version does for instance 2000 computations to solve the sudoku, and the multithreading version with 2 threads does the same 2000 computations in total (so in theory 1000 per thread, and the time should be less than the sequential versione) the time is not less than the sequential version, because probably the managing and the synchronization between the two threads is heavier than the 1000 computations each (if the total program sequentially need 0.005 seconds to start and terminate, the 1000 computations would need very few time to complete). Now the question is, i must parallelize it, and i must to it only using c++ standard library, how can i do it in that way that at least for a fixed number of threads( i dont expect that it works even with 64-128 threads), the multithreading version is always faster (less completion time) of the sequential version?

For the moment the program is based on a tree structure where each node is a possible solution, and a queue to make a task pool where each thread get a work, compute, and if it is not a solution, push back the results it has found and go on taking another work from the queue. The code is the following: (you can compile it with -pthread and you can run it with ./prog num_of_threads 0 0 0, where the last three zeros are not important for now).

!The only relevant code is the last part, getWork(), pushWork(), SolveSudoku(), InfiniteLoop(), the other function is just to make the example runnable but are not useful to analyze.

#include <chrono>
#include <iostream>
#include <thread>
#include <vector>
#include <atomic>
#include <math.h>
#include <list>
#include <mutex>

#define UNASSIGNED 0
#define N 9
#define ERROR_PAIR std::make_pair(-1, -1)

using namespace std;

atomic<bool> solutionFound{false};
mutex mtx;
atomic<int> worksDone{0};


//Each node has a sudoku grid and some sub-trees
struct Node {
    array<unsigned  char, N*N> grid;
    vector<Node *> child;
};


Node *newNode(const array<unsigned  char, N*N> &newGrid) {
    Node *temp = new Node;
    temp->grid = newGrid;
    return temp;
}

list<vector<Node *>> queueWork(0, vector<Node *>(0));

void printGrid(const array<unsigned  char, N*N> &grid) {
    for (int row = 0; row < N; row++) {
        if (row == 3 || row == 6) {
            cout << "---------------------" << endl;
        }
        for (int col = 0; col < N; col++) {
            if (col == 3 || col == 6) {
                cout << "| ";
            }
            cout << (int)grid[row+col*N] << " ";
        }
        cout << endl;
    }
}

//Check if a number can be inserted in a given position
bool canInsert(const int &val, const int &row_, const int &col_,
               const array<unsigned  char, N*N> &grid) {
    //Check column
    for (int row = 0; row < N; row++) {
        if (grid[row+col_*N] == val) return false;
    }
    //Check row
    for (int col = 0; col < N; col++) {
        if (grid[row_+col*N] == val) return false;
    }
    //Check box 3x3
    for (int row = 0; row < N; row++) {
        for (int col = 0; col < N; col++) {
            if (row / 3 == row_ / 3 &&
                col / 3 == col_ / 3) { 
                if ((grid[row+col*N] == val)) return false;
            }
        }
    }
    return true;
}

//Generate a matrix randomly with n initial values
void generateMatrix(const int &seed, const int &n, array<unsigned  char, N*N> &grid) {
    srand(seed);
    int i = 0;
    while (i < n) {
        int row = rand() % 9;
        int col = rand() % 9;
        int val = rand() % 9 + 1;
        if (grid[row+col*N] == UNASSIGNED && canInsert(val, row, col, grid)) {
            grid[row+col*N] = val;
            i++;
        }
    }
    return;
}

//Check if the sudoku is solved
bool isSolution(const array<unsigned char, N*N> &grid)  {
    char row_[N][N+1] = {0};
    char column_[N][N+1] = {0};
    char box[3][3][N+1] = {0};
    
    for (int row = 0; row < N; row++) {
        for (int col = 0; col < N; col++) {
            //Mark the element in row column and box
            row_[row][grid[row+col*N]] += 1;
            column_[col][grid[row+col*N]] += 1;
            box[row / 3][col / 3][grid[row+col*N]] += 1;
            //If an element is already present
            if (box[row / 3][col / 3][grid[row+col*N]] > 1 ||
                column_[col][grid[row+col*N]] > 1 ||
                row_[row][grid[row+col*N]] > 1)
                return false;
        }
    }
    return true;
}

//Find the first empty cell
pair<int, int> findCell(const array<unsigned  char, N*N> &grid) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            if (grid[i+j*N] == UNASSIGNED) {
                return make_pair(i, j);
            }
        }
    }
    return ERROR_PAIR;
}

//Find all possible numbers that can be inserted, and update the grid with that solution. Return the set of all
//the grids, one for each possible choice.
vector<array<unsigned char, N*N>> getChoices(const int &row, const int &col, const array<unsigned  char, N*N> &grid) {
    vector<array<unsigned char, N*N>> choices;
    for (int i = 1; i < 10; i++) {
        if (canInsert(i, row, col, grid)) {
            array<unsigned char, N*N> tmpGrid = grid;
            tmpGrid[row+col*N] = i;
            choices.push_back(move(tmpGrid));
        }
    }
    return choices;
}

//Update the childreen of a node.
void addChoices(vector<array<unsigned char, N*N>> &choices, Node &node) {
    while (!choices.empty()) {
        node.child.push_back(newNode(choices.back()));
        choices.pop_back();
    }
    return;
}

//Get a work from the queue
vector<Node *> getWork(const int chunkSize) {
    lock_guard<mutex> lck(mtx);
    
    if(queueWork.empty()){
        vector<Node *> error;
        return error;
    }
    if(queueWork.size()>=chunkSize){
        vector<Node *> result;
        for(int i=0; i<chunkSize;i++){
            auto tmp = queueWork.back();
            queueWork.pop_back();
        for(int i=0;i<tmp.size();i++){
            result.push_back(tmp[i]);
            }
        }
    return result;
    }

    auto tmp = queueWork.back();
    queueWork.pop_back();
    return tmp;
}

//Put a work in the queue
void pushWork(vector<Node *> &work) {
    lock_guard<mutex> lck(mtx);
    queueWork.push_back(work);
    return;
}


//Compute one step of computation for each node in input, and put all the childreen in the task vector.
void solveSudoku(vector<Node *> &nodes) {
    for (Node *&n : nodes) {
        if (findCell(n->grid) != ERROR_PAIR) {
            pair<int, int> freeCell = findCell(n->grid);
            vector<array<unsigned char, N*N>> choices = getChoices(freeCell.first, freeCell.second, n->grid);
            if (choices.empty()) { 
                delete n;
                continue; 
                }
            addChoices(choices, *n);
            vector<Node *> result;
            for (auto &n : n->child) {
                result.push_back(n);
            }
            pushWork(result);
            delete n;
            continue;
        } else if (isSolution(n->grid) && !solutionFound) {
                solutionFound.store(true);
                printGrid(n->grid);
                cout << "That's the first solution found !" << endl;

                return;
            } else {
                continue;
            }
   }
}

void infiniteLoop(const int chunkSize){
    while(!solutionFound){
        if(!queueWork.empty()){
            auto part = getWork(chunkSize);
             
            if(!part.empty()) {
                solveSudoku(part);
            } 
            worksDone++;    
        }
    } 
}



int main(int argc, char *argv[]) {
    if (argc < 4) {
        std::cerr << "use: " << argv[0]  << " nw seed initial_values " << endl;
        return (-1);
    }

    chrono::high_resolution_clock::time_point t1 = chrono::high_resolution_clock::now();

    const int nw = atoi(argv[1]); //Number of worker
    const int seed = atoi(argv[2]); //Seed for matrix generator
    const int initialValues = atoi(argv[3]); //Number of values to generate at the beginning
    const int chunkSize = 1;

    array<unsigned char, N*N> grid = {0}; 
    vector<thread> tids;

    if(seed != 0 && initialValues != 0){ 
        generateMatrix(seed,initialValues,grid);
    } else {
        grid = 
                                      {9, 0, 0, 0, 3, 0, 0, 0, 5, 
                                       0, 4, 0, 0, 0, 0, 0, 0, 0, 
                                       6, 0, 0, 0, 0, 1, 0, 0, 3, 
                                       0, 3, 0, 0, 0, 4, 0, 0, 0, 
                                       5, 0, 2, 0, 8, 0, 9, 0, 0, 
                                       0, 0, 0, 3, 0, 0, 0, 0, 6, 
                                       0, 0, 0, 0, 0, 9, 0, 0, 0, 
                                       1, 0, 0, 0, 0, 0, 0, 0, 0, 
                                       0, 0, 8, 0, 2, 0, 0, 0, 7};
    }
    
    Node *root = newNode(grid);
    vector<Node *> primoLavoro ;
    primoLavoro.push_back(root);
    solveSudoku(primoLavoro);
    

        for(int i=0;i<nw;i++){
            tids.push_back(thread(infiniteLoop,chunkSize));
        }

    cout << "tids.size()" << tids.size() << endl;
        for(thread &t : tids){
            t.join();  
        }
    


   if(!solutionFound) cout << "No solution found ! " << endl;
   
   
   chrono::high_resolution_clock::time_point t2 = chrono::high_resolution_clock::now(); 
   chrono::duration<double> time_span2 = chrono::duration_cast<chrono::duration<double>>(t2 - t1);

   cout << "WorksDone = " << worksDone / nw << endl;
   cout << "Tempo vecchio " << time_span2.count() << " seconds with " << nw << " threads !" << endl;
   return(0);
    
}
    
}

来源:https://stackoverflow.com/questions/63519121/how-to-parallelize-a-program-that-need-0-005-seconds-to-terminate-sequentially

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!