1. Write a parallel program that computes the sum of all elements in a matrix. Use a binary tree for computing the sum. The program shall use a number of threads specified as input.

#include <iostream>
#include <fstream>
#include <vector>
#include <thread>
#include <mutex>

using namespace std;

int t, n, m;
vector<vector<int>> matrix;
int sum = 0;
mutex mtx;

void partial_matrix_sum(int row, int col, int size) {
    if (size == 1) {
        unique_lock<mutex> lck(mtx);
        sum += matrix[row][col];
        return;
    }

    int l_size = size / 2 + size % 2;
    int l_row = row;
    int l_col = col;

    int r_size = size / 2;
    int r_row = row + (l_size / m);
    int r_col = col + (l_size % m);
    if (r_col >= m) r_row++, r_col = r_col % m;

    partial_matrix_sum(l_row, l_col, l_size);
    partial_matrix_sum(r_row, r_col, r_size);
}

void parallel_matrix_sum(int row, int col, int size, int t) {
    if (size == 1) {
        unique_lock<mutex> lck(mtx);
        sum += matrix[row][col];
        return;
    }

    if (t == 0) 
        partial_matrix_sum(row, col, size);
    else {
        int l_size = size / 2 + size % 2;
        int l_row = row;
        int l_col = col;

        int r_size = size / 2;
        int r_row = row + (l_size / m);
        int r_col = col + (l_size % m);
        if (r_col >= m) r_row++, r_col = r_col % m;

        if (t == 1) {
            thread th(parallel_matrix_sum, l_row, l_col, l_size, 0);
            partial_matrix_sum(r_row, r_col, r_size);
            th.join();
        } else {
            thread l_th(parallel_matrix_sum, l_row, l_col, l_size, t / 2 + t % 2 - 1);
            thread r_th(parallel_matrix_sum, r_row, r_col, r_size, t / 2 - 1);
            l_th.join();
            r_th.join();
        }
    }
}

int main() {
    ifstream fin("input.in");
    fin >> t;
    fin >> n >> m;
    matrix.resize(n, vector<int>(m));
    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < m; ++j) {
            fin >> matrix[i][j];
        }
    }
    if (t == 0) partial_matrix_sum(0, 0, n * m);
    else {
        thread th(parallel_matrix_sum, 0, 0, n * m, t - 1);
        th.join();
    }
    cout << sum;
}

2. Write a parallel program that computes the n-th power of a square matrix. Use a binary tree for computing the result. The program shall use a number of threads specified as input.

#include <iostream>
#include <fstream>
#include <vector>
#include <thread>
#include <mutex>
#include <cmath>

using namespace std;

mutex mtx;

vector<vector<int>> matrix_multiply(const vector<vector<int>> &a, const vector<vector<int>> &b) {
    int n = a.size();
    int m = b[0].size();
    int p = b.size();
    vector<vector<int>> result(n, vector<int>(m, 0));

    for (int i = 0; i < n; i++) {
        for (int j = 0; j < m; j++) {
            for (int k = 0; k < p; k++) {
                result[i][j] += a[i][k] * b[k][j];
            }
        }
    }

    return result;
}

vector<vector<int>> matrix_power_recursive(const vector<vector<int>> &matrix, int n) {
    int size = matrix.size();

    if (n == 0) {
        vector<vector<int>> result(size, vector<int>(size, 0));
        for (int i = 0; i < size; i++) {
            result[i][i] = 1;
        }
        return result;
    }

    if (n % 2 == 0) {
        vector<vector<int>> half_power = matrix_power_recursive(matrix, n / 2);
        return matrix_multiply(half_power, half_power);
    } else {
        vector<vector<int>> half_power = matrix_power_recursive(matrix, (n - 1) / 2);
        vector<vector<int>> temp = matrix_multiply(half_power, half_power);
        return matrix_multiply(temp, matrix);
    }
}

vector<vector<int>> matrix_power(const vector<vector<int>> &matrix, int n, int t) {
    int size = matrix.size();
    vector<vector<int>> result(size, vector<int>(size, 0));

    if (t <= 0) {
        result = matrix_power_recursive(matrix, n);
    } else {
        vector<thread> threads(t);

        for (int i = 0; i < t; i++) {
            threads[i] = thread([&result, &matrix, size, n, i, t]() {
                for (int k = i; k < n; k += t) {
                    vector<vector<int>> temp_result = matrix_power_recursive(matrix, k + 1);
                    mtx.lock();
                    for (int j = 0; j < size; j++) {
                        for (int l = 0; l < size; l++) {
                            result[j][l] += temp_result[j][l];
                        }
                    }
                    mtx.unlock();
                }
            });
        }

        for (int i = 0; i < t; i++) {
            threads[i].join();
        }
    }

    return result;
}

int main() {
    ifstream fin("input.in");
    int t, n, exponent;
    vector<vector<int>> matrix;
    fin >> t;
    fin >> n;
    matrix.resize(n, vector<int>(n));
    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < n; ++j) {
            fin >> matrix[i][j];
        }
    }
    fin >> exponent;
    fin.close();

    vector<vector<int>> result = matrix_power(matrix, exponent, t);

    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            cout << result[i][j] << " ";
        }
        cout << endl;
    }

    return 0;
}

3. Write a parallel program that computes the scalar product of 2 vectors of the same length. Use a binary tree for computing the sum. The program shall use a number of threads specified as input.

#include <iostream>
#include <fstream>
#include <vector>
#include <thread>
#include <mutex>

using namespace std;

int t, n;
vector<int> a, b;
int prod = 0;
mutex mtx;

void compute_scalar_product(int start, int end, int &result) {
    int partial_prod = 0;
    for (int i = start; i < end; ++i) {
        partial_prod += a[i] * b[i];
    }
    lock_guard<mutex> lock(mtx);
    result = partial_prod;
}

void parallel_scalar_product(int start, int end, int &result, int thread_count) {
    if (thread_count == 1) {
        compute_scalar_product(start, end, result);
    } else {
        int mid = (start + end) / 2;
        int left_result = 0;
        int right_result = 0;
        thread left_thread(parallel_scalar_product, start, mid, ref(left_result), thread_count / 2);
        thread right_thread(parallel_scalar_product, mid, end, ref(right_result), thread_count / 2);
        left_thread.join();
        right_thread.join();
        
        lock_guard<mutex> lock(mtx);
        result = left_result + right_result;
    }
}

int main() {
    ifstream fin("input.in");
    fin >> t;
    fin >> n;
    a.resize(n);
    b.resize(n);
    for (int i = 0; i < n; i++) {
        fin >> a[i];
    }
    for (int i = 0; i < n; i++) {
        fin >> b[i];
    }

    if (t == 0) {
        compute_scalar_product(0, n, prod);
    } else {
        parallel_scalar_product(0, n, prod, t);
    }
    cout << prod << endl;
}

4. Write a parallel program that computes the product of 2 matrices. The program shall use a number of threads specified as input.

#include <iostream>
#include <fstream>
#include <vector>
#include <thread>
#include <mutex>

using namespace std;

mutex mtx;

vector<vector<int>> matrix_multiply(const vector<vector<int>> &a, const vector<vector<int>> &b) {
    int n = a.size();
    int m = b[0].size();
    int p = b.size();
    vector<vector<int>> result(n, vector<int>(m, 0));

    for (int i = 0; i < n; i++) {
        for (int j = 0; j < m; j++) {
            for (int k = 0; k < p; k++) {
                result[i][j] += a[i][k] * b[k][j];
            }
        }
    }

    return result;
}

vector<vector<int>> matrix_product_recursive(const vector<vector<int>> &matrixA, const vector<vector<int>> &matrixB) {
    int size = matrixA.size();

    if (size == 1) {
        vector<vector<int>> result(1, vector<int>(1, 0));
        result[0][0] = matrixA[0][0] * matrixB[0][0];
        return result;
    }

    int half_size = size / 2;

    vector<vector<int>> A11(half_size, vector<int>(half_size));
    vector<vector<int>> A12(half_size, vector<int>(half_size));
    vector<vector<int>> A21(half_size, vector<int>(half_size));
    vector<vector<int>> A22(half_size, vector<int>(half_size));

    vector<vector<int>> B11(half_size, vector<int>(half_size));
    vector<vector<int>> B12(half_size, vector<int>(half_size));
    vector<vector<int>> B21(half_size, vector<int>(half_size));
    vector<vector<int>> B22(half_size, vector<int>(half_size));

    for (int i = 0; i < half_size; i++) {
        for (int j = 0; j < half_size; j++) {
            A11[i][j] = matrixA[i][j];
            A12[i][j] = matrixA[i][j + half_size];
            A21[i][j] = matrixA[i + half_size][j];
            A22[i][j] = matrixA[i + half_size][j + half_size];

            B11[i][j] = matrixB[i][j];
            B12[i][j] = matrixB[i][j + half_size];
            B21[i][j] = matrixB[i + half_size][j];
            B22[i][j] = matrixB[i + half_size][j + half_size];
        }
    }

    vector<vector<int>> P1 = matrix_product_recursive(A11, B11);
    vector<vector<int>> P2 = matrix_product_recursive(A12, B21);
    vector<vector<int>> P3 = matrix_product_recursive(A11, B12);
    vector<vector<int>> P4 = matrix_product_recursive(A12, B22);
    vector<vector<int>> P5 = matrix_product_recursive(A21, B11);
    vector<vector<int>> P6 = matrix_product_recursive(A22, B21);
    vector<vector<int>> P7 = matrix_product_recursive(A21, B12);
    vector<vector<int>> P8 = matrix_product_recursive(A22, B22);

    vector<vector<int>> C11(half_size, vector<int>(half_size));
    vector<vector<int>> C12(half_size, vector<int>(half_size));
    vector<vector<int>> C21(half_size, vector<int>(half_size));
    vector<vector<int>> C22(half_size, vector<int>(half_size));

    for (int i = 0; i < half_size; i++) {
        for (int j = 0; j < half_size; j++) {
            C11[i][j] = P1[i][j] + P2[i][j];
            C12[i][j] = P3[i][j] + P4[i][j];
            C21[i][j] = P5[i][j] + P6[i][j];
            C22[i][j] = P7[i][j] + P8[i][j];
        }
    }

    vector<vector<int>> result(size, vector<int>(size));

    for (int i = 0; i < half_size; i++) {
        for (int j = 0; j < half_size; j++) {
            result[i][j] = C11[i][j];
            result[i][j + half_size] = C12[i][j];
            result[i + half_size][j] = C21[i][j];
            result[i + half_size][j + half_size] = C22[i][j];
        }
    }

    return result;
}

vector<vector<int>> matrix_product(const vector<vector<int>> &matrixA, const vector<vector<int>> &matrixB, int t) {
    int size = matrixA.size();
    vector<vector<int>> result(size, vector<int>(size, 0));

    if (t <= 0) {
        result = matrix_multiply(matrixA, matrixB);
    } else {
        vector<thread> threads(t);

        for (int i = 0; i < t; i++) {
            threads[i] = thread([&result, &matrixA, &matrixB, size, i, t]() {
                for (int k = i; k < size; k += t) {
                    for (int j = 0; j < size; j++) {
                        for (int l = 0; l < size; l++) {
                            lock_guard<mutex> lock(mtx);
                            result[k][j] += matrixA[k][l] * matrixB[l][j];
                        }
                    }
                }
            });
        }

        for (int i = 0; i < t; i++) {
            threads[i].join();
        }
    }

    return result;
}

int main() {
    ifstream fin("input.in");
    int t, n;
    vector<vector<int>> matrixA, matrixB;
    fin >> t;
    fin >> n;

    matrixA.resize(n, vector<int>(n));
    matrixB.resize(n, vector<int>(n));

    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < n; ++j) {
            fin >> matrixA[i][j];
        }
    }

    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < n; ++j) {
            fin >> matrixB[i][j];
        }
    }

    fin.close();

    vector<vector<int>> result = matrix_product(matrixA, matrixB, t);

    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            cout << result[i][j] << " ";
        }
        cout << endl;
    }

    return 0;
}
#include <iostream>
#include <fstream>
#include <vector>
#include <thread>
#include <mutex>
#include <queue>
#include <condition_variable>

using namespace std;

mutex mtx;
condition_variable cv;

void multiply_matrices(const vector<vector<int>>& a, const vector<vector<int>>& b, vector<vector<int>>& c, int start, int end) {
    int n = a.size();
    int m = a[0].size();
    int p = b[0].size();

    for (int i = start; i < end; ++i) {
        for (int j = 0; j < n; ++j) {
            for (int k = 0; k < m; ++k) {
                c[i][j] += a[i][k] * b[k][j];
            }
        }
    }
    cv.notify_one();
}

vector<vector<int>> matrix_multiply(const vector<vector<int>> &a, const vector<vector<int>> &b, int t) {
    int n = a.size();
    vector<vector<int>> c(n, vector<int>(n, 0));

    if (t <= 0) {
        lock_guard<mutex> lock(mtx);
        multiply_matrices(a, b, c, 0, n);
    } else {
        vector<thread> threads;

        queue<int> task_queue;
        for (int i = 0; i < t; ++i) {
            int start = (i * n) / t;
            int end = ((i + 1) * n) / t;
            task_queue.push(start);
            task_queue.push(end);
        }

        for (int i = 0; i < t; ++i) {
            threads.push_back(thread([&a, &b, &c, &task_queue]() {
                while (true) {
                    int start, end;
                    {
                        unique_lock<mutex> lock(mtx);
                        if (task_queue.empty()) {
                            break;
                        }
                        start = task_queue.front();
                        task_queue.pop();
                        end = task_queue.front();
                        task_queue.pop();
                    }
                    multiply_matrices(a, b, c, start, end);
                }
            }));
        }

        for (int i = 0; i < t; ++i) {
            threads[i].join();
        }
    }

    return c;
}

int main() {
    ifstream fin("input.in");
    int t;
    fin >> t;
    int n;
    fin >> n;

    vector<vector<int>> a(n, vector<int>(n));
    vector<vector<int>> b(n, vector<int>(n));

    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < n; ++j) {
            fin >> a[i][j];
        }
    }

    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < n; ++j) {
            fin >> b[i][j];
        }
    }

    fin.close();

    vector<vector<int>> result = matrix_multiply(a, b, t);

    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < n; ++j) {
            cout << result[i][j] << ' ';
        }
        cout << '\\n';
    }

    return 0;
}

5. Write a parallel program that computes the product of two big numbers. The program shall use a number of threads specified as input.

#include <iostream>
#include <fstream>
#include <vector>
#include <thread>
#include <mutex>

using namespace std;

mutex mtx;

vector<int> solve(vector<int> a, vector<int> b, int t) {
    int n = a.size();
    int m = 2 * n - 1;
    vector<int> c;
    c.resize(m, 0);

    if (t <= 0) {
        unique_lock<mutex> lock(mtx);
        for (int x = 0; x < m; x++) {
            for (int i = 0; i < n; ++i) {
                if (x - i >= n || x - i < 0) {
                    continue;
                }
                c[x] += a[i] * b[x - i];
            }
        }
    } else {
        vector<thread> thr;
        for (int idx = 0; idx < t; ++idx) {
            thr.push_back(thread([&, idx]() {
                for (int x = idx; x < m; x += t) {
                    for (int i = 0; i < n; ++i) {
                        if (x - i >= n || x - i < 0) {
                            continue;
                        }
                        unique_lock<mutex> lock(mtx);
                        c[x] += a[i] * b[x - i];
                    }
                }
            }));
        }

        for (int i = 0; i < thr.size(); ++i) {
            thr[i].join();
        }
    }

    return c;
}

int main() {
    ifstream fin("input.in");
    int t;
    fin >> t;
    int n;
    fin >> n;
    vector<int> a, b;
    for (int i = 0; i < n; ++i) {
        int x;
        fin >> x;
        a.push_back(x);
    }
    for (int i = 0; i < n; ++i) {
        int x;
        fin >> x;
        b.push_back(x);
    }
    fin.close();

    vector<int> res = solve(a, b, t);

    for (auto it : res) {
        cout << it;
    }

    return 0;
}

6. Write a parallel program that computes the prime numbers up to N. It is assumed to have the list of primes up to √N, and will check each of the other numbers if it is divisible with a number from the initial list.

#include <iostream>
#include <fstream>
#include <vector>
#include <thread>
#include <cmath>
#include <mutex>

using namespace std;

mutex mtx;

bool is_prime(int num, const vector<int> &primes) {
    int sqrt_num = sqrt(num);

    for (int prime : primes) {
        if (prime > sqrt_num)
            break;

        if (num % prime == 0)
            return false;
    }

    return true;
}

void find_primes(int start, int end, const vector<int> &initial_primes, vector<int> &result) {
    for (int i = start; i <= end; ++i) {
        if (is_prime(i, initial_primes)) {
            lock_guard<mutex> lock(mtx);
            result.push_back(i);
        }
    }
}

int main() {
    ifstream fin("input.in");
    int t;
    fin >> t;
    int n;
    fin >> n;
    int sqrt_n = sqrt(n);
    vector<int> initial_primes;
    vector<int> primes;
    primes.push_back(2);

    initial_primes.push_back(2);

    for (int i = 3; i <= sqrt_n; i += 2) {
        if (is_prime(i, initial_primes)) {
            initial_primes.push_back(i);
            primes.push_back(i);
        }
    }

    vector<thread> threads;
    vector<vector<int>> thread_results(t);

    for (int i = 0; i < t; ++i) {
        int start = sqrt_n + 1 + (i * (n - sqrt_n)) / t;
        int end = sqrt_n + 1 + ((i + 1) * (n - sqrt_n)) / t - 1;

        threads.push_back(thread(find_primes, start, end, ref(initial_primes), ref(thread_results[i])));
    }

    for (auto& thread : threads) {
        thread.join();
    }

    for (int i = 0; i < t; ++i) {
        primes.insert(primes.end(), thread_results[i].begin(), thread_results[i].end());
    }

    for (int prime : primes) {
        cout << prime << " ";
    }
    cout << endl;

    return 0;
}

7. Write a parallel that computes the discrete convolution of a vector with another vector. The convolution is defined as: r(i) = sum(a(i) * b(i - j), j = 0..n)

#include <iostream>
#include <vector>
#include <thread>

using namespace std;

inline void solve(vector <int> a, vector <int> b, int T) {
  vector <thread> threads;
  int n = a.size();
  vector <int> sol(n, 0);
  for(int idx = 0; idx < T; ++ idx) {
    threads.push_back(thread([a, b, idx, n, &sol, T](){
      for(int i = idx; i < n; i += T) {
        for(int j = 0; j < n; ++ j) {
          sol[i] += a[j] * b[(i - j + n) % n];
        }
      }
    }));
  }

  for(int i = 0; i < threads.size(); ++ i) {
    threads[i].join();
  }

  for(auto it : sol) {
    cerr << it << '\\n';
  }

}

int main() {
  solve({1, 2, 3}, {1, 2, 3}, 3);
  // r[0] = a[0] * b[0] + a[1] * b[2] + a[2] * b[1] = 1 * 1 + 2 * 3 + 3 * 2 = 1 + 6 + 6 = 13
  // r[1] = a[0] * b[1] + a[1] * b[0] + a[2] * b[2] = 1 * 2 + 2 * 1 + 3 * 3 = 2 + 2 + 9 = 13
  // r[2] = a[0] * b[2] + a[1] * b[1] + a[2] * b[0] = 1 * 3 + 2 * 2 + 3 * 1 = 3 + 4 + 3 = 10
}

8. Write a parallel program that counts the number of permutations of N that satisfy a given property. You have a function (bool pred(vector <int> const& v)) that verifies if a given permutation satisfies the property. Your program shall call that function once for each permutation and count the number of times it returns true.

#include <iostream>
#include <thread>
#include <vector>
#include <atomic>

using namespace std;

bool check(vector <int> v) {
  if(v[0] % 2 == 0) {
    return true;
  }
  return false;
}

bool contains(vector <int> v, int n) {
  for(auto it : v) {
    if(it == n) {
      return true;
    }
  }
  return false;
}

atomic <int> cnt;

void back(vector <int> sol, int T, int n) {
  if(sol.size() == n) {
    if(check(sol)) {
      cnt ++;
    }
    return;
  }
  if(T == 1) {
    for (int i = 1; i <= n; ++ i) {
      if(contains(sol, i)) continue;
      sol.push_back(i);
      back(sol, T, n);
      sol.pop_back();
    }
  } else {
    vector <int> x(sol);
    thread t([&](){
      for(int i = 1; i <= n; i += 2) {
        if(contains(x, i)) continue;
        x.push_back(i);
        back(x, T / 2, n);
        x.pop_back();
      }
    });
    for(int i = 2; i <= n; i += 2) {
      if(contains(sol, i)) continue;
      sol.push_back(i);
      back(sol, T - T / 2, n);
      sol.pop_back();
    }
    t.join();
  }
}

int main() {
  back(vector <int>(), 2, 3);
  cout << cnt.load() << '\\n';
}

9. Write a parallel program that counts the number of subsets of k out of N that satisfy a given property. You have a function (bool pred(vector <int> const& v)) that verifies if a given subset satisfies the property. Your program shall call that function once for each subset of k elements and count the number of times it returns true.

#include <iostream>
#include <fstream>
#include <vector>
#include <thread>
#include <atomic>

using namespace std;

atomic <int> cnt;

inline bool check(vector <int> const &v) {
  if(v.size() == 0) {
    return false;
  }
  return v[0] % 2 == 0;
}

inline void generate(vector <int> v, int k, int n, int T) {
  if(v.size() == k) {
    for(auto it : v) {
      cerr << it << ' ';
    }
    cerr << '\\n';
    if(check(v)) {
      cnt ++;
    }
    return ;
  }
  int lst = 0;
  if(v.size() > 0) {
    lst = v.back();
  }
  if(T == 1) {
    for(int i = lst + 1; i <= n; ++ i) {
      v.push_back(i);
      generate(v, k, n, T);
      v.pop_back();
    }
  } else {
    thread t([&]() {
      vector <int> newv(v);
      for(int i = lst + 1; i <= n; i += 2) {
        newv.push_back(i);
        generate(newv, k, n, T / 2);
        newv.pop_back();
      }
    });
    vector <int> aux(v);
    for(int i = lst + 2; i <= n; i += 2) {
      aux.push_back(i);
      generate(aux, k, n, T - T / 2);
      aux.pop_back();
    }
    t.join();
  }
}

int main() {
  generate(vector <int> (), 3, 5, 2);
  cout << cnt << '\\n';
}