Multithreaded matrix multiplication (c++11, Qt)

This code performs a multithreaded matrix multiplication. It automatically scales to the number of available cores. The code is written in C++ and requires Qt and C++11.

[pastacode lang=”c++” message=”” highlight=”” provider=”manual”]
#include <QVector>
#include <QtConcurrent>
#include <QThreadPool>
#include <QReadWriteLock>

#include <functional>

template <typename T, typename U>
auto innerProduct(QVector<T> const &vector1, QVector<U> const &vector2) -> decltype(T() * U())
{
	Q_ASSERT_X(vector1.size() == vector2.size(), "template <typename T, typename U> auto innerProduct(QVector<T> const &vector1, QVector<U> const &vector2) -> decltype(T() * U())", qPrintable("incompatible sizes of vectors, size of vector1 = " + QString::number(vector1.size()) + ", size of vector2 = "+ QString::number(vector1.size())));

    double sum(0.0);

    for (int i = 0; i < vector1.size(); ++i)
    	sum += vector1.at(i) * vector2.at(i);

	return sum;
}

template <typename T, typename U>
void _multiplicationAuxiliaryFunction(QVector<int> const &indexesToProcess, QVector<QVector<T> > const &matrix1, QVector<QVector<U> > const &matrix2, QVector<QVector<decltype(T() * U())> > &resultMatrix)
{
	for (int i = 0; i < indexesToProcess.size(); ++i) {

    	int currentIndex = indexesToProcess.at(i);

    	QVector<U> currentColumnIndexOfMatrix2 = MatrixOperations::columnVector(currentIndex, matrix2);

        QVector<decltype(T() * U())> &currentResultVector(resultMatrix[currentIndex]);

        for (int j = 0; j < matrix1.size(); ++j)
        currentResultVector[j] = innerProduct(matrix1.at(j), currentColumnIndexOfMatrix2);
    }
}

template <typename T, typename U>
auto multiplication(QVector<QVector<T> > const &matrix1, QVector<QVector<U> > const &matrix2) -> QVector<QVector<decltype(T() * U())> >
{
	Q_ASSERT_X((matrix1.isEmpty() ? 0 : matrix1.at(0).size()) == matrix2.size(), "template <typename T, typename U> auto multiplication(QVector<QVector<U> > const &matrix1, QVector<QVector<U> > const &matrix2) -> QVector<QVector<decltype(T() * U())> >", qPrintable("incompatible sizes of matrices, column size of matrix1 = " + QString::number((matrix1.isEmpty() ? 0 : matrix1.size())) + ", row size of matrix2 = "+ QString::number(matrix2.size())));

    int maximumThreadCount = QThreadPool::globalInstance()->maxThreadCount();

    int n = matrix2.isEmpty() ? 0 : matrix2.at(0).size();

    QVector<QVector<decltype(T() * U())> > resultMatrix(n, QVector<decltype(T() * U())>(matrix1.size()));

    QVector<QVector<int> > scheduler(maximumThreadCount);

    for (int i = 0; i < n; ++i)
    	scheduler[(i % maximumThreadCount)] << i;

	QtConcurrent::blockingMap(scheduler, std::bind(_multiplicationAuxiliaryFunction<T, U>, std::placeholders::_1, std::cref(matrix1), std::cref(matrix2), std::ref(resultMatrix)));

    MatrixOperations::inplaceTranspose(resultMatrix);

    return resultMatrix;
}
[/pastacode]