Fast distance covariance / fast distance correlation

C++ implementation for fast computing of the distance covariance or distance correlation. Implementation based on the article “Fast Computing for Distance Covariance” written by Xiaoming Huo and Gábor J. Székely and published in Technometrics (Volumne 58, 2016 – Issue 4).

#ifndef FASTDISTANCECORRELATION_H
#define FASTDISTANCECORRELATION_H

#include <tuple>
#include <cmath>
#include <algorithm>
#include <vector>

namespace FastDistanceCorrelation {

    double fastDistanceCorrelation(const std::vector<double> &x, const std::vector<double> &y, bool unbiased = false);

    std::vector<double> fastDistanceCovariance(const std::vector<double> &x, const std::vector<double> &y, bool unbiased = false, bool allParameters = false);

    std::vector<double> fastDistanceCovarianceSums(const std::vector<double> &x, const std::vector<double> &y, bool allSums = false);

    std::vector<double> partialSum2D(const std::vector<double> &x, const std::vector<double> &y, const std::vector<double> &c, const std::vector<std::tuple<double, unsigned long, unsigned long, double> > &x_orderStatistics_orderIndices_rank_cummulativeSum, const std::vector<std::tuple<double, unsigned long, unsigned long, double> > &y_orderStatistics_orderIndices_rank_cummulativeSum);

    std::vector<double> binaryTreeSums(const std::vector<double> &y, const std::vector<double> &c);

    bool sortAscendingOnFirstElementOfTupleAndThenOnSecondElementOfTuple(const std::tuple<double, unsigned long, unsigned long, double> &element1, const std::tuple<double, unsigned long, unsigned long, double> &element2);

    std::vector<std::tuple<double, unsigned long, unsigned long, double> > createVector_orderStatistics_orderIndices_rank_cummulativeSum(const std::vector<double> vector);

    double variance(const std::vector<double> &vector);

    double mean(const std::vector<double> &vector);

}

#endif // FASTDISTANCECORRELATION_H
fastdistancecorrelation.h
#include "fastdistancecorrelation.h"

double FastDistanceCorrelation::fastDistanceCorrelation(const std::vector<double> &x, const std::vector<double> &y, bool unbiased)
{

    std::vector<double> parameters = FastDistanceCorrelation::fastDistanceCovariance(x, y, unbiased, true);

    double xDistanceVariance = parameters.at(1);

    double yDistanceVariance = parameters.at(2);

    if (std::fabs(xDistanceVariance * yDistanceVariance) < (std::numerical_limits<double>::epsilon()))
        return 0;
    else
        return std::sqrt(parameters.at(0) / std::sqrt(xDistanceVariance * yDistanceVariance));

}

std::vector<double> FastDistanceCorrelation::fastDistanceCovariance(const std::vector<double> &x, const std::vector<double> &y, bool unbiased, bool allParameters)
{

    if (x.size() != y.size()) {

        if (allParameters)
            return std::vector<double>(3, std::numeric_limits<double>::quiet_NaN());
        else
            return std::vector<double>(1, std::numeric_limits<double>::quiet_NaN());

    }

    std::vector<double> sums = FastDistanceCorrelation::fastDistanceCovarianceSums(x, y, allParameters);

    double n = static_cast<double>(x.size());

    double d1;

    double d2;

    double d3;

    if (unbiased) {

        d1 = n * (n - 3.0);

        d2 = d1 * (n - 2.0);

        d3 = d2 * (n - 1.0);

    } else {

        d1 = n * n;

        d2 = n * n * n;

        d3 = n * n * n *n;

    }

    if (allParameters) {

        std::vector<double> results(3);

        results[0] = (sums.at(0) / d1) - (2.0 * sums.at(1) / d2) + (sums.at(2) / d3);

        results[1] = (sums.at(3) / d1) - (2.0 * sums.at(5) / d2) + (sums.at(7) / d3);

        results[2] = (sums.at(4) / d1) - (2.0 * sums.at(6) / d2) + (sums.at(8) / d3);

        return results;

    } else {

        std::vector<double> results(1);

        results[0] = (sums.at(0) / d1) - (2.0 * sums.at(1) / d2) + (sums.at(2) / d3);

        return results;

    }
}

std::vector<double> FastDistanceCorrelation::fastDistanceCovarianceSums(const std::vector<double> &x, const std::vector<double> &y, bool allSums)
{

    if (x.size() != y.size()) {

        if (allSums)
            return std::vector<double>(9, std::numeric_limits<double>::quiet_NaN());
        else
            return std::vector<double>(3, std::numeric_limits<double>::quiet_NaN());

    }

    std::vector<std::tuple<double, unsigned long, unsigned long, double> > xSortedOrderedRanked = FastDistanceCorrelation::createVector_orderStatistics_orderIndices_rank_cummulativeSum(x);

    std::vector<std::tuple<double, unsigned long, unsigned long, double> > ySortedOrderedRanked = FastDistanceCorrelation::createVector_orderStatistics_orderIndices_rank_cummulativeSum(y);

    std::vector<double> x1(x.size());

    std::vector<double> y1(y.size());

    x1[0] = std::get<0>(xSortedOrderedRanked.at(0));

    y1[0] = y.at(std::get<1>(xSortedOrderedRanked.at(0)));

    for (unsigned long i = 1; i < x.size(); ++i) {

        x1[i] = std::get<0>(xSortedOrderedRanked.at(i));

        y1[i] = y.at(std::get<1>(xSortedOrderedRanked.at(i)));

    }

    std::vector<double> adot(x.size());

    std::vector<double> bdot(x.size());

    double adotdot = 0.0;

    double bdotdot = 0.0;

    double sum2 = 0.0;

    for (unsigned long i = 0; i < x.size(); ++i) {

        unsigned long xalphai = std::get<2>(xSortedOrderedRanked.at(i)) - 1;

        unsigned long yalphai = std::get<2>(ySortedOrderedRanked.at(i)) - 1;


        const std::tuple<double, unsigned long, unsigned long, double> &xTuple(xSortedOrderedRanked.at(xalphai));

        const std::tuple<double, unsigned long, unsigned long, double> &yTuple(ySortedOrderedRanked.at(yalphai));


        double adoti = std::get<3>(xSortedOrderedRanked.back()) + (2.0 * static_cast<double>(xalphai) - static_cast<double>(x.size())) * x.at(i) - 2.0 * (std::get<3>(xTuple) - std::get<0>(xTuple));

        double bdoti = std::get<3>(ySortedOrderedRanked.back()) + (2.0 * static_cast<double>(yalphai) - static_cast<double>(y.size())) * y.at(i) - 2.0 * (std::get<3>(yTuple) - std::get<0>(yTuple));

        adot[i] = adoti;

        bdot[i] = bdoti;

        sum2 += adoti * bdoti;

        adotdot += adoti;

        bdotdot += bdoti;

    }

    double sum3 = adotdot * bdotdot;

    std::vector<std::tuple<double, unsigned long, unsigned long, double> > y1SortedOrderedRanked = FastDistanceCorrelation::createVector_orderStatistics_orderIndices_rank_cummulativeSum(y1);

    std::vector<double> gamma_1 = FastDistanceCorrelation::partialSum2D(x1, y1, std::vector<double>(x.size(), 1.0), xSortedOrderedRanked, y1SortedOrderedRanked);

    std::vector<double> gamma_x = FastDistanceCorrelation::partialSum2D(x1, y1, x1, xSortedOrderedRanked, y1SortedOrderedRanked);

    std::vector<double> gamma_y = FastDistanceCorrelation::partialSum2D(x1, y1, y1, xSortedOrderedRanked, y1SortedOrderedRanked);

    std::vector<double> x1y1(x.size());

    for (unsigned long i = 0; i < x1y1.size(); ++i)
        x1y1[i] = x1.at(i) * y1.at(i);

    std::vector<double> gamma_xy = FastDistanceCorrelation::partialSum2D(x1, y1, x1y1, xSortedOrderedRanked, y1SortedOrderedRanked);

    double sum1 = 0.0;

    for (unsigned long i = 0; i < x.size(); ++i)
        sum1 += (x.at(i) * y.at(i) * gamma_1.at(i)) + gamma_xy.at(i) - (x.at(i) * gamma_y.at(i)) - (y.at(i) * gamma_x.at(i));

    std::vector<double> sums(9, std::numeric_limits<double>::quiet_NaN());

    sums[0] = sum1;

    sums[1] = sum2;

    sums[2] = sum3;

    if (allSums) {

        double n = static_cast<double>(x.size());

        sums[3] = 2 * n * (n - 1) * FastDistanceCorrelation::variance(x);

        sums[4] = 2 * n * (n - 1) * FastDistanceCorrelation::variance(y);

        double sum1 = 0.0;

        double sum2 = 0.0;

        for (unsigned long i = 0; i < x.size(); ++i) {

            sum1 += std::pow(adot.at(i), 2);

            sum2 += std::pow(bdot.at(i), 2);

        }

        sums[5] = sum1;

        sums[6] = sum2;

        sums[7] = std::pow(adotdot, 2.0);

        sums[8] = std::pow(bdotdot, 2.0);

    }

    return sums;

}

std::vector<double> FastDistanceCorrelation::partialSum2D(const std::vector<double> &x, const std::vector<double> &y, const std::vector<double> &c, const std::vector<std::tuple<double, unsigned long, unsigned long, double> > &x_orderStatistics_orderIndices_rank_cummulativeSum, const std::vector<std::tuple<double, unsigned long, unsigned long, double> > &y_orderStatistics_orderIndices_rank_cummulativeSum)
{

    std::vector<double> xPartialSums(x.size(), 0.0);

    std::vector<double> yPartialSums(y.size(), 0.0);

    std::vector<double> yRanked(y.size());

    yRanked[0] = std::get<2>(y_orderStatistics_orderIndices_rank_cummulativeSum.at(0));

    double cdot = c.at(0);

    for (unsigned long i = 1; i < c.size(); ++i) {

        cdot += c.at(i);

        xPartialSums[i] = xPartialSums.at(i - 1) + c.at(i - 1);

        yPartialSums[i] = yPartialSums.at(i - 1) + c.at( std::get<1>(y_orderStatistics_orderIndices_rank_cummulativeSum.at(i - 1)));

        yRanked[i] = std::get<2>(y_orderStatistics_orderIndices_rank_cummulativeSum.at(i));

    }

    std::vector<double> binaryTreeSums = FastDistanceCorrelation::binaryTreeSums(yRanked, c);

    std::vector<double> gamma(x.size());

    for (unsigned long i = 0; i < x.size(); ++i) {

        unsigned long xI = std::get<2>(x_orderStatistics_orderIndices_rank_cummulativeSum.at(i)) - 1;

        gamma[i] = cdot - c.at(xI) - 2.0 * yPartialSums.at(std::get<2>(y_orderStatistics_orderIndices_rank_cummulativeSum.at(xI)) - 1) - 2.0 * xPartialSums.at(xI) + 4.0 * binaryTreeSums.at(xI);

    }

    return gamma;
}

std::vector<double> FastDistanceCorrelation::binaryTreeSums(const std::vector<double> &y, const std::vector<double> &c)
{

    unsigned long n = y.size();

    unsigned long L = static_cast<unsigned long>(std::ceil(std::log2(static_cast<double>(n))));

    std::vector<double> s(static_cast<unsigned long>(std::pow(2.0, static_cast<double>(L + 1))), 0.0);

    std::vector<double> binaryTreeSums(n, 0.0);

    for (unsigned long i = 1; i < n; ++i) {

        for (unsigned long l = 0; l < L; ++l) {

            unsigned long pos = static_cast<unsigned long>(std::ceil(y.at(i - 1) / static_cast<unsigned long>((std::pow(static_cast<double>(2), static_cast<double>(l))))));

            if (l > 0) {

                unsigned long scale = l;

                while (scale != 0) {

                    --scale;

                    pos += static_cast<unsigned long>(std::pow(static_cast<double>(2), static_cast<double>(L - scale)));

                }

            }

            s[pos - 1] += c.at(i - 1);

        }

        for (unsigned long l = 0; l < L; ++l) {

            unsigned long pos = static_cast<unsigned long>(std::floor((static_cast<double>(y.at(i) - 1)) / static_cast<unsigned long>((std::pow(static_cast<double>(2.0), static_cast<double>(l))))));

            if ((static_cast<double>(pos) / 2) > static_cast<unsigned long>(std::floor(static_cast<double>(pos) / 2))) {

                if(l > 0) {

                    unsigned long scale = l;

                    while (scale != 0) {

                        --scale;

                        pos += static_cast<unsigned long>(std::pow(2.0, static_cast<double>(L - scale)));

                    }

                }

                binaryTreeSums[i] += s.at(pos - 1);

           }

        }

    }

    return binaryTreeSums;
	
}

bool FastDistanceCorrelation::sortAscendingOnFirstElementOfTupleAndThenOnSecondElementOfTuple(const std::tuple<double, unsigned long, unsigned long, double> &element1, const std::tuple<double, unsigned long, unsigned long, double> &element2)
{

    if (std::get<0>(element1) < std::get<0>(element2))
        return true;

    if (std::get<0>(element1) == std::get<0>(element2))
        return std::get<1>(element1) < std::get<1>(element2);
    else
        return false;

}

std::vector<std::tuple<double, unsigned long, unsigned long, double> > FastDistanceCorrelation::createVector_orderStatistics_orderIndices_rank_cummulativeSum(const std::vector<double> vector)
{

    std::vector<std::tuple<double, unsigned long, unsigned long, double> > vector_orderStatistics_orderIndices_rank_cummulativeSum;

    vector_orderStatistics_orderIndices_rank_cummulativeSum.reserve(vector.size());

    for (unsigned long i = 0; i < vector.size(); ++i) {

        const double &value(vector.at(i));

        vector_orderStatistics_orderIndices_rank_cummulativeSum.push_back({value, i, 1, value});

    }

    std::sort(vector_orderStatistics_orderIndices_rank_cummulativeSum.begin(), vector_orderStatistics_orderIndices_rank_cummulativeSum.end(), FastDistanceCorrelation::sortAscendingOnFirstElementOfTupleAndThenOnSecondElementOfTuple);

    std::get<2>(vector_orderStatistics_orderIndices_rank_cummulativeSum[std::get<1>(vector_orderStatistics_orderIndices_rank_cummulativeSum.first())]) = 1;

    for (unsigned long i = 1; i < vector_orderStatistics_orderIndices_rank_cummulativeSum.size(); ++i) {

        std::tuple<double, unsigned long, unsigned long, double> &tuple(vector_orderStatistics_orderIndices_rank_cummulativeSum[i]);

        std::get<2>(vector_orderStatistics_orderIndices_rank_cummulativeSum[std::get<1>(tuple)]) = i + 1;

        std::get<3>(tuple) = std::get<3>(vector_orderStatistics_orderIndices_rank_cummulativeSum.at(i - 1)) + std::get<0>(tuple);

    }

    return vector_orderStatistics_orderIndices_rank_cummulativeSum;
	
}

double FastDistanceCorrelation::variance(const std::vector<double> &vector)
{

    if (vector.empty() || (vector.size() == 1))
        return std::numeric_limits<double>::quiet_NaN();

    double m = mean(vector);

    double ep = 0.0;

    double variance = 0.0;

    double n = vector.size();

    for (unsigned long i = 0; i < vector.size(); ++i) {

        double sum = vector.at(i) - m;

        ep += sum;

        variance += sum * sum;

    }

    return (variance - ep * ep / n) / (n - 1.0);

}

double FastDistanceCorrelation::mean(const std::vector<double> &vector)
{
    double sum = 0.0;

    for (const double &value : vector)
        sum += value;

    return sum / static_cast<double>(vector.size());

}
fastdistancecorrelation.cpp

Leave a Reply

Your email address will not be published. Required fields are marked *