BREAKING: improved speed of median/quantile calculation, by only partially sorting

This commit is contained in:
jkriege2 2024-03-18 10:52:29 +01:00
parent be48ed84ba
commit b2aad7ca20
4 changed files with 159 additions and 29 deletions

View File

@ -811,7 +811,75 @@ inline double jkqtpstatMedianOfSortedVector(const TVector& data, size_t* Noutput
/*! \brief calculates the median of a given data range \a first ... \a last, this version partially sorts the range (i.e. no internal copy and checking for NAN-values is performed!)
\ingroup jkqtptools_math_statistics_basic
\tparam InputIt standard iterator type of \a first and \a last.
\param first iterator pointing to the first item in the dataset to use \f$ X_1 \f$
\param last iterator pointing behind the last item in the dataset to use \f$ X_N \f$
\return the median of the data returned between \a first and \a last (excluding invalid doubles).
If the given range \a first ... \a last is empty, NAN is returned
\note This operation implies uses std::nth_element() to calculate the median in linear time O(N)!
The given range is partially sorted after the call!
*/
template <class InputIt>
inline double jkqtpstatMedianAndPartialSort(InputIt first, InputIt last) {
// filter out any NAN-values:
const size_t n=std::distance(first,last);
// handle empty range
if (n<=0) {
return JKQTP_DOUBLE_NAN;
}
// calculate median using nth_element() in linear timme!
const auto middleItr = first + n / 2;
std::nth_element(first, middleItr, last);
if (n % 2 == 0) {
// since nth_element() performs partial sorting,
// "All of the elements before this new nth element are less than or equal to the elements after the new nth element." (from cppreference.com)
const auto leftMiddleItr = std::max_element(first, middleItr);
// the second elemnt to average is the maximum on the left of middleItr!
return (*leftMiddleItr + *middleItr) / 2.0;
} else {
return *middleItr;
}
}
/*! \brief calculates the median of a given data range \a first ... \a last
\ingroup jkqtptools_math_statistics_basic
\tparam InputIt standard iterator type of \a first and \a last.
\param first iterator pointing to the first item in the dataset to use \f$ X_1 \f$
\param last iterator pointing behind the last item in the dataset to use \f$ X_N \f$
\param[out] Noutput optionally returns the number of accumulated valid values in this variable
\return the median of the data returned between \a first and \a last (excluding invalid doubles).
If the given range \a first ... \a last is empty, NAN is returned
\note This operation implies an internal copy of the data, and then uses std::nth_element() to calculate the median in linear time O(N)!
\note Each value is the specified range is converted to a double using jkqtp_todouble().
Entries in the range that are invalid double (using JKQTPIsOKFloat() )
are ignored when calculating.
*/
template <class InputIt>
inline double jkqtpstatMedian(InputIt first, InputIt last, size_t* Noutput=nullptr) {
// filter out any NAN-values:
std::vector<double> dataFiltered;
jkqtpstatFilterGoodFloat(first, last, std::back_inserter(dataFiltered));
const size_t n=dataFiltered.size();
// handle empty range
if (n<=0) {
if (Noutput) *Noutput=0;
return JKQTP_DOUBLE_NAN;
}
// calculate median using nth_element() in linear timme!
return jkqtpstatMedianAndPartialSort(dataFiltered.begin(), dataFiltered.end());
}
/*! \brief calculates the Five-Number Statistical Summary (minimum, median, maximum and two user-defined quantiles (as well as derived from these the inter quartile range)) of a sorted vector /*! \brief calculates the Five-Number Statistical Summary (minimum, median, maximum and two user-defined quantiles (as well as derived from these the inter quartile range)) of a sorted vector
\ingroup jkqtptools_math_statistics_basic \ingroup jkqtptools_math_statistics_basic
@ -1079,29 +1147,6 @@ inline JKQTPStat5NumberStatistics jkqtpstat5NumberStatistics(InputIt first, Inpu
} }
/*! \brief calculates the median of a given data range \a first ... \a last
\ingroup jkqtptools_math_statistics_basic
\tparam InputIt standard iterator type of \a first and \a last.
\param first iterator pointing to the first item in the dataset to use \f$ X_1 \f$
\param last iterator pointing behind the last item in the dataset to use \f$ X_N \f$
\param[out] Noutput optionally returns the number of accumulated valid values in this variable
\return the median of the data returned between \a first and \a last (excluding invalid doubles).
If the given range \a first ... \a last is empty, NAN is returned
\note This operation implies an internal copy of the data, as well as sorting it!
\note Each value is the specified range is converted to a double using jkqtp_todouble().
Entries in the range that are invalid double (using JKQTPIsOKFloat() )
are ignored when calculating.
*/
template <class InputIt>
inline double jkqtpstatMedian(InputIt first, InputIt last, size_t* Noutput=nullptr) {
std::vector<double> dataFiltered;
jkqtpstatFilterGoodFloat(first, last, std::back_inserter(dataFiltered));
std::sort(dataFiltered.begin(), dataFiltered.end());
return jkqtpstatMedianOfSortedVector(dataFiltered, Noutput);
}
/*! \brief calculates the \a quantile -th quantile of a given data range \a first ... \a last /*! \brief calculates the \a quantile -th quantile of a given data range \a first ... \a last
@ -1125,13 +1170,14 @@ template <class InputIt>
inline double jkqtpstatQuantile(InputIt first, InputIt last, double quantile, size_t* Noutput=nullptr) { inline double jkqtpstatQuantile(InputIt first, InputIt last, double quantile, size_t* Noutput=nullptr) {
std::vector<double> dataFiltered; std::vector<double> dataFiltered;
jkqtpstatFilterGoodFloat(first, last, std::back_inserter(dataFiltered)); jkqtpstatFilterGoodFloat(first, last, std::back_inserter(dataFiltered));
std::sort(dataFiltered.begin(), dataFiltered.end());
if (dataFiltered.size()<=0) { if (dataFiltered.size()<=0) {
if (Noutput) *Noutput=0; if (Noutput) *Noutput=0;
return JKQTP_DOUBLE_NAN; return JKQTP_DOUBLE_NAN;
} else { } else {
if (Noutput) *Noutput=dataFiltered.size(); if (Noutput) *Noutput=dataFiltered.size();
return dataFiltered[jkqtp_bounded<size_t>(0, static_cast<size_t>(quantile*static_cast<double>(dataFiltered.size()-1)), dataFiltered.size()-1)]; auto qelement=dataFiltered.begin()+jkqtp_bounded<size_t>(0, static_cast<size_t>(quantile*static_cast<double>(dataFiltered.size()-1)), dataFiltered.size()-1);
std::nth_element(dataFiltered.begin(), qelement, dataFiltered.end());
return *qelement;
} }
} }
@ -1164,20 +1210,18 @@ template <class InputIt>
inline double jkqtpstatMAD(InputIt first, InputIt last, double* median=nullptr, size_t* Noutput=nullptr) { inline double jkqtpstatMAD(InputIt first, InputIt last, double* median=nullptr, size_t* Noutput=nullptr) {
std::vector<double> dataFiltered; std::vector<double> dataFiltered;
jkqtpstatFilterGoodFloat(first, last, std::back_inserter(dataFiltered)); jkqtpstatFilterGoodFloat(first, last, std::back_inserter(dataFiltered));
std::sort(dataFiltered.begin(), dataFiltered.end());
if (dataFiltered.size()<=0) { if (dataFiltered.size()<=0) {
if (Noutput) *Noutput=0; if (Noutput) *Noutput=0;
if (median) *median=JKQTP_DOUBLE_NAN; if (median) *median=JKQTP_DOUBLE_NAN;
return JKQTP_DOUBLE_NAN; return JKQTP_DOUBLE_NAN;
} else { } else {
if (Noutput) *Noutput=dataFiltered.size(); if (Noutput) *Noutput=dataFiltered.size();
double med=jkqtpstatMedianOfSortedVector(dataFiltered); const double med=jkqtpstatMedianAndPartialSort(dataFiltered.begin(), dataFiltered.end());
if (median) *median=med; if (median) *median=med;
for(double& v: dataFiltered) { for(double& v: dataFiltered) {
v=fabs(v-med); v=fabs(v-med);
} }
std::sort(dataFiltered.begin(), dataFiltered.end()); return jkqtpstatMedianAndPartialSort(dataFiltered.begin(), dataFiltered.end());
return jkqtpstatMedianOfSortedVector(dataFiltered);
} }
} }

View File

@ -16,4 +16,5 @@ find_package(Qt${QT_VERSION_MAJOR} COMPONENTS Test REQUIRED)
message( STATUS ".. BUILDING UNIT TESTS FOR JKQTCommon:" ) message( STATUS ".. BUILDING UNIT TESTS FOR JKQTCommon:" )
add_subdirectory(jkqtcommmon) add_subdirectory(jkqtcommmon)
add_subdirectory(jkqtmath)

View File

@ -0,0 +1,7 @@
cmake_minimum_required(VERSION 3.23)
set(CMAKE_INCLUDE_CURRENT_DIR ON)
set(CMAKE_AUTOMOC ON)
jkqtplotter_add_jkqtcommmon_test(jkqtpstatisticstools_test)

View File

@ -0,0 +1,78 @@
#include <QObject>
#include <QtTest>
#include "jkqtmath/jkqtpstatisticstools.h"
#ifndef QCOMPARE_EQ
#define QCOMPARE_EQ(A,B) if (!static_cast<bool>((A)==(B))) {qDebug()<<QTest::toString(A)<< "!=" << QTest::toString(B); } QVERIFY((A)==(B))
#endif
#ifndef QVERIFY_THROWS_NO_EXCEPTION
#define QVERIFY_THROWS_NO_EXCEPTION(B) B
#endif
#ifndef QVERIFY_THROWS_EXCEPTION
#define QVERIFY_THROWS_EXCEPTION(type, A) QVERIFY_EXCEPTION_THROWN(A, type)
#endif
class JKQTPStatisticsToolsTest : public QObject
{
Q_OBJECT
public:
inline JKQTPStatisticsToolsTest() {
}
inline ~JKQTPStatisticsToolsTest() {
}
private slots:
inline void test_jkqtpstatMedian() {
const std::vector<double> empty;
const std::vector<double> oneel = {1.23};
const std::vector<double> twoel = {2, 1};
const std::vector<double> threeel = {3, 2, 1};
const std::vector<double> fourel = {4, 2, 1, 3};
const std::vector<double> fourelnan = {4, 1, JKQTP_NAN, 2, JKQTP_NAN, JKQTP_NAN, 3};
QCOMPARE_EQ(JKQTPIsOKFloat(jkqtpstatMedian(empty.begin(), empty.end())),false);
QCOMPARE_EQ(jkqtpstatMedian(oneel.begin(), oneel.end()),1.23);
QCOMPARE_EQ(jkqtpstatMedian(twoel.begin(), twoel.end()),(1.0+2.0)/2.0);
QCOMPARE_EQ(jkqtpstatMedian(threeel.begin(), threeel.end()),2.0);
QCOMPARE_EQ(jkqtpstatMedian(fourel.begin(), fourel.end()),(2.0+3.0)/2.0);
QCOMPARE_EQ(jkqtpstatMedian(fourelnan.begin(), fourelnan.end()),(2.0+3.0)/2.0);
}
inline void test_jkqtpstatMedianOfSortedVector() {
const std::vector<double> empty;
const std::vector<double> oneel = {1.23};
const std::vector<double> twoel = {1, 2};
const std::vector<double> threeel = {1, 2, 3};
const std::vector<double> fourel = {1, 2, 3, 4};
const std::vector<double> fourelnan = {1, JKQTP_NAN, 2, JKQTP_NAN, 3, 4, JKQTP_NAN};
QCOMPARE_EQ(JKQTPIsOKFloat(jkqtpstatMedianOfSortedVector(empty)),false);
QCOMPARE_EQ(jkqtpstatMedianOfSortedVector(oneel),oneel[0]);
QCOMPARE_EQ(jkqtpstatMedianOfSortedVector(twoel),(twoel[0]+twoel[1])/2.0);
QCOMPARE_EQ(jkqtpstatMedianOfSortedVector(threeel),threeel[1]);
QCOMPARE_EQ(jkqtpstatMedianOfSortedVector(fourel),(fourel[1]+fourel[2])/2.0);
QCOMPARE_EQ(JKQTPIsOKFloat(jkqtpstatMedianOfSortedVector(fourelnan)),false);
}
inline void test_jkqtpstatQuantile() {
const std::vector<double> empty;
const std::vector<double> oneel = {1.23};
const std::vector<double> twoel = {2, 1};
const std::vector<double> threeel = {3, 2, 1};
const std::vector<double> fourel = {4, 2, 1, 3};
const std::vector<double> fourelnan = {4, 1, JKQTP_NAN, 2, JKQTP_NAN, JKQTP_NAN, 3};
QCOMPARE_EQ(JKQTPIsOKFloat(jkqtpstatQuantile(empty.begin(), empty.end(),0.25)), false);
QCOMPARE_EQ(jkqtpstatQuantile(oneel.begin(), oneel.end(),0.25),1.23);
QCOMPARE_EQ(jkqtpstatQuantile(twoel.begin(), twoel.end(),0.33),1);
QCOMPARE_EQ(jkqtpstatQuantile(threeel.begin(), threeel.end(), 0.33),1.0);
QCOMPARE_EQ(jkqtpstatQuantile(fourel.begin(), fourel.end(), 0.25),1);
QCOMPARE_EQ(jkqtpstatQuantile(fourelnan.begin(), fourelnan.end(), 0.25),1);
}
};
QTEST_APPLESS_MAIN(JKQTPStatisticsToolsTest)
#include "jkqtpstatisticstools_test.moc"