JKQtPlotter/examples/datastore_regression/datastore_regression.cpp

345 lines
19 KiB
C++
Raw Normal View History

/** \example datastore_regression.cpp
* Explains how to use the internal statistics library (see \ref jkqtptools_statistics ) together with JKQTPDatastore to perform different types of regression and polynomial fitting.
*
* \ref JKQTPlotterBasicJKQTPDatastoreRegression
*/
#include <QApplication>
#include "jkqtplotter/jkqtplotter.h"
2019-06-20 22:06:31 +08:00
#include "jkqtplotter/graphs/jkqtppeakstream.h"
#include "jkqtplotter/graphs/jkqtpboxplot.h"
#include "jkqtplotter/graphs/jkqtpstatisticsadaptors.h"
#include "jkqtplotter/graphs/jkqtpevaluatedfunction.h"
#include "jkqtcommon/jkqtpstatisticstools.h"
#include "jkqtcommon/jkqtpstringtools.h"
#include <random>
#include <cmath>
int main(int argc, char* argv[])
{
#if QT_VERSION >= 0x050600
QApplication::setAttribute(Qt::AA_EnableHighDpiScaling); // DPI support
QCoreApplication::setAttribute(Qt::AA_UseHighDpiPixmaps); //HiDPI pixmaps
#endif
QApplication app(argc, argv);
// 1. create a window with several plotters and get a pointer to the internal datastores (for convenience)
QWidget mainWidget;
QGridLayout* lay;
mainWidget.setLayout(lay=new QGridLayout);
JKQTPlotter* plot1=new JKQTPlotter(&mainWidget);
plot1->getPlotter()->setPlotLabel("Simple Linear Regression");
JKQTPDatastore* datastore1=plot1->getDatastore();
lay->addWidget(plot1,0,0);
JKQTPlotter *plot2=new JKQTPlotter(datastore1, &mainWidget);
plot2->getPlotter()->setPlotLabel("Weighted Linear Regression");
lay->addWidget(plot2,1,0);
JKQTPlotter* plot3=new JKQTPlotter(datastore1, &mainWidget);
plot3->getPlotter()->setPlotLabel("Robust Linear Regression");
lay->addWidget(plot3,0,1);
JKQTPlotter *plot6=new JKQTPlotter(datastore1, &mainWidget);
plot6->getPlotter()->setPlotLabel("Polynomial Fitting");
lay->addWidget(plot6,1,1);
JKQTPlotter* plot4=new JKQTPlotter(datastore1, &mainWidget);
plot4->getPlotter()->setPlotLabel("Exponential Regression");
lay->addWidget(plot4,0,2);
JKQTPlotter* plot5=new JKQTPlotter(datastore1, &mainWidget);
plot5->getPlotter()->setPlotLabel("Power-Law Regression");
lay->addWidget(plot5,1,2);
// 2.1. To demonstrate linear regression, we create a dataset with a linear dependence between two
// columns and added gaussian noise
std::random_device rd; // random number generators:
std::mt19937 gen{rd()};
std::normal_distribution<> d1{0,1};
double a0=-5;
double b0=2;
size_t colLinX=datastore1->addColumn("lin data, x");
size_t colLinY=datastore1->addColumn("lin data, y");
for (double x=-5; x<=10; x++) {
datastore1->appendToColumn(colLinX, x);
datastore1->appendToColumn(colLinY, a0+b0*x+d1(gen));
}
// we visualize this data with a simple scatter graph:
JKQTPXYLineGraph* graphD;
plot1->addGraph(graphD=new JKQTPXYLineGraph(plot1));
graphD->setXYColumns(colLinX, colLinY);
graphD->setDrawLine(false);
graphD->setTitle(QString("data $f(x)=%1+%2\\cdot x+\\mathcal{N}(0,1)$").arg(jkqtp_floattolatexqstr(a0,1)).arg(jkqtp_floattolatexqstr(b0,1)));
// 2.2. Now we calculate the regression line and add a plot to the graph:
/*double coeffA=0, coeffB=0;
jkqtpstatLinearRegression(datastore1->begin(colLinX), datastore1->end(colLinX), datastore1->begin(colLinY), datastore1->end(colLinY), coeffA, coeffB);
JKQTPXFunctionLineGraph *graphRegLine=new JKQTPXFunctionLineGraph(plot1);
graphRegLine->setSpecialFunction(JKQTPXFunctionLineGraph::SpecialFunction::Line);
graphRegLine->setParamsV(coeffA, coeffB);
graphRegLine->setTitle(QString("regression: $f(x) = %1 + %2 \\cdot x$").arg(jkqtp_floattolatexqstr(coeffA)).arg(jkqtp_floattolatexqstr(coeffB)));
plot1->addGraph(graphRegLine);*/
// this code can also be written with one function call, using the "adaptor" jkqtpstatAddLinearRegression():
//jkqtpstatAddLinearRegression(plot1->getPlotter(), datastore1->begin(colLinX), datastore1->end(colLinX), datastore1->begin(colLinY), datastore1->end(colLinY));
// or even shorter:
jkqtpstatAddLinearRegression(graphD);
// 3.1. We extend the example above by
//std::random_device rd; // random number generators:
//std::mt19937 gen{rd()};
std::uniform_real_distribution<> de{0.5,1.5};
std::uniform_int_distribution<> ddecide{0,4};
//double a0=-5;
//double b0=2;
size_t colWLinX=datastore1->addColumn("wlin data, x");
size_t colWLinY=datastore1->addColumn("wlin data, y");
size_t colWLinE=datastore1->addColumn("wlin data, errors");
for (double x=-5; x<=10; x++) {
double factor=1;
if (ddecide(gen)==4) {
factor=4;
}
const double err=de(gen)*factor;
datastore1->appendToColumn(colWLinX, x);
datastore1->appendToColumn(colWLinY, a0+b0*x+err);
datastore1->appendToColumn(colWLinE, err);
}
// we visualize this data with a simple scatter graph:
JKQTPXYLineErrorGraph* graphE;
plot2->addGraph(graphE=new JKQTPXYLineErrorGraph(plot2));
graphE->setXYColumns(colWLinX, colWLinY);
graphE->setYErrorColumn(static_cast<int>(colWLinE));
graphE->setDrawLine(false);
graphE->setTitle(QString("data $f(x)=%1+%2\\cdot x+\\mbox{Noise}$").arg(jkqtp_floattolatexqstr(a0,1)).arg(jkqtp_floattolatexqstr(b0,1)));
// 2.2. Now we calculate the regression line and add a plot to the graph:
/*double coeffA=0, coeffB=0;
jkqtpstatLinearWeightedRegression(datastore1->begin(colWLinX), datastore1->end(colWLinX),
datastore1->begin(colWLinY), datastore1->end(colWLinY),
datastore1->begin(colWLinE), datastore1->end(colWLinE),
coeffA, coeffB, false, false,
&jkqtp_inversePropSaveDefault<double>);
// note that in addition to the three data-columns we also provided a C++ functor
// jkqtp_inversePropSaveDefault(), which calculates 1/error. This is done, because the function
// jkqtpstatLinearWeightedRegression() uses the data from the range datastore1->begin(colWLinE) ... datastore1->end(colWLinE)
// directly as weights, but we calculated errors, which are inversely proportional to the
// weight of each data point when solving the least squares problem, as data points with
// larger errors should be weighted less than thos with smaller errors
//
// Now we can plot the resulting linear function:
JKQTPXFunctionLineGraph *graphRegLine=new JKQTPXFunctionLineGraph(plot2);
graphRegLine->setSpecialFunction(JKQTPXFunctionLineGraph::SpecialFunction::Line);
graphRegLine->setParamsV(coeffA, coeffB);
graphRegLine->setTitle(QString("weighted regression: $f(x) = %1 + %2 \\cdot x$").arg(jkqtp_floattolatexqstr(coeffA)).arg(jkqtp_floattolatexqstr(coeffB)));
plot2->addGraph(graphRegLine);*/
// this code can also be written with one function call, using the "adaptor" jkqtpstatAddLinearRegression():
//jkqtpstatAddLinearWeightedRegression(plot2->getPlotter(),
// datastore1->begin(colLinX), datastore1->end(colLinX),
// datastore1->begin(colLinY), datastore1->end(colLinY),
// datastore1->begin(colWLinE), datastore1->end(colWLinE),
// &coeffA, &coeffB, false, false,
// &jkqtp_inversePropSaveDefault<double>);
// or even shorter:
jkqtpstatAddLinearWeightedRegression(graphE);
// to demonstrate the effect of the weighting, we also add a simple linear regression that
// does not take into account the errors:
jkqtpstatAddLinearRegression(graphE);
// 4.1. To demonstrate IRLS linear regression, we create a dataset with a linear dependence between two
// columns and added gaussian noise and some outliers
//std::random_device rd; // random number generators:
//std::mt19937 gen{rd()};
//std::normal_distribution<> d1{0,1};
//double a0=-5;
//double b0=2;
size_t colRLinX=datastore1->addColumn("lin data, x");
size_t colRLinY=datastore1->addColumn("lin data, y");
for (double x=-5; x<=10; x++) {
datastore1->appendToColumn(colRLinX, x);
if (jkqtp_approximatelyEqual(x, -5)||jkqtp_approximatelyEqual(x, -3)) datastore1->appendToColumn(colRLinY, a0+b0*x+d1(gen)+12);
else datastore1->appendToColumn(colRLinY, a0+b0*x+d1(gen));
}
// we visualize this data with a simple scatter graph:
//JKQTPXYLineGraph* graphD;
plot3->addGraph(graphD=new JKQTPXYLineGraph(plot3));
graphD->setXYColumns(colRLinX, colRLinY);
graphD->setDrawLine(false);
graphD->setTitle(QString("data $f(x)=%1+%2\\cdot x+\\mathcal{N}(0,1)$").arg(jkqtp_floattolatexqstr(a0,1)).arg(jkqtp_floattolatexqstr(b0,1)));
// 4.2. Now we calculate the regression line and add a plot to the graph:
double coeffA=0, coeffB=0;
jkqtpstatRobustIRLSLinearRegression(datastore1->begin(colRLinX), datastore1->end(colRLinX), datastore1->begin(colRLinY), datastore1->end(colRLinY), coeffA, coeffB);
JKQTPXFunctionLineGraph *graphRegLine=new JKQTPXFunctionLineGraph(plot3);
graphRegLine->setSpecialFunction(JKQTPXFunctionLineGraph::SpecialFunction::Line);
graphRegLine->setParamsV(coeffA, coeffB);
graphRegLine->setTitle(QString("robust regression: $f(x) = %1 + %2 \\cdot x$, $p=1.1$").arg(jkqtp_floattolatexqstr(coeffA)).arg(jkqtp_floattolatexqstr(coeffB)));
plot3->addGraph(graphRegLine);
// this code can also be written with one function call, using the "adaptor" jkqtpstatAddLinearRegression():
//jkqtpstatAddRobustIRLSLinearRegression(plot3->getPlotter(), datastore1->begin(colRLinX), datastore1->end(colRLinX), datastore1->begin(colRLinY), datastore1->end(colRLinY));
// or even shorter:
//jkqtpstatAddRobustIRLSLinearRegression(graphD);
// as a comparison, we also add the result of the normal/non-robust linear regression:
jkqtpstatAddLinearRegression(graphD);
// the following code demonstrates the influence of the rgularization parameter p:
// - the closer it is to 1, the more robust the fit is (it is closer to the L1-norm)
// - the closer it is to 2, the closer the fit is to the least squares solution (i.e. the normal regression)
double p;
p=1.1;
auto g=jkqtpstatAddRobustIRLSLinearRegression(graphD, nullptr, nullptr, false, false, p);
g->setTitle(g->getTitle()+", $p="+jkqtp_floattolatexqstr(p)+"$");
p=1.5;
g=jkqtpstatAddRobustIRLSLinearRegression(graphD, nullptr, nullptr, false, false, p);
g->setTitle(g->getTitle()+", $p="+jkqtp_floattolatexqstr(p)+"$");
p=1.7;
g=jkqtpstatAddRobustIRLSLinearRegression(graphD, nullptr, nullptr, false, false, p);
g->setTitle(g->getTitle()+", $p="+jkqtp_floattolatexqstr(p)+"$");
p=2;
g=jkqtpstatAddRobustIRLSLinearRegression(graphD, nullptr, nullptr, false, false, p);
g->setTitle(g->getTitle()+", $p="+jkqtp_floattolatexqstr(p)+"$");
// 5.1. The functions for linear regression can also be used to calculate some non-linear models by transforming the input data.
// This is also supported by the statistics library. the supported models are defined in JKQTPStatRegressionModelType
//std::random_device rd; // random number generators:
//std::mt19937 gen{rd()};
//std::normal_distribution<> d1{0,1};
double a0_powerlaw=20;
double b0_powerlaw=0.25;
double a0_exp=5;
double b0_exp=0.5;
double a0_log=0;
double b0_log=1;
size_t colNLLinX=datastore1->addColumn("non-lin data, x");
size_t colNLLinYExp=datastore1->addColumn("non-lin data, y, exponential model");
size_t colNLLinYPow=datastore1->addColumn("non-lin data, y, power-law model");
size_t colNLLinYLog=datastore1->addColumn("non-lin data, y, log-law model");
auto model_powerlaw=jkqtpStatGenerateRegressionModel(JKQTPStatRegressionModelType::PowerLaw);
auto model_exp=jkqtpStatGenerateRegressionModel(JKQTPStatRegressionModelType::Exponential);
auto model_log=jkqtpStatGenerateRegressionModel(JKQTPStatRegressionModelType::Logarithm);
for (double x=0.1; x<=10; x+=0.5) {
datastore1->appendToColumn(colNLLinX, x);
double ypow=model_powerlaw(x, a0_powerlaw, b0_powerlaw)+d1(gen);
while (ypow<0) {
ypow=model_powerlaw(x, a0_powerlaw, b0_powerlaw)+d1(gen);
}
datastore1->appendToColumn(colNLLinYPow, ypow);
double yexp=model_exp(x, a0_exp, b0_exp)+d1(gen);
while (yexp<0) {
yexp=model_exp(x, a0_exp, b0_exp)+d1(gen);
}
datastore1->appendToColumn(colNLLinYExp, yexp);
datastore1->appendToColumn(colNLLinYLog, model_log(x, a0_log, b0_log));
}
// we visualize this data with a simple scatter graphs:
JKQTPXYLineGraph* graphD_powerlaw;
plot5->addGraph(graphD_powerlaw=new JKQTPXYLineGraph(plot5));
graphD_powerlaw->setXYColumns(colNLLinX, colNLLinYPow);
graphD_powerlaw->setDrawLine(false);
graphD_powerlaw->setTitle(QString("data $%1+\\mathcal{N}(0,1)$").arg(jkqtpstatRegressionModel2Latex(JKQTPStatRegressionModelType::PowerLaw, a0_powerlaw, b0_powerlaw)));
JKQTPXYLineGraph* graphD_exp;
plot4->addGraph(graphD_exp=new JKQTPXYLineGraph(plot4));
graphD_exp->setXYColumns(colNLLinX, colNLLinYExp);
graphD_exp->setDrawLine(false);
graphD_exp->setTitle(QString("data $%1+\\mathcal{N}(0,1)$").arg(jkqtpstatRegressionModel2Latex(JKQTPStatRegressionModelType::Exponential, a0_exp, b0_exp)));
JKQTPXYLineGraph* graphD_log;
plot5->addGraph(graphD_log=new JKQTPXYLineGraph(plot5));
graphD_log->setXYColumns(colNLLinX, colNLLinYLog);
graphD_log->setDrawLine(false);
graphD_log->setTitle(QString("data $%1+\\mathcal{N}(0,1)$").arg(jkqtpstatRegressionModel2Latex(JKQTPStatRegressionModelType::Logarithm, a0_log, b0_log)));
// 5.2. Now we calculate the regression models and add a plot to the graph:
double cA=0, cB=0;
JKQTPXFunctionLineGraph* gFunc;
jkqtpstatRegression(JKQTPStatRegressionModelType::Exponential, datastore1->begin(colNLLinX), datastore1->end(colNLLinX), datastore1->begin(colNLLinYExp), datastore1->end(colNLLinYExp), cA, cB);
plot4->addGraph(gFunc=new JKQTPXFunctionLineGraph(plot4));
gFunc->setPlotFunctionFunctor(jkqtpStatGenerateRegressionModel(JKQTPStatRegressionModelType::Exponential, cA, cB));
gFunc->setTitle(QString("regression: $%1$").arg(jkqtpstatRegressionModel2Latex(JKQTPStatRegressionModelType::Exponential, cA, cB)));
cA=0; cB=0;
jkqtpstatRegression(JKQTPStatRegressionModelType::PowerLaw, datastore1->begin(colNLLinX), datastore1->end(colNLLinX), datastore1->begin(colNLLinYPow), datastore1->end(colNLLinYPow), cA, cB);
plot5->addGraph(gFunc=new JKQTPXFunctionLineGraph(plot5));
gFunc->setPlotFunctionFunctor(jkqtpStatGenerateRegressionModel(JKQTPStatRegressionModelType::PowerLaw, cA, cB));
gFunc->setTitle(QString("regression: $%1$").arg(jkqtpstatRegressionModel2Latex(JKQTPStatRegressionModelType::PowerLaw, cA, cB)));
// Note: Here we used the normal linear regression functions, but variants for IRLS and weighted regression are also available!
// 5.3. Of course also adaptors exist:
//jkqtpstatAddRegression(plot4->getPlotter(), JKQTPStatRegressionModelType::Exponential, datastore1->begin(colNLLinX), datastore1->end(colNLLinX), datastore1->begin(colNLLinYExp), datastore1->end(colNLLinYExp));
//jkqtpstatAddRegression(plot5->getPlotter(), JKQTPStatRegressionModelType::PowerLaw, datastore1->begin(colNLLinX), datastore1->end(colNLLinX), datastore1->begin(colNLLinYPow), datastore1->end(colNLLinYPow));
//jkqtpstatAddRegression(graphD_exp, JKQTPStatRegressionModelType::Exponential);
//jkqtpstatAddRegression(graphD_powerlaw, JKQTPStatRegressionModelType::PowerLaw);
jkqtpstatAddRegression(graphD_log, JKQTPStatRegressionModelType::Logarithm);
// 6.1. To demonstrate polynomial fitting, we generate data for a polynomial model
std::vector<double> pPoly {1,2,-2,0.5};
size_t colPolyX=datastore1->addColumn("polynomial data, x");
size_t colPolyY=datastore1->addColumn("polynomial data, y");
for (double x=-10; x<=10; x++) {
datastore1->appendToColumn(colPolyX, x);
datastore1->appendToColumn(colPolyY, jkqtp_polyEval(x, pPoly.begin(), pPoly.end())+d1(gen)*50.0);
}
// we visualize this data with a simple scatter graph:
JKQTPXYLineGraph* graphP;
plot6->addGraph(graphP=new JKQTPXYLineGraph(plot6));
graphP->setXYColumns(colPolyX, colPolyY);
graphP->setDrawLine(false);
graphP->setTitle(QString("data $%1+\\mathcal{N}(0,50)$").arg(jkqtp_polynomialModel2Latex(pPoly.begin(), pPoly.end())));
// 6.2. now we can fit polynomials with different number of coefficients:
for (size_t p=0; p<=5; p++) {
std::vector<double> pFit;
JKQTPXFunctionLineGraph* gPoly;
jkqtpstatPolyFit(datastore1->begin(colPolyX), datastore1->end(colPolyX), datastore1->begin(colPolyY), datastore1->end(colPolyY), p, std::back_inserter(pFit));
plot6->addGraph(gPoly=new JKQTPXFunctionLineGraph(plot6));
gPoly->setPlotFunctionFunctor(jkqtp_generatePolynomialModel(pFit.begin(), pFit.end()));
gPoly->setTitle(QString("regression: $%1$").arg(jkqtp_polynomialModel2Latex(pFit.begin(), pFit.end())));
}
// 6.3. of course also the "adaptor" shortcuts are available:
//for (size_t p=0; p<=5; p++) {
// jkqtpstatAddPolyFit(plot6->getPlotter(), datastore1->begin(colPolyX), datastore1->end(colPolyX), datastore1->begin(colPolyY), datastore1->end(colPolyY), p);
// jkqtpstatAddPolyFit(graphP, p);
//}
// autoscale the plot so the graph is contained
plot1->zoomToFit();
plot1->getXAxis()->setShowZeroAxis(false);
plot1->getYAxis()->setShowZeroAxis(false);
plot1->getPlotter()->setKeyPosition(JKQTPKeyPosition::JKQTPKeyInsideTopLeft);
plot2->zoomToFit();
plot2->getXAxis()->setShowZeroAxis(false);
plot2->getYAxis()->setShowZeroAxis(false);
plot2->getPlotter()->setKeyPosition(JKQTPKeyPosition::JKQTPKeyInsideTopLeft);
plot3->zoomToFit();
plot3->getXAxis()->setShowZeroAxis(false);
plot3->getYAxis()->setShowZeroAxis(false);
plot3->getPlotter()->setKeyPosition(JKQTPKeyPosition::JKQTPKeyInsideTopLeft);
plot4->zoomToFit();
plot4->getXAxis()->setShowZeroAxis(false);
plot4->getYAxis()->setShowZeroAxis(false);
plot4->getPlotter()->setKeyPosition(JKQTPKeyPosition::JKQTPKeyInsideTopLeft);
plot4->setAbsoluteX(0.05, plot4->getXMax());
plot4->zoomToFit();
plot5->getXAxis()->setShowZeroAxis(false);
plot5->getYAxis()->setShowZeroAxis(false);
plot5->getPlotter()->setKeyPosition(JKQTPKeyPosition::JKQTPKeyInsideTopLeft);
plot5->setAbsoluteX(0.05, plot5->getXMax());
plot5->zoomToFit();
plot6->getXAxis()->setShowZeroAxis(false);
plot6->getYAxis()->setShowZeroAxis(false);
plot6->getPlotter()->setKeyPosition(JKQTPKeyPosition::JKQTPKeyInsideBottomRight);
plot6->zoomToFit();
// show plotter and make it a decent size
mainWidget.show();
mainWidget.resize(1600,800);
return app.exec();
}