2019-06-12 18:58:47 +08:00
/*
2022-07-19 19:40:43 +08:00
Copyright ( c ) 2008 - 2022 Jan W . Krieger ( < jan @ jkrieger . de > )
2019-06-12 18:58:47 +08:00
last modification : $ LastChangedDate $ ( revision $ Rev $ )
This software is free software : you can redistribute it and / or modify
it under the terms of the GNU Lesser General Public License ( LGPL ) as published by
the Free Software Foundation , either version 2.1 of the License , or
( at your option ) any later version .
This program is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU Lesser General Public License ( LGPL ) for more details .
You should have received a copy of the GNU Lesser General Public License ( LGPL )
along with this program . If not , see < http : //www.gnu.org/licenses/>.
*/
# ifndef JKQTPSTATREGRESSION_H_INCLUDED
# define JKQTPSTATREGRESSION_H_INCLUDED
# include <stdint.h>
# include <cmath>
# include <stdlib.h>
# include <string.h>
# include <iostream>
# include <stdio.h>
# include <limits>
# include <vector>
# include <utility>
# include <cfloat>
# include <ostream>
# include <iomanip>
# include <sstream>
2019-06-22 20:21:32 +08:00
# include "jkqtcommon/jkqtcommon_imexport.h"
2019-06-12 18:58:47 +08:00
# include "jkqtcommon/jkqtplinalgtools.h"
# include "jkqtcommon/jkqtparraytools.h"
# include "jkqtcommon/jkqtpdebuggingtools.h"
# include "jkqtcommon/jkqtpstatbasics.h"
# include "jkqtcommon/jkqtpstatpoly.h"
/*! \brief calculate the linear regression coefficients for a given data range \a firstX / \a firstY ... \a lastX / \a lastY where the model is \f$ f(x)=a+b\cdot x \f$
So this function solves the least - squares optimization problem : \ f [ ( a ^ \ ast , b ^ \ ast ) = \ mathop { \ mathrm { arg \ ; min } } \ limits_ { a , b } \ sum \ limits_i \ left ( y_i - ( a + b \ cdot x_i ) \ right ) ^ 2 \ f ]
\ ingroup jkqtptools_math_statistics_regression
\ tparam InputItX standard iterator type of \ a firstX and \ a lastX .
\ tparam InputItY standard iterator type of \ a firstY and \ a lastY .
\ param firstX iterator pointing to the first item in the x - dataset to use \ f $ x_1 \ f $
\ param lastX iterator pointing behind the last item in the x - dataset to use \ f $ x_N \ f $
\ param firstY iterator pointing to the first item in the y - dataset to use \ f $ y_1 \ f $
\ param lastY iterator pointing behind the last item in the y - dataset to use \ f $ y_N \ f $
\ param [ in , out ] coeffA returns the offset of the linear model
\ param [ in , out ] coeffB returns the slope of the linear model
\ param fixA if \ c true , the offset coefficient \ f $ a \ f $ is not determined by the fit , but the value provided in \ a coeffA is used
\ param fixB if \ c true , the slope coefficient \ f $ b \ f $ is not determined by the fit , but the value provided in \ a coeffB is used
This function computes internally :
\ f [ a = \ overline { y } - b \ cdot \ overline { x } \ f ]
\ f [ b = \ frac { \ sum x_iy_i - N \ cdot \ overline { x } \ cdot \ overline { y } } { \ sum x_i ^ 2 - N \ cdot ( \ overline { x } ) ^ 2 } \ f ]
2019-06-21 04:24:47 +08:00
\ image html datastore_regression_lin . png
2019-06-12 18:58:47 +08:00
*/
template < class InputItX , class InputItY >
inline void jkqtpstatLinearRegression ( InputItX firstX , InputItX lastX , InputItY firstY , InputItY lastY , double & coeffA , double & coeffB , bool fixA = false , bool fixB = false ) {
if ( fixA & & fixB ) return ;
const int Nx = std : : distance ( firstX , lastX ) ;
const int Ny = std : : distance ( firstY , lastY ) ;
JKQTPASSERT ( Nx > 1 & & Ny > 1 ) ;
double sumx = 0 , sumy = 0 , sumxy = 0 , sumx2 = 0 ;
size_t N = 0 ;
auto itX = firstX ;
auto itY = firstY ;
for ( ; itX ! = lastX & & itY ! = lastY ; + + itX , + + itY ) {
const double fit_x = jkqtp_todouble ( * itX ) ;
const double fit_y = jkqtp_todouble ( * itY ) ;
if ( JKQTPIsOKFloat ( fit_x ) & & JKQTPIsOKFloat ( fit_y ) ) {
sumx = sumx + fit_x ;
sumy = sumy + fit_y ;
sumxy = sumxy + fit_x * fit_y ;
sumx2 = sumx2 + fit_x * fit_x ;
N + + ;
}
}
const double NN = static_cast < double > ( N ) ;
JKQTPASSERT_M ( NN > 1 , " too few datapoints " ) ;
if ( ! fixA & & ! fixB ) {
coeffB = ( double ( sumxy ) - double ( sumx ) * double ( sumy ) / NN ) / ( double ( sumx2 ) - double ( sumx ) * double ( sumx ) / NN ) ; ;
coeffA = double ( sumy ) / NN - coeffB * double ( sumx ) / NN ;
} else if ( fixA & & ! fixB ) {
coeffB = ( double ( sumy ) / NN - coeffA ) / ( double ( sumx ) / NN ) ;
} else if ( ! fixA & & fixB ) {
coeffA = double ( sumy ) / NN - coeffB * double ( sumx ) / NN ;
}
}
/*! \brief calculate the weighted linear regression coefficients for a given for a given data range \a firstX / \a firstY / \a firstW ... \a lastX / \a lastY / \a lastW where the model is \f$ f(x)=a+b\cdot x \f$
So this function solves the least - squares optimization problem : \ f [ ( a ^ \ ast , b ^ \ ast ) = \ mathop { \ mathrm { arg \ ; min } } \ limits_ { a , b } \ sum \ limits_iw_i ^ 2 \ cdot \ left ( y_i - ( a + b \ cdot x_i ) \ right ) ^ 2 \ f ]
\ ingroup jkqtptools_math_statistics_regression
\ tparam InputItX standard iterator type of \ a firstX and \ a lastX .
\ tparam InputItY standard iterator type of \ a firstY and \ a lastY .
\ tparam InputItW standard iterator type of \ a firstW and \ a lastW .
\ param firstX iterator pointing to the first item in the x - dataset to use \ f $ x_1 \ f $
\ param lastX iterator pointing behind the last item in the x - dataset to use \ f $ x_N \ f $
\ param firstY iterator pointing to the first item in the y - dataset to use \ f $ y_1 \ f $
\ param lastY iterator pointing behind the last item in the y - dataset to use \ f $ y_N \ f $
\ param firstW iterator pointing to the first item in the weight - dataset to use \ f $ w_1 \ f $
\ param lastW iterator pointing behind the last item in the weight - dataset to use \ f $ w_N \ f $
\ param [ in , out ] coeffA returns the offset of the linear model
\ param [ in , out ] coeffB returns the slope of the linear model
\ param fixA if \ c true , the offset coefficient \ f $ a \ f $ is not determined by the fit , but the value provided in \ a coeffA is used
\ param fixB if \ c true , the slope coefficient \ f $ b \ f $ is not determined by the fit , but the value provided in \ a coeffB is used
\ param fWeightDataToWi an optional function , which is applied to the data from \ a firstW . . . \ a lastW to convert them to weight , i . e . \ c wi = fWeightDataToWi ( * itW )
e . g . if you use data used to draw error bars , you can use jkqtp_inversePropSaveDefault ( ) . The default is jkqtp_identity ( ) , which just returns the values .
In the case of jkqtp_inversePropSaveDefault ( ) , a datapoint x , y , has a large weight , if it ' s error is small and in the case if jkqtp_identity ( ) it ' s weight
is directly proportional to the given value .
This function internally computes :
\ f [ a = \ frac { \ overline { y } - b \ cdot \ overline { x } } { \ overline { w ^ 2 } } \ f ]
\ f [ b = \ frac { \ overline { w ^ 2 } \ cdot \ overline { x \ cdot y } - \ overline { x } \ cdot \ overline { y } } { \ overline { x ^ 2 } \ cdot \ overline { w ^ 2 } - \ overline { x } ^ 2 } \ f ]
Here the averages are defined in terms of a weight vector \ f $ w_i \ f $ :
\ f [ \ overline { x } = \ sum \ limits_iw_i ^ 2 \ cdot x_i \ f ]
\ f [ \ overline { y } = \ sum \ limits_iw_i ^ 2 \ cdot y_i \ f ]
\ f [ \ overline { x \ cdot y } = \ sum \ limits_iw_i ^ 2 \ cdot x_i \ cdot y_i \ f ]
\ f [ \ overline { x ^ 2 } = \ sum \ limits_iw_i ^ 2 \ cdot x_i ^ 2 \ f ]
\ f [ \ overline { w ^ 2 } = \ sum \ limits_iw_i ^ 2 \ f ]
2019-06-21 04:24:47 +08:00
\ image html datastore_regression_linweight . png
2019-06-12 18:58:47 +08:00
*/
template < class InputItX , class InputItY , class InputItW >
inline void jkqtpstatLinearWeightedRegression ( InputItX firstX , InputItX lastX , InputItY firstY , InputItY lastY , InputItW firstW , InputItW lastW , double & coeffA , double & coeffB , bool fixA = false , bool fixB = false , std : : function < double ( double ) > fWeightDataToWi = & jkqtp_identity < double > ) {
if ( fixA & & fixB ) return ;
const int Nx = std : : distance ( firstX , lastX ) ;
const int Ny = std : : distance ( firstY , lastY ) ;
const int Nw = std : : distance ( firstW , lastW ) ;
JKQTPASSERT ( Nx > 1 & & Ny > 1 & & Nw > 1 ) ;
double sumx = 0 , sumy = 0 , sumxy = 0 , sumx2 = 0 , sumw2 = 0 ;
size_t N = 0 ;
auto itX = firstX ;
auto itY = firstY ;
auto itW = firstW ;
for ( ; itX ! = lastX & & itY ! = lastY & & itW ! = lastW ; + + itX , + + itY , + + itW ) {
const double fit_x = jkqtp_todouble ( * itX ) ;
const double fit_y = jkqtp_todouble ( * itY ) ;
const double fit_w2 = jkqtp_sqr ( fWeightDataToWi ( jkqtp_todouble ( * itW ) ) ) ;
if ( JKQTPIsOKFloat ( fit_x ) & & JKQTPIsOKFloat ( fit_y ) & & JKQTPIsOKFloat ( fit_w2 ) ) {
sumx = sumx + fit_w2 * fit_x ;
sumy = sumy + fit_w2 * fit_y ;
sumxy = sumxy + fit_w2 * fit_x * fit_y ;
sumx2 = sumx2 + fit_w2 * fit_x * fit_x ;
sumw2 = sumw2 + fit_w2 ;
N + + ;
}
}
const double NN = static_cast < double > ( N ) ;
JKQTPASSERT_M ( NN > 1 , " too few datapoints " ) ;
if ( ! fixA & & ! fixB ) {
coeffB = ( double ( sumxy ) * double ( sumw2 ) - double ( sumx ) * double ( sumy ) ) / ( double ( sumx2 ) * double ( sumw2 ) - double ( sumx ) * double ( sumx ) ) ;
coeffA = ( double ( sumy ) - coeffB * double ( sumx ) ) / double ( sumw2 ) ;
} else if ( fixA & & ! fixB ) {
coeffB = ( double ( sumy ) - coeffA * double ( sumw2 ) ) / double ( sumx ) ;
} else if ( ! fixA & & fixB ) {
coeffA = ( double ( sumy ) - coeffB * double ( sumx ) ) / double ( sumw2 ) ;
}
}
/*! \brief calculate the (robust) iteratively reweighted least-squares (IRLS) estimate for the parameters of the model \f$ f(x)=a+b\cdot x \f$
for a given data range \ a firstX / \ a firstY . . . \ a lastX / \ a lastY
So this function finds an outlier - robust solution to the optimization problem :
\ f [ ( a ^ \ ast , b ^ \ ast ) = \ mathop { \ mathrm { arg \ ; min } } \ limits_ { a , b } \ sum \ limits_i | a + b \ cdot x_i - y_i | ^ p \ f ]
\ ingroup jkqtptools_math_statistics_regression
\ ingroup jkqtptools_math_statistics_regression
\ tparam InputItX standard iterator type of \ a firstX and \ a lastX .
\ tparam InputItY standard iterator type of \ a firstY and \ a lastY .
\ param firstX iterator pointing to the first item in the x - dataset to use \ f $ x_1 \ f $
\ param lastX iterator pointing behind the last item in the x - dataset to use \ f $ x_N \ f $
\ param firstY iterator pointing to the first item in the y - dataset to use \ f $ y_1 \ f $
\ param lastY iterator pointing behind the last item in the y - dataset to use \ f $ y_N \ f $
\ param [ in , out ] coeffA returns the offset of the linear model
\ param [ in , out ] coeffB returns the slope of the linear model
\ param fixA if \ c true , the offset coefficient \ f $ a \ f $ is not determined by the fit , but the value provided in \ a coeffA is used
\ param fixB if \ c true , the slope coefficient \ f $ b \ f $ is not determined by the fit , but the value provided in \ a coeffB is used
\ param p regularization parameter , the optimization problem is formulated in the \ f $ L_p \ f $ norm , using this \ a p ( see image below for an example )
\ param iterations the number of iterations the IRLS algorithm performs
This is a simple form of the IRLS algorithm to estimate the parameters a and b in a linear model \ f $ f ( x ) = a + b \ cdot x \ f $ .
This algorithm solves the optimization problem for a \ f $ L_p \ f $ - norm :
\ f [ ( a ^ \ ast , b ^ \ ast ) = \ mathop { \ mathrm { arg \ ; min } } \ limits_ { a , b } \ sum \ limits_i | a + b \ cdot x_i - y_i | ^ p \ f ]
by iteratively optimization weights \ f $ \ vec { w } \ f $ and solving a weighted least squares problem in each iteration :
\ f [ ( a_n , b_n ) = \ mathop { \ mathrm { arg \ ; min } } \ limits_ { a , b } \ sum \ limits_i | a + b \ cdot x_i - y_i | ^ { ( p - 2 ) } \ cdot | a + b \ cdot x_i - y_i | ^ 2 \ f ]
The IRLS - algorithm works as follows :
- calculate initial \ f $ a_0 \ f $ and \ f $ b_0 \ f $ with unweighted regression from x and y
- perform a number of iterations ( parameter \ a iterations ) . In each iteration \ f $ n \ f $ :
- calculate the error vector \ f $ \ vec { e } \ f $ : \ f [ e_i = a + b \ cdot x_i - y_i \ f ]
- estimate new weights \ f $ \ vec { w } \ f $ : \ f [ w_i = | e_i | ^ { ( p - 2 ) / 2 } \ f ]
- calculate new estimates \ f $ a_n \ f $ and \ f $ b_n \ f $ with weighted regression from \ f $ \ vec { x } \ f $ and \ f $ \ vec { y } \ f $ and \ f $ \ vec { w } \ f $
.
- return the last estimates \ f $ a_n \ f $ and \ f $ b_n \ f $
.
\ image html irls . png
2019-06-21 04:24:47 +08:00
\ image html datastore_regression_linrobust_p . png
2019-06-12 18:58:47 +08:00
\ see https : //en.wikipedia.org/wiki/Iteratively_reweighted_least_squares, C. Sidney Burrus: "Iterative Reweighted Least Squares", <a href="http://cnx.org/content/m45285/latest/">http://cnx.org/content/m45285/latest/</a>
*/
template < class InputItX , class InputItY >
inline void jkqtpstatRobustIRLSLinearRegression ( InputItX firstX , InputItX lastX , InputItY firstY , InputItY lastY , double & coeffA , double & coeffB , bool fixA = false , bool fixB = false , double p = 1.1 , int iterations = 100 ) {
if ( fixA & & fixB ) return ;
const int Nx = std : : distance ( firstX , lastX ) ;
const int Ny = std : : distance ( firstY , lastY ) ;
const int N = std : : min ( Nx , Ny ) ;
JKQTPASSERT ( Nx > 1 & & Ny > 1 ) ;
std : : vector < double > weights ;
std : : fill_n ( std : : back_inserter ( weights ) , N , 1.0 ) ;
double alast = coeffA , blast = coeffB ;
jkqtpstatLinearWeightedRegression ( firstX , lastX , firstY , lastY , weights . begin ( ) , weights . end ( ) , alast , blast , fixA , fixB , & jkqtp_identity < double > ) ;
for ( int it = 0 ; it < iterations - 1 ; it + + ) {
// calculate weights
auto itX = firstX ;
auto itY = firstY ;
for ( double & w : weights ) {
const double fit_x = * itX ;
const double fit_y = * itY ;
const double e = alast + blast * fit_x - fit_y ;
w = pow ( std : : max < double > ( JKQTP_EPSILON * 100.0 , fabs ( e ) ) , ( p - 2.0 ) / 2.0 ) ;
+ + itX ;
+ + itY ;
}
// solve weighted linear least squares
jkqtpstatLinearWeightedRegression ( firstX , lastX , firstY , lastY , weights . begin ( ) , weights . end ( ) , alast , blast , fixA , fixB , & jkqtp_identity < double > ) ;
}
coeffA = alast ;
coeffB = blast ;
}
/*! \brief when performing linear regression, different target functions can be fitted, if the input data is transformed accordingly. This library provides the options in this enum by default.
\ ingroup jkqtptools_math_statistics_regression
*/
enum class JKQTPStatRegressionModelType {
Linear , /*!< \brief linear model \f$ f(x)=a+b\cdot x \f$ */
PowerLaw , /*!< \brief power law model \f$ f(x)=a\cdot x^b \f$ */
Exponential , /*!< \brief exponential model \f$ f(x)=a\cdot \exp(b\cdot x) \f$ */
Logarithm , /*!< \brief exponential model \f$ f(x)=a+b\cdot \ln(x) \f$ */
} ;
/*! \brief Generates functors \c f(x,a,b) for the models from JKQTPStatRegressionModelType in \a type
\ ingroup jkqtptools_math_statistics_regression
*/
2019-06-22 20:21:32 +08:00
JKQTCOMMON_LIB_EXPORT std : : function < double ( double , double , double ) > jkqtpStatGenerateRegressionModel ( JKQTPStatRegressionModelType type ) ;
2019-06-12 18:58:47 +08:00
/*! \brief Generates a LaTeX string for the models from JKQTPStatRegressionModelType in \a type
\ ingroup jkqtptools_math_statistics_regression
*/
2019-06-22 20:21:32 +08:00
JKQTCOMMON_LIB_EXPORT QString jkqtpstatRegressionModel2Latex ( JKQTPStatRegressionModelType type , double a , double b ) ;
2019-06-12 18:58:47 +08:00
/*! \brief Generates functors \c f(x) for the models from JKQTPStatRegressionModelType in \a type and binds the parameter values \a and \a b to the returned function
\ ingroup jkqtptools_math_statistics_regression
*/
2019-06-22 20:21:32 +08:00
JKQTCOMMON_LIB_EXPORT std : : function < double ( double ) > jkqtpStatGenerateRegressionModel ( JKQTPStatRegressionModelType type , double a , double b ) ;
2019-06-12 18:58:47 +08:00
/*! \brief Generates the transformation function for x-data (\c result.first ) and y-data (\c result.second ) for each regression model in JKQTPStatRegressionModelType in \a type
\ ingroup jkqtptools_math_statistics_regression
\ internal
*/
2019-06-22 20:21:32 +08:00
JKQTCOMMON_LIB_EXPORT std : : pair < std : : function < double ( double ) > , std : : function < double ( double ) > > jkqtpStatGenerateTransformation ( JKQTPStatRegressionModelType type ) ;
2019-06-12 18:58:47 +08:00
/*! \brief Generates the transformation function for a-parameter (offset, \c result.first : transform, \c result.second : back-transform) for each regression model in JKQTPStatRegressionModelType in \a type
\ ingroup jkqtptools_math_statistics_regression
\ internal
*/
2019-06-22 20:21:32 +08:00
JKQTCOMMON_LIB_EXPORT std : : pair < std : : function < double ( double ) > , std : : function < double ( double ) > > jkqtpStatGenerateParameterATransformation ( JKQTPStatRegressionModelType type ) ;
2019-06-12 18:58:47 +08:00
/*! \brief Generates the transformation function for b-parameter (slope, \c result.first : transform, \c result.second : back-transform) for each regression model in JKQTPStatRegressionModelType in \a type
\ ingroup jkqtptools_math_statistics_regression
\ internal
*/
2019-06-22 20:21:32 +08:00
JKQTCOMMON_LIB_EXPORT std : : pair < std : : function < double ( double ) > , std : : function < double ( double ) > > jkqtpStatGenerateParameterBTransformation ( JKQTPStatRegressionModelType type ) ;
2019-06-12 18:58:47 +08:00
/*! \brief calculate the linear regression coefficients for a given data range \a firstX / \a firstY ... \a lastX / \a lastY where the model is defined by \a type
So this function solves the least - squares optimization problem : \ f [ ( a ^ \ ast , b ^ \ ast ) = \ mathop { \ mathrm { arg \ ; min } } \ limits_ { a , b } \ sum \ limits_i \ left ( y_i - f_ { \ text { type } } ( x_i , a , b ) \ right ) ^ 2 \ f ]
by reducing it to a linear fit by transforming x - and / or y - data
\ ingroup jkqtptools_math_statistics_regression
\ tparam InputItX standard iterator type of \ a firstX and \ a lastX .
\ tparam InputItY standard iterator type of \ a firstY and \ a lastY .
\ param type model to be fitted
\ param firstX iterator pointing to the first item in the x - dataset to use \ f $ x_1 \ f $
\ param lastX iterator pointing behind the last item in the x - dataset to use \ f $ x_N \ f $
\ param firstY iterator pointing to the first item in the y - dataset to use \ f $ y_1 \ f $
\ param lastY iterator pointing behind the last item in the y - dataset to use \ f $ y_N \ f $
\ param [ in , out ] coeffA returns the offset of the linear model
\ param [ in , out ] coeffB returns the slope of the linear model
\ param fixA if \ c true , the offset coefficient \ f $ a \ f $ is not determined by the fit , but the value provided in \ a coeffA is used
\ param fixB if \ c true , the slope coefficient \ f $ b \ f $ is not determined by the fit , but the value provided in \ a coeffB is used
This function computes internally first transforms the data , as appropriate to fit the model defined by \ a type and then calls jkqtpstatLinearRegression ( )
to obtain the parameters . The output parameters are transformed , so they can be used with jkqtpStatGenerateRegressionModel ( ) to generate a functor
that evaluates the model
\ see JKQTPStatRegressionModelType , jkqtpStatGenerateRegressionModel ( ) , jkqtpstatLinearRegression ( ) , jkqtpStatGenerateTransformation ( )
*/
template < class InputItX , class InputItY >
inline void jkqtpstatRegression ( JKQTPStatRegressionModelType type , InputItX firstX , InputItX lastX , InputItY firstY , InputItY lastY , double & coeffA , double & coeffB , bool fixA = false , bool fixB = false ) {
std : : vector < double > x , y ;
auto trafo = jkqtpStatGenerateTransformation ( type ) ;
auto aTrafo = jkqtpStatGenerateParameterATransformation ( type ) ;
auto bTrafo = jkqtpStatGenerateParameterBTransformation ( type ) ;
std : : transform ( firstX , lastX , std : : back_inserter ( x ) , trafo . first ) ;
std : : transform ( firstY , lastY , std : : back_inserter ( y ) , trafo . second ) ;
double a = aTrafo . first ( coeffA ) ;
double b = bTrafo . first ( coeffB ) ;
jkqtpstatLinearRegression ( x . begin ( ) , x . end ( ) , y . begin ( ) , y . end ( ) , a , b , fixA , fixB ) ;
coeffA = aTrafo . second ( a ) ;
coeffB = bTrafo . second ( b ) ;
}
/*! \brief calculate the robust linear regression coefficients for a given data range \a firstX / \a firstY ... \a lastX / \a lastY where the model is defined by \a type
2019-06-13 18:46:49 +08:00
So this function solves the Lp - norm optimization problem : \ f [ ( a ^ \ ast , b ^ \ ast ) = \ mathop { \ mathrm { arg \ ; min } } \ limits_ { a , b } \ sum \ limits_i | y_i - f_ { \ text { type } } ( x_i , a , b ) | ^ p \ f ]
2019-06-12 18:58:47 +08:00
by reducing it to a linear fit by transforming x - and / or y - data
\ ingroup jkqtptools_math_statistics_regression
\ tparam InputItX standard iterator type of \ a firstX and \ a lastX .
\ tparam InputItY standard iterator type of \ a firstY and \ a lastY .
\ param type model to be fitted
\ param firstX iterator pointing to the first item in the x - dataset to use \ f $ x_1 \ f $
\ param lastX iterator pointing behind the last item in the x - dataset to use \ f $ x_N \ f $
\ param firstY iterator pointing to the first item in the y - dataset to use \ f $ y_1 \ f $
\ param lastY iterator pointing behind the last item in the y - dataset to use \ f $ y_N \ f $
\ param [ in , out ] coeffA returns the offset of the linear model
\ param [ in , out ] coeffB returns the slope of the linear model
\ param fixA if \ c true , the offset coefficient \ f $ a \ f $ is not determined by the fit , but the value provided in \ a coeffA is used
\ param fixB if \ c true , the slope coefficient \ f $ b \ f $ is not determined by the fit , but the value provided in \ a coeffB is used
\ param p regularization parameter , the optimization problem is formulated in the \ f $ L_p \ f $ norm , using this \ a p ( see image below for an example )
\ param iterations the number of iterations the IRLS algorithm performs
This function computes internally first transforms the data , as appropriate to fit the model defined by \ a type and then calls jkqtpstatRobustIRLSLinearRegression ( )
to obtain the parameters . The output parameters are transformed , so they can be used with jkqtpStatGenerateRegressionModel ( ) to generate a functor
that evaluates the model
\ see JKQTPStatRegressionModelType , jkqtpStatGenerateRegressionModel ( ) , jkqtpstatRobustIRLSLinearRegression ( ) , jkqtpStatGenerateTransformation ( )
*/
template < class InputItX , class InputItY >
inline void jkqtpstatRobustIRLSRegression ( JKQTPStatRegressionModelType type , InputItX firstX , InputItX lastX , InputItY firstY , InputItY lastY , double & coeffA , double & coeffB , bool fixA = false , bool fixB = false , double p = 1.1 , int iterations = 100 ) {
std : : vector < double > x , y ;
auto trafo = jkqtpStatGenerateTransformation ( type ) ;
auto aTrafo = jkqtpStatGenerateParameterATransformation ( type ) ;
auto bTrafo = jkqtpStatGenerateParameterBTransformation ( type ) ;
std : : transform ( firstX , lastX , std : : back_inserter ( x ) , trafo . first ) ;
std : : transform ( firstY , lastY , std : : back_inserter ( y ) , trafo . second ) ;
double a = aTrafo . first ( coeffA ) ;
double b = bTrafo . first ( coeffB ) ;
jkqtpstatRobustIRLSLinearRegression ( x . begin ( ) , x . end ( ) , y . begin ( ) , y . end ( ) , a , b , fixA , fixB , p , iterations ) ;
coeffA = aTrafo . second ( a ) ;
coeffB = bTrafo . second ( b ) ;
}
/*! \brief calculate the robust linear regression coefficients for a given data range \a firstX / \a firstY ... \a lastX / \a lastY where the model is defined by \a type
2019-06-13 18:46:49 +08:00
So this function solves the Lp - norm optimization problem : \ f [ ( a ^ \ ast , b ^ \ ast ) = \ mathop { \ mathrm { arg \ ; min } } \ limits_ { a , b } \ sum \ limits_iw_i ^ 2 \ left ( y_i - f_ { \ text { type } } ( x_i , a , b ) \ right ) ^ 2 \ f ]
2019-06-12 18:58:47 +08:00
by reducing it to a linear fit by transforming x - and / or y - data
\ ingroup jkqtptools_math_statistics_regression
\ tparam InputItX standard iterator type of \ a firstX and \ a lastX .
\ tparam InputItY standard iterator type of \ a firstY and \ a lastY .
\ tparam InputItW standard iterator type of \ a firstW and \ a lastW .
\ param type model to be fitted
\ param firstX iterator pointing to the first item in the x - dataset to use \ f $ x_1 \ f $
\ param lastX iterator pointing behind the last item in the x - dataset to use \ f $ x_N \ f $
\ param firstY iterator pointing to the first item in the y - dataset to use \ f $ y_1 \ f $
\ param lastY iterator pointing behind the last item in the y - dataset to use \ f $ y_N \ f $
\ param firstW iterator pointing to the first item in the weight - dataset to use \ f $ w_1 \ f $
\ param lastW iterator pointing behind the last item in the weight - dataset to use \ f $ w_N \ f $
\ param [ in , out ] coeffA returns the offset of the linear model
\ param [ in , out ] coeffB returns the slope of the linear model
\ param fixA if \ c true , the offset coefficient \ f $ a \ f $ is not determined by the fit , but the value provided in \ a coeffA is used
\ param fixB if \ c true , the slope coefficient \ f $ b \ f $ is not determined by the fit , but the value provided in \ a coeffB is used
\ param fWeightDataToWi an optional function , which is applied to the data from \ a firstW . . . \ a lastW to convert them to weight , i . e . \ c wi = fWeightDataToWi ( * itW )
e . g . if you use data used to draw error bars , you can use jkqtp_inversePropSaveDefault ( ) . The default is jkqtp_identity ( ) , which just returns the values .
In the case of jkqtp_inversePropSaveDefault ( ) , a datapoint x , y , has a large weight , if it ' s error is small and in the case if jkqtp_identity ( ) it ' s weight
is directly proportional to the given value .
This function computes internally first transforms the data , as appropriate to fit the model defined by \ a type and then calls jkqtpstatLinearWeightedRegression ( )
to obtain the parameters . The output parameters are transformed , so they can be used with jkqtpStatGenerateRegressionModel ( ) to generate a functor
that evaluates the model
\ see JKQTPStatRegressionModelType , jkqtpStatGenerateRegressionModel ( ) , jkqtpstatLinearWeightedRegression ( ) , jkqtpStatGenerateTransformation ( )
*/
template < class InputItX , class InputItY , class InputItW >
inline void jkqtpstatWeightedRegression ( JKQTPStatRegressionModelType type , InputItX firstX , InputItX lastX , InputItY firstY , InputItY lastY , InputItW firstW , InputItW lastW , double & coeffA , double & coeffB , bool fixA = false , bool fixB = false , std : : function < double ( double ) > fWeightDataToWi = & jkqtp_identity < double > ) {
std : : vector < double > x , y ;
auto trafo = jkqtpStatGenerateTransformation ( type ) ;
auto aTrafo = jkqtpStatGenerateParameterATransformation ( type ) ;
auto bTrafo = jkqtpStatGenerateParameterBTransformation ( type ) ;
std : : transform ( firstX , lastX , std : : back_inserter ( x ) , trafo . first ) ;
std : : transform ( firstY , lastY , std : : back_inserter ( y ) , trafo . second ) ;
double a = aTrafo . first ( coeffA ) ;
double b = bTrafo . first ( coeffB ) ;
jkqtpstatLinearWeightedRegression ( x . begin ( ) , x . end ( ) , y . begin ( ) , y . end ( ) , firstW , lastW , a , b , fixA , fixB , fWeightDataToWi ) ;
coeffA = aTrafo . second ( a ) ;
coeffB = bTrafo . second ( b ) ;
}
/*! \brief calculates the coefficient of determination \f$ R^2 \f$ for a set of measurements \f$ (x_i,y_i) \f$ with a fit function \f$ f(x) \f$
\ ingroup jkqtptools_math_statistics_regression
\ tparam InputItX standard iterator type of \ a firstX and \ a lastX .
\ tparam InputItY standard iterator type of \ a firstY and \ a lastY .
\ param firstX iterator pointing to the first item in the x - dataset to use \ f $ x_1 \ f $
\ param lastX iterator pointing behind the last item in the x - dataset to use \ f $ x_N \ f $
\ param firstY iterator pointing to the first item in the y - dataset to use \ f $ y_1 \ f $
\ param lastY iterator pointing behind the last item in the y - dataset to use \ f $ y_N \ f $
\ param f function \ f $ f ( x ) \ f $ , result of a fit to the data
\ return coeffcicient of determination \ f [ R ^ 2 = 1 - \ frac { \ sum_i \ bigl [ y_i - f ( x_i ) \ bigr ] ^ 2 } { \ sum_i \ bigl [ y_i - \ overline { y } \ bigr ] ^ 2 } \ f ] where \ f [ \ overline { y } = \ frac { 1 } { N } \ cdot \ sum_iy_i \ f ]
\ see https : //en.wikipedia.org/wiki/Coefficient_of_determination
*/
template < class InputItX , class InputItY >
inline double jkqtpstatCoefficientOfDetermination ( InputItX firstX , InputItX lastX , InputItY firstY , InputItY lastY , std : : function < double ( double ) > f ) {
auto itX = firstX ;
auto itY = firstY ;
const double yMean = jkqtpstatAverage ( firstX , lastX ) ;
double SSres = 0 ;
double SStot = 0 ;
for ( ; itX ! = lastX & & itY ! = lastY ; + + itX , + + itY ) {
const double fit_x = jkqtp_todouble ( * itX ) ;
const double fit_y = jkqtp_todouble ( * itY ) ;
if ( JKQTPIsOKFloat ( fit_x ) & & JKQTPIsOKFloat ( fit_y ) ) {
SStot + = jkqtp_sqr ( fit_y - yMean ) ;
SSres + = jkqtp_sqr ( fit_y - f ( fit_x ) ) ;
}
}
return 1.0 - SSres / SStot ;
}
/*! \brief calculates the weightedcoefficient of determination \f$ R^2 \f$ for a set of measurements \f$ (x_i,y_i,w_i) \f$ with a fit function \f$ f(x) \f$
\ ingroup jkqtptools_math_statistics_regression
\ tparam InputItX standard iterator type of \ a firstX and \ a lastX .
\ tparam InputItY standard iterator type of \ a firstY and \ a lastY .
\ tparam InputItW standard iterator type of \ a firstW and \ a lastW .
\ param firstX iterator pointing to the first item in the x - dataset to use \ f $ x_1 \ f $
\ param lastX iterator pointing behind the last item in the x - dataset to use \ f $ x_N \ f $
\ param firstY iterator pointing to the first item in the y - dataset to use \ f $ y_1 \ f $
\ param lastY iterator pointing behind the last item in the y - dataset to use \ f $ y_N \ f $
\ param firstW iterator pointing to the first item in the weight - dataset to use \ f $ w_1 \ f $
\ param lastW iterator pointing behind the last item in the weight - dataset to use \ f $ w_N \ f $
\ param f function \ f $ f ( x ) \ f $ , result of a fit to the data
\ param fWeightDataToWi an optional function , which is applied to the data from \ a firstW . . . \ a lastW to convert them to weight , i . e . \ c wi = fWeightDataToWi ( * itW )
e . g . if you use data used to draw error bars , you can use jkqtp_inversePropSaveDefault ( ) . The default is jkqtp_identity ( ) , which just returns the values .
In the case of jkqtp_inversePropSaveDefault ( ) , a datapoint x , y , has a large weight , if it ' s error is small and in the case if jkqtp_identity ( ) it ' s weight
is directly proportional to the given value .
\ return weighted coeffcicient of determination \ f [ R ^ 2 = 1 - \ frac { \ sum_iw_i ^ 2 \ bigl [ y_i - f ( x_i ) \ bigr ] ^ 2 } { \ sum_iw_i ^ 2 \ bigl [ y_i - \ overline { y } \ bigr ] ^ 2 } \ f ] where \ f [ \ overline { y } = \ frac { 1 } { N } \ cdot \ sum_iw_iy_i \ f ]
with \ f [ \ sum_iw_i = 1 \ f ]
\ see https : //en.wikipedia.org/wiki/Coefficient_of_determination
*/
template < class InputItX , class InputItY , class InputItW >
inline double jkqtpstatWeightedCoefficientOfDetermination ( InputItX firstX , InputItX lastX , InputItY firstY , InputItY lastY , InputItW firstW , InputItW lastW , std : : function < double ( double ) > f , std : : function < double ( double ) > fWeightDataToWi = & jkqtp_identity < double > ) {
auto itX = firstX ;
auto itY = firstY ;
auto itW = firstW ;
const double yMean = jkqtpstatWeightedAverage ( firstX , lastX , firstW ) ;
double SSres = 0 ;
double SStot = 0 ;
for ( ; itX ! = lastX & & itY ! = lastY & & itW ! = lastW ; + + itX , + + itY , + + itW ) {
const double fit_x = jkqtp_todouble ( * itX ) ;
const double fit_y = jkqtp_todouble ( * itY ) ;
const double fit_w2 = jkqtp_sqr ( fWeightDataToWi ( jkqtp_todouble ( * itW ) ) ) ;
if ( JKQTPIsOKFloat ( fit_x ) & & JKQTPIsOKFloat ( fit_y ) & & JKQTPIsOKFloat ( fit_w2 ) ) {
SSres + = ( fit_w2 * jkqtp_sqr ( fit_y - f ( fit_x ) ) ) ;
SStot + = ( fit_w2 * jkqtp_sqr ( fit_y - yMean ) ) ;
}
}
return 1.0 - SSres / SStot ;
}
/*! \brief calculates the sum of deviations \f$ \chi^2 \f$ for a set of measurements \f$ (x_i,y_i) \f$ with a fit function \f$ f(x) \f$
\ ingroup jkqtptools_math_statistics_regression
\ tparam InputItX standard iterator type of \ a firstX and \ a lastX .
\ tparam InputItY standard iterator type of \ a firstY and \ a lastY .
\ param firstX iterator pointing to the first item in the x - dataset to use \ f $ x_1 \ f $
\ param lastX iterator pointing behind the last item in the x - dataset to use \ f $ x_N \ f $
\ param firstY iterator pointing to the first item in the y - dataset to use \ f $ y_1 \ f $
\ param lastY iterator pointing behind the last item in the y - dataset to use \ f $ y_N \ f $
\ param f function \ f $ f ( x ) \ f $ , result of a fit to the data
\ return sum of deviations \ f [ \ chi ^ 2 = \ sum_i \ bigl [ y_i - f ( x_i ) \ bigr ] ^ 2 \ f ]
\ see https : //en.wikipedia.org/wiki/Coefficient_of_determination
*/
template < class InputItX , class InputItY >
inline double jkqtpstatSumOfDeviations ( InputItX firstX , InputItX lastX , InputItY firstY , InputItY lastY , std : : function < double ( double ) > f ) {
auto itX = firstX ;
auto itY = firstY ;
double SSres = 0 ;
for ( ; itX ! = lastX & & itY ! = lastY ; + + itX , + + itY ) {
const double fit_x = jkqtp_todouble ( * itX ) ;
const double fit_y = jkqtp_todouble ( * itY ) ;
if ( JKQTPIsOKFloat ( fit_x ) & & JKQTPIsOKFloat ( fit_y ) ) {
SSres + = jkqtp_sqr ( fit_y - f ( fit_x ) ) ;
}
}
return SSres ;
}
/*! \brief calculates the weighted sum of deviations \f$ \chi^2 \f$ for a set of measurements \f$ (x_i,y_i,w_i) \f$ with a fit function \f$ f(x) \f$
\ ingroup jkqtptools_math_statistics_regression
\ tparam InputItX standard iterator type of \ a firstX and \ a lastX .
\ tparam InputItY standard iterator type of \ a firstY and \ a lastY .
\ tparam InputItW standard iterator type of \ a firstW and \ a lastW .
\ param firstX iterator pointing to the first item in the x - dataset to use \ f $ x_1 \ f $
\ param lastX iterator pointing behind the last item in the x - dataset to use \ f $ x_N \ f $
\ param firstY iterator pointing to the first item in the y - dataset to use \ f $ y_1 \ f $
\ param lastY iterator pointing behind the last item in the y - dataset to use \ f $ y_N \ f $
\ param firstW iterator pointing to the first item in the weight - dataset to use \ f $ w_1 \ f $
\ param lastW iterator pointing behind the last item in the weight - dataset to use \ f $ w_N \ f $
\ param f function \ f $ f ( x ) \ f $ , result of a fit to the data
\ param fWeightDataToWi an optional function , which is applied to the data from \ a firstW . . . \ a lastW to convert them to weight , i . e . \ c wi = fWeightDataToWi ( * itW )
e . g . if you use data used to draw error bars , you can use jkqtp_inversePropSaveDefault ( ) . The default is jkqtp_identity ( ) , which just returns the values .
In the case of jkqtp_inversePropSaveDefault ( ) , a datapoint x , y , has a large weight , if it ' s error is small and in the case if jkqtp_identity ( ) it ' s weight
is directly proportional to the given value .
\ return weighted sum of deviations \ f [ \ chi ^ 2 = \ sum_iw_i ^ 2 \ cdot \ bigl [ y_i - f ( x_i ) \ bigr ] ^ 2 \ f ]
\ see https : //en.wikipedia.org/wiki/Reduced_chi-squared_statistic
*/
template < class InputItX , class InputItY , class InputItW >
inline double jkqtpstatWeightedSumOfDeviations ( InputItX firstX , InputItX lastX , InputItY firstY , InputItY lastY , InputItW firstW , InputItW lastW , std : : function < double ( double ) > f , std : : function < double ( double ) > fWeightDataToWi = & jkqtp_identity < double > ) {
auto itX = firstX ;
auto itY = firstY ;
auto itW = firstW ;
double SSres = 0 ;
for ( ; itX ! = lastX & & itY ! = lastY & & itW ! = lastW ; + + itX , + + itY , + + itW ) {
const double fit_x = jkqtp_todouble ( * itX ) ;
const double fit_y = jkqtp_todouble ( * itY ) ;
const double fit_w2 = jkqtp_sqr ( fWeightDataToWi ( jkqtp_todouble ( * itW ) ) ) ;
if ( JKQTPIsOKFloat ( fit_x ) & & JKQTPIsOKFloat ( fit_y ) & & JKQTPIsOKFloat ( fit_w2 ) ) {
SSres + = fit_w2 * jkqtp_sqr ( fit_y - f ( fit_x ) ) ;
}
}
return SSres ;
}
# endif // JKQTPSTATREGRESSION_H_INCLUDED