diff --git a/include/gwmodelpp/GWRBasic.h b/include/gwmodelpp/GWRBasic.h
index 30665d2..5e6905f 100644
--- a/include/gwmodelpp/GWRBasic.h
+++ b/include/gwmodelpp/GWRBasic.h
@@ -51,10 +51,25 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
     typedef arma::mat (GWRBasic::*FitCoreCalculator)(const arma::mat&, const arma::vec&, const SpatialWeight&);   //!< \~english Fit function declaration. \~chinese 拟合函数声明。
     typedef arma::mat (GWRBasic::*FitCoreSHatCalculator)(const arma::mat&, const arma::vec&, const SpatialWeight&, arma::vec&);   //!< \~english Fit function declaration. \~chinese 拟合函数声明。
     typedef arma::mat (GWRBasic::*FitCoreCVCalculator)(const arma::mat&, const arma::vec&, const SpatialWeight&);   //!< \~english Fit function declaration. \~chinese 拟合函数声明。
+    typedef void (GWRBasic::*FTestCalculator)();
+    typedef double (GWRBasic::*TrQtQCalculator)();
+    typedef double (GWRBasic::*TrQtQCoreCalculator)();
+    typedef arma::vec (GWRBasic::*DiagBCalculator)(arma::uword);
+    typedef arma::vec (GWRBasic::*DiagBCoreCalculator)(arma::uword, const arma::vec&);
 
     typedef double (GWRBasic::*BandwidthSelectionCriterionCalculator)(BandwidthWeight*);        //!< \~english Declaration of criterion calculator for bandwidth selection. \~chinese 带宽优选指标计算函数声明。
     typedef double (GWRBasic::*IndepVarsSelectCriterionCalculator)(const std::vector<std::size_t>&); //!< \~english Declaration of criterion calculator for variable selection. \~chinese 变量优选指标计算函数声明。
 
+    struct FTestResult
+    {
+        double s = 0.0;
+        double df1 = 0.0;
+        double df2 = 0.0;
+        double p = 0.0;
+    };
+
+    using FTestResultCombine = std::tuple<FTestResult, FTestResult, std::vector<FTestResult>, FTestResult>;
+
 private:
 
     /**
@@ -388,6 +403,15 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
 
     void setStoreC(bool flag) { mStoreC = flag; }
 
+    bool isDoFTest() { return mIsDoFTest; };
+
+    void setIsDoFtest(bool value) { mIsDoFTest = value; }
+
+    FTestResultCombine fTestResults()
+    {
+        return std::make_tuple(mF1Test, mF2Test, mF3Test, mF4Test);
+    }
+
 public:     // Implement Algorithm
     bool isValid() override;
 
@@ -396,6 +420,11 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
 
     arma::mat fit() override;
 
+    void fTest()
+    {
+        (this->*mFTestFunction)();
+    }
+
 public:     // Implement IVariableSelectable
     Status getCriterion(const std::vector<size_t>& variables, double& criterion) override
     {
@@ -508,6 +537,11 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
      */
     arma::mat fitBase();
 
+    void fTestBase();
+
+    double calcTrQtQBase();
+
+    arma::vec calcDiagBBase(arma::uword i);
 
 private:
 
@@ -517,6 +551,10 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
 
     arma::mat fitCoreCVSerial(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw);
 
+    double calcTrQtQCoreSerial();
+
+    arma::vec calcDiagBCoreSerial(arma::uword i, const arma::vec& c);
+
 #ifdef ENABLE_OPENMP
 
     /**
@@ -593,6 +631,10 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
      */
     arma::mat fitCoreSHatOmp(const arma::mat& x, const arma::vec& y, const SpatialWeight& sw, arma::vec& shat);
 
+    double calcTrQtQCoreOmp();
+
+    arma::vec calcDiagBCoreOmp(arma::uword i, const arma::vec& c);
+
 #endif
 
 #ifdef ENABLE_CUDA
@@ -678,6 +720,8 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
     double bandwidthSizeCriterionCVMpi(BandwidthWeight* bandwidthWeight);
     double bandwidthSizeCriterionAICMpi(BandwidthWeight* bandwidthWeight);
     arma::mat fitMpi();
+    double calcTrQtQMpi();
+    arma::vec calcDiagBMpi(arma::uword i);
 #endif // ENABLE_MPI
 
 public:     // Implement IParallelizable
@@ -772,6 +816,17 @@ class GWRBasic : public GWRBase, public IBandwidthSelectable, public IVarialbeSe
     arma::cube mC;//!< \~english All \f$S\f$ matrices. \~chinese 所有 \f$C\f$ 矩阵。
     bool mStoreS = false; //!< \~english Whether to save S \~chinese 是否保存 S 矩阵
     bool mStoreC = false; //!< \~english Whether to save C \~chinese 是否保存 C 矩阵
+
+    bool mIsDoFTest = false;
+    FTestResult mF1Test;
+    FTestResult mF2Test;
+    std::vector<FTestResult> mF3Test;
+    FTestResult mF4Test;
+    FTestCalculator mFTestFunction = &GWRBasic::fTestBase;
+    TrQtQCalculator mCalcTrQtQFunction = &GWRBasic::calcTrQtQBase;
+    TrQtQCoreCalculator mCalcTrQtQCoreFunction = &GWRBasic::calcTrQtQCoreSerial;
+    DiagBCalculator mCalcDiagBFunction = &GWRBasic::calcDiagBBase;
+    DiagBCoreCalculator mCalcDiagBCoreFunction = &GWRBasic::calcDiagBCoreSerial;
 };
 
 }
diff --git a/include/gwmodelpp/utils/armampi.h b/include/gwmodelpp/utils/armampi.h
index d1866a0..9fa1758 100644
--- a/include/gwmodelpp/utils/armampi.h
+++ b/include/gwmodelpp/utils/armampi.h
@@ -7,4 +7,8 @@
 #define GWM_MPI_UWORD MPI_UNSIGNED_LONG_LONG
 #endif  // ARMA_32BIT_WORD
 
-void mat_mul_mpi(arma::mat& a, arma::mat& b, arma::mat& c, const int ip, const int np, const size_t range);
+void mat_mul_mpi(arma::mat& a, arma::mat& b, arma::mat& c, const int ip, const int np);
+
+void mat_quad_mpi(arma::mat& a, arma::mat& aTa, const int ip, const int np);
+
+double mat_trace_mpi(arma::mat& a, const int ip, const int np);
diff --git a/src/gwmodelpp/GWRBasic.cpp b/src/gwmodelpp/GWRBasic.cpp
index 12720c7..acd771c 100644
--- a/src/gwmodelpp/GWRBasic.cpp
+++ b/src/gwmodelpp/GWRBasic.cpp
@@ -1,3 +1,4 @@
+#include <gsl/gsl_cdf.h>
 #include "GWRBasic.h"
 #include "BandwidthSelector.h"
 #include "VariableForwardSelector.h"
@@ -158,6 +159,12 @@ mat GWRBasic::fit()
 
     GWM_LOG_STAGE("Model fitting");
     mBetas = (this->*mFitFunction)();
+
+    if (mIsDoFTest)
+    {
+        GWM_LOG_STAGE("F Test");
+        (this->*mFTestFunction)();
+    }
     
 #ifdef ENABLE_CUDA
     if (mParallelType & ParallelType::CUDA)
@@ -417,6 +424,174 @@ double gwm::GWRBasic::indepVarsSelectionCriterion(const vector<size_t>& indepVar
     }
 }
 
+void gwm::GWRBasic::fTestBase()
+{
+    double v1 = mSHat(0), v2 = mSHat(1);
+    double nDp = double(mCoords.n_rows), nVar = double(mX.n_cols);
+    double RSSg = RSS(mX, mY, mBetas);
+    vec betaOls = (mX.t() * mX).i() * (mX.t() * mY);
+    vec residualOls = mY - mX * betaOls;
+    double RSSo = sum(residualOls % residualOls);
+    double DFo = nDp - nVar;
+    double delta1 = nDp - 2.0 * v1 + v2;
+    double sigma21 = RSSg / delta1;
+    double lDelta1 = sum(mQDiag), lDelta2 = (this->*mCalcTrQtQFunction)();
+    //=========
+    // F1 Test
+    //=========
+    mF1Test.s = (RSSg/lDelta1)/(RSSo/DFo);
+    mF1Test.df1 = lDelta1 * lDelta1 / lDelta2;
+    mF1Test.df2 = DFo;
+    mF1Test.p = gsl_cdf_fdist_P(mF1Test.s, mF1Test.df1, mF1Test.df2);
+    //=========
+    // F2 Test
+    //=========
+    mF2Test.s = ((RSSo-RSSg)/(DFo-lDelta1))/(RSSo/DFo);
+    mF2Test.df1 = (DFo-lDelta1) * (DFo-lDelta1) / (DFo - 2 * lDelta1 + lDelta2);
+    mF2Test.df2 = DFo;
+    mF2Test.p = gsl_cdf_fdist_Q(mF2Test.s, mF2Test.df1, mF2Test.df2);
+    //=========
+    // F3 Test
+    //=========
+    mF3Test.resize(mX.n_cols);
+    vec vk2(nVar, arma::fill::zeros);
+    for (int i = 0; i < nVar; i++)
+    {
+        vec betasi = mBetas.col(i);
+        vec betasJndp = vec(nDp, fill::ones) * (sum(betasi) * 1.0 / nDp);
+        vk2(i) = (1.0 / nDp) * det(trans(betasi - betasJndp) * betasi);
+    }
+
+    for (uword i = 0; i < mX.n_cols; i++)
+    {
+        vec diagB = (this->*mCalcDiagBFunction)(i);
+        double g1 = diagB(0), g2 = diagB(1);
+        double numdf = g1 * g1 / g2;
+        FTestResult f3i;
+        f3i.s = (vk2(i) / g1) / sigma21;
+        f3i.df1 = numdf;
+        f3i.df2 = mF1Test.df1;
+        f3i.p = gsl_cdf_fdist_Q(f3i.s, numdf, mF1Test.df1);
+        mF3Test[i] = f3i;
+    }
+    //=========
+    // F4 Test
+    //=========
+    mF4Test.s = RSSg / RSSo;
+    mF4Test.df1 = delta1;
+    mF4Test.df2 = DFo;
+    mF4Test.p = gsl_cdf_fdist_P(mF4Test.s, mF4Test.df1, mF4Test.df2);
+}
+
+double GWRBasic::calcTrQtQBase()
+{
+    double trQtQ = 0.0;
+    uword nDp = mCoords.n_rows;
+    if (isStoreS())
+    {
+        mat EmS = eye(nDp, nDp) - mS;
+        mat Q = trans(EmS) * EmS;
+        trQtQ = sum(diagvec(trans(Q) * Q));
+    }
+    else
+    {
+        trQtQ = (this->*mCalcTrQtQCoreFunction)();
+    }
+    return trQtQ;
+}
+
+double GWRBasic::calcTrQtQCoreSerial()
+{
+    double trQtQ = 0.0;
+    uword nDp = mCoords.n_rows, nVar = mCoords.n_cols;
+    mat wspan(1, nVar, fill::ones);
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+    for (uword i = workRange.first; i < workRange.second; i++)
+    {
+        vec wi = mSpatialWeight.weightVector(i);
+        mat xtwi = trans(mX % (wi * wspan));
+        try
+        {
+            mat xtwxR = inv_sympd(xtwi * mX);
+            mat ci = xtwxR * xtwi;
+            mat si = mX.row(i) * inv(xtwi * mX) * xtwi;
+            vec pi = -trans(si);
+            pi(i) += 1.0;
+            double qi = sum(pi % pi);
+            trQtQ += qi * qi;
+            for (arma::uword j = i + 1; j < nDp; j++)
+            {
+                vec wj = mSpatialWeight.weightVector(j);
+                mat xtwj = trans(mX % (wj * wspan));
+                try {
+                    mat sj = mX.row(j) * inv_sympd(xtwj * mX) * xtwj;
+                    vec pj = -trans(sj);
+                    pj(j) += 1.0;
+                    double qj = sum(pi % pj);
+                    trQtQ += qj * qj * 2.0;
+                }
+                catch (const std::exception& e)
+                {
+                    return DBL_MAX;
+                }
+            }
+        }
+        catch(const std::exception& e)
+        {
+            return DBL_MAX;
+        }
+        
+    }
+    return trQtQ;
+}
+
+arma::vec GWRBasic::calcDiagBBase(arma::uword i)
+{
+    arma::uword nDp = mX.n_rows;
+    vec c(nDp, fill::zeros);
+    for (arma::uword j = 0; j < nDp; j++)
+    {
+        vec w = mSpatialWeight.weightVector(j);
+        mat xtw = trans(mX.each_col() % w);
+        try
+        {
+            mat C = trans(xtw) * inv_sympd(xtw * mX);
+            c += C.col(i);
+        }
+        catch (const std::exception& e)
+        {
+            return { DBL_MAX, DBL_MAX };
+        }
+    }
+    vec diagB = (this->*mCalcDiagBCoreFunction)(i, c);
+    diagB = 1.0 / nDp * diagB;
+    return { sum(diagB), sum(diagB % diagB) };
+}
+
+vec GWRBasic::calcDiagBCoreSerial(uword i, const vec& c)
+{
+    arma::uword nDp = mX.n_rows;
+    vec diagB(nDp, fill::zeros);
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+    for (arma::uword k = workRange.first; k < workRange.second; k++)
+    {
+        vec w = mSpatialWeight.weightVector(k);
+        mat xtw = (mX.each_col() % w).t();
+        try
+        {
+            mat C = xtw.t() * inv_sympd(xtw * mX);
+            vec b = C.col(i);
+            diagB += (b % b - (1.0 / nDp) * (b % c));
+        }
+        catch (const std::exception& e)
+        {
+            diagB.fill(DBL_MAX);
+            return diagB;
+        }
+    }
+    return diagB;
+}
+
 #ifdef ENABLE_OPENMP
 mat GWRBasic::predictOmp(const mat& locations, const mat& x, const vec& y)
 {
@@ -582,6 +757,87 @@ arma::mat GWRBasic::fitCoreSHatOmp(const arma::mat& x, const arma::vec& y, const
     shat = sum(shat_all, 1);
     return betas.t();
 }
+
+double GWRBasic::calcTrQtQCoreOmp()
+{
+    double trQtQ = 0.0;
+    uword nDp = mCoords.n_rows, nVar = mCoords.n_cols;
+    mat wspan(1, nVar, fill::ones);
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+    int flag = true;
+#pragma omp parallel for num_threads(mOmpThreadNum)
+    for (uword i = workRange.first; i < workRange.second; i++)
+    {
+        if (flag) {
+            vec wi = mSpatialWeight.weightVector(i);
+            mat xtwi = trans(mX % (wi * wspan));
+            try
+            {
+                mat xtwxR = inv_sympd(xtwi * mX);
+                mat ci = xtwxR * xtwi;
+                mat si = mX.row(i) * inv(xtwi * mX) * xtwi;
+                vec pi = -trans(si);
+                pi(i) += 1.0;
+                double qi = sum(pi % pi);
+                trQtQ += qi * qi;
+                for (arma::uword j = i + 1; j < nDp; j++)
+                {
+                    vec wj = mSpatialWeight.weightVector(j);
+                    mat xtwj = trans(mX % (wj * wspan));
+                    try {
+                        mat sj = mX.row(j) * inv_sympd(xtwj * mX) * xtwj;
+                        vec pj = -trans(sj);
+                        pj(j) += 1.0;
+                        double qj = sum(pi % pj);
+                        trQtQ += qj * qj * 2.0;
+                    }
+                    catch (const std::exception& e)
+                    {
+                        flag = false;
+                    }
+                }
+            }
+            catch(const std::exception& e)
+            {
+                flag = false;
+            }
+        }
+    }
+    return flag ? trQtQ : DBL_MAX;
+}
+
+vec GWRBasic::calcDiagBCoreOmp(uword i, const vec& c)
+{
+    arma::uword nDp = mX.n_rows;
+    mat diagB_all(nDp, mOmpThreadNum, fill::zeros);
+    std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+    int flag = true;
+#pragma omp parallel for num_threads(mOmpThreadNum)
+    for (arma::uword k = workRange.first; k < workRange.second; k++)
+    {
+        if (flag) {
+            int thread = omp_get_thread_num();
+            vec w = mSpatialWeight.weightVector(k);
+            mat xtw = trans(mX.each_col() % w);
+            try
+            {
+                mat C = trans(xtw) * inv_sympd(xtw * mX);
+                vec b = C.col(i);
+                diagB_all.col(thread) += (b % b - (1.0 / nDp) * (b % c));
+            }
+            catch (const std::exception& e)
+            {
+                flag = false;
+            }
+        }
+    }
+    if (!flag)
+    {
+        diagB_all.fill(DBL_MAX);
+    }
+    vec diagB = sum(diagB_all, 1);
+    return diagB;
+}
 #endif
 
 #ifdef ENABLE_CUDA
@@ -1020,26 +1276,51 @@ arma::mat gwm::GWRBasic::fitMpi()
     return mBetas;
 }
 
-// double GWRBasic::indepVarsSelectionCriterion(const mat& x, const vec& y, vec& shat)
-// {
-//     try
-//     {
-//         mat S;
-//         mat betas = (this->*mFitCoreSHatFunction)(x, y, mSpatialWeight, shat, S);
-//         GWM_LOG_PROGRESS(++mIndepVarSelectionProgressCurrent, mIndepVarSelectionProgressTotal);
-//         if (mStatus == Status::Success)
-//         {
-//             double rss = GWRBase::RSS(x, y, betas);
-//             return rss;
-//         }
-//         else return DBL_MAX;
-//     }
-//     catch(const std::exception& e)
-//     {
-//         return DBL_MAX;
-//     }
-    
-// }
+double GWRBasic::calcTrQtQMpi()
+{
+    double trQtQ = 0.0;
+    uword nDp = mCoords.n_rows;
+    if (isStoreS())
+    {
+        std::pair<uword, uword> workRange = mWorkRange.value_or(make_pair(0, nDp));
+        mat EmS = eye(nDp, nDp).eval().rows(workRange.first, workRange.second - 1) - mS;
+        mat Q, QtQ;
+        mat_quad_mpi(EmS, Q, mWorkerId, mWorkerNum);
+        mat_quad_mpi(Q, QtQ, mWorkerId, mWorkerNum);
+        trQtQ = mat_trace_mpi(QtQ, mWorkerId, mWorkerNum);
+    }
+    else
+    {
+        double trQtQi = (this->*mCalcTrQtQCoreFunction)();
+        MPI_Allreduce(&trQtQi, &trQtQ, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    }
+    return trQtQ;
+}
+
+vec GWRBasic::calcDiagBMpi(uword i)
+{
+    arma::uword nDp = mX.n_rows;
+    vec c(nDp, fill::zeros);
+    for (arma::uword j = 0; j < nDp; j++)
+    {
+        vec w = mSpatialWeight.weightVector(j);
+        mat xtw = trans(mX.each_col() % w);
+        try
+        {
+            mat C = trans(xtw) * inv_sympd(xtw * mX);
+            c += C.col(i);
+        }
+        catch (const std::exception& e)
+        {
+            return { DBL_MAX, DBL_MAX };
+        }
+    }
+    vec diagBi = (this->*mCalcDiagBCoreFunction)(i, c);
+    vec diagB(nDp, arma::fill::zeros);
+    MPI_Allreduce(diagBi.memptr(), diagB.memptr(), int(nDp), MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    diagB = 1.0 / double(nDp) * diagB;
+    return { sum(diagB), sum(diagB % diagB) };
+}
 #endif // ENABLE_MPI
 
 void GWRBasic::setBandwidthSelectionCriterion(const BandwidthSelectionCriterionType& criterion)
@@ -1087,6 +1368,8 @@ void GWRBasic::setParallelType(const ParallelType& type)
             mFitCoreFunction = &GWRBasic::fitCoreSerial;
             mFitCoreCVFunction = &GWRBasic::fitCoreCVSerial;
             mFitCoreSHatFunction = &GWRBasic::fitCoreSHatSerial;
+            mCalcTrQtQCoreFunction = &GWRBasic::calcTrQtQCoreSerial;
+            mCalcDiagBCoreFunction = &GWRBasic::calcDiagBCoreSerial;
             break;
 #ifdef ENABLE_OPENMP
         case ParallelType::OpenMP:
@@ -1095,6 +1378,8 @@ void GWRBasic::setParallelType(const ParallelType& type)
             mFitCoreFunction = &GWRBasic::fitCoreOmp;
             mFitCoreCVFunction = &GWRBasic::fitCoreCVOmp;
             mFitCoreSHatFunction = &GWRBasic::fitCoreSHatOmp;
+            mCalcTrQtQCoreFunction = &GWRBasic::calcTrQtQCoreOmp;
+            mCalcDiagBCoreFunction = &GWRBasic::calcDiagBCoreOmp;
             break;
 #endif // ENABLE_OPENMP
 #ifdef ENABLE_CUDA
@@ -1104,6 +1389,8 @@ void GWRBasic::setParallelType(const ParallelType& type)
             mFitCoreFunction = &GWRBasic::fitCoreCuda;
             mFitCoreCVFunction = &GWRBasic::fitCoreCVCuda;
             mFitCoreSHatFunction = &GWRBasic::fitCoreSHatCuda;
+            mCalcTrQtQCoreFunction = &GWRBasic::calcTrQtQCoreSerial;
+            mCalcDiagBCoreFunction = &GWRBasic::calcDiagBCoreSerial;
             break;
 #endif // ENABLE_CUDA
         default:
@@ -1111,6 +1398,8 @@ void GWRBasic::setParallelType(const ParallelType& type)
             mFitCoreFunction = &GWRBasic::fitCoreSerial;
             mFitCoreCVFunction = &GWRBasic::fitCoreCVSerial;
             mFitCoreSHatFunction = &GWRBasic::fitCoreSHatSerial;
+            mCalcTrQtQCoreFunction = &GWRBasic::calcTrQtQCoreSerial;
+            mCalcDiagBCoreFunction = &GWRBasic::calcDiagBCoreSerial;
             break;
         }
 #ifdef ENABLE_MPI
@@ -1118,11 +1407,17 @@ void GWRBasic::setParallelType(const ParallelType& type)
         {
             mFitFunction = &GWRBasic::fitMpi;
             mIndepVarsSelectionCriterionFunction = &GWRBasic::indepVarsSelectionCriterionMpi;
+            mFTestFunction = &GWRBasic::fTestBase;
+            mCalcTrQtQFunction = &GWRBasic::calcTrQtQMpi;
+            mCalcDiagBFunction = &GWRBasic::calcDiagBMpi;
         }
         else
         {
             mFitFunction = &GWRBasic::fitBase;
             mIndepVarsSelectionCriterionFunction = &GWRBasic::indepVarsSelectionCriterion;
+            mFTestFunction = &GWRBasic::fTestBase;
+            mCalcTrQtQFunction = &GWRBasic::calcTrQtQBase;
+            mCalcDiagBFunction = &GWRBasic::calcDiagBBase;
         }
 #endif
         setBandwidthSelectionCriterion(mBandwidthSelectionCriterion);
diff --git a/src/gwmodelpp/GWRMultiscale.cpp b/src/gwmodelpp/GWRMultiscale.cpp
index 3e03f4a..41a2a55 100644
--- a/src/gwmodelpp/GWRMultiscale.cpp
+++ b/src/gwmodelpp/GWRMultiscale.cpp
@@ -954,7 +954,7 @@ vec GWRMultiscale::fitVarMpi(const size_t var)
     if (mHasHatMatrix)
     {
         mat SArrayi = mSArray.slice(var) - mS0;
-        mat_mul_mpi(S, SArrayi, mSArray.slice(var), mWorkerId, mWorkerNum, mWorkRangeSize);
+        mat_mul_mpi(S, SArrayi, mSArray.slice(var), mWorkerId, mWorkerNum);
         mSArray.slice(var) += S;
         mS0 = mSArray.slice(var) - SArrayi;
     }
diff --git a/src/gwmodelpp/utils/armampi.cpp b/src/gwmodelpp/utils/armampi.cpp
index 024db9b..69c3f1b 100644
--- a/src/gwmodelpp/utils/armampi.cpp
+++ b/src/gwmodelpp/utils/armampi.cpp
@@ -5,7 +5,7 @@
 using namespace std;
 using namespace arma;
 
-void mat_mul_mpi(mat& a, mat& b, mat& c, const int ip, const int np, const size_t range)
+void mat_mul_mpi(mat& a, mat& b, mat& c, const int ip, const int np)
 {
     arma::uword m = a.n_rows, n = b.n_cols;
     int k = (int) b.n_rows;
@@ -23,3 +23,33 @@ void mat_mul_mpi(mat& a, mat& b, mat& c, const int ip, const int np, const size_
         MPI_Reduce(ci.memptr(), c.memptr(), ci.n_elem, MPI_DOUBLE, MPI_SUM, pi, MPI_COMM_WORLD);
     }
 }
+
+void mat_quad_mpi(mat& a, mat& aTa, const int ip, const int np)
+{
+    arma::uword m = a.n_cols, n = a.n_rows;
+    int k = (int) a.n_rows;
+    arma::Col<int> a_rows(np, arma::fill::zeros);
+    MPI_Allgather(&k, 1, MPI_INT, a_rows.memptr(), 1, MPI_INT, MPI_COMM_WORLD);
+    aTa = mat(n, m, arma::fill::zeros);
+    mat a_buf;
+    for (int pi = 0; pi < np; pi++)
+    {
+        arma::Col<int> a_disp = arma::cumsum(a_rows) - a_rows;
+        a_buf = a.cols(a_disp(pi), a_disp(pi) + a_rows(pi) - 1);
+        mat aTai = a_buf.t() * a;
+        MPI_Reduce(aTai.memptr(), aTa.memptr(), aTai.n_elem, MPI_DOUBLE, MPI_SUM, pi, MPI_COMM_WORLD);
+    }
+}
+
+double mat_trace_mpi(arma::mat& a, const int ip, const int np)
+{
+    int k = (int) a.n_rows;
+    arma::Col<int> a_rows(np, arma::fill::zeros);
+    MPI_Allgather(&k, 1, MPI_INT, a_rows.memptr(), 1, MPI_INT, MPI_COMM_WORLD);
+    arma::Col<int> a_disp = arma::cumsum(a_rows) - a_rows;
+    mat ai = a.cols(a_disp(ip), a_disp(ip) + a_rows(ip) - 1);
+    double ti = trace(ai);
+    double tr = 0.0;
+    MPI_Allreduce(&ti, &tr, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    return tr;
+}
diff --git a/test/mpi/testGWRBasicMpi.cpp b/test/mpi/testGWRBasicMpi.cpp
index 2c44061..d435540 100644
--- a/test/mpi/testGWRBasicMpi.cpp
+++ b/test/mpi/testGWRBasicMpi.cpp
@@ -21,6 +21,11 @@ using namespace std;
 using namespace arma;
 using namespace gwm;
 
+array<double, 4> convFTestArray(GWRBasic::FTestResult f)
+{
+    return { f.s, f.df1, f.df2, f.p };
+}
+
 TEST_CASE("BasicGWR: LondonHP")
 {
     int iProcess, nProcess;
@@ -303,6 +308,79 @@ TEST_CASE("BasicGWR: LondonHP")
         }
     }
 
+    SECTION("F test | adaptive bandwidth | no bandwidth optimization | no variable optimization") {
+        auto parallel = GENERATE_REF(values(parallel_list));
+        INFO("Parallel:" << ParallelTypeDict.at(parallel));
+
+        CRSDistance distance(false);
+        BandwidthWeight bandwidth(36, true, BandwidthWeight::Gaussian);
+        SpatialWeight spatial(&bandwidth, &distance);
+
+        GWRBasic algorithm;
+        algorithm.setCoords(londonhp100_coord);
+        algorithm.setDependentVariable(y);
+        algorithm.setIndependentVariables(x);
+        algorithm.setSpatialWeight(spatial);
+        algorithm.setParallelType(parallel);
+        algorithm.setWorkerId(iProcess);
+        algorithm.setWorkerNum(nProcess);
+#ifdef ENABLE_OPENMP
+        if (parallel == ParallelType::OpenMP)
+        {
+            algorithm.setOmpThreadNum(omp_get_num_threads());
+        }
+#endif // ENABLE_OPENMP
+#ifdef ENABLE_CUDA
+        if (parallel == ParallelType::CUDA)
+        {
+            algorithm.setGPUId(0);
+            algorithm.setGroupSize(64);
+        }
+#endif // ENABLE_CUDA
+        algorithm.setIsDoFtest(true);
+        REQUIRE_NOTHROW(algorithm.fit());
+        if (iProcess == 0)
+        {
+            auto results = algorithm.fTestResults();
+            auto f1 = convFTestArray(get<0>(results));
+            auto f2 = convFTestArray(get<1>(results));
+            vector<array<double, 4>> f3;
+            for (auto &&i : get<2>(results))
+            {
+                f3.push_back(convFTestArray(i));
+            }
+            auto f4 = convFTestArray(get<3>(results));
+            REQUIRE_THAT(f1[0], Catch::Matchers::WithinAbs(0.9342, 1e-3));
+            REQUIRE_THAT(f1[1], Catch::Matchers::WithinAbs(93.3300, 1e-3));
+            REQUIRE_THAT(f1[2], Catch::Matchers::WithinAbs(96.0000, 1e-3));
+            REQUIRE_THAT(f1[3], Catch::Matchers::WithinAbs(0.3710, 1e-3));
+            REQUIRE_THAT(f2[0], Catch::Matchers::WithinAbs(1.9762, 1e-3));
+            REQUIRE_THAT(f2[1], Catch::Matchers::WithinAbs(13.1571, 1e-3));
+            REQUIRE_THAT(f2[2], Catch::Matchers::WithinAbs(96.0000, 1e-3));
+            REQUIRE_THAT(f2[3], Catch::Matchers::WithinAbs(0.0303, 1e-3));
+            REQUIRE_THAT(f4[0], Catch::Matchers::WithinAbs(0.8752, 1e-3));
+            REQUIRE_THAT(f4[1], Catch::Matchers::WithinAbs(89.9377, 1e-3));
+            REQUIRE_THAT(f4[2], Catch::Matchers::WithinAbs(96.0000, 1e-3));
+            REQUIRE_THAT(f4[3], Catch::Matchers::WithinAbs(0.2619, 1e-3));
+            REQUIRE_THAT(f3[0][0], Catch::Matchers::WithinAbs(0.4655, 1e-3));
+            REQUIRE_THAT(f3[0][1], Catch::Matchers::WithinAbs(27.1298, 1e-3));
+            REQUIRE_THAT(f3[0][2], Catch::Matchers::WithinAbs(93.3300, 1e-3));
+            REQUIRE_THAT(f3[0][3], Catch::Matchers::WithinAbs(0.9872, 1e-3));
+            REQUIRE_THAT(f3[1][0], Catch::Matchers::WithinAbs(0.5022, 1e-3));
+            REQUIRE_THAT(f3[1][1], Catch::Matchers::WithinAbs(18.3315, 1e-3));
+            REQUIRE_THAT(f3[1][2], Catch::Matchers::WithinAbs(93.3300, 1e-3));
+            REQUIRE_THAT(f3[1][3], Catch::Matchers::WithinAbs(0.9526, 1e-3));
+            REQUIRE_THAT(f3[2][0], Catch::Matchers::WithinAbs(0.6415, 1e-3));
+            REQUIRE_THAT(f3[2][1], Catch::Matchers::WithinAbs(29.6827, 1e-3));
+            REQUIRE_THAT(f3[2][2], Catch::Matchers::WithinAbs(93.3300, 1e-3));
+            REQUIRE_THAT(f3[2][3], Catch::Matchers::WithinAbs(0.9151, 1e-3));
+            REQUIRE_THAT(f3[3][0], Catch::Matchers::WithinAbs(0.3019, 1e-3));
+            REQUIRE_THAT(f3[3][1], Catch::Matchers::WithinAbs(24.7164, 1e-3));
+            REQUIRE_THAT(f3[3][2], Catch::Matchers::WithinAbs(93.3300, 1e-3));
+            REQUIRE_THAT(f3[3][3], Catch::Matchers::WithinAbs(0.9994, 1e-3));
+        }
+    }
+
 }
 
 int main(int argc, char *argv[])
diff --git a/test/mpi/testMpiMatMul.cpp b/test/mpi/testMpiMatMul.cpp
index dc70b5c..2bb7299 100644
--- a/test/mpi/testMpiMatMul.cpp
+++ b/test/mpi/testMpiMatMul.cpp
@@ -32,7 +32,7 @@ TEST_CASE("mat mul mpi")
     A = A_all.rows(from, to - 1);
     B = B_all.rows(from, to - 1);
     // printf("process %d begin mat mul\n", ip);
-    REQUIRE_NOTHROW(mat_mul_mpi(A, B, C, ip, np, size));
+    REQUIRE_NOTHROW(mat_mul_mpi(A, B, C, ip, np));
     // printf("process %d end mat mul\n", ip);
 
     if (ip == 0)
@@ -59,6 +59,91 @@ TEST_CASE("mat mul mpi")
     }
 }
 
+TEST_CASE("mat quad mpi")
+{
+    int ip, np;
+    MPI_Comm_size(MPI_COMM_WORLD, &np);
+    MPI_Comm_rank(MPI_COMM_WORLD, &ip);
+
+    uword n = 100;
+    uword size = n / np + (n % np == 0 ? 0 : 1);
+    uword from = ip * size, to = min(from + size, n);
+
+    // printf("range from %llu to %llu\n", from, to);
+
+    mat A_all, C_all;
+    A_all = mat(n, n, arma::fill::randn);
+    if (ip == 0)
+    {
+        C_all = A_all.t() * A_all;
+    }
+
+    mat A, C;
+    A = A_all.rows(from, to - 1);
+    // printf("process %d begin mat mul\n", ip);
+    REQUIRE_NOTHROW(mat_quad_mpi(A, C, ip, np));
+    // printf("process %d end mat mul\n", ip);
+
+    if (ip == 0)
+    {
+        mat C_gather(n, n);
+        C_gather.rows(from, to - 1) = C;
+        uvec received(np, fill::zeros);
+        received(0) = 1;
+        auto bufsize = size * n;
+        double *buf = new double[size * n];
+        while (!all(received == 1))
+        {
+            MPI_Status status;
+            MPI_Recv(buf, bufsize, MPI_DOUBLE, MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, &status);
+            received(status.MPI_SOURCE) = 1;
+            uword ifrom = status.MPI_SOURCE * size, ito = min(ifrom + size, n), irange = ito - ifrom; 
+            C_gather.rows(ifrom, ito - 1) = mat(buf, irange, n);
+        }
+        REQUIRE(approx_equal(C_gather, C_all, "absdiff", 1e-6));
+    }
+    else
+    {
+        MPI_Send(C.memptr(), C.n_elem, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
+    }
+}
+
+TEST_CASE("mat trace mpi")
+{
+    int ip, np;
+    MPI_Comm_size(MPI_COMM_WORLD, &np);
+    MPI_Comm_rank(MPI_COMM_WORLD, &ip);
+
+    uword n = 100;
+    uword size = n / np + (n % np == 0 ? 0 : 1);
+    uword from = ip * size, to = min(from + size, n);
+
+    // printf("range from %llu to %llu\n", from, to);
+
+    mat A_all;
+    double trA_all;
+    A_all = mat(n, n, arma::fill::randn);
+    if (ip == 0)
+    {
+        trA_all = trace(A_all);
+    }
+
+    mat A;
+    double trA = 0.0;
+    A = A_all.rows(from, to - 1);
+    // printf("process %d begin mat mul\n", ip);
+    REQUIRE_NOTHROW(([&]()
+    {
+        trA = mat_trace_mpi(A, ip, np);
+    })());
+    // printf("process %d end mat mul\n", ip);
+
+    if (ip == 0)
+    {
+        REQUIRE_THAT(trA, Catch::Matchers::WithinAbs(trA_all, 1e-6));
+    }
+}
+
 int main(int argc, char *argv[])
 {
     MPI_Init(&argc, &argv);
diff --git a/test/testGWRBasic.cpp b/test/testGWRBasic.cpp
index b1e76f0..f490ab9 100644
--- a/test/testGWRBasic.cpp
+++ b/test/testGWRBasic.cpp
@@ -1,6 +1,7 @@
 #define CATCH_CONFIG_MAIN
 #include <catch2/catch_all.hpp>
 
+#include <array>
 #include <vector>
 #include <string>
 #include <armadillo>
@@ -20,6 +21,11 @@ using namespace std;
 using namespace arma;
 using namespace gwm;
 
+array<double, 4> convFTestArray(GWRBasic::FTestResult f)
+{
+    return { f.s, f.df1, f.df2, f.p };
+}
+
 TEST_CASE("BasicGWR: LondonHP")
 {
     mat londonhp100_coord, londonhp100_data;
@@ -259,6 +265,73 @@ TEST_CASE("BasicGWR: LondonHP")
         REQUIRE_THAT(diagnostic.RSquareAdjust, Catch::Matchers::WithinAbs(0.678982114793865, 1e-8));
     }
 
+    SECTION("F test | adaptive bandwidth | no bandwidth optimization | no variable optimization") {
+        auto parallel = GENERATE_REF(values(parallel_list));
+        INFO("Parallel:" << ParallelTypeDict.at(parallel));
+
+        CRSDistance distance(false);
+        BandwidthWeight bandwidth(36, true, BandwidthWeight::Gaussian);
+        SpatialWeight spatial(&bandwidth, &distance);
+
+        GWRBasic algorithm;
+        algorithm.setCoords(londonhp100_coord);
+        algorithm.setDependentVariable(y);
+        algorithm.setIndependentVariables(x);
+        algorithm.setSpatialWeight(spatial);
+        algorithm.setParallelType(parallel);
+#ifdef ENABLE_OPENMP
+        if (parallel == ParallelType::OpenMP)
+        {
+            algorithm.setOmpThreadNum(omp_get_num_threads());
+        }
+#endif // ENABLE_OPENMP
+#ifdef ENABLE_CUDA
+        if (parallel == ParallelType::CUDA)
+        {
+            algorithm.setGPUId(0);
+            algorithm.setGroupSize(64);
+        }
+#endif // ENABLE_CUDA
+        algorithm.setIsDoFtest(true);
+        REQUIRE_NOTHROW(algorithm.fit());
+        auto results = algorithm.fTestResults();
+        auto f1 = convFTestArray(get<0>(results));
+        auto f2 = convFTestArray(get<1>(results));
+        vector<array<double, 4>> f3;
+        for (auto &&i : get<2>(results))
+        {
+            f3.push_back(convFTestArray(i));
+        }
+        auto f4 = convFTestArray(get<3>(results));
+        REQUIRE_THAT(f1[0], Catch::Matchers::WithinAbs(0.9342, 1e-3));
+        REQUIRE_THAT(f1[1], Catch::Matchers::WithinAbs(93.3300, 1e-3));
+        REQUIRE_THAT(f1[2], Catch::Matchers::WithinAbs(96.0000, 1e-3));
+        REQUIRE_THAT(f1[3], Catch::Matchers::WithinAbs(0.3710, 1e-3));
+        REQUIRE_THAT(f2[0], Catch::Matchers::WithinAbs(1.9762, 1e-3));
+        REQUIRE_THAT(f2[1], Catch::Matchers::WithinAbs(13.1571, 1e-3));
+        REQUIRE_THAT(f2[2], Catch::Matchers::WithinAbs(96.0000, 1e-3));
+        REQUIRE_THAT(f2[3], Catch::Matchers::WithinAbs(0.0303, 1e-3));
+        REQUIRE_THAT(f4[0], Catch::Matchers::WithinAbs(0.8752, 1e-3));
+        REQUIRE_THAT(f4[1], Catch::Matchers::WithinAbs(89.9377, 1e-3));
+        REQUIRE_THAT(f4[2], Catch::Matchers::WithinAbs(96.0000, 1e-3));
+        REQUIRE_THAT(f4[3], Catch::Matchers::WithinAbs(0.2619, 1e-3));
+        REQUIRE_THAT(f3[0][0], Catch::Matchers::WithinAbs(0.4655, 1e-3));
+        REQUIRE_THAT(f3[0][1], Catch::Matchers::WithinAbs(27.1298, 1e-3));
+        REQUIRE_THAT(f3[0][2], Catch::Matchers::WithinAbs(93.3300, 1e-3));
+        REQUIRE_THAT(f3[0][3], Catch::Matchers::WithinAbs(0.9872, 1e-3));
+        REQUIRE_THAT(f3[1][0], Catch::Matchers::WithinAbs(0.5022, 1e-3));
+        REQUIRE_THAT(f3[1][1], Catch::Matchers::WithinAbs(18.3315, 1e-3));
+        REQUIRE_THAT(f3[1][2], Catch::Matchers::WithinAbs(93.3300, 1e-3));
+        REQUIRE_THAT(f3[1][3], Catch::Matchers::WithinAbs(0.9526, 1e-3));
+        REQUIRE_THAT(f3[2][0], Catch::Matchers::WithinAbs(0.6415, 1e-3));
+        REQUIRE_THAT(f3[2][1], Catch::Matchers::WithinAbs(29.6827, 1e-3));
+        REQUIRE_THAT(f3[2][2], Catch::Matchers::WithinAbs(93.3300, 1e-3));
+        REQUIRE_THAT(f3[2][3], Catch::Matchers::WithinAbs(0.9151, 1e-3));
+        REQUIRE_THAT(f3[3][0], Catch::Matchers::WithinAbs(0.3019, 1e-3));
+        REQUIRE_THAT(f3[3][1], Catch::Matchers::WithinAbs(24.7164, 1e-3));
+        REQUIRE_THAT(f3[3][2], Catch::Matchers::WithinAbs(93.3300, 1e-3));
+        REQUIRE_THAT(f3[3][3], Catch::Matchers::WithinAbs(0.9994, 1e-3));
+    }
 }